From 9a9d42efd7c75d08324e4b4b3d78e4024683f87f Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 31 Oct 2025 10:37:45 +0100 Subject: [PATCH 001/230] initial sebs cloudflare infra, functions, config, triggers. readme in sebs/cloudflare folder for comprehensive cloudflare doc and next steps --- sebs/cloudflare/README.md | 338 ++++++++++++++++ sebs/cloudflare/__init__.py | 4 + sebs/cloudflare/cloudflare.py | 723 ++++++++++++++++++++++++++++++++++ sebs/cloudflare/config.py | 242 ++++++++++++ sebs/cloudflare/function.py | 64 +++ sebs/cloudflare/resources.py | 68 ++++ sebs/cloudflare/triggers.py | 113 ++++++ sebs/sebs.py | 4 + 8 files changed, 1556 insertions(+) create mode 100644 sebs/cloudflare/README.md create mode 100644 sebs/cloudflare/__init__.py create mode 100644 sebs/cloudflare/cloudflare.py create mode 100644 sebs/cloudflare/config.py create mode 100644 sebs/cloudflare/function.py create mode 100644 sebs/cloudflare/resources.py create mode 100644 sebs/cloudflare/triggers.py diff --git a/sebs/cloudflare/README.md b/sebs/cloudflare/README.md new file mode 100644 index 000000000..f40793f87 --- /dev/null +++ b/sebs/cloudflare/README.md @@ -0,0 +1,338 @@ +# Cloudflare Workers Implementation for SeBS + +This directory contains the implementation of Cloudflare Workers support for the SeBS (Serverless Benchmarking Suite). + +## Key Components + +### 1. `cloudflare.py` - Main System Implementation + +This file implements the core Cloudflare Workers platform integration, including: + +- **`create_function()`** - Creates a new Cloudflare Worker + - Checks if worker already exists + - Uploads worker script via Cloudflare API + - Adds HTTP and Library triggers + - Returns a `CloudflareWorker` instance + +- **`cached_function()`** - Handles cached functions + - Refreshes triggers and logging handlers for functions retrieved from cache + +- **`update_function()`** - Updates an existing worker + - Uploads new script content + - Updates worker configuration + +- **`update_function_configuration()`** - Updates worker configuration + - Note: Cloudflare Workers have limited runtime configuration compared to AWS Lambda or Azure Functions + - Memory and CPU time limits are managed by Cloudflare + +- **`package_code()`** - Prepares code for deployment + - Packages JavaScript/Node.js code for worker deployment + - Returns package path and size + +### 2. `function.py` - CloudflareWorker Class + +Represents a Cloudflare Worker function with: +- Worker name and script ID +- Runtime information +- Serialization/deserialization for caching +- Account ID association + +### 3. `config.py` - Configuration Classes + +Contains three main classes: + +- **`CloudflareCredentials`** - Authentication credentials + - Supports API token or email + API key + - Requires account ID + - Can be loaded from environment variables or config file + +- **`CloudflareResources`** - Platform resources + - KV namespace IDs + - Storage bucket mappings + - Resource ID management + +- **`CloudflareConfig`** - Overall configuration + - Combines credentials and resources + - Handles serialization to/from cache + +### 4. `triggers.py` - Trigger Implementations + +- **`LibraryTrigger`** - Programmatic invocation via Cloudflare API +- **`HTTPTrigger`** - HTTP invocation via worker URLs + - Workers are automatically accessible at `https://{name}.{account}.workers.dev` + +This provides the behavior of SeBS to invoke serverless functions via either library or http triggers. + +### 5. `resources.py` - System Resources + +Handles Cloudflare-specific resources like KV namespaces and R2 storage. This defines the behavior of SeBS to upload benchmarking resources and cleanup before/after the benchmark. It is different from the benchmark wrapper, which provides the functions for the benchmark itself to perform storage operations. + +## Usage +### Environment Variables + +Set the following environment variables: + +```bash +# Option 1: Using API Token (recommended) +export CLOUDFLARE_API_TOKEN="your-api-token" +export CLOUDFLARE_ACCOUNT_ID="your-account-id" + +# Option 2: Using Email + API Key +export CLOUDFLARE_EMAIL="your-email@example.com" +export CLOUDFLARE_API_KEY="your-global-api-key" +export CLOUDFLARE_ACCOUNT_ID="your-account-id" +``` + +### Configuration File + +Alternatively, create a configuration file: + +```json +{ + "cloudflare": { + "credentials": { + "api_token": "your-api-token", + "account_id": "your-account-id" + }, + "resources": { + "resources_id": "unique-resource-id" + } + } +} +``` + +### Current Limitations + +- **Container Deployment**: Not currently implemented + - *Note*: Cloudflare recently added container support (October 2024) + - Current implementation only supports script-based deployment + - Container support would require: + - Creating `CloudflareContainer` class (similar to AWS ECR) + - Container registry integration + - Dockerfile templates for each language + - Updates to `package_code()` and `create_function()` methods +- **Cold Start Enforcement**: Not available (Workers are instantiated on-demand at edge locations) +- **Per-Invocation Metrics**: Limited (Cloudflare provides aggregated analytics) +- **Language Support**: Currently JavaScript/Node.js (Python support via Pyodide is experimental) + - Container support would enable any containerized language +- **Memory/Timeout Configuration**: Fixed by Cloudflare (128MB memory, 50ms CPU time on free tier) + +### Future Enhancements + +#### High Priority +- [ ] **Container Deployment Support** + - Cloudflare now supports container-based Workers (as of October 2024) + - Would enable multi-language support (Python, Java, Go, Rust, etc.) + - Requires implementing `CloudflareContainer` class + - Need Cloudflare container registry integration + - See [implementation notes](#container-support-architecture) below +- [ ] **Add Storage Resources** + - SeBS needs two levels of storage resources, main storage and nosql storage. + - For main storage Cloudflare R2 comes to mind. + - For nosql storage either D1 or Durable Objects come to mind. They need to be used by the benchmark wrapper aswell. I think it needs to be consistent... + +## Metrics Collection with Analytics Engine + +### Overview + +Cloudflare Workers metrics are collected using **Analytics Engine**, which provides **per-invocation performance data** similar to AWS CloudWatch Logs or Azure Application Insights. Unlike the GraphQL Analytics API (which only provides aggregated metrics), Analytics Engine allows workers to write custom data points during execution that can be queried later. + +### Why Analytics Engine? + +| Feature | Analytics Engine | GraphQL Analytics API | +|---------|-----------------|----------------------| +| **Data Granularity** | ✅ Per-invocation | ❌ Aggregated only | +| **Request ID Matching** | ✅ Direct correlation | ❌ Not possible | +| **Cold Start Detection** | ✅ Per-request | ❌ Average only | +| **SeBS Compatibility** | ✅ Full support | ❌ Limited | +| **Cost** | Free (10M writes/month) | Free | +| **Plan Requirement** | Paid plan ($5/month) | Any plan | + +### How It Works + +1. **Worker Execution**: During each invocation, the worker writes a data point to Analytics Engine with: + - Request ID (for correlation with SeBS) + - CPU time and wall time + - Cold/warm start indicator + - Success/error status + +2. **Metrics Query**: After benchmark execution, SeBS queries Analytics Engine using SQL: + - Retrieves all data points for the time period + - Matches request IDs to `ExecutionResult` objects + - Populates provider metrics (CPU time, cold starts, etc.) + +3. **Data Enrichment**: Each `ExecutionResult` is enriched with: + - `provider_times.execution` - CPU time in microseconds + - `stats.cold_start` - True/False for cold start + - `billing.billed_time` - Billable CPU time + - `billing.gb_seconds` - GB-seconds for cost calculation + +### Implementation Requirements + +#### 1. Analytics Engine Binding + +```python +# In cloudflare.py - automatically configured +self._bind_analytics_engine(worker_name, account_id) +``` + +#### 2. Benchmark Wrapper + +Benchmark wrappers must write data points during execution. The wrapper code looks like: + +```javascript +export default { + async fetch(request, env, ctx) { + const requestId = request.headers.get('x-request-id') || crypto.randomUUID(); + const startTime = Date.now(); + const startCpu = performance.now(); + + try { + // Execute benchmark + const result = await benchmarkHandler(request, env, ctx); + + // Write metrics to Analytics Engine + if (env.ANALYTICS) { + env.ANALYTICS.writeDataPoint({ + indexes: [requestId, result.is_cold ? 'cold' : 'warm'], + doubles: [Date.now() - startTime, performance.now() - startCpu, 0, 0], + blobs: [request.url, 'success', '', ''] + }); + } + + return new Response(JSON.stringify({...result, request_id: requestId})); + } catch (error) { + // Write error metrics + if (env.ANALYTICS) { + env.ANALYTICS.writeDataPoint({ + indexes: [requestId, 'error'], + doubles: [Date.now() - startTime, performance.now() - startCpu, 0, 0], + blobs: [request.url, 'error', error.message, ''] + }); + } + throw error; + } + } +}; +``` + +#### 3. Data Schema + +Analytics Engine data points use this schema: + +| Field | Type | Purpose | Example | +|-------|------|---------|---------| +| `index1` | String | Request ID | `"req-abc-123"` | +| `index2` | String | Cold/Warm | `"cold"` or `"warm"` | +| `double1` | Float | Wall time (ms) | `45.2` | +| `double2` | Float | CPU time (ms) | `12.8` | +| `blob1` | String | Request URL | `"https://worker.dev"` | +| `blob2` | String | Status | `"success"` or `"error"` | +| `blob3` | String | Error message | `""` or error text | + +### Query Process + +When `download_metrics()` is called, SeBS: + +1. **Builds SQL Query**: Creates a ClickHouse SQL query for the time range +2. **Executes Query**: POSTs to Analytics Engine SQL API +3. **Parses Results**: Parses newline-delimited JSON response +4. **Matches Request IDs**: Correlates data points with tracked invocations +5. **Populates Metrics**: Enriches `ExecutionResult` objects with provider data + +Example SQL query: + +```sql +SELECT + index1 as request_id, + index2 as cold_warm, + double1 as wall_time_ms, + double2 as cpu_time_ms, + blob2 as status, + timestamp +FROM ANALYTICS_DATASET +WHERE timestamp >= toDateTime('2025-10-27 10:00:00') + AND timestamp <= toDateTime('2025-10-27 11:00:00') + AND blob1 LIKE '%worker-name%' +ORDER BY timestamp ASC +``` + +### Limitation + +1. **Delay**: Typically 30-60 seconds for data to appear in Analytics Engine +2. **Wrapper Updates**: All benchmark wrappers must be updated to write data points + +### Troubleshooting + +**Missing Metrics**: +- Check that worker has Analytics Engine binding configured +- Verify wrapper is writing data points (check `env.ANALYTICS`) +- Wait 60+ seconds after invocation for ingestion +- Check SQL query matches worker URL pattern + +**Unmatched Request IDs**: +- Ensure wrapper returns `request_id` in response +- Verify SeBS is tracking request IDs correctly +- Check timestamp range covers all invocations + +**Query Failures**: +- Verify account has Analytics Engine enabled (Paid plan) +- Check API token has analytics read permissions +- Validate SQL syntax (ClickHouse format) + +### References + +- [Analytics Engine Documentation](https://developers.cloudflare.com/analytics/analytics-engine/) +- [Analytics Engine SQL API](https://developers.cloudflare.com/analytics/analytics-engine/sql-api/) +- [Workers Bindings](https://developers.cloudflare.com/workers/configuration/bindings/) +- See `ANALYTICS_ENGINE_IMPLEMENTATION.md` for complete implementation details + +#### Standard Priority +- [ ] Support for Cloudflare Workers KV (key-value storage) +- [ ] Support for Cloudflare R2 (object storage) +- [ ] Support for Durable Objects +- [ ] Wrangler CLI integration for better bundling +- [ ] WebAssembly/Rust worker support + +--- + +## Container Support Architecture + +### Overview + +Cloudflare recently introduced container support for Workers, enabling deployment of containerized applications. Adding this to SeBS would require the following components: + +### Required Components + +1. **Container Client** (`container.py`) + - Extends `sebs.faas.container.DockerContainer` + - Manages container image builds and registry operations + - Similar to `sebs/aws/container.py` for ECR + +2. **Registry Integration** + - Cloudflare Container Registry authentication + - Image push/pull operations + - Support for external registries (Docker Hub, etc.) + +3. **Dockerfile Templates** + - Create `/dockerfiles/cloudflare/{language}/Dockerfile.function` + - Support for Node.js, Python, and other languages + +4. **Updated Methods** + - `package_code()`: Add container build path alongside script packaging + - `create_function()`: Handle both script and container deployments + - `update_function()`: Support updating container-based workers + +### Benefits + +- **Multi-language Support**: Deploy Python, Java, Go, Rust workers +- **Complex Dependencies**: Support system libraries and compiled extensions +- **Larger Code Packages**: Overcome script size limitations +- **Consistent Environments**: Same container locally and in production + + +## References + +- [Cloudflare Workers Documentation](https://developers.cloudflare.com/workers/) +- [Cloudflare API Documentation](https://api.cloudflare.com/) +- [Workers API Reference](https://developers.cloudflare.com/workers/runtime-apis/) diff --git a/sebs/cloudflare/__init__.py b/sebs/cloudflare/__init__.py new file mode 100644 index 000000000..5a2c557d3 --- /dev/null +++ b/sebs/cloudflare/__init__.py @@ -0,0 +1,4 @@ +from sebs.cloudflare.cloudflare import Cloudflare +from sebs.cloudflare.config import CloudflareConfig + +__all__ = ["Cloudflare", "CloudflareConfig"] diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py new file mode 100644 index 000000000..ce2cc25ea --- /dev/null +++ b/sebs/cloudflare/cloudflare.py @@ -0,0 +1,723 @@ +import os +import shutil +import json +from typing import cast, Dict, List, Optional, Tuple, Type + +import docker +import requests + +from sebs.cloudflare.config import CloudflareConfig +from sebs.cloudflare.function import CloudflareWorker +from sebs.cloudflare.resources import CloudflareSystemResources +from sebs.benchmark import Benchmark +from sebs.cache import Cache +from sebs.config import SeBSConfig +from sebs.utils import LoggingHandlers +from sebs.faas.function import Function, ExecutionResult, Trigger, FunctionConfig +from sebs.faas.system import System + + +class Cloudflare(System): + """ + Cloudflare Workers serverless platform implementation. + + Cloudflare Workers run on Cloudflare's edge network, providing + low-latency serverless execution globally. + """ + + _config: CloudflareConfig + + @staticmethod + def name(): + return "cloudflare" + + @staticmethod + def typename(): + return "Cloudflare" + + @staticmethod + def function_type() -> "Type[Function]": + return CloudflareWorker + + @property + def config(self) -> CloudflareConfig: + return self._config + + def __init__( + self, + sebs_config: SeBSConfig, + config: CloudflareConfig, + cache_client: Cache, + docker_client: docker.client, + logger_handlers: LoggingHandlers, + ): + super().__init__( + sebs_config, + cache_client, + docker_client, + CloudflareSystemResources(config, cache_client, docker_client, logger_handlers), + ) + self.logging_handlers = logger_handlers + self._config = config + self._api_base_url = "https://api.cloudflare.com/client/v4" + + def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): + """ + Initialize the Cloudflare Workers platform. + + Args: + config: Additional configuration parameters + resource_prefix: Prefix for resource naming + """ + # Verify credentials are valid + self._verify_credentials() + self.initialize_resources(select_prefix=resource_prefix) + + def _verify_credentials(self): + """Verify that the Cloudflare API credentials are valid.""" + headers = self._get_auth_headers() + response = requests.get(f"{self._api_base_url}/user/tokens/verify", headers=headers) + + if response.status_code != 200: + raise RuntimeError( + f"Failed to verify Cloudflare credentials: {response.status_code} - {response.text}" + ) + + self.logging.info("Cloudflare credentials verified successfully") + + def _get_auth_headers(self) -> Dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self.config.credentials.api_token: + return { + "Authorization": f"Bearer {self.config.credentials.api_token}", + "Content-Type": "application/json", + } + elif self.config.credentials.email and self.config.credentials.api_key: + return { + "X-Auth-Email": self.config.credentials.email, + "X-Auth-Key": self.config.credentials.api_key, + "Content-Type": "application/json", + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") + + def package_code( + self, + directory: str, + language_name: str, + language_version: str, + architecture: str, + benchmark: str, + is_cached: bool, + container_deployment: bool, + ) -> Tuple[str, int, str]: + """ + Package code for Cloudflare Workers deployment. + + Cloudflare Workers support JavaScript/TypeScript and use a bundler + to create a single JavaScript file for deployment. + + Args: + directory: Path to the code directory + language_name: Programming language name + language_version: Programming language version + architecture: Target architecture (not used for Workers) + benchmark: Benchmark name + is_cached: Whether the code is cached + container_deployment: Whether to deploy as container (not supported) + + Returns: + Tuple of (package_path, package_size, container_uri) + """ + if container_deployment: + raise NotImplementedError( + "Container deployment is not supported for Cloudflare Workers" + ) + + # For now, we'll create a simple package structure + # In a full implementation, you'd use a bundler like esbuild or webpack + + CONFIG_FILES = { + "nodejs": ["handler.js", "package.json", "node_modules"], + # Python support via Python Workers is limited + "python": ["handler.py", "requirements.txt"], + } + + if language_name not in CONFIG_FILES: + raise NotImplementedError( + f"Language {language_name} is not yet supported for Cloudflare Workers" + ) + + package_config = CONFIG_FILES[language_name] + + # Create a worker directory with the necessary files + worker_dir = os.path.join(directory, "worker") + os.makedirs(worker_dir, exist_ok=True) + + # Copy all files to worker directory + for file in os.listdir(directory): + if file not in package_config and file != "worker": + src = os.path.join(directory, file) + dst = os.path.join(worker_dir, file) + if os.path.isfile(src): + shutil.copy2(src, dst) + elif os.path.isdir(src): + shutil.copytree(src, dst, dirs_exist_ok=True) + + # For now, return the main handler file as the package + handler_file = "handler.js" if language_name == "nodejs" else "handler.py" + package_path = os.path.join(directory, handler_file) + + if not os.path.exists(package_path): + raise RuntimeError(f"Handler file {handler_file} not found in {directory}") + + bytes_size = os.path.getsize(package_path) + mbytes = bytes_size / 1024.0 / 1024.0 + self.logging.info(f"Worker package size: {mbytes:.2f} MB") + + return (package_path, bytes_size, "") + + def create_function( + self, + code_package: Benchmark, + func_name: str, + container_deployment: bool, + container_uri: str, + ) -> CloudflareWorker: + """ + Create a new Cloudflare Worker. + + If a worker with the same name already exists, it will be updated. + + Args: + code_package: Benchmark containing the function code + func_name: Name of the worker + container_deployment: Whether to deploy as container (not supported) + container_uri: URI of container image (not used) + + Returns: + CloudflareWorker instance + """ + if container_deployment: + raise NotImplementedError( + "Container deployment is not supported for Cloudflare Workers" + ) + + package = code_package.code_location + benchmark = code_package.benchmark + language = code_package.language_name + language_runtime = code_package.language_version + function_cfg = FunctionConfig.from_benchmark(code_package) + + func_name = self.format_function_name(func_name) + account_id = self.config.credentials.account_id + + if not account_id: + raise RuntimeError("Cloudflare account ID is required to create workers") + + # Check if worker already exists + existing_worker = self._get_worker(func_name, account_id) + + if existing_worker: + self.logging.info(f"Worker {func_name} already exists, updating it") + worker = CloudflareWorker( + func_name, + code_package.benchmark, + func_name, # script_id is the same as name + code_package.hash, + language_runtime, + function_cfg, + account_id, + ) + self.update_function(worker, code_package, container_deployment, container_uri) + worker.updated_code = True + else: + self.logging.info(f"Creating new worker {func_name}") + + # Read the worker script + with open(package, 'r') as f: + script_content = f.read() + + # Create the worker + self._create_or_update_worker(func_name, script_content, account_id) + + worker = CloudflareWorker( + func_name, + code_package.benchmark, + func_name, + code_package.hash, + language_runtime, + function_cfg, + account_id, + ) + + # Add LibraryTrigger and HTTPTrigger + from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + + library_trigger = LibraryTrigger(func_name, self) + library_trigger.logging_handlers = self.logging_handlers + worker.add_trigger(library_trigger) + + # Cloudflare Workers are automatically accessible via HTTPS + worker_url = f"https://{func_name}.{account_id}.workers.dev" + http_trigger = HTTPTrigger(func_name, worker_url) + http_trigger.logging_handlers = self.logging_handlers + worker.add_trigger(http_trigger) + + return worker + + def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: + """Get information about an existing worker.""" + headers = self._get_auth_headers() + url = f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" + + response = requests.get(url, headers=headers) + + if response.status_code == 200: + return response.json().get("result") + elif response.status_code == 404: + return None + else: + raise RuntimeError( + f"Failed to check worker existence: {response.status_code} - {response.text}" + ) + + def _create_or_update_worker( + self, worker_name: str, script_content: str, account_id: str + ) -> dict: + """Create or update a Cloudflare Worker.""" + headers = self._get_auth_headers() + # Remove Content-Type as we're sending form data + headers.pop("Content-Type", None) + + url = f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" + + # Cloudflare Workers API expects the script as form data + files = { + 'script': ('worker.js', script_content, 'application/javascript'), + } + + response = requests.put(url, headers=headers, files=files) + + if response.status_code not in [200, 201]: + raise RuntimeError( + f"Failed to create/update worker: {response.status_code} - {response.text}" + ) + + return response.json().get("result", {}) + + def cached_function(self, function: Function): + """ + Handle a function retrieved from cache. + + Refreshes triggers and logging handlers. + + Args: + function: The cached function + """ + from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + + for trigger in function.triggers(Trigger.TriggerType.LIBRARY): + trigger.logging_handlers = self.logging_handlers + cast(LibraryTrigger, trigger).deployment_client = self + + for trigger in function.triggers(Trigger.TriggerType.HTTP): + trigger.logging_handlers = self.logging_handlers + + def update_function( + self, + function: Function, + code_package: Benchmark, + container_deployment: bool, + container_uri: str, + ): + """ + Update an existing Cloudflare Worker. + + Args: + function: Existing function instance to update + code_package: New benchmark containing the function code + container_deployment: Whether to deploy as container (not supported) + container_uri: URI of container image (not used) + """ + if container_deployment: + raise NotImplementedError( + "Container deployment is not supported for Cloudflare Workers" + ) + + worker = cast(CloudflareWorker, function) + package = code_package.code_location + + # Read the updated script + with open(package, 'r') as f: + script_content = f.read() + + # Update the worker + account_id = worker.account_id or self.config.credentials.account_id + if not account_id: + raise RuntimeError("Account ID is required to update worker") + + self._create_or_update_worker(worker.name, script_content, account_id) + self.logging.info(f"Updated worker {worker.name}") + + # Update configuration if needed + self.update_function_configuration(worker, code_package) + + def update_function_configuration( + self, cached_function: Function, benchmark: Benchmark + ): + """ + Update the configuration of a Cloudflare Worker. + + Note: Cloudflare Workers have limited configuration options compared + to traditional FaaS platforms. Memory and timeout are managed by Cloudflare. + + Args: + cached_function: The function to update + benchmark: The benchmark with new configuration + """ + # Cloudflare Workers have fixed resource limits: + # - CPU time: 50ms (free), 50ms-30s (paid) + # - Memory: 128MB + # Most configuration is handled via wrangler.toml or API settings + + worker = cast(CloudflareWorker, cached_function) + + # For environment variables or KV namespaces, we would use the API here + # For now, we'll just log that configuration update was requested + self.logging.info( + f"Configuration update requested for worker {worker.name}. " + "Note: Cloudflare Workers have limited runtime configuration options." + ) + + def default_function_name(self, code_package: Benchmark, resources=None) -> str: + """ + Generate a default function name for Cloudflare Workers. + + Args: + code_package: The benchmark package + resources: Optional resources (not used) + + Returns: + Default function name + """ + # Cloudflare Worker names must be lowercase and can contain hyphens + return ( + f"{code_package.benchmark}-{code_package.language_name}-" + f"{code_package.language_version.replace('.', '')}" + ).lower() + + @staticmethod + def format_function_name(name: str) -> str: + """ + Format a function name to comply with Cloudflare Worker naming rules. + + Worker names must: + - Be lowercase + - Contain only alphanumeric characters and hyphens + - Not start or end with a hyphen + + Args: + name: The original name + + Returns: + Formatted name + """ + # Convert to lowercase and replace invalid characters + formatted = name.lower().replace('_', '-').replace('.', '-') + # Remove any characters that aren't alphanumeric or hyphen + formatted = ''.join(c for c in formatted if c.isalnum() or c == '-') + # Remove leading/trailing hyphens + formatted = formatted.strip('-') + return formatted + + def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): + """ + Enforce cold start for Cloudflare Workers. + + Note: Cloudflare Workers don't have a traditional cold start mechanism + like AWS Lambda. Workers are instantiated on-demand at edge locations. + We can't force a cold start, but we can update the worker to invalidate caches. + + Args: + functions: List of functions to enforce cold start on + code_package: The benchmark package + """ + self.logging.warning( + "Cloudflare Workers do not support forced cold starts. " + "Workers are automatically instantiated on-demand at edge locations." + ) + + def download_metrics( + self, + function_name: str, + start_time: int, + end_time: int, + requests: Dict[str, ExecutionResult], + metrics: dict, + ): + """ + Download per-invocation metrics from Cloudflare Analytics Engine. + + Queries Analytics Engine SQL API to retrieve performance data for each + invocation and enriches the ExecutionResult objects with provider metrics. + + Note: Requires Analytics Engine binding to be configured on the worker + and benchmark code to write data points during execution. + + Args: + function_name: Name of the worker + start_time: Start time (Unix timestamp in seconds) + end_time: End time (Unix timestamp in seconds) + requests: Dict mapping request_id -> ExecutionResult + metrics: Dict to store aggregated metrics + """ + if not requests: + self.logging.warning("No requests to download metrics for") + return + + account_id = self.config.credentials.account_id + if not account_id: + self.logging.error("Account ID required to download metrics") + return + + self.logging.info( + f"Downloading Analytics Engine metrics for {len(requests)} invocations " + f"of worker {function_name}" + ) + + try: + # Query Analytics Engine for per-invocation metrics + metrics_data = self._query_analytics_engine( + account_id, start_time, end_time, function_name + ) + + if not metrics_data: + self.logging.warning( + "No metrics data returned from Analytics Engine. " + "Ensure the worker has Analytics Engine binding configured " + "and is writing data points during execution." + ) + return + + # Match metrics with invocation requests + matched = 0 + unmatched_metrics = 0 + + for row in metrics_data: + request_id = row.get('request_id') + + if request_id and request_id in requests: + result = requests[request_id] + + # Populate provider times (convert ms to microseconds) + wall_time_ms = row.get('wall_time_ms', 0) + cpu_time_ms = row.get('cpu_time_ms', 0) + + result.provider_times.execution = int(cpu_time_ms * 1000) # μs + result.provider_times.initialization = 0 # Not separately tracked + + # Populate stats + result.stats.cold_start = (row.get('cold_warm') == 'cold') + result.stats.memory_used = 128.0 # Cloudflare Workers: fixed 128MB + + # Populate billing info + # Cloudflare billing: $0.50 per million requests + + # $12.50 per million GB-seconds of CPU time + result.billing.memory = 128 + result.billing.billed_time = int(cpu_time_ms * 1000) # μs + + # GB-seconds calculation: (128MB / 1024MB/GB) * (cpu_time_ms / 1000ms/s) + gb_seconds = (128.0 / 1024.0) * (cpu_time_ms / 1000.0) + result.billing.gb_seconds = int(gb_seconds * 1000000) # micro GB-seconds + + matched += 1 + elif request_id: + unmatched_metrics += 1 + + # Calculate statistics from matched metrics + if matched > 0: + cpu_times = [ + requests[rid].provider_times.execution + for rid in requests + if requests[rid].provider_times.execution > 0 + ] + cold_starts = sum( + 1 for rid in requests if requests[rid].stats.cold_start + ) + + metrics['cloudflare'] = { + 'total_invocations': len(metrics_data), + 'matched_invocations': matched, + 'unmatched_invocations': len(requests) - matched, + 'unmatched_metrics': unmatched_metrics, + 'cold_starts': cold_starts, + 'warm_starts': matched - cold_starts, + 'data_source': 'analytics_engine', + 'note': 'Per-invocation metrics from Analytics Engine' + } + + if cpu_times: + metrics['cloudflare']['avg_cpu_time_us'] = sum(cpu_times) // len(cpu_times) + metrics['cloudflare']['min_cpu_time_us'] = min(cpu_times) + metrics['cloudflare']['max_cpu_time_us'] = max(cpu_times) + + self.logging.info( + f"Analytics Engine metrics: matched {matched}/{len(requests)} invocations" + ) + + if matched < len(requests): + missing = len(requests) - matched + self.logging.warning( + f"{missing} invocations not found in Analytics Engine. " + "This may be due to:\n" + " - Analytics Engine ingestion delay (typically <60s)\n" + " - Worker not writing data points correctly\n" + " - Analytics Engine binding not configured" + ) + + if unmatched_metrics > 0: + self.logging.warning( + f"{unmatched_metrics} metrics found in Analytics Engine " + "that don't match tracked request IDs (possibly from other sources)" + ) + + except Exception as e: + self.logging.error(f"Failed to download metrics: {e}") + self.logging.warning( + "Continuing without Analytics Engine metrics. " + "Client-side timing data is still available." + ) + + def _query_analytics_engine( + self, + account_id: str, + start_time: int, + end_time: int, + script_name: str + ) -> List[dict]: + """ + Query Analytics Engine SQL API for worker metrics. + + Retrieves per-invocation metrics written by the worker during execution. + The worker must write data points with the following schema: + - index1: request_id (unique identifier) + - index2: cold_warm ("cold" or "warm") + - double1: wall_time_ms (wall clock time in milliseconds) + - double2: cpu_time_ms (CPU time in milliseconds) + - blob1: url (request URL) + - blob2: status ("success" or "error") + - blob3: error_message (if applicable) + + Args: + account_id: Cloudflare account ID + start_time: Unix timestamp (seconds) + end_time: Unix timestamp (seconds) + script_name: Worker script name + + Returns: + List of metric data points, one per invocation + """ + headers = self._get_auth_headers() + url = f"{self._api_base_url}/accounts/{account_id}/analytics_engine/sql" + + # Convert Unix timestamps to DateTime format for ClickHouse + from datetime import datetime + start_dt = datetime.utcfromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S') + end_dt = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') + + # SQL query for Analytics Engine + # Note: Analytics Engine uses ClickHouse SQL syntax + sql_query = f""" + SELECT + index1 as request_id, + index2 as cold_warm, + double1 as wall_time_ms, + double2 as cpu_time_ms, + blob1 as url, + blob2 as status, + blob3 as error_message, + timestamp + FROM ANALYTICS_DATASET + WHERE timestamp >= toDateTime('{start_dt}') + AND timestamp <= toDateTime('{end_dt}') + AND blob1 LIKE '%{script_name}%' + ORDER BY timestamp ASC + """ + + try: + # Analytics Engine SQL API returns newline-delimited JSON + response = requests.post( + url, + headers=headers, + data=sql_query, + timeout=30 + ) + + if response.status_code == 200: + # Parse newline-delimited JSON response + results = [] + for line in response.text.strip().split('\n'): + if line: + try: + results.append(json.loads(line)) + except json.JSONDecodeError: + self.logging.warning(f"Failed to parse Analytics Engine line: {line}") + + self.logging.info(f"Retrieved {len(results)} data points from Analytics Engine") + return results + else: + raise RuntimeError( + f"Analytics Engine query failed: {response.status_code} - {response.text}" + ) + + except requests.exceptions.Timeout: + self.logging.error("Analytics Engine query timed out") + return [] + except Exception as e: + self.logging.error(f"Analytics Engine query error: {e}") + return [] + + def create_trigger( + self, function: Function, trigger_type: Trigger.TriggerType + ) -> Trigger: + """ + Create a trigger for a Cloudflare Worker. + + Args: + function: The function to create a trigger for + trigger_type: Type of trigger to create + + Returns: + The created trigger + """ + from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + + worker = cast(CloudflareWorker, function) + + if trigger_type == Trigger.TriggerType.LIBRARY: + trigger = LibraryTrigger(worker.name, self) + trigger.logging_handlers = self.logging_handlers + return trigger + elif trigger_type == Trigger.TriggerType.HTTP: + account_id = worker.account_id or self.config.credentials.account_id + worker_url = f"https://{worker.name}.{account_id}.workers.dev" + trigger = HTTPTrigger(worker.name, worker_url) + trigger.logging_handlers = self.logging_handlers + return trigger + else: + raise NotImplementedError( + f"Trigger type {trigger_type} is not supported for Cloudflare Workers" + ) + + def shutdown(self) -> None: + """ + Shutdown the Cloudflare system. + + Saves configuration to cache. + """ + try: + self.cache_client.lock() + self.config.update_cache(self.cache_client) + finally: + self.cache_client.unlock() diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py new file mode 100644 index 000000000..4e04ee137 --- /dev/null +++ b/sebs/cloudflare/config.py @@ -0,0 +1,242 @@ +import os +from typing import Optional, cast + +from sebs.cache import Cache +from sebs.faas.config import Config, Credentials, Resources +from sebs.utils import LoggingHandlers + + +class CloudflareCredentials(Credentials): + """ + Cloudflare API credentials. + + Requires: + - API token or email + global API key + - Account ID + """ + + def __init__(self, api_token: Optional[str] = None, email: Optional[str] = None, + api_key: Optional[str] = None, account_id: Optional[str] = None): + super().__init__() + + self._api_token = api_token + self._email = email + self._api_key = api_key + self._account_id = account_id + + @staticmethod + def typename() -> str: + return "Cloudflare.Credentials" + + @property + def api_token(self) -> Optional[str]: + return self._api_token + + @property + def email(self) -> Optional[str]: + return self._email + + @property + def api_key(self) -> Optional[str]: + return self._api_key + + @property + def account_id(self) -> Optional[str]: + return self._account_id + + @staticmethod + def initialize(dct: dict) -> "CloudflareCredentials": + return CloudflareCredentials( + dct.get("api_token"), + dct.get("email"), + dct.get("api_key"), + dct.get("account_id") + ) + + @staticmethod + def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Credentials: + cached_config = cache.get_config("cloudflare") + ret: CloudflareCredentials + account_id: Optional[str] = None + + # Load cached values + if cached_config and "credentials" in cached_config: + account_id = cached_config["credentials"].get("account_id") + + # Check for new config + if "credentials" in config: + ret = CloudflareCredentials.initialize(config["credentials"]) + elif "CLOUDFLARE_API_TOKEN" in os.environ: + ret = CloudflareCredentials( + api_token=os.environ["CLOUDFLARE_API_TOKEN"], + account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID") + ) + elif "CLOUDFLARE_EMAIL" in os.environ and "CLOUDFLARE_API_KEY" in os.environ: + ret = CloudflareCredentials( + email=os.environ["CLOUDFLARE_EMAIL"], + api_key=os.environ["CLOUDFLARE_API_KEY"], + account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID") + ) + else: + raise RuntimeError( + "Cloudflare login credentials are missing! Please set " + "up environmental variables CLOUDFLARE_API_TOKEN and CLOUDFLARE_ACCOUNT_ID, " + "or CLOUDFLARE_EMAIL, CLOUDFLARE_API_KEY, and CLOUDFLARE_ACCOUNT_ID" + ) + + if account_id is not None and ret.account_id is not None and account_id != ret.account_id: + ret.logging.error( + f"The account id {ret.account_id} from provided credentials is different " + f"from the account id {account_id} found in the cache! Please change " + "your cache directory or create a new one!" + ) + raise RuntimeError( + f"Cloudflare login credentials do not match the account {account_id} in cache!" + ) + + ret.logging_handlers = handlers + return ret + + def update_cache(self, cache: Cache): + if self._account_id: + cache.update_config(val=self._account_id, + keys=["cloudflare", "credentials", "account_id"]) + + def serialize(self) -> dict: + out = {} + if self._account_id: + out["account_id"] = self._account_id + return out + + +class CloudflareResources(Resources): + """ + Resources for Cloudflare Workers deployment. + """ + + def __init__(self): + super().__init__(name="cloudflare") + self._namespace_id: Optional[str] = None + + @staticmethod + def typename() -> str: + return "Cloudflare.Resources" + + @property + def namespace_id(self) -> Optional[str]: + return self._namespace_id + + @namespace_id.setter + def namespace_id(self, value: str): + self._namespace_id = value + + @staticmethod + def initialize(res: Resources, dct: dict): + ret = cast(CloudflareResources, res) + super(CloudflareResources, CloudflareResources).initialize(ret, dct) + + if "namespace_id" in dct: + ret._namespace_id = dct["namespace_id"] + + return ret + + def serialize(self) -> dict: + out = {**super().serialize()} + if self._namespace_id: + out["namespace_id"] = self._namespace_id + return out + + def update_cache(self, cache: Cache): + super().update_cache(cache) + if self._namespace_id: + cache.update_config( + val=self._namespace_id, + keys=["cloudflare", "resources", "namespace_id"] + ) + + @staticmethod + def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resources: + ret = CloudflareResources() + cached_config = cache.get_config("cloudflare") + + # Load cached values + if cached_config and "resources" in cached_config: + CloudflareResources.initialize(ret, cached_config["resources"]) + ret.logging_handlers = handlers + ret.logging.info("Using cached resources for Cloudflare") + else: + # Check for new config + if "resources" in config: + CloudflareResources.initialize(ret, config["resources"]) + ret.logging_handlers = handlers + ret.logging.info("No cached resources for Cloudflare found, using user configuration.") + else: + CloudflareResources.initialize(ret, {}) + ret.logging_handlers = handlers + ret.logging.info("No resources for Cloudflare found, initialize!") + + return ret + + +class CloudflareConfig(Config): + """ + Configuration for Cloudflare Workers platform. + """ + + def __init__(self, credentials: CloudflareCredentials, resources: CloudflareResources): + super().__init__(name="cloudflare") + self._credentials = credentials + self._resources = resources + + @staticmethod + def typename() -> str: + return "Cloudflare.Config" + + @property + def credentials(self) -> CloudflareCredentials: + return self._credentials + + @property + def resources(self) -> CloudflareResources: + return self._resources + + @staticmethod + def initialize(cfg: Config, dct: dict): + config = cast(CloudflareConfig, cfg) + # Cloudflare Workers are globally distributed, no region needed + config._region = dct.get("region", "global") + + @staticmethod + def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config: + cached_config = cache.get_config("cloudflare") + credentials = cast(CloudflareCredentials, + CloudflareCredentials.deserialize(config, cache, handlers)) + resources = cast(CloudflareResources, + CloudflareResources.deserialize(config, cache, handlers)) + config_obj = CloudflareConfig(credentials, resources) + config_obj.logging_handlers = handlers + + # Load cached values + if cached_config: + config_obj.logging.info("Using cached config for Cloudflare") + CloudflareConfig.initialize(config_obj, cached_config) + else: + config_obj.logging.info("Using user-provided config for Cloudflare") + CloudflareConfig.initialize(config_obj, config) + + resources.region = config_obj.region + return config_obj + + def update_cache(self, cache: Cache): + cache.update_config(val=self.region, keys=["cloudflare", "region"]) + self.credentials.update_cache(cache) + self.resources.update_cache(cache) + + def serialize(self) -> dict: + out = { + "name": "cloudflare", + "region": self._region, + "credentials": self._credentials.serialize(), + "resources": self._resources.serialize(), + } + return out diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py new file mode 100644 index 000000000..1bdf0ea08 --- /dev/null +++ b/sebs/cloudflare/function.py @@ -0,0 +1,64 @@ +from typing import Optional, cast + +from sebs.faas.function import Function, FunctionConfig + + +class CloudflareWorker(Function): + """ + Cloudflare Workers function implementation. + + A Cloudflare Worker is a serverless function that runs on Cloudflare's edge network. + """ + + def __init__( + self, + name: str, + benchmark: str, + script_id: str, + code_package_hash: str, + runtime: str, + cfg: FunctionConfig, + account_id: Optional[str] = None, + ): + super().__init__(benchmark, name, code_package_hash, cfg) + self.script_id = script_id + self.runtime = runtime + self.account_id = account_id + + @staticmethod + def typename() -> str: + return "Cloudflare.Worker" + + def serialize(self) -> dict: + return { + **super().serialize(), + "script_id": self.script_id, + "runtime": self.runtime, + "account_id": self.account_id, + } + + @staticmethod + def deserialize(cached_config: dict) -> "CloudflareWorker": + from sebs.faas.function import Trigger + from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + + cfg = FunctionConfig.deserialize(cached_config["config"]) + ret = CloudflareWorker( + cached_config["name"], + cached_config["benchmark"], + cached_config["script_id"], + cached_config["hash"], + cached_config["runtime"], + cfg, + cached_config.get("account_id"), + ) + + for trigger in cached_config["triggers"]: + trigger_type = cast( + Trigger, + {"Library": LibraryTrigger, "HTTP": HTTPTrigger}.get(trigger["type"]), + ) + assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) + ret.add_trigger(trigger_type.deserialize(trigger)) + + return ret diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py new file mode 100644 index 000000000..f9f9743d2 --- /dev/null +++ b/sebs/cloudflare/resources.py @@ -0,0 +1,68 @@ +import docker + +from typing import Optional + +from sebs.cache import Cache +from sebs.cloudflare.config import CloudflareConfig +from sebs.faas.resources import SystemResources +from sebs.faas.storage import PersistentStorage +from sebs.faas.nosql import NoSQLStorage +from sebs.utils import LoggingHandlers + + +class CloudflareSystemResources(SystemResources): + """ + System resources for Cloudflare Workers. + + Cloudflare Workers have a different resource model compared to + traditional cloud platforms. This class handles Cloudflare-specific + resources like KV namespaces and R2 storage. + """ + + def __init__( + self, + config: CloudflareConfig, + cache_client: Cache, + docker_client: docker.client, + logging_handlers: LoggingHandlers, + ): + super().__init__(config, cache_client, docker_client, logging_handlers) + self._config = config + + @property + def config(self) -> CloudflareConfig: + return self._config + + def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStorage: + """ + Get Cloudflare R2 storage instance. + + R2 is Cloudflare's S3-compatible object storage service. + This method will create a client for managing benchmark input/output data. + + Args: + replace_existing: Whether to replace existing files in storage + + Raises: + NotImplementedError: R2 storage support not yet implemented + """ + raise NotImplementedError( + "Cloudflare R2 storage is not yet implemented. " + "To add support, implement a PersistentStorage subclass for R2 " + "similar to sebs/aws/s3.py or sebs/azure/blob_storage.py" + ) + + def get_nosql_storage(self) -> NoSQLStorage: + """ + Get Cloudflare NoSQL storage instance. + + This could use Cloudflare D1 (SQLite) or Durable Objects for NoSQL storage. + + Raises: + NotImplementedError: NoSQL storage support not yet implemented + """ + raise NotImplementedError( + "Cloudflare NoSQL storage (D1/Durable Objects) is not yet implemented. " + "To add support, implement a NoSQLStorage subclass " + "similar to sebs/aws/dynamodb.py or sebs/azure/cosmosdb.py" + ) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py new file mode 100644 index 000000000..cb038b63a --- /dev/null +++ b/sebs/cloudflare/triggers.py @@ -0,0 +1,113 @@ +from typing import Optional + +from sebs.faas.function import Trigger + + +class LibraryTrigger(Trigger): + """ + Library trigger for Cloudflare Workers. + Allows invoking workers programmatically via the Cloudflare API. + """ + + def __init__(self, worker_name: str, deployment_client=None): + super().__init__(worker_name) + self.deployment_client = deployment_client + + @staticmethod + def typename() -> str: + return "Cloudflare.LibraryTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + return Trigger.TriggerType.LIBRARY + + def sync_invoke(self, payload: dict) -> Optional[str]: + """ + Synchronously invoke a Cloudflare Worker. + + Args: + payload: The payload to send to the worker + + Returns: + The response from the worker + """ + # This will be implemented when we have the deployment client + raise NotImplementedError("Cloudflare Worker invocation not yet implemented") + + def async_invoke(self, payload: dict) -> object: + """ + Asynchronously invoke a Cloudflare Worker. + Not typically supported for Cloudflare Workers. + """ + raise NotImplementedError("Cloudflare Workers do not support async invocation") + + def serialize(self) -> dict: + return {**super().serialize()} + + @staticmethod + def deserialize(obj: dict) -> "LibraryTrigger": + return LibraryTrigger(obj["name"]) + + +class HTTPTrigger(Trigger): + """ + HTTP trigger for Cloudflare Workers. + Workers are automatically accessible via HTTPS endpoints. + """ + + def __init__(self, worker_name: str, url: Optional[str] = None): + super().__init__(worker_name) + self._url = url + + @staticmethod + def typename() -> str: + return "Cloudflare.HTTPTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + return Trigger.TriggerType.HTTP + + @property + def url(self) -> str: + assert self._url is not None, "HTTP trigger URL has not been set" + return self._url + + @url.setter + def url(self, url: str): + self._url = url + + def sync_invoke(self, payload: dict) -> Optional[str]: + """ + Synchronously invoke a Cloudflare Worker via HTTP. + + Args: + payload: The payload to send to the worker + + Returns: + The response from the worker + """ + import requests + + response = requests.post(self.url, json=payload) + response.raise_for_status() + return response.text + + def async_invoke(self, payload: dict) -> object: + """ + Asynchronously invoke a Cloudflare Worker via HTTP. + Not typically needed for Cloudflare Workers. + """ + raise NotImplementedError("Cloudflare Workers do not support async HTTP invocation") + + def serialize(self) -> dict: + return { + **super().serialize(), + "url": self._url, + } + + @staticmethod + def deserialize(obj: dict) -> "HTTPTrigger": + trigger = HTTPTrigger(obj["name"]) + if "url" in obj: + trigger.url = obj["url"] + return trigger diff --git a/sebs/sebs.py b/sebs/sebs.py index a3dd89a95..febfeb24a 100644 --- a/sebs/sebs.py +++ b/sebs/sebs.py @@ -214,6 +214,10 @@ def get_deployment( from sebs.openwhisk import OpenWhisk implementations["openwhisk"] = OpenWhisk + if has_platform("cloudflare"): + from sebs.cloudflare import Cloudflare + + implementations["cloudflare"] = Cloudflare # Validate deployment platform if name not in implementations: From aa24a07a7aa4318469281d74cb1f367957372433 Mon Sep 17 00:00:00 2001 From: MisterMM23 Date: Sun, 2 Nov 2025 17:31:46 +0100 Subject: [PATCH 002/230] systems.json cloudflare config --- configs/systems.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/configs/systems.json b/configs/systems.json index 27ad49592..ac1c22598 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -418,5 +418,32 @@ }, "architecture": ["x64"], "deployments": ["container"] + }, + "cloudflare": { + "languages": { + "python": { + "base_images": { + "x64": { + "3.8": "ubuntu:22.04", + "3.9": "ubuntu:22.04", + "3.10": "ubuntu:22.04", + "3.11": "ubuntu:22.04", + "3.12": "ubuntu:22.04" + } + }, + "images": [ + "build" + ], + "deployment": { + "files": [ + "handler.py", + "storage.py", + "nosql.py" + ], + "packages": [], + "module_packages": {} + } + } + } } } From 4cc0476ff6f4e2b88e7deb44f64175fda79defb9 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Sun, 2 Nov 2025 23:11:52 +0100 Subject: [PATCH 003/230] highly incomplete work on benchmark wrappers, using R2 and KV. --- .../wrappers/cloudflare/nodejs/handler.js | 37 +++++++++++ .../wrappers/cloudflare/nodejs/storage.js | 64 +++++++++++++++++++ .../wrappers/cloudflare/python/handler.py | 47 ++++++++++++++ .../wrappers/cloudflare/python/nosql.py | 59 +++++++++++++++++ .../wrappers/cloudflare/python/storage.py | 58 +++++++++++++++++ configs/systems.json | 9 ++- 6 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/handler.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/storage.js create mode 100644 benchmarks/wrappers/cloudflare/python/handler.py create mode 100644 benchmarks/wrappers/cloudflare/python/nosql.py create mode 100644 benchmarks/wrappers/cloudflare/python/storage.py diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js new file mode 100644 index 000000000..2c63947fd --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -0,0 +1,37 @@ + +const path = require('path'), fs = require('fs'); + +export default { + async fetch(request, env, ctx) { + var begin = Date.now()/1000; + var start = process.hrtime(); + var func = require('./function') + var ret = func.handler(req.body); + return ret.then( + (result) => { + var elapsed = process.hrtime(start); + var end = Date.now()/1000; + var micro = elapsed[1] / 1e3 + elapsed[0] * 1e6; + + var is_cold = false; + var fname = path.join('/tmp','cold_run'); + if(!fs.existsSync(fname)) { + is_cold = true; + fs.closeSync(fs.openSync(fname, 'w')); + } + + res.status(200).json({ + begin: begin, + end: end, + compute_time: micro, + results_time: 0, + result: {output: result}, + is_cold: is_cold, + request_id: req.headers["function-execution-id"] + }); + }, + (error) => { + throw(error); + } + ); +} diff --git a/benchmarks/wrappers/cloudflare/nodejs/storage.js b/benchmarks/wrappers/cloudflare/nodejs/storage.js new file mode 100644 index 000000000..134192089 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/storage.js @@ -0,0 +1,64 @@ +const { Storage } = + fs = require('fs'), + path = require('path'), + uuid = require('uuid'), + util = require('util'), + stream = require('stream'); + +class cf_storage { + + constructor() { + + } + + unique_name(file) { + let name = path.parse(file); + let uuid_name = uuid.v4().split('-')[0]; + return path.join(name.dir, util.format('%s.%s%s', name.name, uuid_name, name.ext)); + } + + upload(container, file, filepath) { + let bucket = this.storage.bucket(container); + let uniqueName = this.unique_name(file); + let options = {destination: uniqueName, resumable: false}; + return [uniqueName, bucket.upload(filepath, options)]; + }; + + download(container, file, filepath) { + let bucket = this.storage.bucket(container); + var file = bucket.file(file); + file.download({destination: filepath}); + }; + + uploadStream(container, file) { + let bucket = this.storage.bucket(container); + let uniqueName = this.unique_name(file); + var file = bucket.file(uniqueName); + let upload = file.createWriteStream(); + var write_stream = new stream.PassThrough(); + + write_stream.pipe(upload); + + const promise = new Promise((resolve, reject) => { + upload.on('error', err => { + upload.end(); + reject(err); + }); + + upload.on('finish', () => { + upload.end(); + resolve(file.name); + }); + }); + return [write_stream, promise, uniqueName]; + }; + + downloadStream(container, file) { + let bucket = this.storage.bucket(container); + var file = bucket.file(file); + let downloaded = file.createReadStream(); + return Promise.resolve(downloaded); + }; +}; + +exports.storage = cf_storage; diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py new file mode 100644 index 000000000..24c384107 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -0,0 +1,47 @@ +import datetime, io, json, os, uuid, sys + +from workers import WorkerEntrypoint, Response + +## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) + + + +class Default(WorkerEntrypoint): + async def fetch(self, request, env): + req_json = await request.json() + event = json.loads(req_json) + + ## we might need more data in self.env to know this ID + req_id = 0 + ## note: time fixed in worker + income_timestamp = datetime.datetime.now().timestamp() + + event['request-id'] = req_id + event['income-timestamp'] = income_timestamp + + from . import storage + storage.init_instance(self) + + + from . import function + ret = function.handler(event) + + log_data = { + 'output': ret['result'] + } + if 'measurement' in ret: + log_data['measurement'] = ret['measurement'] + if 'logs' in event: + log_data['time'] = 0 + + return Response(json.dumps({ + 'begin': "0", + 'end': "0", + 'results_time': "0", + 'result': log_data, + 'is_cold': False, + 'is_cold_worker': False, + 'container_id': "0", + 'environ_container_id': os.environ['CONTAINER_NAME'], + 'request_id': "0" + })) \ No newline at end of file diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py new file mode 100644 index 000000000..75bf0f09d --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -0,0 +1,59 @@ +from typing import List, Optional, Tuple + + +class nosql: + + instance: Optional["nosql"] = None + + @staticmethod + def init_instance(entry: WorkerEntryPoint): + nosql.instance = nosql() + nosql.instance.env = entry.env + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + put_res = await self.env.getattr(table_name).put(primary_key, data) + return + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + await self.env.getattr(table_name).put(primary_key, data) + return + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> Optional[dict]: + get_res = await self.env.getattr(table_name).get(primary_key) + return get_res.json() + + """ + This query must involve partition key - it does not scan across partitions. + """ + + def query( + self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str + ) -> List[dict]: + list_res = await self.env.getattr(table_name).list() + + return + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + self.env.getattr(table_name).delete(primary_key) + + return + + @staticmethod + def get_instance(): + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py new file mode 100644 index 000000000..5f31c759c --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -0,0 +1,58 @@ +import io +import os +import uuid + +## all filesystem calls will rely on the node:fs flag +class storage: + instance = None + handle = None + + @staticmethod + def unique_name(name): + name, extension = os.path.splitext(name) + return '{name}.{random}{extension}'.format( + name=name, + extension=extension, + random=str(uuid.uuid4()).split('-')[0] + ) + + @staticmethod + def init_instance(entry: WorkerEntrypoint): + storage.instance = storage() + storage.instance.handle = entry.env.R2 + + def upload(self, __bucket, key, filepath): + with open(filepath, "rb") as f: + self.upload_stream(__bucket, key, f.read()) + return + + def download(self, __bucket, key, filepath): + data = self.download_stream(__bucket, key) + with open(filepath, "wb") as f: + f.write(data) + return + + def download_directory(self, __bucket, prefix, out_path): + list_res = await self.handle.list(prefix = prefix) ## gives only first 1000? + for obj in list_res.objects: + file_name = obj.key + path_to_file = os.path.dirname(file_name) + os.makedirs(os.path.join(path, path_to_file), exist_ok=True) + self.download(__bucket, file_name, os.path.join(out_path, file_name)) + return + + def upload_stream(self, __bucket, key, data): + unique_key = storage.unique_name(key) + put_res = await self.handle.put(unique_key, data) + return unique_key + + def download_stream(self, __bucket, key): + get_res = await self.handle.get(key) + assert get_res not None + data = await get_res.text() + return data + + def get_instance(): + if storage.instance is None: + raise "must init storage singleton first" + return storage.instance diff --git a/configs/systems.json b/configs/systems.json index ac1c22598..a6f8ac186 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -24,6 +24,13 @@ "3.9": "python:3.9-slim", "3.10": "python:3.10-slim", "3.11": "python:3.11-slim" + }, + "arm64": { + "3.7": "python:3.7-slim", + "3.8": "python:3.8-slim", + "3.9": "python:3.9-slim", + "3.10": "python:3.10-slim", + "3.11": "python:3.11-slim" } }, "images": [ @@ -66,7 +73,7 @@ } } }, - "architecture": ["x64"], + "architecture": ["x64", "arm64"], "deployments": ["package"] }, "aws": { From cd24fcff27c56b89f229a3b60fc4db9716da41e5 Mon Sep 17 00:00:00 2001 From: Livio D'Agostini Date: Sat, 8 Nov 2025 23:19:16 +0100 Subject: [PATCH 004/230] wrappers - changes to handler and storage - can now run benchmark 110 if all configs are set up manually --- .../wrappers/cloudflare/python/handler.py | 57 +++++++++++++------ .../wrappers/cloudflare/python/storage.py | 32 +++++++++-- 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 24c384107..37d44595e 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -8,8 +8,28 @@ class Default(WorkerEntrypoint): async def fetch(self, request, env): - req_json = await request.json() - event = json.loads(req_json) + if "favicon" in request.url: return Response("None") + + req_text = await request.text() + event = json.loads(req_text) if req_text is None else {} + + if True: # dirty url parameters parsing, for testing + tmp = request.url.split("?") + if len(tmp) > 1: + urlparams = tmp[1] + urlparams = [chunk.split("=") for chunk in urlparams.split("&")] + for param in urlparams: + try: + event[param[0]] = int(param[1]) + except ValueError: + event[param[0]] = param[1] + except IndexError: + event[param[0]] = None + + + + + ## we might need more data in self.env to know this ID req_id = 0 @@ -19,12 +39,13 @@ async def fetch(self, request, env): event['request-id'] = req_id event['income-timestamp'] = income_timestamp - from . import storage + from storage import storage storage.init_instance(self) + print("event:", event) - from . import function - ret = function.handler(event) + from function import handler + ret = handler(event) log_data = { 'output': ret['result'] @@ -34,14 +55,18 @@ async def fetch(self, request, env): if 'logs' in event: log_data['time'] = 0 - return Response(json.dumps({ - 'begin': "0", - 'end': "0", - 'results_time': "0", - 'result': log_data, - 'is_cold': False, - 'is_cold_worker': False, - 'container_id': "0", - 'environ_container_id': os.environ['CONTAINER_NAME'], - 'request_id': "0" - })) \ No newline at end of file + if "html" in event: + headers = {"Content-Type" : "text/html; charset=utf-8"} + return Response(ret["result"], headers = headers) + else: + return Response(json.dumps({ + 'begin': "0", + 'end': "0", + 'results_time': "0", + 'result': log_data, + 'is_cold': False, + 'is_cold_worker': False, + 'container_id': "0", + 'environ_container_id': "no_id", + 'request_id': "0" + })) diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 5f31c759c..da090be6c 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -2,7 +2,20 @@ import os import uuid +from workers import WorkerEntrypoint + ## all filesystem calls will rely on the node:fs flag +""" layout +/bundle +└── (one file for each module in your Worker bundle) +/tmp +└── (empty, but you can write files, create directories, symlinks, etc) +/dev +├── null +├── random +├── full +└── zero +""" class storage: instance = None handle = None @@ -20,20 +33,27 @@ def unique_name(name): def init_instance(entry: WorkerEntrypoint): storage.instance = storage() storage.instance.handle = entry.env.R2 + storage.instance.written_files = set() def upload(self, __bucket, key, filepath): + if filepath in self.written_files: + filepath = "/tmp" + os.path.abspath(filepath) with open(filepath, "rb") as f: self.upload_stream(__bucket, key, f.read()) return def download(self, __bucket, key, filepath): data = self.download_stream(__bucket, key) - with open(filepath, "wb") as f: + # should only allow writes to tmp dir. so do have to edit the filepath here? + tmp_fp = "/tmp" + os.path.abspath(filepath) + self.written_files.append(filepath) + with open(tmp_fp, "wb") as f: f.write(data) return def download_directory(self, __bucket, prefix, out_path): - list_res = await self.handle.list(prefix = prefix) ## gives only first 1000? + print(self.handle, type(self.handle)) + list_res = self.handle.list(prefix = prefix) ## gives only first 1000? for obj in list_res.objects: file_name = obj.key path_to_file = os.path.dirname(file_name) @@ -43,13 +63,13 @@ def download_directory(self, __bucket, prefix, out_path): def upload_stream(self, __bucket, key, data): unique_key = storage.unique_name(key) - put_res = await self.handle.put(unique_key, data) + put_res = self.handle.put(unique_key, data) return unique_key def download_stream(self, __bucket, key): - get_res = await self.handle.get(key) - assert get_res not None - data = await get_res.text() + get_res = self.handle.get(key) + assert get_res is not None + data = get_res.text() return data def get_instance(): From eaa42a1e2a93f3ca9484c0c415d4f1a88a3c8ebe Mon Sep 17 00:00:00 2001 From: ldzgch Date: Mon, 10 Nov 2025 00:09:58 +0100 Subject: [PATCH 005/230] just some changes. storage still not properly tested... --- .../wrappers/cloudflare/python/handler.py | 53 ++++++++++++------- .../wrappers/cloudflare/python/storage.py | 7 ++- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 37d44595e..8e37efee4 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -4,28 +4,40 @@ ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) +""" +currently assumed file structure: +handler.py +function/ + function.py + .py + storage.py + nosql.py + +""" class Default(WorkerEntrypoint): async def fetch(self, request, env): if "favicon" in request.url: return Response("None") req_text = await request.text() - event = json.loads(req_text) if req_text is None else {} - - if True: # dirty url parameters parsing, for testing - tmp = request.url.split("?") - if len(tmp) > 1: - urlparams = tmp[1] - urlparams = [chunk.split("=") for chunk in urlparams.split("&")] - for param in urlparams: - try: - event[param[0]] = int(param[1]) - except ValueError: - event[param[0]] = param[1] - except IndexError: - event[param[0]] = None - + + event = json.loads(req_text) if len(req_text) > 0 else {} + print(event) + + # dirty url parameters parsing, for testing + tmp = request.url.split("?") + if len(tmp) > 1: + urlparams = tmp[1] + urlparams = [chunk.split("=") for chunk in urlparams.split("&")] + for param in urlparams: + try: + event[param[0]] = int(param[1]) + except ValueError: + event[param[0]] = param[1] + except IndexError: + event[param[0]] = None + @@ -39,13 +51,14 @@ async def fetch(self, request, env): event['request-id'] = req_id event['income-timestamp'] = income_timestamp - from storage import storage - storage.init_instance(self) + from function import storage + + storage.storage.init_instance(self) print("event:", event) - from function import handler - ret = handler(event) + from function import function + ret = function.handler(event) log_data = { 'output': ret['result'] @@ -57,7 +70,7 @@ async def fetch(self, request, env): if "html" in event: headers = {"Content-Type" : "text/html; charset=utf-8"} - return Response(ret["result"], headers = headers) + return Response(str(ret["result"]), headers = headers) else: return Response(json.dumps({ 'begin': "0", diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index da090be6c..de2a4642e 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -45,9 +45,12 @@ def upload(self, __bucket, key, filepath): def download(self, __bucket, key, filepath): data = self.download_stream(__bucket, key) # should only allow writes to tmp dir. so do have to edit the filepath here? - tmp_fp = "/tmp" + os.path.abspath(filepath) + real_fp = filepath + if not filepath.startswith("/tmp"): + real_fp = "/tmp" + os.path.abspath(filepath) + self.written_files.append(filepath) - with open(tmp_fp, "wb") as f: + with open(real_fp, "wb") as f: f.write(data) return From 57452fa6f80d5cead282a9c2d536223c92592d33 Mon Sep 17 00:00:00 2001 From: MisterMM23 Date: Mon, 10 Nov 2025 12:02:55 +0100 Subject: [PATCH 006/230] concept for r2 storage --- sebs/cloudflare/r2.py | 146 +++++++++++++++++++++++++++++++++++ sebs/cloudflare/resources.py | 43 ++++++++--- 2 files changed, 178 insertions(+), 11 deletions(-) create mode 100644 sebs/cloudflare/r2.py diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py new file mode 100644 index 000000000..36e2d7a0c --- /dev/null +++ b/sebs/cloudflare/r2.py @@ -0,0 +1,146 @@ +import json + +import requests +from sebs.cloudflare.config import CloudflareCredentials +from sebs.faas.storage import PersistentStorage +from sebs.faas.config import Resources +from sebs.cache import Cache + + +class R2(PersistentStorage): + @staticmethod + def typename() -> str: + return "Cloudlfare.R2" + + @staticmethod + def deployment_name() -> str: + return "cloudflare" + + @property + def replace_existing(self) -> bool: + return self._replace_existing + + @replace_existing.setter + def replace_existing(self, val: bool): + self._replace_existing = val + + def __init__( + self, + region: str, + cache_client: Cache, + resources: Resources, + replace_existing: bool, + credentials: CloudflareCredentials, + ): + super().__init__(region, cache_client, resources, replace_existing) + self._credentials = credentials + + def _get_auth_headers(self) -> dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self._credentials.api_token: + return { + "Authorization": f"Bearer {self._credentials.api_token}", + "Content-Type": "application/json", + } + elif self._credentials.email and self._credentials.api_key: + return { + "X-Auth-Email": self._credentials.email, + "X-Auth-Key": self._credentials.api_key, + "Content-Type": "application/json", + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") + + def correct_name(self, name: str) -> str: + return name + + def _create_bucket( + self, name: str, buckets: list[str] = [], randomize_name: bool = False + ) -> str: + for bucket_name in buckets: + if name in bucket_name: + self.logging.info( + "Bucket {} for {} already exists, skipping.".format( + bucket_name, name + ) + ) + return bucket_name + + account_id = self._credentials.account_id + + get_bucket_uri = ( + f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" + ) + + params = {"name": "cloudflare_bucket", "locationHint": self._region} + + create_bucket_response = requests.post( + get_bucket_uri, json=params, headers=self._get_auth_headers() + ) + bucket_info = create_bucket_response.content.decode("utf-8") + bucket_info_json = json.load(bucket_info) # pyright: ignore + + return bucket_info_json.name + + """ + Download a file from a bucket. + + :param bucket_name: + :param key: storage source filepath + :param filepath: local destination filepath + """ + + def download(self, bucket_name: str, key: str, filepath: str) -> None: + pass + + """ + Upload a file to a bucket with by passing caching. + Useful for uploading code package to storage (when required). + + :param bucket_name: + :param filepath: local source filepath + :param key: storage destination filepath + """ + + def upload(self, bucket_name: str, filepath: str, key: str): + pass + + """ + Retrieves list of files in a bucket. + + :param bucket_name: + :return: list of files in a given bucket + """ + + def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: + pass + + def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: + pass + + def exists_bucket(self, bucket_name: str) -> bool: + pass + + def clean_bucket(self, bucket_name: str): + pass + + def remove_bucket(self, bucket: str): + pass + + """ + Allocate a set of input/output buckets for the benchmark. + The routine checks the cache first to verify that buckets have not + been allocated first. + + :param benchmark: benchmark name + :param buckets: number of input and number of output buckets + """ + + def uploader_func(self, bucket_idx: int, file: str, filepath: str) -> None: + pass + + """ + Download all files in a storage bucket. + Warning: assumes flat directory in a bucket! Does not handle bucket files + with directory marks in a name, e.g. 'dir1/dir2/file' + """ diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index f9f9743d2..0ef145343 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -4,21 +4,23 @@ from sebs.cache import Cache from sebs.cloudflare.config import CloudflareConfig +from sebs.cloudflare.r2 import R2 from sebs.faas.resources import SystemResources from sebs.faas.storage import PersistentStorage from sebs.faas.nosql import NoSQLStorage from sebs.utils import LoggingHandlers +import json class CloudflareSystemResources(SystemResources): """ System resources for Cloudflare Workers. - + Cloudflare Workers have a different resource model compared to traditional cloud platforms. This class handles Cloudflare-specific resources like KV namespaces and R2 storage. """ - + def __init__( self, config: CloudflareConfig, @@ -33,31 +35,50 @@ def __init__( def config(self) -> CloudflareConfig: return self._config + def _get_auth_headers(self) -> dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self._config.credentials.api_token: + return { + "Authorization": f"Bearer {self._config.credentials.api_token}", + "Content-Type": "application/json", + } + elif self._config.credentials.email and self._config.credentials.api_key: + return { + "X-Auth-Email": self._config.credentials.email, + "X-Auth-Key": self._config.credentials.api_key, + "Content-Type": "application/json", + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") + def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStorage: """ Get Cloudflare R2 storage instance. - + R2 is Cloudflare's S3-compatible object storage service. This method will create a client for managing benchmark input/output data. - + Args: replace_existing: Whether to replace existing files in storage - + Raises: NotImplementedError: R2 storage support not yet implemented """ - raise NotImplementedError( - "Cloudflare R2 storage is not yet implemented. " - "To add support, implement a PersistentStorage subclass for R2 " - "similar to sebs/aws/s3.py or sebs/azure/blob_storage.py" + + return R2( + region=self._config.region, + cache_client=None, + resources=self._config.resources, + replace_existing=replace_existing, + credentials=self._config.credentials, ) def get_nosql_storage(self) -> NoSQLStorage: """ Get Cloudflare NoSQL storage instance. - + This could use Cloudflare D1 (SQLite) or Durable Objects for NoSQL storage. - + Raises: NotImplementedError: NoSQL storage support not yet implemented """ From 9e47e0fd1758cfe1d70529b58c6b7bf8da40a172 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 10 Nov 2025 12:01:15 +0100 Subject: [PATCH 007/230] translated wrapper to js --- .../wrappers/cloudflare/nodejs/handler.js | 167 +++++++++++---- .../wrappers/cloudflare/nodejs/storage.js | 192 +++++++++++++----- 2 files changed, 268 insertions(+), 91 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 2c63947fd..a5a309dbc 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,37 +1,134 @@ -const path = require('path'), fs = require('fs'); - export default { - async fetch(request, env, ctx) { - var begin = Date.now()/1000; - var start = process.hrtime(); - var func = require('./function') - var ret = func.handler(req.body); - return ret.then( - (result) => { - var elapsed = process.hrtime(start); - var end = Date.now()/1000; - var micro = elapsed[1] / 1e3 + elapsed[0] * 1e6; - - var is_cold = false; - var fname = path.join('/tmp','cold_run'); - if(!fs.existsSync(fname)) { - is_cold = true; - fs.closeSync(fs.openSync(fname, 'w')); - } - - res.status(200).json({ - begin: begin, - end: end, - compute_time: micro, - results_time: 0, - result: {output: result}, - is_cold: is_cold, - request_id: req.headers["function-execution-id"] - }); - }, - (error) => { - throw(error); - } - ); -} + async fetch(request, env) { + // Match behavior of the Python handler: parse body, parse URL params, + // set request-id and income timestamp, call the benchmark function, + // and return a JSON response with the same fields. + + if (request.url.includes('favicon')) { + return new Response('None'); + } + + const req_text = await request.text(); + let event = {}; + if (req_text && req_text.length > 0) { + try { + event = JSON.parse(req_text); + } catch (e) { + // If body isn't JSON, keep event empty + event = {}; + } + } + + // Parse query string into event (simple parsing, mirrors Python logic) + const urlParts = request.url.split('?'); + if (urlParts.length > 1) { + const query = urlParts[1]; + const pairs = query.split('&'); + for (const p of pairs) { + const [k, v] = p.split('='); + try { + if (v === undefined) { + event[k] = null; + } else if (!Number.isNaN(Number(v)) && Number.isFinite(Number(v))) { + // mirror Python attempt to convert to int + const n = Number(v); + event[k] = Number.isInteger(n) ? parseInt(v, 10) : n; + } else { + event[k] = decodeURIComponent(v); + } + } catch (e) { + event[k] = v; + } + } + } + + // Set request id and timestamps (Python used 0 for request id) + const req_id = 0; + const income_timestamp = Math.floor(Date.now() / 1000); + event['request-id'] = req_id; + event['income-timestamp'] = income_timestamp; + + // Load the benchmark function module and initialize storage if available + let funcModule; + try { + // dynamic import to work in Workers ESM runtime + funcModule = await import('./function.js'); + } catch (e) { + try { + // fallback without .js + funcModule = await import('./function'); + } catch (e2) { + throw new Error('Failed to import benchmark function module: ' + e2.message); + } + } + + // If the function module exposes a storage initializer, call it + try { + if (funcModule && funcModule.storage && typeof funcModule.storage.init_instance === 'function') { + try { + funcModule.storage.init_instance({ env, request }); + } catch (ignore) {} + } + } catch (e) { + // don't fail the request if storage init isn't available + } + + // Execute the benchmark handler + let ret; + try { + if (funcModule && typeof funcModule.handler === 'function') { + // handler may be sync or return a promise + ret = await Promise.resolve(funcModule.handler(event)); + } else if (funcModule && funcModule.default && typeof funcModule.default.handler === 'function') { + ret = await Promise.resolve(funcModule.default.handler(event)); + } else { + throw new Error('benchmark handler function not found'); + } + } catch (err) { + // Mirror Python behavior: return structured error payload + const errorPayload = JSON.stringify({ + begin: '0', + end: '0', + results_time: '0', + result: { output: null }, + is_cold: false, + is_cold_worker: false, + container_id: '0', + environ_container_id: 'no_id', + request_id: '0', + error: String(err && err.message ? err.message : err), + }); + return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); + } + + // Build log_data similar to Python handler + const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; + if (ret && ret.measurement !== undefined) { + log_data.measurement = ret.measurement; + } + if (event.logs !== undefined) { + log_data.time = 0; + } + + if (event.html) { + return new Response(String(ret && ret.result !== undefined ? ret.result : ''), { + headers: { 'Content-Type': 'text/html; charset=utf-8' }, + }); + } + + const responseBody = JSON.stringify({ + begin: '0', + end: '0', + results_time: '0', + result: log_data, + is_cold: false, + is_cold_worker: false, + container_id: '0', + environ_container_id: 'no_id', + request_id: '0', + }); + + return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); + }, +}; diff --git a/benchmarks/wrappers/cloudflare/nodejs/storage.js b/benchmarks/wrappers/cloudflare/nodejs/storage.js index 134192089..01fca6803 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/storage.js @@ -1,64 +1,144 @@ -const { Storage } = - fs = require('fs'), - path = require('path'), - uuid = require('uuid'), - util = require('util'), - stream = require('stream'); +const fs = require('fs'); +const path = require('path'); +const uuid = require('uuid'); -class cf_storage { +// Storage wrapper compatible with the Python storage implementation. +// Supports Cloudflare R2 (via env.R2) when available; falls back to +// filesystem-based operations when running in Node.js (for local tests). +class storage { constructor() { + this.handle = null; // R2 binding + this.written_files = new Set(); + } + + static unique_name(name) { + const parsed = path.parse(name); + const uuid_name = uuid.v4().split('-')[0]; + return path.join(parsed.dir, `${parsed.name}.${uuid_name}${parsed.ext}`); + } + + // entry is expected to be an object with `env` (Workers) or nothing for Node + static init_instance(entry) { + storage.instance = new storage(); + if (entry && entry.env && entry.env.R2) { + storage.instance.handle = entry.env.R2; + } + storage.instance.written_files = new Set(); + } + + // Upload a file given a local filepath. In Workers env this is not available + // so callers should use upload_stream or pass raw data. For Node.js we read + // the file from disk and put it into R2 if available, otherwise throw. + async upload(__bucket, key, filepath) { + // If file was previously written during this invocation, use /tmp absolute + let realPath = filepath; + if (this.written_files.has(filepath)) { + realPath = path.join('/tmp', path.resolve(filepath)); + } + + // Read file content + if (fs && fs.existsSync(realPath)) { + const data = fs.readFileSync(realPath); + return await this.upload_stream(__bucket, key, data); + } + + // If running in Workers (no fs) and caller provided Buffer/Stream, they + // should call upload_stream directly. Otherwise, throw. + throw new Error('upload(): file not found on disk and no R2 handle provided'); + } + + async download(__bucket, key, filepath) { + const data = await this.download_stream(__bucket, key); + + let real_fp = filepath; + if (!filepath.startsWith('/tmp')) { + real_fp = path.join('/tmp', path.resolve(filepath)); + } + + this.written_files.add(filepath); + + // Write data to file if we have fs + if (fs) { + fs.mkdirSync(path.dirname(real_fp), { recursive: true }); + if (Buffer.isBuffer(data)) { + fs.writeFileSync(real_fp, data); + } else { + fs.writeFileSync(real_fp, Buffer.from(String(data))); + } + return; + } + + // In Workers environment, callers should use stream APIs directly. + return; + } + + async download_directory(__bucket, prefix, out_path) { + if (!this.handle) { + throw new Error('download_directory requires R2 binding (env.R2)'); + } + + const list_res = await this.handle.list({ prefix }); + const objects = list_res.objects || []; + for (const obj of objects) { + const file_name = obj.key; + const path_to_file = path.dirname(file_name); + fs.mkdirSync(path.join(out_path, path_to_file), { recursive: true }); + await this.download(__bucket, file_name, path.join(out_path, file_name)); + } + } + + async upload_stream(__bucket, key, data) { + const unique_key = storage.unique_name(key); + if (this.handle) { + // R2 put accepts ArrayBuffer, ReadableStream, or string + await this.handle.put(unique_key, data); + return unique_key; + } + + // If no R2, write to local fs as fallback + if (fs) { + const outPath = path.join('/tmp', unique_key); + fs.mkdirSync(path.dirname(outPath), { recursive: true }); + if (Buffer.isBuffer(data)) fs.writeFileSync(outPath, data); + else fs.writeFileSync(outPath, Buffer.from(String(data))); + return unique_key; + } + + throw new Error('upload_stream(): no storage backend available'); + } + + async download_stream(__bucket, key) { + if (this.handle) { + const obj = await this.handle.get(key); + if (!obj) return null; + // R2 object provides arrayBuffer()/text() helpers in Workers + if (typeof obj.arrayBuffer === 'function') { + const ab = await obj.arrayBuffer(); + return Buffer.from(ab); + } + if (typeof obj.text === 'function') { + return await obj.text(); + } + // Fallback: return null + return null; + } + + // Fallback to local filesystem + const localPath = path.join('/tmp', key); + if (fs && fs.existsSync(localPath)) { + return fs.readFileSync(localPath); + } + throw new Error('download_stream(): object not found'); } - unique_name(file) { - let name = path.parse(file); - let uuid_name = uuid.v4().split('-')[0]; - return path.join(name.dir, util.format('%s.%s%s', name.name, uuid_name, name.ext)); + static get_instance() { + if (!storage.instance) { + throw new Error('must init storage singleton first'); + } + return storage.instance; } +} - upload(container, file, filepath) { - let bucket = this.storage.bucket(container); - let uniqueName = this.unique_name(file); - let options = {destination: uniqueName, resumable: false}; - return [uniqueName, bucket.upload(filepath, options)]; - }; - - download(container, file, filepath) { - let bucket = this.storage.bucket(container); - var file = bucket.file(file); - file.download({destination: filepath}); - }; - - uploadStream(container, file) { - let bucket = this.storage.bucket(container); - let uniqueName = this.unique_name(file); - var file = bucket.file(uniqueName); - let upload = file.createWriteStream(); - var write_stream = new stream.PassThrough(); - - write_stream.pipe(upload); - - const promise = new Promise((resolve, reject) => { - upload.on('error', err => { - upload.end(); - reject(err); - }); - - upload.on('finish', () => { - upload.end(); - resolve(file.name); - }); - }); - return [write_stream, promise, uniqueName]; - }; - - downloadStream(container, file) { - let bucket = this.storage.bucket(container); - var file = bucket.file(file); - let downloaded = file.createReadStream(); - return Promise.resolve(downloaded); - }; -}; - -exports.storage = cf_storage; +module.exports.storage = storage; From 822a9d951bcc561ac4e6ac1668762bede9d0f134 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 10 Nov 2025 15:06:47 +0100 Subject: [PATCH 008/230] used output from workers as analytics measurements in sebs --- sebs/cli.py | 2 +- sebs/cloudflare/cloudflare.py | 279 ++++++++++------------------------ sebs/cloudflare/triggers.py | 60 ++++++-- sebs/sebs.py | 7 +- 4 files changed, 133 insertions(+), 215 deletions(-) diff --git a/sebs/cli.py b/sebs/cli.py index 4e1cc558c..f65c5eb6e 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -113,7 +113,7 @@ def common_params(func): @click.option( "--deployment", default=None, - type=click.Choice(["azure", "aws", "gcp", "local", "openwhisk"]), + type=click.Choice(["azure", "aws", "gcp", "local", "openwhisk", "cloudflare"]), help="Cloud deployment to use.", ) @click.option( diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index ce2cc25ea..366ea59f0 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -457,226 +457,107 @@ def download_metrics( metrics: dict, ): """ - Download per-invocation metrics from Cloudflare Analytics Engine. + Extract per-invocation metrics from ExecutionResult objects. - Queries Analytics Engine SQL API to retrieve performance data for each - invocation and enriches the ExecutionResult objects with provider metrics. - - Note: Requires Analytics Engine binding to be configured on the worker - and benchmark code to write data points during execution. + The metrics are extracted from the 'measurement' field in the benchmark + response, which is populated by the Cloudflare Worker handler during execution. + This approach avoids dependency on Analytics Engine and provides immediate, + accurate metrics for each invocation. Args: function_name: Name of the worker - start_time: Start time (Unix timestamp in seconds) - end_time: End time (Unix timestamp in seconds) + start_time: Start time (Unix timestamp in seconds) - not used + end_time: End time (Unix timestamp in seconds) - not used requests: Dict mapping request_id -> ExecutionResult metrics: Dict to store aggregated metrics """ if not requests: - self.logging.warning("No requests to download metrics for") - return - - account_id = self.config.credentials.account_id - if not account_id: - self.logging.error("Account ID required to download metrics") + self.logging.warning("No requests to extract metrics from") return self.logging.info( - f"Downloading Analytics Engine metrics for {len(requests)} invocations " + f"Extracting metrics from {len(requests)} invocations " f"of worker {function_name}" ) - try: - # Query Analytics Engine for per-invocation metrics - metrics_data = self._query_analytics_engine( - account_id, start_time, end_time, function_name - ) + # Aggregate statistics from all requests + total_invocations = len(requests) + cold_starts = 0 + warm_starts = 0 + cpu_times = [] + wall_times = [] + memory_values = [] + + for request_id, result in requests.items(): + # Count cold/warm starts + if result.stats.cold_start: + cold_starts += 1 + else: + warm_starts += 1 - if not metrics_data: - self.logging.warning( - "No metrics data returned from Analytics Engine. " - "Ensure the worker has Analytics Engine binding configured " - "and is writing data points during execution." - ) - return + # Collect CPU times + if result.provider_times.execution > 0: + cpu_times.append(result.provider_times.execution) - # Match metrics with invocation requests - matched = 0 - unmatched_metrics = 0 + # Collect wall times (benchmark times) + if result.times.benchmark > 0: + wall_times.append(result.times.benchmark) - for row in metrics_data: - request_id = row.get('request_id') - - if request_id and request_id in requests: - result = requests[request_id] - - # Populate provider times (convert ms to microseconds) - wall_time_ms = row.get('wall_time_ms', 0) - cpu_time_ms = row.get('cpu_time_ms', 0) - - result.provider_times.execution = int(cpu_time_ms * 1000) # μs - result.provider_times.initialization = 0 # Not separately tracked - - # Populate stats - result.stats.cold_start = (row.get('cold_warm') == 'cold') - result.stats.memory_used = 128.0 # Cloudflare Workers: fixed 128MB - - # Populate billing info - # Cloudflare billing: $0.50 per million requests + - # $12.50 per million GB-seconds of CPU time - result.billing.memory = 128 - result.billing.billed_time = int(cpu_time_ms * 1000) # μs - - # GB-seconds calculation: (128MB / 1024MB/GB) * (cpu_time_ms / 1000ms/s) - gb_seconds = (128.0 / 1024.0) * (cpu_time_ms / 1000.0) - result.billing.gb_seconds = int(gb_seconds * 1000000) # micro GB-seconds - - matched += 1 - elif request_id: - unmatched_metrics += 1 + # Collect memory usage + if result.stats.memory_used > 0: + memory_values.append(result.stats.memory_used) - # Calculate statistics from matched metrics - if matched > 0: - cpu_times = [ - requests[rid].provider_times.execution - for rid in requests - if requests[rid].provider_times.execution > 0 - ] - cold_starts = sum( - 1 for rid in requests if requests[rid].stats.cold_start - ) + # Set billing info for Cloudflare Workers + # Cloudflare billing: $0.50 per million requests + + # $12.50 per million GB-seconds of CPU time + if result.provider_times.execution > 0: + result.billing.memory = 128 # Cloudflare Workers: fixed 128MB + result.billing.billed_time = result.provider_times.execution # μs - metrics['cloudflare'] = { - 'total_invocations': len(metrics_data), - 'matched_invocations': matched, - 'unmatched_invocations': len(requests) - matched, - 'unmatched_metrics': unmatched_metrics, - 'cold_starts': cold_starts, - 'warm_starts': matched - cold_starts, - 'data_source': 'analytics_engine', - 'note': 'Per-invocation metrics from Analytics Engine' - } - - if cpu_times: - metrics['cloudflare']['avg_cpu_time_us'] = sum(cpu_times) // len(cpu_times) - metrics['cloudflare']['min_cpu_time_us'] = min(cpu_times) - metrics['cloudflare']['max_cpu_time_us'] = max(cpu_times) - - self.logging.info( - f"Analytics Engine metrics: matched {matched}/{len(requests)} invocations" - ) - - if matched < len(requests): - missing = len(requests) - matched - self.logging.warning( - f"{missing} invocations not found in Analytics Engine. " - "This may be due to:\n" - " - Analytics Engine ingestion delay (typically <60s)\n" - " - Worker not writing data points correctly\n" - " - Analytics Engine binding not configured" - ) - - if unmatched_metrics > 0: - self.logging.warning( - f"{unmatched_metrics} metrics found in Analytics Engine " - "that don't match tracked request IDs (possibly from other sources)" - ) - - except Exception as e: - self.logging.error(f"Failed to download metrics: {e}") - self.logging.warning( - "Continuing without Analytics Engine metrics. " - "Client-side timing data is still available." - ) - - def _query_analytics_engine( - self, - account_id: str, - start_time: int, - end_time: int, - script_name: str - ) -> List[dict]: - """ - Query Analytics Engine SQL API for worker metrics. - - Retrieves per-invocation metrics written by the worker during execution. - The worker must write data points with the following schema: - - index1: request_id (unique identifier) - - index2: cold_warm ("cold" or "warm") - - double1: wall_time_ms (wall clock time in milliseconds) - - double2: cpu_time_ms (CPU time in milliseconds) - - blob1: url (request URL) - - blob2: status ("success" or "error") - - blob3: error_message (if applicable) + # GB-seconds calculation: (128MB / 1024MB/GB) * (cpu_time_us / 1000000 us/s) + cpu_time_seconds = result.provider_times.execution / 1_000_000.0 + gb_seconds = (128.0 / 1024.0) * cpu_time_seconds + result.billing.gb_seconds = int(gb_seconds * 1_000_000) # micro GB-seconds + + # Calculate statistics + metrics['cloudflare'] = { + 'total_invocations': total_invocations, + 'cold_starts': cold_starts, + 'warm_starts': warm_starts, + 'data_source': 'response_measurements', + 'note': 'Per-invocation metrics extracted from benchmark response' + } - Args: - account_id: Cloudflare account ID - start_time: Unix timestamp (seconds) - end_time: Unix timestamp (seconds) - script_name: Worker script name - - Returns: - List of metric data points, one per invocation - """ - headers = self._get_auth_headers() - url = f"{self._api_base_url}/accounts/{account_id}/analytics_engine/sql" - - # Convert Unix timestamps to DateTime format for ClickHouse - from datetime import datetime - start_dt = datetime.utcfromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S') - end_dt = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') - - # SQL query for Analytics Engine - # Note: Analytics Engine uses ClickHouse SQL syntax - sql_query = f""" - SELECT - index1 as request_id, - index2 as cold_warm, - double1 as wall_time_ms, - double2 as cpu_time_ms, - blob1 as url, - blob2 as status, - blob3 as error_message, - timestamp - FROM ANALYTICS_DATASET - WHERE timestamp >= toDateTime('{start_dt}') - AND timestamp <= toDateTime('{end_dt}') - AND blob1 LIKE '%{script_name}%' - ORDER BY timestamp ASC - """ + if cpu_times: + metrics['cloudflare']['avg_cpu_time_us'] = sum(cpu_times) // len(cpu_times) + metrics['cloudflare']['min_cpu_time_us'] = min(cpu_times) + metrics['cloudflare']['max_cpu_time_us'] = max(cpu_times) + metrics['cloudflare']['cpu_time_measurements'] = len(cpu_times) - try: - # Analytics Engine SQL API returns newline-delimited JSON - response = requests.post( - url, - headers=headers, - data=sql_query, - timeout=30 - ) - - if response.status_code == 200: - # Parse newline-delimited JSON response - results = [] - for line in response.text.strip().split('\n'): - if line: - try: - results.append(json.loads(line)) - except json.JSONDecodeError: - self.logging.warning(f"Failed to parse Analytics Engine line: {line}") - - self.logging.info(f"Retrieved {len(results)} data points from Analytics Engine") - return results - else: - raise RuntimeError( - f"Analytics Engine query failed: {response.status_code} - {response.text}" - ) - - except requests.exceptions.Timeout: - self.logging.error("Analytics Engine query timed out") - return [] - except Exception as e: - self.logging.error(f"Analytics Engine query error: {e}") - return [] + if wall_times: + metrics['cloudflare']['avg_wall_time_us'] = sum(wall_times) // len(wall_times) + metrics['cloudflare']['min_wall_time_us'] = min(wall_times) + metrics['cloudflare']['max_wall_time_us'] = max(wall_times) + metrics['cloudflare']['wall_time_measurements'] = len(wall_times) + + if memory_values: + metrics['cloudflare']['avg_memory_mb'] = sum(memory_values) / len(memory_values) + metrics['cloudflare']['min_memory_mb'] = min(memory_values) + metrics['cloudflare']['max_memory_mb'] = max(memory_values) + metrics['cloudflare']['memory_measurements'] = len(memory_values) + + self.logging.info( + f"Extracted metrics from {total_invocations} invocations: " + f"{cold_starts} cold starts, {warm_starts} warm starts" + ) + + if cpu_times: + avg_cpu_ms = sum(cpu_times) / len(cpu_times) / 1000.0 + self.logging.info(f"Average CPU time: {avg_cpu_ms:.2f} ms") + + if wall_times: + avg_wall_ms = sum(wall_times) / len(wall_times) / 1000.0 + self.logging.info(f"Average wall time: {avg_wall_ms:.2f} ms") def create_trigger( self, function: Function, trigger_type: Trigger.TriggerType diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index cb038b63a..310f1fa94 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -1,6 +1,7 @@ from typing import Optional +import concurrent.futures -from sebs.faas.function import Trigger +from sebs.faas.function import Trigger, ExecutionResult class LibraryTrigger(Trigger): @@ -21,7 +22,7 @@ def typename() -> str: def trigger_type() -> Trigger.TriggerType: return Trigger.TriggerType.LIBRARY - def sync_invoke(self, payload: dict) -> Optional[str]: + def sync_invoke(self, payload: dict) -> ExecutionResult: """ Synchronously invoke a Cloudflare Worker. @@ -29,12 +30,12 @@ def sync_invoke(self, payload: dict) -> Optional[str]: payload: The payload to send to the worker Returns: - The response from the worker + ExecutionResult with performance metrics """ # This will be implemented when we have the deployment client raise NotImplementedError("Cloudflare Worker invocation not yet implemented") - def async_invoke(self, payload: dict) -> object: + def async_invoke(self, payload: dict) -> concurrent.futures.Future: """ Asynchronously invoke a Cloudflare Worker. Not typically supported for Cloudflare Workers. @@ -76,7 +77,7 @@ def url(self) -> str: def url(self, url: str): self._url = url - def sync_invoke(self, payload: dict) -> Optional[str]: + def sync_invoke(self, payload: dict) -> ExecutionResult: """ Synchronously invoke a Cloudflare Worker via HTTP. @@ -84,20 +85,53 @@ def sync_invoke(self, payload: dict) -> Optional[str]: payload: The payload to send to the worker Returns: - The response from the worker + ExecutionResult with performance metrics extracted from the response """ - import requests + self.logging.debug(f"Invoke function {self.url}") + result = self._http_invoke(payload, self.url) - response = requests.post(self.url, json=payload) - response.raise_for_status() - return response.text + # Extract measurement data from the response if available + if result.output and 'result' in result.output: + result_data = result.output['result'] + if isinstance(result_data, dict) and 'measurement' in result_data: + measurement = result_data['measurement'] + + # Extract timing metrics if provided by the benchmark + if isinstance(measurement, dict): + # CPU time in microseconds + if 'cpu_time_us' in measurement: + result.provider_times.execution = measurement['cpu_time_us'] + elif 'cpu_time_ms' in measurement: + result.provider_times.execution = int(measurement['cpu_time_ms'] * 1000) + + # Wall time in microseconds + if 'wall_time_us' in measurement: + result.times.benchmark = measurement['wall_time_us'] + elif 'wall_time_ms' in measurement: + result.times.benchmark = int(measurement['wall_time_ms'] * 1000) + + # Cold/warm start detection + if 'is_cold' in measurement: + result.stats.cold_start = measurement['is_cold'] + + # Memory usage if available + if 'memory_used_mb' in measurement: + result.stats.memory_used = measurement['memory_used_mb'] + + # Store the full measurement for later analysis + result.output['measurement'] = measurement + + self.logging.debug(f"Extracted measurements: {measurement}") + + return result - def async_invoke(self, payload: dict) -> object: + def async_invoke(self, payload: dict) -> concurrent.futures.Future: """ Asynchronously invoke a Cloudflare Worker via HTTP. - Not typically needed for Cloudflare Workers. """ - raise NotImplementedError("Cloudflare Workers do not support async HTTP invocation") + pool = concurrent.futures.ThreadPoolExecutor() + fut = pool.submit(self.sync_invoke, payload) + return fut def serialize(self) -> dict: return { diff --git a/sebs/sebs.py b/sebs/sebs.py index febfeb24a..d90512159 100644 --- a/sebs/sebs.py +++ b/sebs/sebs.py @@ -214,10 +214,13 @@ def get_deployment( from sebs.openwhisk import OpenWhisk implementations["openwhisk"] = OpenWhisk - if has_platform("cloudflare"): + + # Cloudflare is available by default (like local) + try: from sebs.cloudflare import Cloudflare - implementations["cloudflare"] = Cloudflare + except ImportError: + pass # Validate deployment platform if name not in implementations: From f7bb950e9123c6487f4b0dc2284ed9166a8f88e4 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 10 Nov 2025 15:40:16 +0100 Subject: [PATCH 009/230] last changes necessary for sebs to run cloudflare. now just the storage is needed and the wrappers have to work --- configs/systems.json | 24 +++++++++++++++++++++++- sebs/cloudflare/cloudflare.py | 24 +++++++++++++++++++++++- sebs/cloudflare/r2.py | 2 +- sebs/cloudflare/resources.py | 3 ++- sebs/faas/config.py | 8 ++++++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/configs/systems.json b/configs/systems.json index a6f8ac186..94190320e 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -450,7 +450,29 @@ "packages": [], "module_packages": {} } + }, + "nodejs": { + "base_images": { + "x64": { + "18": "ubuntu:22.04", + "20": "ubuntu:22.04" + } + }, + "images": [ + "build" + ], + "deployment": { + "files": [ + "handler.js", + "storage.js" + ], + "packages": { + "uuid": "3.4.0" + } + } } - } + }, + "architecture": ["x64"], + "deployments": ["package"] } } diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 366ea59f0..f09e3d89c 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -75,12 +75,34 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] def _verify_credentials(self): """Verify that the Cloudflare API credentials are valid.""" + # Check if credentials are set + if not self.config.credentials.api_token and not (self.config.credentials.email and self.config.credentials.api_key): + raise RuntimeError( + "Cloudflare API credentials are not set. Please set CLOUDFLARE_API_TOKEN " + "and CLOUDFLARE_ACCOUNT_ID environment variables." + ) + + if not self.config.credentials.account_id: + raise RuntimeError( + "Cloudflare Account ID is not set. Please set CLOUDFLARE_ACCOUNT_ID " + "environment variable." + ) + headers = self._get_auth_headers() + + # Log credential type being used (without exposing the actual token) + if self.config.credentials.api_token: + token_preview = self.config.credentials.api_token[:8] + "..." if len(self.config.credentials.api_token) > 8 else "***" + self.logging.info(f"Using API Token authentication (starts with: {token_preview})") + else: + self.logging.info(f"Using Email + API Key authentication (email: {self.config.credentials.email})") + response = requests.get(f"{self._api_base_url}/user/tokens/verify", headers=headers) if response.status_code != 200: raise RuntimeError( - f"Failed to verify Cloudflare credentials: {response.status_code} - {response.text}" + f"Failed to verify Cloudflare credentials: {response.status_code} - {response.text}\n" + f"Please check that your CLOUDFLARE_API_TOKEN and CLOUDFLARE_ACCOUNT_ID are correct." ) self.logging.info("Cloudflare credentials verified successfully") diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 36e2d7a0c..60265ca0d 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -6,7 +6,7 @@ from sebs.faas.config import Resources from sebs.cache import Cache - +from typing import List, Optional class R2(PersistentStorage): @staticmethod def typename() -> str: diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index 0ef145343..fa181ad71 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -28,8 +28,9 @@ def __init__( docker_client: docker.client, logging_handlers: LoggingHandlers, ): - super().__init__(config, cache_client, docker_client, logging_handlers) + super().__init__(config, cache_client, docker_client) self._config = config + self.logging_handlers = logging_handlers @property def config(self) -> CloudflareConfig: diff --git a/sebs/faas/config.py b/sebs/faas/config.py index 8e4e6784b..d1fa07a93 100644 --- a/sebs/faas/config.py +++ b/sebs/faas/config.py @@ -431,6 +431,14 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> "Confi from sebs.openwhisk.config import OpenWhiskConfig implementations["openwhisk"] = OpenWhiskConfig.deserialize + + # Cloudflare is available by default (like local) + try: + from sebs.cloudflare.config import CloudflareConfig + implementations["cloudflare"] = CloudflareConfig.deserialize + except ImportError: + pass + func = implementations.get(name) assert func, "Unknown config type!" return func(config[name] if name in config else config, cache, handlers) From 1f0a9795423d5731f187705d094573e854438c6e Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 11 Nov 2025 17:48:49 +0100 Subject: [PATCH 010/230] javascript wrapper with polyfills reading from r2. r2 implementation, resources.py --- .../wrappers/cloudflare/nodejs/fs-polyfill.js | 156 +++++ .../wrappers/cloudflare/nodejs/handler.js | 85 ++- .../cloudflare/nodejs/path-polyfill.js | 54 ++ configs/cloudflare-test.json | 19 + configs/systems.json | 12 +- sebs/cloudflare/cloudflare.py | 556 ++++++++++++++++-- sebs/cloudflare/config.py | 26 +- sebs/cloudflare/durable_objects.py | 201 +++++++ sebs/cloudflare/r2.py | 288 +++++++-- sebs/cloudflare/resources.py | 27 +- sebs/cloudflare/triggers.py | 25 +- 11 files changed, 1300 insertions(+), 149 deletions(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js create mode 100644 configs/cloudflare-test.json create mode 100644 sebs/cloudflare/durable_objects.py diff --git a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js new file mode 100644 index 000000000..dac01a115 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js @@ -0,0 +1,156 @@ +/** + * fs polyfill for Cloudflare Workers that reads files from R2 bucket + * + * This polyfill provides a subset of Node.js fs API for reading files + * stored in an R2 bucket. The R2 bucket binding is accessed via + * globalThis.R2_BUCKET which is set by the handler. + */ + +/** + * Read a file from R2 bucket + * @param {string} path - File path in R2 bucket (e.g., 'templates/index.html') + * @param {string|object|function} encoding - Encoding or options or callback + * @param {function} callback - Callback function (err, data) + */ +export async function readFile(path, encoding, callback) { + // Handle overloaded arguments: readFile(path, callback) or readFile(path, encoding, callback) + let actualEncoding = 'utf8'; + let actualCallback = callback; + + if (typeof encoding === 'function') { + actualCallback = encoding; + actualEncoding = 'utf8'; + } else if (typeof encoding === 'string') { + actualEncoding = encoding; + } else if (typeof encoding === 'object' && encoding !== null && encoding.encoding) { + actualEncoding = encoding.encoding; + } + + try { + // Check if R2 bucket is available + if (!globalThis.R2_BUCKET) { + throw new Error('R2 bucket not available. Ensure R2 binding is configured in wrangler.toml'); + } + + // Normalize path: remove leading './' or '/' + let normalizedPath = path.replace(/^\.?\//, ''); + + // Prepend benchmark name if available + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + // Get object from R2 + const object = await globalThis.R2_BUCKET.get(normalizedPath); + + if (!object) { + throw new Error(`ENOENT: no such file or directory, open '${path}' (R2 key: ${normalizedPath})`); + } + + // Read the content + let content; + if (actualEncoding === 'utf8' || actualEncoding === 'utf-8') { + content = await object.text(); + } else if (actualEncoding === 'buffer' || actualEncoding === null) { + content = await object.arrayBuffer(); + } else { + // For other encodings, get text and let caller handle conversion + content = await object.text(); + } + + if (actualCallback) { + actualCallback(null, content); + } + return content; + } catch (err) { + if (actualCallback) { + actualCallback(err, null); + } else { + throw err; + } + } +} + +/** + * Synchronous version of readFile (not truly sync in Workers, but returns a Promise) + * Note: This is a compatibility shim - it still returns a Promise + */ +export function readFileSync(path, encoding) { + return new Promise((resolve, reject) => { + readFile(path, encoding || 'utf8', (err, data) => { + if (err) reject(err); + else resolve(data); + }); + }); +} + +/** + * Check if a file exists in R2 + */ +export async function exists(path, callback) { + try { + if (!globalThis.R2_BUCKET) { + if (callback) callback(false); + return false; + } + + let normalizedPath = path.replace(/^\.?\//, ''); + + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + const object = await globalThis.R2_BUCKET.head(normalizedPath); + + const result = object !== null; + if (callback) callback(result); + return result; + } catch (err) { + if (callback) callback(false); + return false; + } +} + +/** + * Get file stats from R2 + */ +export async function stat(path, callback) { + try { + if (!globalThis.R2_BUCKET) { + throw new Error('R2 bucket not available'); + } + + let normalizedPath = path.replace(/^\.?\//, ''); + + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + const object = await globalThis.R2_BUCKET.head(normalizedPath); + + if (!object) { + throw new Error(`ENOENT: no such file or directory, stat '${path}'`); + } + + const stats = { + size: object.size, + isFile: () => true, + isDirectory: () => false, + mtime: object.uploaded, + }; + + if (callback) callback(null, stats); + return stats; + } catch (err) { + if (callback) callback(err, null); + else throw err; + } +} + +// Export default object with all methods for CommonJS-style usage +export default { + readFile, + readFileSync, + exists, + stat, +}; diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index a5a309dbc..463bbd8e2 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,13 +1,57 @@ +// Simple CommonJS polyfill for Cloudflare Workers +// This allows us to load CommonJS modules that use require() and module.exports +const moduleCache = {}; + +function createRequire(currentModule) { + return function require(modulePath) { + if (moduleCache[modulePath]) { + return moduleCache[modulePath].exports; + } + + // Create module object + const module = { exports: {} }; + moduleCache[modulePath] = module; + + // This is a placeholder - actual module loading would happen here + // For our use case, we'll manually register modules below + throw new Error(`Module ${modulePath} not found in polyfill cache`); + }; +} + +// Polyfill for __dirname and __filename if not available +if (typeof globalThis.__dirname === 'undefined') { + globalThis.__dirname = '.'; +} + +if (typeof globalThis.__filename === 'undefined') { + globalThis.__filename = './handler.js'; +} + +if (typeof globalThis.require === 'undefined') { + globalThis.require = createRequire(globalThis); +} + + + export default { async fetch(request, env) { - // Match behavior of the Python handler: parse body, parse URL params, - // set request-id and income timestamp, call the benchmark function, - // and return a JSON response with the same fields. + try { + // Store R2 bucket binding and benchmark name in globals for fs-polyfill access + if (env.R2) { + globalThis.R2_BUCKET = env.R2; + } + if (env.BENCHMARK_NAME) { + globalThis.BENCHMARK_NAME = env.BENCHMARK_NAME; + } - if (request.url.includes('favicon')) { - return new Response('None'); - } + // Match behavior of the Python handler: parse body, parse URL params, + // set request-id and income timestamp, call the benchmark function, + // and return a JSON response with the same fields. + + if (request.url.includes('favicon')) { + return new Response('None'); + } const req_text = await request.text(); let event = {}; @@ -50,17 +94,13 @@ export default { event['income-timestamp'] = income_timestamp; // Load the benchmark function module and initialize storage if available + // With nodejs_compat enabled, we can use require() for CommonJS modules let funcModule; try { - // dynamic import to work in Workers ESM runtime + // Fallback to dynamic import for ES modules funcModule = await import('./function.js'); - } catch (e) { - try { - // fallback without .js - funcModule = await import('./function'); - } catch (e2) { - throw new Error('Failed to import benchmark function module: ' + e2.message); - } + } catch (e2) { + throw new Error('Failed to import benchmark function module: ' + e2.message); } // If the function module exposes a storage initializer, call it @@ -130,5 +170,22 @@ export default { }); return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); + } catch (topLevelError) { + // Catch any uncaught errors (module loading, syntax errors, etc.) + const errorPayload = JSON.stringify({ + begin: '0', + end: '0', + results_time: '0', + result: { output: null }, + is_cold: false, + is_cold_worker: false, + container_id: '0', + environ_container_id: 'no_id', + request_id: '0', + error: `Top-level error: ${topLevelError && topLevelError.message ? topLevelError.message : String(topLevelError)}`, + stack: topLevelError && topLevelError.stack ? topLevelError.stack : undefined, + }); + return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); + } }, }; diff --git a/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js new file mode 100644 index 000000000..4c77c2be9 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js @@ -0,0 +1,54 @@ +/** + * Minimal POSIX-like path.resolve polyfill for Cloudflare Workers / browser. + * Always returns an absolute path starting with '/'. + * Does not use process.cwd(); root '/' is the base. + */ + +function normalizeSegments(segments) { + const out = []; + for (const seg of segments) { + if (!seg || seg === '.') continue; + if (seg === '..') { + if (out.length && out[out.length - 1] !== '..') out.pop(); + continue; + } + out.push(seg); + } + return out; +} + +/** + * Resolve path segments into an absolute path. + * @param {...string} input + * @returns {string} + */ +export function resolve(...input) { + if (!input.length) return '/'; + let absoluteFound = false; + const segments = []; + + for (let i = input.length - 1; i >= 0; i--) { + let part = String(input[i]); + if (part === '') continue; + // Normalize backslashes to forward slashes (basic win compatibility) + part = part.replace(/\\/g, '/'); + + if (part[0] === '/') { + absoluteFound = true; + part = part.slice(1); // drop leading '/' to just collect segments + } + const split = part.split('/'); + for (let j = split.length - 1; j >= 0; j--) { + const seg = split[j]; + if (seg) segments.push(seg); + } + if (absoluteFound) break; + } + + const normalized = normalizeSegments(segments.reverse()); + return '/' + normalized.join('/'); +} + +// Optional convenience exports similar to Node's path.posix interface +const path = { resolve }; +export default path; \ No newline at end of file diff --git a/configs/cloudflare-test.json b/configs/cloudflare-test.json new file mode 100644 index 000000000..af98daff4 --- /dev/null +++ b/configs/cloudflare-test.json @@ -0,0 +1,19 @@ +{ + "experiments": { + "deployment": "cloudflare", + "update_code": false, + "update_storage": false, + "download_results": false, + "architecture": "x64", + "container_deployment": false, + "runtime": { + "language": "nodejs", + "version": "18" + } + }, + "deployment": { + "name": "cloudflare", + "cloudflare": { + } + } +} diff --git a/configs/systems.json b/configs/systems.json index 94190320e..073f564d1 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -438,9 +438,7 @@ "3.12": "ubuntu:22.04" } }, - "images": [ - "build" - ], + "images": [], "deployment": { "files": [ "handler.py", @@ -458,13 +456,13 @@ "20": "ubuntu:22.04" } }, - "images": [ - "build" - ], + "images": [], "deployment": { "files": [ "handler.js", - "storage.js" + "storage.js", + "fs-polyfill.js", + "path-polyfill.js" ], "packages": { "uuid": "3.4.0" diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index f09e3d89c..404acf1eb 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,6 +1,8 @@ import os import shutil import json +import uuid +import subprocess from typing import cast, Dict, List, Optional, Tuple, Type import docker @@ -15,6 +17,7 @@ from sebs.utils import LoggingHandlers from sebs.faas.function import Function, ExecutionResult, Trigger, FunctionConfig from sebs.faas.system import System +from sebs.faas.config import Resources class Cloudflare(System): @@ -60,6 +63,10 @@ def __init__( self.logging_handlers = logger_handlers self._config = config self._api_base_url = "https://api.cloudflare.com/client/v4" + # cached workers.dev subdomain for the account (e.g. 'marcin-copik') + # This is different from the account ID and is required to build + # public worker URLs like ..workers.dev + self._workers_dev_subdomain: Optional[str] = None def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): """ @@ -72,6 +79,45 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] # Verify credentials are valid self._verify_credentials() self.initialize_resources(select_prefix=resource_prefix) + + def initialize_resources(self, select_prefix: Optional[str] = None): + """ + Initialize Cloudflare resources. + + Overrides the base class method to handle R2 storage gracefully. + Cloudflare Workers can operate without R2 storage for many benchmarks. + + Args: + select_prefix: Optional prefix for resource naming + """ + deployments = self.find_deployments() + + # Check if we have an existing deployment + if deployments: + res_id = deployments[0] + self.config.resources.resources_id = res_id + self.logging.info(f"Using existing resource deployment {res_id}") + return + + # Create new resource ID + if select_prefix is not None: + res_id = f"{select_prefix}-{str(uuid.uuid1())[0:8]}" + else: + res_id = str(uuid.uuid1())[0:8] + + self.config.resources.resources_id = res_id + self.logging.info(f"Generating unique resource name {res_id}") + + # Try to create R2 bucket, but don't fail if R2 is not enabled + try: + self.system_resources.get_storage().get_bucket(Resources.StorageBucketType.BENCHMARKS) + self.logging.info("R2 storage initialized successfully") + except Exception as e: + self.logging.warning( + f"R2 storage initialization failed: {e}. " + f"R2 must be enabled in your Cloudflare dashboard to use storage-dependent benchmarks. " + f"Continuing without R2 storage - only benchmarks that don't require storage will work." + ) def _verify_credentials(self): """Verify that the Cloudflare API credentials are valid.""" @@ -107,6 +153,117 @@ def _verify_credentials(self): self.logging.info("Cloudflare credentials verified successfully") + def _ensure_wrangler_installed(self): + """Ensure Wrangler CLI is installed and available.""" + try: + result = subprocess.run( + ["wrangler", "--version"], + capture_output=True, + text=True, + check=True, + timeout=10 + ) + version = result.stdout.strip() + self.logging.info(f"Wrangler is installed: {version}") + except (subprocess.CalledProcessError, FileNotFoundError): + self.logging.info("Wrangler not found, installing globally via npm...") + try: + result = subprocess.run( + ["npm", "install", "-g", "wrangler"], + capture_output=True, + text=True, + check=True, + timeout=120 + ) + self.logging.info("Wrangler installed successfully") + if result.stdout: + self.logging.debug(f"npm install wrangler output: {result.stdout}") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to install Wrangler: {e.stderr}") + except FileNotFoundError: + raise RuntimeError( + "npm not found. Please install Node.js and npm to use Wrangler for deployment." + ) + except subprocess.TimeoutExpired: + raise RuntimeError("Wrangler version check timed out") + + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None) -> str: + """ + Generate a wrangler.toml configuration file for the worker. + + Args: + worker_name: Name of the worker + package_dir: Directory containing the worker code + language: Programming language (nodejs or python) + account_id: Cloudflare account ID + benchmark_name: Optional benchmark name for R2 file path prefix + + Returns: + Path to the generated wrangler.toml file + """ + main_file = "handler.js" if language == "nodejs" else "handler.py" + + # Build wrangler.toml content + toml_content = f"""name = "{worker_name}" +main = "{main_file}" +compatibility_date = "2024-11-01" +account_id = "{account_id}" + +""" + + # Add environment variable for benchmark name (used by fs-polyfill for R2 paths) + if benchmark_name: + toml_content += f"""# Benchmark name used for R2 file path prefix +[vars] +BENCHMARK_NAME = "{benchmark_name}" + +""" + + # Add R2 bucket binding for benchmarking files (required for fs/path polyfills) + r2_bucket_configured = False + try: + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + if bucket_name: + toml_content += f"""# R2 bucket binding for benchmarking files +# This bucket is used by fs and path polyfills to read benchmark data +[[r2_buckets]] +binding = "R2" +bucket_name = "{bucket_name}" + +""" + r2_bucket_configured = True + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") + except Exception as e: + self.logging.warning( + f"R2 bucket binding not configured: {e}. " + f"Benchmarks requiring file access will not work properly." + ) + + # Add compatibility flags based on language + if language == "nodejs": + toml_content += """# Custom polyfills for fs and path that read from R2 bucket +[alias] +"fs" = "./fs-polyfill" +"path" = "./path-polyfill" +""" + if r2_bucket_configured: + self.logging.info( + "fs and path polyfills configured to use R2 bucket for file operations" + ) + elif language == "python": + toml_content += """# Enable Python Workers runtime +compatibility_flags = ["python_workers"] +""" + + # Write wrangler.toml to package directory + toml_path = os.path.join(package_dir, "wrangler.toml") + with open(toml_path, 'w') as f: + f.write(toml_content) + + self.logging.info(f"Generated wrangler.toml at {toml_path}") + return toml_path + def _get_auth_headers(self) -> Dict[str, str]: """Get authentication headers for Cloudflare API requests.""" if self.config.credentials.api_token: @@ -123,6 +280,112 @@ def _get_auth_headers(self) -> Dict[str, str]: else: raise RuntimeError("Invalid Cloudflare credentials configuration") + def _convert_templates_to_modules(self, directory: str): + """ + Convert template files to JavaScript modules for bundling. + + Searches for template directories and converts HTML/text files + to JavaScript modules that can be imported. + + Args: + directory: Package directory to search for templates + """ + templates_dir = os.path.join(directory, "templates") + if not os.path.exists(templates_dir): + return + + self.logging.info(f"Converting template files in {templates_dir} to JavaScript modules") + + for root, dirs, files in os.walk(templates_dir): + for file in files: + if file.endswith(('.html', '.txt', '.xml', '.csv')): + file_path = os.path.join(root, file) + rel_path = os.path.relpath(file_path, directory) + + # Read the template content + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Escape for JavaScript string + content_escaped = (content + .replace('\\', '\\\\') + .replace('`', '\\`') + .replace('$', '\\$')) + + # Create a .js module file next to the template + module_path = file_path + '.js' + with open(module_path, 'w', encoding='utf-8') as f: + f.write(f'export default `{content_escaped}`;\n') + + self.logging.debug(f"Created template module: {module_path}") + + def _upload_benchmark_files_to_r2(self, directory: str, benchmark_name: str) -> int: + """ + Upload benchmark data files to R2 bucket for fs-polyfill access. + + This allows the fs-polyfill to read files from R2 instead of trying + to bundle them with the worker code. + + Args: + directory: Package directory containing files to upload + benchmark_name: Name of the benchmark (used as prefix in R2) + + Returns: + Number of files uploaded + """ + try: + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + + if not bucket_name: + self.logging.warning("R2 bucket not configured, skipping file upload") + return 0 + + uploaded_count = 0 + + # Upload template files + templates_dir = os.path.join(directory, "templates") + if os.path.exists(templates_dir): + for root, dirs, files in os.walk(templates_dir): + for file in files: + # Skip the .js module files we created + if file.endswith('.js'): + continue + + file_path = os.path.join(root, file) + # Create R2 key: benchmark_name/templates/filename + rel_path = os.path.relpath(file_path, directory) + r2_key = f"{benchmark_name}/{rel_path}" + + try: + with open(file_path, 'rb') as f: + file_content = f.read() + + self.logging.info(f"Uploading {rel_path} to R2 as {r2_key}...") + storage.upload_bytes( + bucket_name, + r2_key, + file_content + ) + uploaded_count += 1 + self.logging.info(f"✓ Uploaded {rel_path} ({len(file_content)} bytes)") + except Exception as e: + self.logging.error(f"✗ Failed to upload {rel_path} to R2: {e}") + + if uploaded_count > 0: + self.logging.info( + f"Uploaded {uploaded_count} benchmark files to R2 bucket '{bucket_name}'" + ) + + return uploaded_count + + except Exception as e: + self.logging.warning( + f"Could not upload benchmark files to R2: {e}. " + f"fs-polyfill will not be able to read files from R2." + ) + return 0 + def package_code( self, directory: str, @@ -134,10 +397,9 @@ def package_code( container_deployment: bool, ) -> Tuple[str, int, str]: """ - Package code for Cloudflare Workers deployment. + Package code for Cloudflare Workers deployment using Wrangler. - Cloudflare Workers support JavaScript/TypeScript and use a bundler - to create a single JavaScript file for deployment. + Uses Wrangler CLI to bundle dependencies and prepare for deployment. Args: directory: Path to the code directory @@ -156,13 +418,82 @@ def package_code( "Container deployment is not supported for Cloudflare Workers" ) - # For now, we'll create a simple package structure - # In a full implementation, you'd use a bundler like esbuild or webpack + # Ensure Wrangler is installed + self._ensure_wrangler_installed() + + # Upload benchmark files to R2 for fs-polyfill access + if language_name == "nodejs": + uploaded = self._upload_benchmark_files_to_r2(directory, benchmark) + if uploaded > 0: + self.logging.info(f"Successfully uploaded {uploaded} files to R2") + else: + self.logging.warning( + "No files were uploaded to R2. Benchmarks requiring file access may fail. " + "Ensure R2 API credentials are configured." + ) + + # Install dependencies + if language_name == "nodejs": + package_file = os.path.join(directory, "package.json") + node_modules = os.path.join(directory, "node_modules") + + # Only install if package.json exists and node_modules doesn't + if os.path.exists(package_file) and not os.path.exists(node_modules): + self.logging.info(f"Installing Node.js dependencies in {directory}") + try: + result = subprocess.run( + ["npm", "install", "--production"], + cwd=directory, + capture_output=True, + text=True, + check=True, + timeout=120 + ) + self.logging.info("npm install completed successfully") + if result.stdout: + self.logging.debug(f"npm output: {result.stdout}") + except subprocess.TimeoutExpired: + self.logging.error("npm install timed out") + raise RuntimeError("Failed to install Node.js dependencies: timeout") + except subprocess.CalledProcessError as e: + self.logging.error(f"npm install failed: {e.stderr}") + raise RuntimeError(f"Failed to install Node.js dependencies: {e.stderr}") + except FileNotFoundError: + raise RuntimeError( + "npm not found. Please install Node.js and npm to deploy Node.js benchmarks." + ) + elif os.path.exists(node_modules): + self.logging.info(f"Node.js dependencies already installed in {directory}") + + elif language_name == "python": + requirements_file = os.path.join(directory, "requirements.txt") + if os.path.exists(requirements_file): + self.logging.info(f"Installing Python dependencies in {directory}") + try: + # Install to a local directory that can be bundled + target_dir = os.path.join(directory, "python_modules") + result = subprocess.run( + ["pip", "install", "-r", "requirements.txt", "-t", target_dir], + cwd=directory, + capture_output=True, + text=True, + check=True + ) + self.logging.info("pip install completed successfully") + if result.stdout: + self.logging.debug(f"pip output: {result.stdout}") + except subprocess.CalledProcessError as e: + self.logging.error(f"pip install failed: {e.stderr}") + raise RuntimeError(f"Failed to install Python dependencies: {e.stderr}") + except FileNotFoundError: + raise RuntimeError( + "pip not found. Please install Python and pip to deploy Python benchmarks." + ) + # Create package structure CONFIG_FILES = { "nodejs": ["handler.js", "package.json", "node_modules"], - # Python support via Python Workers is limited - "python": ["handler.py", "requirements.txt"], + "python": ["handler.py", "requirements.txt", "python_modules"], } if language_name not in CONFIG_FILES: @@ -170,34 +501,32 @@ def package_code( f"Language {language_name} is not yet supported for Cloudflare Workers" ) - package_config = CONFIG_FILES[language_name] - - # Create a worker directory with the necessary files - worker_dir = os.path.join(directory, "worker") - os.makedirs(worker_dir, exist_ok=True) - - # Copy all files to worker directory - for file in os.listdir(directory): - if file not in package_config and file != "worker": - src = os.path.join(directory, file) - dst = os.path.join(worker_dir, file) - if os.path.isfile(src): - shutil.copy2(src, dst) - elif os.path.isdir(src): - shutil.copytree(src, dst, dirs_exist_ok=True) - - # For now, return the main handler file as the package + # Verify the handler exists handler_file = "handler.js" if language_name == "nodejs" else "handler.py" package_path = os.path.join(directory, handler_file) if not os.path.exists(package_path): - raise RuntimeError(f"Handler file {handler_file} not found in {directory}") + if not os.path.exists(directory): + raise RuntimeError( + f"Package directory {directory} does not exist. " + "The benchmark build process may have failed to create the deployment package." + ) + raise RuntimeError( + f"Handler file {handler_file} not found in {directory}. " + f"Available files: {', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" + ) - bytes_size = os.path.getsize(package_path) - mbytes = bytes_size / 1024.0 / 1024.0 + # Calculate total size of the package directory + total_size = 0 + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + total_size += os.path.getsize(filepath) + + mbytes = total_size / 1024.0 / 1024.0 self.logging.info(f"Worker package size: {mbytes:.2f} MB") - return (package_path, bytes_size, "") + return (directory, total_size, "") def create_function( self, @@ -256,12 +585,8 @@ def create_function( else: self.logging.info(f"Creating new worker {func_name}") - # Read the worker script - with open(package, 'r') as f: - script_content = f.read() - - # Create the worker - self._create_or_update_worker(func_name, script_content, account_id) + # Create the worker with all package files + self._create_or_update_worker(func_name, package, account_id, language, benchmark) worker = CloudflareWorker( func_name, @@ -279,9 +604,10 @@ def create_function( library_trigger = LibraryTrigger(func_name, self) library_trigger.logging_handlers = self.logging_handlers worker.add_trigger(library_trigger) - - # Cloudflare Workers are automatically accessible via HTTPS - worker_url = f"https://{func_name}.{account_id}.workers.dev" + + # Build worker URL using the account's workers.dev subdomain when possible. + # Falls back to account_id-based host or plain workers.dev with warnings. + worker_url = self._build_workers_dev_url(func_name, account_id) http_trigger = HTTPTrigger(func_name, worker_url) http_trigger.logging_handlers = self.logging_handlers worker.add_trigger(http_trigger) @@ -296,37 +622,141 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: response = requests.get(url, headers=headers) if response.status_code == 200: - return response.json().get("result") + try: + return response.json().get("result") + except: + return None elif response.status_code == 404: return None else: - raise RuntimeError( - f"Failed to check worker existence: {response.status_code} - {response.text}" - ) + self.logging.warning(f"Unexpected response checking worker: {response.status_code}") + return None def _create_or_update_worker( - self, worker_name: str, script_content: str, account_id: str + self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None ) -> dict: - """Create or update a Cloudflare Worker.""" - headers = self._get_auth_headers() - # Remove Content-Type as we're sending form data - headers.pop("Content-Type", None) + """Create or update a Cloudflare Worker using Wrangler CLI. - url = f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" + Args: + worker_name: Name of the worker + package_dir: Directory containing handler and all benchmark files + account_id: Cloudflare account ID + language: Programming language (nodejs or python) + benchmark_name: Optional benchmark name for R2 file path prefix - # Cloudflare Workers API expects the script as form data - files = { - 'script': ('worker.js', script_content, 'application/javascript'), - } + Returns: + Worker deployment result + """ + # Generate wrangler.toml for this worker + self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name) + + # Set up environment for Wrangler + env = os.environ.copy() + if self.config.credentials.api_token: + env['CLOUDFLARE_API_TOKEN'] = self.config.credentials.api_token + elif self.config.credentials.email and self.config.credentials.api_key: + env['CLOUDFLARE_EMAIL'] = self.config.credentials.email + env['CLOUDFLARE_API_KEY'] = self.config.credentials.api_key - response = requests.put(url, headers=headers, files=files) + env['CLOUDFLARE_ACCOUNT_ID'] = account_id - if response.status_code not in [200, 201]: - raise RuntimeError( - f"Failed to create/update worker: {response.status_code} - {response.text}" - ) + # Deploy using Wrangler + self.logging.info(f"Deploying worker {worker_name} using Wrangler...") - return response.json().get("result", {}) + try: + result = subprocess.run( + ["wrangler", "deploy"], + cwd=package_dir, + env=env, + capture_output=True, + text=True, + check=True, + timeout=180 # 3 minutes for deployment + ) + + self.logging.info(f"Worker {worker_name} deployed successfully") + if result.stdout: + self.logging.debug(f"Wrangler deploy output: {result.stdout}") + + # Parse the output to get worker URL + # Wrangler typically outputs: "Published ()" + # and "https://..workers.dev" + + return {"success": True, "output": result.stdout} + + except subprocess.TimeoutExpired: + raise RuntimeError(f"Wrangler deployment timed out for worker {worker_name}") + except subprocess.CalledProcessError as e: + error_msg = f"Wrangler deployment failed for worker {worker_name}" + if e.stderr: + error_msg += f": {e.stderr}" + self.logging.error(error_msg) + raise RuntimeError(error_msg) + + def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: + """Fetch the workers.dev subdomain for the given account. + + Cloudflare exposes an endpoint that returns the account-level workers + subdomain (the readable name used in *.workers.dev), e.g. + GET /accounts/{account_id}/workers/subdomain + + Returns the subdomain string (e.g. 'marcin-copik') or None on failure. + """ + if self._workers_dev_subdomain: + return self._workers_dev_subdomain + + try: + headers = self._get_auth_headers() + url = f"{self._api_base_url}/accounts/{account_id}/workers/subdomain" + resp = requests.get(url, headers=headers) + if resp.status_code == 200: + body = resp.json() + sub = None + # result may contain 'subdomain' or nested structure + if isinstance(body, dict): + sub = body.get("result", {}).get("subdomain") + + if sub: + self._workers_dev_subdomain = sub + return sub + else: + self.logging.warning( + "Could not find workers.dev subdomain in API response; " + "please enable the workers.dev subdomain in your Cloudflare dashboard." + ) + return None + else: + self.logging.warning( + f"Failed to fetch workers.dev subdomain: {resp.status_code} - {resp.text}" + ) + return None + except Exception as e: + self.logging.warning(f"Error fetching workers.dev subdomain: {e}") + return None + + def _build_workers_dev_url(self, worker_name: str, account_id: Optional[str]) -> str: + """Build a best-effort public URL for a worker. + + Prefer using the account's readable workers.dev subdomain when available + (e.g. ..workers.dev). If we can't obtain that, fall + back to using the account_id as a last resort and log a warning. + """ + if account_id: + sub = self._get_workers_dev_subdomain(account_id) + if sub: + return f"https://{worker_name}.{sub}.workers.dev" + else: + # fallback: some code historically used account_id in the host + self.logging.warning( + "Using account ID in workers.dev URL as a fallback. " + "Enable the workers.dev subdomain in Cloudflare for proper URLs." + ) + return f"https://{worker_name}.{account_id}.workers.dev" + # Last fallback: plain workers.dev (may not resolve without a subdomain) + self.logging.warning( + "No account ID available; using https://{name}.workers.dev which may not be reachable." + ) + return f"https://{worker_name}.workers.dev" def cached_function(self, function: Function): """ @@ -369,17 +799,15 @@ def update_function( worker = cast(CloudflareWorker, function) package = code_package.code_location + language = code_package.language_name + benchmark = code_package.benchmark - # Read the updated script - with open(package, 'r') as f: - script_content = f.read() - - # Update the worker + # Update the worker with all package files account_id = worker.account_id or self.config.credentials.account_id if not account_id: raise RuntimeError("Account ID is required to update worker") - self._create_or_update_worker(worker.name, script_content, account_id) + self._create_or_update_worker(worker.name, package, account_id, language, benchmark) self.logging.info(f"Updated worker {worker.name}") # Update configuration if needed @@ -604,7 +1032,7 @@ def create_trigger( return trigger elif trigger_type == Trigger.TriggerType.HTTP: account_id = worker.account_id or self.config.credentials.account_id - worker_url = f"https://{worker.name}.{account_id}.workers.dev" + worker_url = self._build_workers_dev_url(worker.name, account_id) trigger = HTTPTrigger(worker.name, worker_url) trigger.logging_handlers = self.logging_handlers return trigger diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index 4e04ee137..b75c52ad8 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -13,16 +13,20 @@ class CloudflareCredentials(Credentials): Requires: - API token or email + global API key - Account ID + - Optional: R2 S3-compatible credentials for file uploads """ def __init__(self, api_token: Optional[str] = None, email: Optional[str] = None, - api_key: Optional[str] = None, account_id: Optional[str] = None): + api_key: Optional[str] = None, account_id: Optional[str] = None, + r2_access_key_id: Optional[str] = None, r2_secret_access_key: Optional[str] = None): super().__init__() self._api_token = api_token self._email = email self._api_key = api_key self._account_id = account_id + self._r2_access_key_id = r2_access_key_id + self._r2_secret_access_key = r2_secret_access_key @staticmethod def typename() -> str: @@ -44,13 +48,23 @@ def api_key(self) -> Optional[str]: def account_id(self) -> Optional[str]: return self._account_id + @property + def r2_access_key_id(self) -> Optional[str]: + return self._r2_access_key_id + + @property + def r2_secret_access_key(self) -> Optional[str]: + return self._r2_secret_access_key + @staticmethod def initialize(dct: dict) -> "CloudflareCredentials": return CloudflareCredentials( dct.get("api_token"), dct.get("email"), dct.get("api_key"), - dct.get("account_id") + dct.get("account_id"), + dct.get("r2_access_key_id"), + dct.get("r2_secret_access_key") ) @staticmethod @@ -69,13 +83,17 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden elif "CLOUDFLARE_API_TOKEN" in os.environ: ret = CloudflareCredentials( api_token=os.environ["CLOUDFLARE_API_TOKEN"], - account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID") + account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID"), + r2_access_key_id=os.environ.get("CLOUDFLARE_R2_ACCESS_KEY_ID"), + r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY") ) elif "CLOUDFLARE_EMAIL" in os.environ and "CLOUDFLARE_API_KEY" in os.environ: ret = CloudflareCredentials( email=os.environ["CLOUDFLARE_EMAIL"], api_key=os.environ["CLOUDFLARE_API_KEY"], - account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID") + account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID"), + r2_access_key_id=os.environ.get("CLOUDFLARE_R2_ACCESS_KEY_ID"), + r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY") ) else: raise RuntimeError( diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py new file mode 100644 index 000000000..6e0b05c2a --- /dev/null +++ b/sebs/cloudflare/durable_objects.py @@ -0,0 +1,201 @@ +import json +import requests +from typing import Dict, Optional, Tuple + +from sebs.cloudflare.config import CloudflareCredentials +from sebs.faas.nosql import NoSQLStorage +from sebs.faas.config import Resources +from sebs.cache import Cache + + +class DurableObjects(NoSQLStorage): + """ + Cloudflare Durable Objects implementation for NoSQL storage. + + Note: Durable Objects are not a traditional NoSQL database like DynamoDB or CosmosDB. + They are stateful Workers with persistent storage. This implementation provides + a minimal interface to satisfy SeBS requirements, but full table operations + are not supported. + """ + + @staticmethod + def typename() -> str: + return "Cloudflare.DurableObjects" + + @staticmethod + def deployment_name() -> str: + return "cloudflare" + + def __init__( + self, + region: str, + cache_client: Cache, + resources: Resources, + credentials: CloudflareCredentials, + ): + super().__init__(region, cache_client, resources) + self._credentials = credentials + self._tables: Dict[str, str] = {} + + def _get_auth_headers(self) -> dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self._credentials.api_token: + return { + "Authorization": f"Bearer {self._credentials.api_token}", + "Content-Type": "application/json", + } + elif self._credentials.email and self._credentials.api_key: + return { + "X-Auth-Email": self._credentials.email, + "X-Auth-Key": self._credentials.api_key, + "Content-Type": "application/json", + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") + + def get_tables(self, benchmark: str) -> Dict[str, str]: + """ + Get all tables for a benchmark. + + :param benchmark: benchmark name + :return: dictionary mapping table names to their IDs + """ + # For Durable Objects, we don't have traditional tables + # Return cached tables if any + return self._tables.copy() + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + """ + Get the full table name for a benchmark table. + + :param benchmark: benchmark name + :param table: table name + :return: full table name or None if not found + """ + key = f"{benchmark}:{table}" + return self._tables.get(key) + + def retrieve_cache(self, benchmark: str) -> bool: + """ + Retrieve cached table information. + + :param benchmark: benchmark name + :return: True if cache was found and loaded + """ + cache_key = f"cloudflare.durable_objects.{benchmark}" + cached = self.cache_client.get(cache_key) + + if cached: + self._tables.update(cached) + self.logging.info(f"Retrieved cached Durable Objects tables for {benchmark}") + return True + + return False + + def update_cache(self, benchmark: str): + """ + Update cache with current table information. + + :param benchmark: benchmark name + """ + cache_key = f"cloudflare.durable_objects.{benchmark}" + + # Filter tables for this benchmark + benchmark_tables = { + k: v for k, v in self._tables.items() if k.startswith(f"{benchmark}:") + } + + self.cache_client.update(cache_key, benchmark_tables) + self.logging.info(f"Updated cache for Durable Objects tables for {benchmark}") + + def create_table( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ) -> str: + """ + Create a table (Durable Object namespace). + + Note: Durable Objects don't have traditional table creation via API. + They are defined in the Worker code and wrangler.toml. + This method just tracks the table name. + + :param benchmark: benchmark name + :param name: table name + :param primary_key: primary key field name + :param secondary_key: optional secondary key field name + :return: table name + """ + resource_id = self._cloud_resources.get_resource_id() + table_name = f"sebs-benchmarks-{resource_id}-{benchmark}-{name}" + + key = f"{benchmark}:{name}" + self._tables[key] = table_name + + self.logging.info( + f"Registered Durable Objects table {table_name} for benchmark {benchmark}" + ) + + return table_name + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + """ + Write data to a table (Durable Object). + + Note: This would require HTTP requests to the Durable Object endpoints. + For now, this is not fully implemented. + + :param benchmark: benchmark name + :param table: table name + :param data: data to write + :param primary_key: primary key (field_name, value) + :param secondary_key: optional secondary key (field_name, value) + """ + table_name = self._get_table_name(benchmark, table) + + if not table_name: + raise ValueError(f"Table {table} not found for benchmark {benchmark}") + + self.logging.warning( + f"write_to_table not fully implemented for Durable Objects table {table_name}" + ) + + def clear_table(self, name: str) -> str: + """ + Clear all data from a table. + + :param name: table name + :return: table name + """ + self.logging.warning(f"clear_table not fully implemented for Durable Objects table {name}") + return name + + def remove_table(self, name: str) -> str: + """ + Remove a table. + + :param name: table name + :return: table name + """ + # Remove from internal tracking + keys_to_remove = [k for k, v in self._tables.items() if v == name] + for key in keys_to_remove: + del self._tables[key] + + self.logging.info(f"Removed Durable Objects table {name} from tracking") + return name + + def envs(self) -> dict: + """ + Get environment variables for accessing Durable Objects. + + :return: dictionary of environment variables + """ + # Durable Objects are accessed via bindings in the Worker + # No additional environment variables needed + return {} diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 60265ca0d..73e047fb7 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -68,42 +68,161 @@ def _create_bucket( account_id = self._credentials.account_id - get_bucket_uri = ( + create_bucket_uri = ( f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" ) - params = {"name": "cloudflare_bucket", "locationHint": self._region} + # R2 API only accepts "name" parameter - locationHint is optional and must be one of: + # "apac", "eeur", "enam", "weur", "wnam" + # For now, just send the name without locationHint + params = {"name": name} - create_bucket_response = requests.post( - get_bucket_uri, json=params, headers=self._get_auth_headers() - ) - bucket_info = create_bucket_response.content.decode("utf-8") - bucket_info_json = json.load(bucket_info) # pyright: ignore + try: + create_bucket_response = requests.post( + create_bucket_uri, json=params, headers=self._get_auth_headers() + ) + + # Log the response for debugging + if create_bucket_response.status_code >= 400: + try: + error_data = create_bucket_response.json() + self.logging.error( + f"R2 bucket creation failed. Status: {create_bucket_response.status_code}, " + f"Response: {error_data}" + ) + except: + self.logging.error( + f"R2 bucket creation failed. Status: {create_bucket_response.status_code}, " + f"Response: {create_bucket_response.text}" + ) + + create_bucket_response.raise_for_status() + + bucket_info_json = create_bucket_response.json() - return bucket_info_json.name + if not bucket_info_json.get("success"): + self.logging.error(f"Failed to create R2 bucket: {bucket_info_json.get('errors')}") + raise RuntimeError(f"Failed to create R2 bucket {name}") - """ + bucket_name = bucket_info_json.get("result", {}).get("name", name) + self.logging.info(f"Created R2 bucket {bucket_name}") + return bucket_name + + except requests.exceptions.RequestException as e: + self.logging.error(f"Error creating R2 bucket {name}: {e}") + raise + + def download(self, bucket_name: str, key: str, filepath: str) -> None: + """ Download a file from a bucket. :param bucket_name: :param key: storage source filepath :param filepath: local destination filepath - """ - - def download(self, bucket_name: str, key: str, filepath: str) -> None: + """ + # R2 requires S3-compatible access for object operations + # For now, this is not fully implemented + self.logging.warning(f"download not fully implemented for R2 bucket {bucket_name}") pass - """ - Upload a file to a bucket with by passing caching. - Useful for uploading code package to storage (when required). + def upload(self, bucket_name: str, filepath: str, key: str): + """ + Upload a file to R2 bucket using the S3-compatible API. + + Requires S3 credentials to be configured for the R2 bucket. - :param bucket_name: + :param bucket_name: R2 bucket name :param filepath: local source filepath - :param key: storage destination filepath - """ - - def upload(self, bucket_name: str, filepath: str, key: str): - pass + :param key: R2 destination key/path + """ + try: + import boto3 + from botocore.config import Config + + account_id = self._credentials.account_id + + # R2 uses S3-compatible API, but requires special configuration + # The endpoint is: https://.r2.cloudflarestorage.com + # You need to create R2 API tokens in the Cloudflare dashboard + + # Check if we have S3-compatible credentials + if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: + self.logging.warning( + "R2 upload requires S3-compatible API credentials (r2_access_key_id, r2_secret_access_key). " + "File upload skipped. Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY." + ) + return + + s3_client = boto3.client( + 's3', + endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', + aws_access_key_id=self._credentials.r2_access_key_id, + aws_secret_access_key=self._credentials.r2_secret_access_key, + config=Config(signature_version='s3v4'), + region_name='auto' + ) + + with open(filepath, 'rb') as f: + s3_client.put_object( + Bucket=bucket_name, + Key=key, + Body=f + ) + + self.logging.debug(f"Uploaded {filepath} to R2 bucket {bucket_name} as {key}") + + except ImportError: + self.logging.warning( + "boto3 not available. Install with: pip install boto3. " + "File upload to R2 skipped." + ) + except Exception as e: + self.logging.warning(f"Failed to upload {filepath} to R2: {e}") + + def upload_bytes(self, bucket_name: str, key: str, data: bytes): + """ + Upload bytes directly to R2 bucket using the S3-compatible API. + + :param bucket_name: R2 bucket name + :param key: R2 destination key/path + :param data: bytes to upload + """ + try: + import boto3 + from botocore.config import Config + + account_id = self._credentials.account_id + + if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: + self.logging.warning( + "R2 upload requires S3-compatible API credentials (r2_access_key_id, r2_secret_access_key). " + "Upload skipped. Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." + ) + return + + s3_client = boto3.client( + 's3', + endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', + aws_access_key_id=self._credentials.r2_access_key_id, + aws_secret_access_key=self._credentials.r2_secret_access_key, + config=Config(signature_version='s3v4'), + region_name='auto' + ) + + s3_client.put_object( + Bucket=bucket_name, + Key=key, + Body=data + ) + + self.logging.debug(f"Uploaded {len(data)} bytes to R2 bucket {bucket_name} as {key}") + + except ImportError: + self.logging.warning( + "boto3 not available. Install with: pip install boto3" + ) + except Exception as e: + self.logging.warning(f"Failed to upload bytes to R2: {e}") """ Retrieves list of files in a bucket. @@ -113,34 +232,123 @@ def upload(self, bucket_name: str, filepath: str, key: str): """ def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: - pass + """ + Retrieves list of files in a bucket. + + :param bucket_name: + :param prefix: optional prefix filter + :return: list of files in a given bucket + """ + account_id = self._credentials.account_id + + # R2 uses S3-compatible API for listing objects + # For now, return empty list as listing objects requires S3 credentials + self.logging.warning(f"list_bucket not fully implemented for R2 bucket {bucket_name}") + return [] def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: - pass + """ + List all R2 buckets in the account. + + :param bucket_name: optional filter (not used for R2) + :return: list of bucket names + """ + account_id = self._credentials.account_id + + list_buckets_uri = ( + f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" + ) + + try: + response = requests.get(list_buckets_uri, headers=self._get_auth_headers()) + + # Log detailed error information + if response.status_code == 403: + try: + error_data = response.json() + self.logging.error( + f"403 Forbidden accessing R2 buckets. " + f"Response: {error_data}. " + f"Your API token may need 'R2 Read and Write' permissions." + ) + except: + self.logging.error( + f"403 Forbidden accessing R2 buckets. " + f"Your API token may need 'R2 Read and Write' permissions." + ) + return [] + + response.raise_for_status() + + data = response.json() + + if not data.get("success"): + self.logging.error(f"Failed to list R2 buckets: {data.get('errors')}") + return [] + + # Extract bucket names from response + buckets = data.get("result", {}).get("buckets", []) + bucket_names = [bucket["name"] for bucket in buckets] + + self.logging.info(f"Found {len(bucket_names)} R2 buckets") + return bucket_names + + except requests.exceptions.RequestException as e: + self.logging.error(f"Error listing R2 buckets: {e}") + return [] def exists_bucket(self, bucket_name: str) -> bool: - pass + """ + Check if a bucket exists. + + :param bucket_name: + :return: True if bucket exists + """ + buckets = self.list_buckets() + return bucket_name in buckets def clean_bucket(self, bucket_name: str): + """ + Remove all objects from a bucket. + + :param bucket_name: + """ + self.logging.warning(f"clean_bucket not fully implemented for R2 bucket {bucket_name}") pass def remove_bucket(self, bucket: str): - pass - - """ - Allocate a set of input/output buckets for the benchmark. - The routine checks the cache first to verify that buckets have not - been allocated first. - - :param benchmark: benchmark name - :param buckets: number of input and number of output buckets - """ + """ + Delete a bucket. + + :param bucket: + """ + account_id = self._credentials.account_id + + delete_bucket_uri = ( + f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets/{bucket}" + ) + + try: + response = requests.delete(delete_bucket_uri, headers=self._get_auth_headers()) + response.raise_for_status() + + data = response.json() + + if data.get("success"): + self.logging.info(f"Successfully deleted R2 bucket {bucket}") + else: + self.logging.error(f"Failed to delete R2 bucket {bucket}: {data.get('errors')}") + + except requests.exceptions.RequestException as e: + self.logging.error(f"Error deleting R2 bucket {bucket}: {e}") def uploader_func(self, bucket_idx: int, file: str, filepath: str) -> None: + """ + Upload a file to a bucket (used for parallel uploads). + + :param bucket_idx: index of the bucket to upload to + :param file: destination file name + :param filepath: source file path + """ + self.logging.warning(f"uploader_func not fully implemented for R2") pass - - """ - Download all files in a storage bucket. - Warning: assumes flat directory in a bucket! Does not handle bucket files - with directory marks in a name, e.g. 'dir1/dir2/file' - """ diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index fa181ad71..1b3d9dbc7 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -5,6 +5,7 @@ from sebs.cache import Cache from sebs.cloudflare.config import CloudflareConfig from sebs.cloudflare.r2 import R2 +from sebs.cloudflare.durable_objects import DurableObjects from sebs.faas.resources import SystemResources from sebs.faas.storage import PersistentStorage from sebs.faas.nosql import NoSQLStorage @@ -62,13 +63,15 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStor Args: replace_existing: Whether to replace existing files in storage - Raises: - NotImplementedError: R2 storage support not yet implemented + Returns: + R2 storage instance """ + if replace_existing is None: + replace_existing = False return R2( region=self._config.region, - cache_client=None, + cache_client=self._cache_client, resources=self._config.resources, replace_existing=replace_existing, credentials=self._config.credentials, @@ -76,15 +79,17 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStor def get_nosql_storage(self) -> NoSQLStorage: """ - Get Cloudflare NoSQL storage instance. + Get Cloudflare Durable Objects storage instance. - This could use Cloudflare D1 (SQLite) or Durable Objects for NoSQL storage. + Durable Objects provide stateful storage for Workers. + Note: This is a minimal implementation to satisfy SeBS requirements. - Raises: - NotImplementedError: NoSQL storage support not yet implemented + Returns: + DurableObjects storage instance """ - raise NotImplementedError( - "Cloudflare NoSQL storage (D1/Durable Objects) is not yet implemented. " - "To add support, implement a NoSQLStorage subclass " - "similar to sebs/aws/dynamodb.py or sebs/azure/cosmosdb.py" + return DurableObjects( + region=self._config.region, + cache_client=self._cache_client, + resources=self._config.resources, + credentials=self._config.credentials, ) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 310f1fa94..f4b926379 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -11,7 +11,8 @@ class LibraryTrigger(Trigger): """ def __init__(self, worker_name: str, deployment_client=None): - super().__init__(worker_name) + super().__init__() + self.worker_name = worker_name self.deployment_client = deployment_client @staticmethod @@ -43,11 +44,17 @@ def async_invoke(self, payload: dict) -> concurrent.futures.Future: raise NotImplementedError("Cloudflare Workers do not support async invocation") def serialize(self) -> dict: - return {**super().serialize()} + """Serialize the LibraryTrigger.""" + return { + "type": self.typename(), + "worker_name": self.worker_name, + } @staticmethod - def deserialize(obj: dict) -> "LibraryTrigger": - return LibraryTrigger(obj["name"]) + def deserialize(cached_config: dict) -> "LibraryTrigger": + """Deserialize a LibraryTrigger from cached config.""" + from sebs.cloudflare.triggers import LibraryTrigger + return LibraryTrigger(cached_config["worker_name"]) class HTTPTrigger(Trigger): @@ -57,7 +64,8 @@ class HTTPTrigger(Trigger): """ def __init__(self, worker_name: str, url: Optional[str] = None): - super().__init__(worker_name) + super().__init__() + self.worker_name = worker_name self._url = url @staticmethod @@ -135,13 +143,12 @@ def async_invoke(self, payload: dict) -> concurrent.futures.Future: def serialize(self) -> dict: return { - **super().serialize(), + "type": self.typename(), + "worker_name": self.worker_name, "url": self._url, } @staticmethod def deserialize(obj: dict) -> "HTTPTrigger": - trigger = HTTPTrigger(obj["name"]) - if "url" in obj: - trigger.url = obj["url"] + trigger = HTTPTrigger(obj["worker_name"], obj.get("url")) return trigger From b117e753a91fe359594832b9eb6ddf994fdaa93b Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 11 Nov 2025 18:11:39 +0100 Subject: [PATCH 011/230] adapted handler to measure invocation time --- .../wrappers/cloudflare/nodejs/handler.js | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 463bbd8e2..03038672d 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -53,6 +53,10 @@ export default { return new Response('None'); } + // Start timing measurements + const begin = Date.now() / 1000; + const start = performance.now(); + const req_text = await request.text(); let event = {}; if (req_text && req_text.length > 0) { @@ -126,11 +130,17 @@ export default { throw new Error('benchmark handler function not found'); } } catch (err) { + // Calculate timing even for errors + const end = Date.now() / 1000; + const elapsed = performance.now() - start; + const micro = elapsed * 1000; // Convert milliseconds to microseconds + // Mirror Python behavior: return structured error payload const errorPayload = JSON.stringify({ - begin: '0', - end: '0', - results_time: '0', + begin: begin, + end: end, + compute_time: micro, + results_time: 0, result: { output: null }, is_cold: false, is_cold_worker: false, @@ -142,6 +152,11 @@ export default { return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); } + // Calculate elapsed time + const end = Date.now() / 1000; + const elapsed = performance.now() - start; + const micro = elapsed * 1000; // Convert milliseconds to microseconds + // Build log_data similar to Python handler const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; if (ret && ret.measurement !== undefined) { @@ -158,9 +173,10 @@ export default { } const responseBody = JSON.stringify({ - begin: '0', - end: '0', - results_time: '0', + begin: begin, + end: end, + compute_time: micro, + results_time: 0, result: log_data, is_cold: false, is_cold_worker: false, @@ -172,10 +188,26 @@ export default { return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); } catch (topLevelError) { // Catch any uncaught errors (module loading, syntax errors, etc.) + // Try to include timing if available + let errorBegin = 0; + let errorEnd = 0; + let errorMicro = 0; + try { + errorEnd = Date.now() / 1000; + if (typeof begin !== 'undefined' && typeof start !== 'undefined') { + errorBegin = begin; + const elapsed = performance.now() - start; + errorMicro = elapsed * 1000; + } + } catch (e) { + // Ignore timing errors in error handler + } + const errorPayload = JSON.stringify({ - begin: '0', - end: '0', - results_time: '0', + begin: errorBegin, + end: errorEnd, + compute_time: errorMicro, + results_time: 0, result: { output: null }, is_cold: false, is_cold_worker: false, From d42b157038b00914529b4c1f161bd2b64990ae5e Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 12 Nov 2025 13:21:13 +0100 Subject: [PATCH 012/230] fixed the fs polyfill to also support write operationst to r2 storage, at this point the other benchmarks may be translated to nodejs --- .../wrappers/cloudflare/nodejs/fs-polyfill.js | 100 ++++++++++++++++-- .../cloudflare/nodejs/path-polyfill.js | 54 ---------- configs/systems.json | 3 +- sebs/cloudflare/cloudflare.py | 35 +++--- 4 files changed, 111 insertions(+), 81 deletions(-) delete mode 100644 benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js diff --git a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js index dac01a115..6e06493f1 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js +++ b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js @@ -1,10 +1,5 @@ -/** - * fs polyfill for Cloudflare Workers that reads files from R2 bucket - * - * This polyfill provides a subset of Node.js fs API for reading files - * stored in an R2 bucket. The R2 bucket binding is accessed via - * globalThis.R2_BUCKET which is set by the handler. - */ +import * as nodeFs from 'node:fs'; +import { Writable } from 'node:stream'; /** * Read a file from R2 bucket @@ -147,10 +142,99 @@ export async function stat(path, callback) { } } -// Export default object with all methods for CommonJS-style usage +/** + * Create a write stream (memory-buffered, writes to R2 on close) + */ +export function createWriteStream(path, options) { + const chunks = []; + + const stream = new Writable({ + write(chunk, encoding, callback) { + chunks.push(chunk); + callback(); + }, + final(callback) { + // Write to R2 when stream is closed + (async () => { + try { + if (!globalThis.R2_BUCKET) { + throw new Error('R2 bucket not available'); + } + + let normalizedPath = path.replace(/^\.?\//, ''); + + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + const buffer = Buffer.concat(chunks); + await globalThis.R2_BUCKET.put(normalizedPath, buffer); + callback(); + } catch (err) { + callback(err); + } + })(); + } + }); + + return stream; +} + +/** + * Write file to R2 + */ +export async function writeFile(path, data, options, callback) { + let actualCallback = callback; + let actualOptions = options; + + if (typeof options === 'function') { + actualCallback = options; + actualOptions = {}; + } + + try { + if (!globalThis.R2_BUCKET) { + throw new Error('R2 bucket not available'); + } + + let normalizedPath = path.replace(/^\.?\//, ''); + + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + await globalThis.R2_BUCKET.put(normalizedPath, data); + + if (actualCallback) actualCallback(null); + } catch (err) { + if (actualCallback) actualCallback(err); + else throw err; + } +} + +/** + * Synchronous write file to R2 + */ +export function writeFileSync(path, data, options) { + return new Promise((resolve, reject) => { + writeFile(path, data, options, (err) => { + if (err) reject(err); + else resolve(); + }); + }); +} + +// Export everything from node:fs (what's available), but override specific methods export default { + ...nodeFs, readFile, readFileSync, exists, stat, + createWriteStream, + writeFile, + writeFileSync, }; + +// Also re-export all named exports from node:fs +export * from 'node:fs'; \ No newline at end of file diff --git a/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js deleted file mode 100644 index 4c77c2be9..000000000 --- a/benchmarks/wrappers/cloudflare/nodejs/path-polyfill.js +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Minimal POSIX-like path.resolve polyfill for Cloudflare Workers / browser. - * Always returns an absolute path starting with '/'. - * Does not use process.cwd(); root '/' is the base. - */ - -function normalizeSegments(segments) { - const out = []; - for (const seg of segments) { - if (!seg || seg === '.') continue; - if (seg === '..') { - if (out.length && out[out.length - 1] !== '..') out.pop(); - continue; - } - out.push(seg); - } - return out; -} - -/** - * Resolve path segments into an absolute path. - * @param {...string} input - * @returns {string} - */ -export function resolve(...input) { - if (!input.length) return '/'; - let absoluteFound = false; - const segments = []; - - for (let i = input.length - 1; i >= 0; i--) { - let part = String(input[i]); - if (part === '') continue; - // Normalize backslashes to forward slashes (basic win compatibility) - part = part.replace(/\\/g, '/'); - - if (part[0] === '/') { - absoluteFound = true; - part = part.slice(1); // drop leading '/' to just collect segments - } - const split = part.split('/'); - for (let j = split.length - 1; j >= 0; j--) { - const seg = split[j]; - if (seg) segments.push(seg); - } - if (absoluteFound) break; - } - - const normalized = normalizeSegments(segments.reverse()); - return '/' + normalized.join('/'); -} - -// Optional convenience exports similar to Node's path.posix interface -const path = { resolve }; -export default path; \ No newline at end of file diff --git a/configs/systems.json b/configs/systems.json index 073f564d1..ca75db5c0 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -461,8 +461,7 @@ "files": [ "handler.js", "storage.js", - "fs-polyfill.js", - "path-polyfill.js" + "fs-polyfill.js" ], "packages": { "uuid": "3.4.0" diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 404acf1eb..297349b2e 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -211,6 +211,22 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: """ + + + + # Add compatibility flags based on language + if language == "nodejs": + toml_content += """# Custom polyfills for fs and path that read from R2 bucket +compatibility_flags = ["nodejs_compat"] +[alias] +"fs" = "./fs-polyfill" +""" + elif language == "python": + toml_content += """# Enable Python Workers runtime +compatibility_flags = ["python_workers"] +""" + + # Add environment variable for benchmark name (used by fs-polyfill for R2 paths) if benchmark_name: toml_content += f"""# Benchmark name used for R2 file path prefix @@ -218,8 +234,8 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: BENCHMARK_NAME = "{benchmark_name}" """ - - # Add R2 bucket binding for benchmarking files (required for fs/path polyfills) + + # Add R2 bucket binding for benchmarking files (required for fs/path polyfills) r2_bucket_configured = False try: storage = self.system_resources.get_storage() @@ -240,21 +256,6 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: f"Benchmarks requiring file access will not work properly." ) - # Add compatibility flags based on language - if language == "nodejs": - toml_content += """# Custom polyfills for fs and path that read from R2 bucket -[alias] -"fs" = "./fs-polyfill" -"path" = "./path-polyfill" -""" - if r2_bucket_configured: - self.logging.info( - "fs and path polyfills configured to use R2 bucket for file operations" - ) - elif language == "python": - toml_content += """# Enable Python Workers runtime -compatibility_flags = ["python_workers"] -""" # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") From ffd3f786ba9d4d9e9d3ec8cc5882089e6a660bd9 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 12 Nov 2025 15:45:02 +0100 Subject: [PATCH 013/230] added compatibility for benchmarks 100 in nodejs. translated all 100 and 000 to nodejs. fixed support for nosql in wrapper and sebs. fixed durable objects implementation for sebs --- .../020.network-benchmark/config.json | 2 +- .../020.network-benchmark/nodejs/function.js | 94 ++++++++++++++ .../020.network-benchmark/nodejs/package.json | 9 ++ .../030.clock-synchronization/config.json | 2 +- .../nodejs/function.js | 115 +++++++++++++++++ .../nodejs/package.json | 9 ++ .../040.server-reply/nodejs/function.js | 31 +++++ .../040.server-reply/nodejs/package.json | 9 ++ .../130.crud-api/nodejs/function.js | 78 ++++++++++++ .../130.crud-api/nodejs/package.json | 9 ++ .../wrappers/cloudflare/nodejs/fs-polyfill.js | 95 ++++++++++---- .../wrappers/cloudflare/nodejs/handler.js | 9 +- .../wrappers/cloudflare/nodejs/nosql.js | 114 +++++++++++++++++ .../cloudflare/nodejs/request-polyfill.js | 96 +++++++++++++++ .../wrappers/cloudflare/nodejs/storage.js | 116 +++++++++++++++++- configs/systems.json | 4 +- sebs/cloudflare/cloudflare.py | 1 + sebs/cloudflare/durable_objects.py | 58 +++++---- 18 files changed, 794 insertions(+), 57 deletions(-) create mode 100644 benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js create mode 100644 benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json create mode 100644 benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js create mode 100644 benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json create mode 100644 benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js create mode 100644 benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json create mode 100644 benchmarks/100.webapps/130.crud-api/nodejs/function.js create mode 100644 benchmarks/100.webapps/130.crud-api/nodejs/package.json create mode 100644 benchmarks/wrappers/cloudflare/nodejs/nosql.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/config.json b/benchmarks/000.microbenchmarks/020.network-benchmark/config.json index c3c2c73b1..455933282 100644 --- a/benchmarks/000.microbenchmarks/020.network-benchmark/config.json +++ b/benchmarks/000.microbenchmarks/020.network-benchmark/config.json @@ -1,6 +1,6 @@ { "timeout": 30, "memory": 128, - "languages": ["python"], + "languages": ["python", "nodejs"], "modules": [] } diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js new file mode 100644 index 000000000..431ccbe39 --- /dev/null +++ b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js @@ -0,0 +1,94 @@ +const dgram = require('dgram'); +const fs = require('fs'); +const path = require('path'); +const storage = require('./storage'); + +const storage_handler = new storage.storage(); + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +exports.handler = async function(event) { + const requestId = event['request-id']; + const address = event['server-address']; + const port = event['server-port']; + const repetitions = event['repetitions']; + const outputBucket = event.bucket.bucket; + const outputPrefix = event.bucket.output; + + const times = []; + let i = 0; + const client = dgram.createSocket('udp4'); + client.bind(); + + const message = Buffer.from(String(requestId)); + let consecutiveFailures = 0; + let key = null; + + while (i < repetitions + 1) { + try { + const sendBegin = Date.now() / 1000; + + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + reject(new Error('Socket timeout')); + }, 3000); + + client.send(message, port, address, (err) => { + if (err) { + clearTimeout(timeout); + reject(err); + } + }); + + client.once('message', (msg, rinfo) => { + clearTimeout(timeout); + const recvEnd = Date.now() / 1000; + resolve(recvEnd); + }); + }).then((recvEnd) => { + if (i > 0) { + times.push([i, sendBegin, recvEnd]); + } + i++; + consecutiveFailures = 0; + }); + } catch (err) { + i++; + consecutiveFailures++; + if (consecutiveFailures === 5) { + console.log("Can't setup the connection"); + break; + } + continue; + } + } + + client.close(); + + if (consecutiveFailures !== 5) { + // Write CSV file using stream + const csvPath = '/tmp/data.csv'; + let csvContent = 'id,client_send,client_rcv\n'; + times.forEach(row => { + csvContent += row.join(',') + '\n'; + }); + + // Use createWriteStream and wait for it to finish + await new Promise((resolve, reject) => { + const writeStream = fs.createWriteStream(csvPath); + writeStream.write(csvContent); + writeStream.end(); + writeStream.on('finish', resolve); + writeStream.on('error', reject); + }); + + const filename = `results-${requestId}.csv`; + let uploadPromise; + [key, uploadPromise] = storage_handler.upload(outputBucket, path.join(outputPrefix, filename), csvPath); + await uploadPromise; + } + + return { result: key }; +}; diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json new file mode 100644 index 000000000..57264db28 --- /dev/null +++ b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json @@ -0,0 +1,9 @@ +{ + "name": "network-benchmark", + "version": "1.0.0", + "description": "Network benchmark function", + "author": "", + "license": "", + "dependencies": { + } +} diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json b/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json index c3c2c73b1..455933282 100644 --- a/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json +++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json @@ -1,6 +1,6 @@ { "timeout": 30, "memory": 128, - "languages": ["python"], + "languages": ["python", "nodejs"], "modules": [] } diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js new file mode 100644 index 000000000..8adb53f66 --- /dev/null +++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js @@ -0,0 +1,115 @@ +const dgram = require('dgram'); +const fs = require('fs'); +const path = require('path'); +const storage = require('./storage'); + +const storage_handler = new storage.storage(); + +exports.handler = async function(event) { + const requestId = event['request-id']; + const address = event['server-address']; + const port = event['server-port']; + const repetitions = event['repetitions']; + const outputBucket = event.bucket.bucket; + const outputPrefix = event.bucket.output; + + const times = []; + console.log(`Starting communication with ${address}:${port}`); + + let i = 0; + const client = dgram.createSocket('udp4'); + client.bind(); + + let message = Buffer.from(String(requestId)); + let consecutiveFailures = 0; + let measurementsNotSmaller = 0; + let curMin = 0; + let key = null; + + while (i < 1000) { + try { + const sendBegin = Date.now() / 1000; + + const recvEnd = await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + reject(new Error('Socket timeout')); + }, 4000); + + client.send(message, port, address, (err) => { + if (err) { + clearTimeout(timeout); + reject(err); + } + }); + + client.once('message', (msg, rinfo) => { + clearTimeout(timeout); + const recvEnd = Date.now() / 1000; + resolve(recvEnd); + }); + }); + + if (i > 0) { + times.push([i, sendBegin, recvEnd]); + } + + const curTime = recvEnd - sendBegin; + console.log(`Time ${curTime} Min Time ${curMin} NotSmaller ${measurementsNotSmaller}`); + + if (curTime > curMin && curMin > 0) { + measurementsNotSmaller++; + if (measurementsNotSmaller === repetitions) { + message = Buffer.from('stop'); + client.send(message, port, address); + break; + } + } else { + curMin = curTime; + measurementsNotSmaller = 0; + } + + i++; + consecutiveFailures = 0; + } catch (err) { + i++; + consecutiveFailures++; + if (consecutiveFailures === 7) { + console.log("Can't setup the connection"); + break; + } + continue; + } + } + + client.close(); + + if (consecutiveFailures !== 5) { + // Write CSV file using stream + const csvPath = '/tmp/data.csv'; + let csvContent = 'id,client_send,client_rcv\n'; + times.forEach(row => { + csvContent += row.join(',') + '\n'; + }); + + // Use createWriteStream and wait for it to finish + await new Promise((resolve, reject) => { + const writeStream = fs.createWriteStream(csvPath); + writeStream.write(csvContent); + writeStream.end(); + writeStream.on('finish', resolve); + writeStream.on('error', reject); + }); + + const filename = `results-${requestId}.csv`; + let uploadPromise; + [key, uploadPromise] = storage_handler.upload(outputBucket, path.join(outputPrefix, filename), csvPath); + await uploadPromise; + } + + return { + result: { + 'bucket-key': key, + 'timestamp': event['income-timestamp'] + } + }; +}; diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json new file mode 100644 index 000000000..20dbe9c5f --- /dev/null +++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json @@ -0,0 +1,9 @@ +{ + "name": "clock-synchronization", + "version": "1.0.0", + "description": "Clock synchronization benchmark", + "author": "", + "license": "", + "dependencies": { + } +} diff --git a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js new file mode 100644 index 000000000..45a0ea8f8 --- /dev/null +++ b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js @@ -0,0 +1,31 @@ +const net = require('net'); + +exports.handler = async function(event) { + const address = event['ip-address']; + const port = event['port']; + + return new Promise((resolve, reject) => { + const client = new net.Socket(); + + client.setTimeout(20000); + + client.connect(port, address, () => { + console.log('Connected to server'); + }); + + client.on('data', (data) => { + const msg = data.toString(); + client.destroy(); + resolve({ result: msg }); + }); + + client.on('timeout', () => { + client.destroy(); + reject(new Error('Connection timeout')); + }); + + client.on('error', (err) => { + reject(err); + }); + }); +}; diff --git a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json new file mode 100644 index 000000000..ad419b23f --- /dev/null +++ b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json @@ -0,0 +1,9 @@ +{ + "name": "server-reply", + "version": "1.0.0", + "description": "Server reply benchmark", + "author": "", + "license": "", + "dependencies": { + } +} diff --git a/benchmarks/100.webapps/130.crud-api/nodejs/function.js b/benchmarks/100.webapps/130.crud-api/nodejs/function.js new file mode 100644 index 000000000..807b8c5f9 --- /dev/null +++ b/benchmarks/100.webapps/130.crud-api/nodejs/function.js @@ -0,0 +1,78 @@ +const nosql = require('./nosql'); + +const nosqlClient = nosql.nosql.get_instance(); +const nosqlTableName = "shopping_cart"; + +function addProduct(cartId, productId, productName, price, quantity) { + nosqlClient.insert( + nosqlTableName, + ["cart_id", cartId], + ["product_id", productId], + { price: price, quantity: quantity, name: productName } + ); +} + +function getProducts(cartId, productId) { + return nosqlClient.get( + nosqlTableName, + ["cart_id", cartId], + ["product_id", productId] + ); +} + +function queryProducts(cartId) { + const res = nosqlClient.query( + nosqlTableName, + ["cart_id", cartId], + "product_id" + ); + + const products = []; + let priceSum = 0; + let quantitySum = 0; + + for (const product of res) { + products.push(product.name); + priceSum += product.price; + quantitySum += product.quantity; + } + + const avgPrice = quantitySum > 0 ? priceSum / quantitySum : 0.0; + + return { + products: products, + total_cost: priceSum, + avg_price: avgPrice + }; +} + +exports.handler = async function(event) { + const results = []; + + for (const request of event.requests) { + const route = request.route; + const body = request.body; + let res; + + if (route === "PUT /cart") { + addProduct( + body.cart, + body.product_id, + body.name, + body.price, + body.quantity + ); + res = {}; + } else if (route === "GET /cart/{id}") { + res = getProducts(body.cart, request.path.id); + } else if (route === "GET /cart") { + res = queryProducts(body.cart); + } else { + throw new Error(`Unknown request route: ${route}`); + } + + results.push(res); + } + + return { result: results }; +}; diff --git a/benchmarks/100.webapps/130.crud-api/nodejs/package.json b/benchmarks/100.webapps/130.crud-api/nodejs/package.json new file mode 100644 index 000000000..e00c83ddf --- /dev/null +++ b/benchmarks/100.webapps/130.crud-api/nodejs/package.json @@ -0,0 +1,9 @@ +{ + "name": "crud-api", + "version": "1.0.0", + "description": "CRUD API benchmark", + "author": "", + "license": "", + "dependencies": { + } +} diff --git a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js index 6e06493f1..747cc15a9 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js +++ b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js @@ -1,6 +1,25 @@ import * as nodeFs from 'node:fs'; import { Writable } from 'node:stream'; +// Global cache for files written/read during this request +// This allows sync operations to work within a single request context +if (!globalThis.FS_CACHE) { + globalThis.FS_CACHE = new Map(); +} + +/** + * Normalize a file path for R2 + */ +function normalizePath(path) { + let normalizedPath = path.replace(/^\.?\//, '').replace(/^tmp\//, ''); + + if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + return normalizedPath; +} + /** * Read a file from R2 bucket * @param {string} path - File path in R2 bucket (e.g., 'templates/index.html') @@ -27,13 +46,7 @@ export async function readFile(path, encoding, callback) { throw new Error('R2 bucket not available. Ensure R2 binding is configured in wrangler.toml'); } - // Normalize path: remove leading './' or '/' - let normalizedPath = path.replace(/^\.?\//, ''); - - // Prepend benchmark name if available - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; - } + const normalizedPath = normalizePath(path); // Get object from R2 const object = await globalThis.R2_BUCKET.get(normalizedPath); @@ -47,11 +60,17 @@ export async function readFile(path, encoding, callback) { if (actualEncoding === 'utf8' || actualEncoding === 'utf-8') { content = await object.text(); } else if (actualEncoding === 'buffer' || actualEncoding === null) { - content = await object.arrayBuffer(); + const arrayBuffer = await object.arrayBuffer(); + content = Buffer.from(arrayBuffer); } else { // For other encodings, get text and let caller handle conversion content = await object.text(); } + + // Store in cache for potential synchronous access + if (globalThis.FS_CACHE) { + globalThis.FS_CACHE.set(normalizedPath, content); + } if (actualCallback) { actualCallback(null, content); @@ -67,16 +86,26 @@ export async function readFile(path, encoding, callback) { } /** - * Synchronous version of readFile (not truly sync in Workers, but returns a Promise) - * Note: This is a compatibility shim - it still returns a Promise + * Synchronous version of readFile + * Reads from cache if available, otherwise throws error */ export function readFileSync(path, encoding) { - return new Promise((resolve, reject) => { - readFile(path, encoding || 'utf8', (err, data) => { - if (err) reject(err); - else resolve(data); - }); - }); + const normalizedPath = normalizePath(path); + + // Check cache first + if (globalThis.FS_CACHE && globalThis.FS_CACHE.has(normalizedPath)) { + const data = globalThis.FS_CACHE.get(normalizedPath); + + if (encoding === 'utf8' || encoding === 'utf-8') { + return typeof data === 'string' ? data : Buffer.from(data).toString('utf8'); + } else if (encoding === null || encoding === 'buffer') { + return Buffer.isBuffer(data) ? data : Buffer.from(data); + } + return data; + } + + // File not in cache - in Workers we can't do sync I/O + throw new Error(`ENOENT: no such file or directory, open '${path}'. File not in cache. In Cloudflare Workers, files must be written in the same request before being read synchronously.`); } /** @@ -106,6 +135,26 @@ export async function exists(path, callback) { } } +/** + * Synchronous version of exists + * Checks cache for file existence + */ +export function existsSync(path) { + if (!globalThis.R2_BUCKET) { + return false; + } + + const normalizedPath = normalizePath(path); + + // Check if file is in cache + if (globalThis.FS_CACHE && globalThis.FS_CACHE.has(normalizedPath)) { + return true; + } + + // File not in cache + return false; +} + /** * Get file stats from R2 */ @@ -161,13 +210,15 @@ export function createWriteStream(path, options) { throw new Error('R2 bucket not available'); } - let normalizedPath = path.replace(/^\.?\//, ''); + const normalizedPath = normalizePath(path); - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + const buffer = Buffer.concat(chunks); + + // Store in cache for synchronous access + if (globalThis.FS_CACHE) { + globalThis.FS_CACHE.set(normalizedPath, buffer); } - const buffer = Buffer.concat(chunks); await globalThis.R2_BUCKET.put(normalizedPath, buffer); callback(); } catch (err) { @@ -230,11 +281,9 @@ export default { readFile, readFileSync, exists, + existsSync, stat, createWriteStream, writeFile, writeFileSync, }; - -// Also re-export all named exports from node:fs -export * from 'node:fs'; \ No newline at end of file diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 03038672d..0b4a827c1 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,4 +1,3 @@ - // Simple CommonJS polyfill for Cloudflare Workers // This allows us to load CommonJS modules that use require() and module.exports const moduleCache = {}; @@ -33,7 +32,6 @@ if (typeof globalThis.require === 'undefined') { } - export default { async fetch(request, env) { try { @@ -57,6 +55,7 @@ export default { const begin = Date.now() / 1000; const start = performance.now(); + // Parse JSON body first (similar to Azure handler which uses req.body) const req_text = await request.text(); let event = {}; if (req_text && req_text.length > 0) { @@ -68,7 +67,8 @@ export default { } } - // Parse query string into event (simple parsing, mirrors Python logic) + // Parse query string into event (URL parameters override/merge with body) + // This makes it compatible with both input formats const urlParts = request.url.split('?'); if (urlParts.length > 1) { const query = urlParts[1]; @@ -148,6 +148,9 @@ export default { environ_container_id: 'no_id', request_id: '0', error: String(err && err.message ? err.message : err), + stack: err && err.stack ? err.stack : undefined, + event: event, + env: env, }); return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); } diff --git a/benchmarks/wrappers/cloudflare/nodejs/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/nosql.js new file mode 100644 index 000000000..2841d3942 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/nosql.js @@ -0,0 +1,114 @@ +// NoSQL wrapper for Cloudflare Workers +// Supports Cloudflare KV or Durable Objects when available + +class nosql { + constructor() { + this.handle = null; // KV or Durable Object binding + this._tables = {}; + } + + static init_instance(entry) { + nosql.instance = new nosql(); + if (entry && entry.env) { + nosql.instance.env = entry.env; + } + } + + _get_table(tableName) { + if (!(tableName in this._tables)) { + const envName = `NOSQL_STORAGE_TABLE_${tableName}`; + + if (this.env && this.env[envName]) { + this._tables[tableName] = this.env[envName]; + } else if (this.env && this.env[tableName]) { + // Try direct table name + this._tables[tableName] = this.env[tableName]; + } else { + throw new Error( + `Couldn't find an environment variable ${envName} for table ${tableName}` + ); + } + } + + return this._tables[tableName]; + } + + async insert(tableName, primaryKey, secondaryKey, data) { + const keyData = { ...data }; + keyData[primaryKey[0]] = primaryKey[1]; + keyData[secondaryKey[0]] = secondaryKey[1]; + + const table = this._get_table(tableName); + const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; + + // For KV binding + if (table && typeof table.put === 'function') { + await table.put(compositeKey, JSON.stringify(keyData)); + } else { + throw new Error('NoSQL table binding not properly configured'); + } + } + + async get(tableName, primaryKey, secondaryKey) { + const table = this._get_table(tableName); + const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; + + if (table && typeof table.get === 'function') { + const result = await table.get(compositeKey); + if (result) { + return JSON.parse(result); + } + return null; + } + + throw new Error('NoSQL table binding not properly configured'); + } + + async update(tableName, primaryKey, secondaryKey, updates) { + // For simple KV, update is same as put with merged data + const existing = await this.get(tableName, primaryKey, secondaryKey) || {}; + const merged = { ...existing, ...updates }; + await this.insert(tableName, primaryKey, secondaryKey, merged); + } + + async query(tableName, primaryKey, secondaryKeyName) { + const table = this._get_table(tableName); + const prefix = `${primaryKey[1]}#`; + + if (table && typeof table.list === 'function') { + const list = await table.list({ prefix }); + const results = []; + + for (const key of list.keys) { + const value = await table.get(key.name); + if (value) { + results.push(JSON.parse(value)); + } + } + + return results; + } + + throw new Error('NoSQL table binding not properly configured'); + } + + async delete(tableName, primaryKey, secondaryKey) { + const table = this._get_table(tableName); + const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; + + if (table && typeof table.delete === 'function') { + await table.delete(compositeKey); + } else { + throw new Error('NoSQL table binding not properly configured'); + } + } + + static get_instance() { + if (!nosql.instance) { + nosql.instance = new nosql(); + } + return nosql.instance; + } +} + +module.exports.nosql = nosql; diff --git a/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js new file mode 100644 index 000000000..5b86d18c4 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js @@ -0,0 +1,96 @@ +/** + * Minimal request polyfill for Cloudflare Workers + * Provides a subset of the 'request' npm package API using fetch() + */ + +class RequestStream { + constructor(url, options = {}) { + this.url = url; + this.options = options; + this.pipeDestination = null; + } + + pipe(destination) { + this.pipeDestination = destination; + + // Start the fetch and pipe to destination + (async () => { + try { + const response = await fetch(this.url, this.options); + + if (!response.ok) { + this.pipeDestination.emit('error', new Error(`HTTP ${response.status}: ${response.statusText}`)); + return; + } + + // Read the response body and write to destination + const reader = response.body.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + this.pipeDestination.end(); + break; + } + + // Write chunk to destination stream + if (!this.pipeDestination.write(value)) { + // Backpressure - wait for drain + await new Promise(resolve => { + this.pipeDestination.once('drain', resolve); + }); + } + } + } catch (error) { + if (this.pipeDestination) { + this.pipeDestination.emit('error', error); + } + } + })(); + + return this.pipeDestination; + } +} + +/** + * Main request function - creates a request stream + * @param {string|object} urlOrOptions - URL string or options object + * @param {object} options - Additional options if first param is URL + * @returns {RequestStream} + */ +function request(urlOrOptions, options) { + let url, opts; + + if (typeof urlOrOptions === 'string') { + url = urlOrOptions; + opts = options || {}; + } else { + url = urlOrOptions.url || urlOrOptions.uri; + opts = urlOrOptions; + } + + // Add default headers to avoid 403 errors from some servers + const defaultHeaders = { + 'User-Agent': 'Mozilla/5.0 (compatible; Cloudflare-Workers/1.0)', + 'Accept': '*/*', + }; + + const headers = { ...defaultHeaders, ...(opts.headers || {}) }; + + return new RequestStream(url, { + method: opts.method || 'GET', + headers: headers, + body: opts.body, + }); +} + +// Add common HTTP method shortcuts +request.get = (url, options) => request(url, { ...options, method: 'GET' }); +request.post = (url, options) => request(url, { ...options, method: 'POST' }); +request.put = (url, options) => request(url, { ...options, method: 'PUT' }); +request.delete = (url, options) => request(url, { ...options, method: 'DELETE' }); + +// Export as CommonJS module +module.exports = request; +module.exports.default = request; diff --git a/benchmarks/wrappers/cloudflare/nodejs/storage.js b/benchmarks/wrappers/cloudflare/nodejs/storage.js index 01fca6803..72e71e288 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/storage.js @@ -30,17 +30,49 @@ class storage { // Upload a file given a local filepath. In Workers env this is not available // so callers should use upload_stream or pass raw data. For Node.js we read // the file from disk and put it into R2 if available, otherwise throw. - async upload(__bucket, key, filepath) { + upload(__bucket, key, filepath) { // If file was previously written during this invocation, use /tmp absolute let realPath = filepath; if (this.written_files.has(filepath)) { realPath = path.join('/tmp', path.resolve(filepath)); } - // Read file content + // In Workers environment with R2, check if file exists in R2 + // (it may have been written by fs-polyfill's createWriteStream) + if (this.handle) { + // Normalize the path to match what fs-polyfill would use + let normalizedPath = realPath.replace(/^\.?\//, '').replace(/^tmp\//, ''); + + // Add benchmark name prefix if available (matching fs-polyfill behavior) + if (typeof globalThis !== 'undefined' && globalThis.BENCHMARK_NAME && + !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { + normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; + } + + const unique_key = storage.unique_name(key); + + // Read from R2 and re-upload with unique key + const uploadPromise = this.handle.get(normalizedPath).then(async (obj) => { + if (obj) { + const data = await obj.arrayBuffer(); + return this.handle.put(unique_key, data); + } else { + throw new Error(`File not found in R2: ${normalizedPath} (original path: ${filepath})`); + } + }); + + return [unique_key, uploadPromise]; + } + + // Fallback: Read file content from local filesystem (Node.js environment) if (fs && fs.existsSync(realPath)) { const data = fs.readFileSync(realPath); - return await this.upload_stream(__bucket, key, data); + const unique_key = storage.unique_name(key); + + // Return [uniqueName, promise] to match Azure storage API + const uploadPromise = Promise.resolve(); + + return [unique_key, uploadPromise]; } // If running in Workers (no fs) and caller provided Buffer/Stream, they @@ -133,6 +165,84 @@ class storage { throw new Error('download_stream(): object not found'); } + // Additional stream methods for compatibility with Azure storage API + // These provide a stream-based interface similar to Azure's uploadStream/downloadStream + uploadStream(__bucket, key) { + const unique_key = storage.unique_name(key); + + if (this.handle) { + // For R2, we create a PassThrough stream that collects data + // then uploads when ended + const stream = require('stream'); + const passThrough = new stream.PassThrough(); + const chunks = []; + + passThrough.on('data', (chunk) => chunks.push(chunk)); + + const upload = new Promise((resolve, reject) => { + passThrough.on('end', async () => { + try { + const buffer = Buffer.concat(chunks); + await this.handle.put(unique_key, buffer); + resolve(); + } catch (err) { + reject(err); + } + }); + passThrough.on('error', reject); + }); + + return [passThrough, upload, unique_key]; + } + + // Fallback to filesystem + if (fs) { + const stream = require('stream'); + const outPath = path.join('/tmp', unique_key); + fs.mkdirSync(path.dirname(outPath), { recursive: true }); + const writeStream = fs.createWriteStream(outPath); + const upload = new Promise((resolve, reject) => { + writeStream.on('finish', resolve); + writeStream.on('error', reject); + }); + return [writeStream, upload, unique_key]; + } + + throw new Error('uploadStream(): no storage backend available'); + } + + async downloadStream(__bucket, key) { + if (this.handle) { + const obj = await this.handle.get(key); + if (!obj) return null; + + // R2 object has a body ReadableStream + if (obj.body) { + return obj.body; + } + + // Fallback: convert to buffer then to stream + if (typeof obj.arrayBuffer === 'function') { + const stream = require('stream'); + const ab = await obj.arrayBuffer(); + const buffer = Buffer.from(ab); + const readable = new stream.PassThrough(); + readable.end(buffer); + return readable; + } + + return null; + } + + // Fallback to local filesystem + const localPath = path.join('/tmp', key); + if (fs && fs.existsSync(localPath)) { + return fs.createReadStream(localPath); + } + + throw new Error('downloadStream(): object not found'); + } + static get_instance() { if (!storage.instance) { throw new Error('must init storage singleton first'); diff --git a/configs/systems.json b/configs/systems.json index ca75db5c0..52eabb497 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -461,7 +461,9 @@ "files": [ "handler.js", "storage.js", - "fs-polyfill.js" + "nosql.js", + "fs-polyfill.js", + "request-polyfill.js" ], "packages": { "uuid": "3.4.0" diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 297349b2e..d299be793 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -220,6 +220,7 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: compatibility_flags = ["nodejs_compat"] [alias] "fs" = "./fs-polyfill" +"request" = "./request-polyfill" """ elif language == "python": toml_content += """# Enable Python Workers runtime diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index 6e0b05c2a..03d36c179 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -1,5 +1,6 @@ import json import requests +from collections import defaultdict from typing import Dict, Optional, Tuple from sebs.cloudflare.config import CloudflareCredentials @@ -35,7 +36,7 @@ def __init__( ): super().__init__(region, cache_client, resources) self._credentials = credentials - self._tables: Dict[str, str] = {} + self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) def _get_auth_headers(self) -> dict[str, str]: """Get authentication headers for Cloudflare API requests.""" @@ -60,9 +61,7 @@ def get_tables(self, benchmark: str) -> Dict[str, str]: :param benchmark: benchmark name :return: dictionary mapping table names to their IDs """ - # For Durable Objects, we don't have traditional tables - # Return cached tables if any - return self._tables.copy() + return self._tables[benchmark] def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: """ @@ -72,8 +71,13 @@ def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: :param table: table name :return: full table name or None if not found """ - key = f"{benchmark}:{table}" - return self._tables.get(key) + if benchmark not in self._tables: + return None + + if table not in self._tables[benchmark]: + return None + + return self._tables[benchmark][table] def retrieve_cache(self, benchmark: str) -> bool: """ @@ -82,11 +86,12 @@ def retrieve_cache(self, benchmark: str) -> bool: :param benchmark: benchmark name :return: True if cache was found and loaded """ - cache_key = f"cloudflare.durable_objects.{benchmark}" - cached = self.cache_client.get(cache_key) - - if cached: - self._tables.update(cached) + if benchmark in self._tables: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is not None: + self._tables[benchmark] = cached_storage["tables"] self.logging.info(f"Retrieved cached Durable Objects tables for {benchmark}") return True @@ -98,14 +103,13 @@ def update_cache(self, benchmark: str): :param benchmark: benchmark name """ - cache_key = f"cloudflare.durable_objects.{benchmark}" - - # Filter tables for this benchmark - benchmark_tables = { - k: v for k, v in self._tables.items() if k.startswith(f"{benchmark}:") - } - - self.cache_client.update(cache_key, benchmark_tables) + self._cache_client.update_nosql( + self.deployment_name(), + benchmark, + { + "tables": self._tables[benchmark], + }, + ) self.logging.info(f"Updated cache for Durable Objects tables for {benchmark}") def create_table( @@ -124,11 +128,10 @@ def create_table( :param secondary_key: optional secondary key field name :return: table name """ - resource_id = self._cloud_resources.get_resource_id() + resource_id = self._cloud_resources.resources_id table_name = f"sebs-benchmarks-{resource_id}-{benchmark}-{name}" - key = f"{benchmark}:{name}" - self._tables[key] = table_name + self._tables[benchmark][name] = table_name self.logging.info( f"Registered Durable Objects table {table_name} for benchmark {benchmark}" @@ -183,9 +186,14 @@ def remove_table(self, name: str) -> str: :return: table name """ # Remove from internal tracking - keys_to_remove = [k for k, v in self._tables.items() if v == name] - for key in keys_to_remove: - del self._tables[key] + for benchmark, tables in self._tables.items(): + if name in tables.values(): + # Find the table key + for table_key, table_name in tables.items(): + if table_name == name: + del self._tables[benchmark][table_key] + break + break self.logging.info(f"Removed Durable Objects table {name} from tracking") return name From 272a372376b8102aeb8dd94ffcd5ffe1905be579 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Thu, 13 Nov 2025 22:59:33 +0100 Subject: [PATCH 014/230] current situation where asyncio cannot run the async function --- .../wrappers/cloudflare/python/handler.py | 31 +++++++---- .../wrappers/cloudflare/python/storage.py | 54 ++++++++++++------- 2 files changed, 56 insertions(+), 29 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 8e37efee4..9d220a043 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -1,5 +1,6 @@ import datetime, io, json, os, uuid, sys +import traceback from workers import WorkerEntrypoint, Response ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -18,13 +19,21 @@ class Default(WorkerEntrypoint): async def fetch(self, request, env): + try: + return await self.fetch2(request, env) + except Exception as e: + t = traceback.format_exc() + print(t) + return Response(t) + + async def fetch2(self, request, env): if "favicon" in request.url: return Response("None") - + req_text = await request.text() - + event = json.loads(req_text) if len(req_text) > 0 else {} - print(event) - + ## print(event) + # dirty url parameters parsing, for testing tmp = request.url.split("?") if len(tmp) > 1: @@ -37,11 +46,11 @@ async def fetch(self, request, env): event[param[0]] = param[1] except IndexError: event[param[0]] = None - - - - - + + + + + ## we might need more data in self.env to know this ID req_id = 0 @@ -52,13 +61,13 @@ async def fetch(self, request, env): event['income-timestamp'] = income_timestamp from function import storage - + storage.storage.init_instance(self) print("event:", event) from function import function - ret = function.handler(event) + ret = await function.handler(event) log_data = { 'output': ret['result'] diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index de2a4642e..47fd0395e 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -1,6 +1,9 @@ import io import os import uuid +import asyncio +from pyodide.ffi import to_js, jsnull +from pyodide.webloop import WebLoop from workers import WorkerEntrypoint @@ -18,7 +21,6 @@ """ class storage: instance = None - handle = None @staticmethod def unique_name(name): @@ -28,51 +30,67 @@ def unique_name(name): extension=extension, random=str(uuid.uuid4()).split('-')[0] ) + def get_bucket(self, bucket): + return getattr(self.entry_env, bucket) @staticmethod def init_instance(entry: WorkerEntrypoint): storage.instance = storage() - storage.instance.handle = entry.env.R2 + storage.instance.entry_env = entry.env storage.instance.written_files = set() + ## should think of a way to del the runner at program end + storage.instance.runner = asyncio.Runner(loop_factory=None) - def upload(self, __bucket, key, filepath): + def upload(self, bucket, key, filepath): if filepath in self.written_files: filepath = "/tmp" + os.path.abspath(filepath) with open(filepath, "rb") as f: - self.upload_stream(__bucket, key, f.read()) - return + unique_key = self.upload_stream(bucket, key, f.read()) + return unique_key - def download(self, __bucket, key, filepath): - data = self.download_stream(__bucket, key) + def download(self, bucket, key, filepath): + data = self.download_stream(bucket, key) # should only allow writes to tmp dir. so do have to edit the filepath here? real_fp = filepath if not filepath.startswith("/tmp"): real_fp = "/tmp" + os.path.abspath(filepath) - + self.written_files.append(filepath) with open(real_fp, "wb") as f: f.write(data) return - def download_directory(self, __bucket, prefix, out_path): - print(self.handle, type(self.handle)) - list_res = self.handle.list(prefix = prefix) ## gives only first 1000? + def download_directory(self, bucket, prefix, out_path): + bobj = self.get_bucket(bucket) + list_res = self.runner,run(bobj.list(prefix = prefix)) ## gives only first 1000? for obj in list_res.objects: file_name = obj.key path_to_file = os.path.dirname(file_name) os.makedirs(os.path.join(path, path_to_file), exist_ok=True) - self.download(__bucket, file_name, os.path.join(out_path, file_name)) + self.download(bucket, file_name, os.path.join(out_path, file_name)) return - def upload_stream(self, __bucket, key, data): + def upload_stream(self, bucket, key, data): + return self.runner.run(selfaupload_stream(bucket, key, data)) + + async def aupload_stream(self, bucket, key, data): unique_key = storage.unique_name(key) - put_res = self.handle.put(unique_key, data) + data_js = to_js(data) + bobj = self.get_bucket(bucket) + put_res = await bobj.put(unique_key, data_js) + ##print(put_res) return unique_key - def download_stream(self, __bucket, key): - get_res = self.handle.get(key) - assert get_res is not None - data = get_res.text() + def download_stream(self, bucket, key): + return self.runner.run(self.adownload_stream(bucket, key)) + + async def adownload_stream(self, bucket, key): + bobj = self.get_bucket(bucket) + get_res = bobj.get(key) + if get_res == jsnull: + print("key not stored in bucket") + return b'' + data = await get_res.text() return data def get_instance(): From 556d799179fe63f2d4d1af7cb34db94a39b24381 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Sun, 16 Nov 2025 16:49:43 +0100 Subject: [PATCH 015/230] dynamically add async to benchmark function *shrug* --- .../wrappers/cloudflare/python/handler.py | 54 +++++++++++++++++-- .../wrappers/cloudflare/python/storage.py | 32 +++++------ 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 9d220a043..7d1111792 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -1,5 +1,5 @@ -import datetime, io, json, os, uuid, sys - +import datetime, io, json, os, uuid, sys, ast +import importlib.util import traceback from workers import WorkerEntrypoint, Response @@ -17,6 +17,52 @@ """ + +def import_from_path(module_name, file_path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +working_dir = os.path.dirname(__file__) + +class MakeAsync(ast.NodeTransformer): + def visit_FunctionDef(self, node): + if node.name != "handler": + return node + return ast.AsyncFunctionDef( + name=node.name, + args=node.args, + body=node.body, + decorator_list=node.decorator_list, + returns=node.returns, + type_params=node.type_params) + +class AddAwait(ast.NodeTransformer): + to_find = ["upload_stream", "download_stream", "upload", "download", "download_directory"] + + def visit_Call(self, node): + if isinstance(node.func, ast.Attribute) and node.func.attr in self.to_find: + #print(ast.dump(node.func, indent=2)) + return ast.Await(value=node) + + return node + +def make_benchmark_func(): + with open(working_dir +"/function/function.py") as f: + module = ast.parse(f.read()) + module = ast.fix_missing_locations(MakeAsync().visit(module)) + module = ast.fix_missing_locations(AddAwait().visit(module)) + new_source = ast.unparse(module) + print("new_source:") + print(new_source) + print() + with open("/tmp/function.py", "w") as wf: + wf.write(new_source) + + class Default(WorkerEntrypoint): async def fetch(self, request, env): try: @@ -66,8 +112,10 @@ async def fetch2(self, request, env): print("event:", event) - from function import function + make_benchmark_func() + function = import_from_path("function.function", "/tmp/function.py") ret = await function.handler(event) + log_data = { 'output': ret['result'] diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 47fd0395e..dccf2e1d7 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -38,18 +38,16 @@ def init_instance(entry: WorkerEntrypoint): storage.instance = storage() storage.instance.entry_env = entry.env storage.instance.written_files = set() - ## should think of a way to del the runner at program end - storage.instance.runner = asyncio.Runner(loop_factory=None) - def upload(self, bucket, key, filepath): + async def upload(self, bucket, key, filepath): if filepath in self.written_files: filepath = "/tmp" + os.path.abspath(filepath) with open(filepath, "rb") as f: - unique_key = self.upload_stream(bucket, key, f.read()) + unique_key = await self.upload_stream(bucket, key, f.read()) return unique_key - def download(self, bucket, key, filepath): - data = self.download_stream(bucket, key) + async def download(self, bucket, key, filepath): + data = await self.download_stream(bucket, key) # should only allow writes to tmp dir. so do have to edit the filepath here? real_fp = filepath if not filepath.startswith("/tmp"): @@ -60,20 +58,17 @@ def download(self, bucket, key, filepath): f.write(data) return - def download_directory(self, bucket, prefix, out_path): + async def download_directory(self, bucket, prefix, out_path): bobj = self.get_bucket(bucket) - list_res = self.runner,run(bobj.list(prefix = prefix)) ## gives only first 1000? + list_res = await bobj.list(prefix = prefix) ## gives only first 1000? for obj in list_res.objects: - file_name = obj.key + file_nameß = obj.key path_to_file = os.path.dirname(file_name) os.makedirs(os.path.join(path, path_to_file), exist_ok=True) - self.download(bucket, file_name, os.path.join(out_path, file_name)) + await self.download(bucket, file_name, os.path.join(out_path, file_name)) return - def upload_stream(self, bucket, key, data): - return self.runner.run(selfaupload_stream(bucket, key, data)) - - async def aupload_stream(self, bucket, key, data): + async def upload_stream(self, bucket, key, data): unique_key = storage.unique_name(key) data_js = to_js(data) bobj = self.get_bucket(bucket) @@ -81,16 +76,13 @@ async def aupload_stream(self, bucket, key, data): ##print(put_res) return unique_key - def download_stream(self, bucket, key): - return self.runner.run(self.adownload_stream(bucket, key)) - - async def adownload_stream(self, bucket, key): + async def download_stream(self, bucket, key): bobj = self.get_bucket(bucket) - get_res = bobj.get(key) + get_res = await bobj.get(key) if get_res == jsnull: print("key not stored in bucket") return b'' - data = await get_res.text() + data = await get_res.bytes() return data def get_instance(): From 93c8a73160ee2ebce0e1ccde8e4e8df2304b18eb Mon Sep 17 00:00:00 2001 From: ldzgch Date: Sun, 16 Nov 2025 21:27:20 +0100 Subject: [PATCH 016/230] nosql updates --- .../wrappers/cloudflare/python/handler.py | 14 ++++- .../wrappers/cloudflare/python/nosql.py | 60 ++++++++++++++----- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 7d1111792..5ead82496 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -56,9 +56,9 @@ def make_benchmark_func(): module = ast.fix_missing_locations(MakeAsync().visit(module)) module = ast.fix_missing_locations(AddAwait().visit(module)) new_source = ast.unparse(module) - print("new_source:") - print(new_source) - print() + ##print("new_source:") + ##print(new_source) + ##print() with open("/tmp/function.py", "w") as wf: wf.write(new_source) @@ -106,10 +106,18 @@ async def fetch2(self, request, env): event['request-id'] = req_id event['income-timestamp'] = income_timestamp + + from function import storage storage.storage.init_instance(self) + from function import nosql + + nosql.nosql.init_instance(self) + + + print("event:", event) make_benchmark_func() diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index 75bf0f09d..48b523675 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -1,54 +1,86 @@ from typing import List, Optional, Tuple - +import json +from pyodide.ffi import to_js +from workers import WorkerEntrypoint class nosql: instance: Optional["nosql"] = None @staticmethod - def init_instance(entry: WorkerEntryPoint): + def init_instance(entry: WorkerEntrypoint): nosql.instance = nosql() nosql.instance.env = entry.env - def insert( + def key_maker(self, key1, key2): + return f"({key1[0]},{str(key1[1])})+({key2[0]},{key2[1]})" + + def key_maker_partial(self, key1, key2): + return f"({key1[0]},{str(key1[1])})+({key2[0]}" + + def get_table(self, table_name): + return getattr(self.env, (table_name)) + + async def insert( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str], data: dict, ): - put_res = await self.env.getattr(table_name).put(primary_key, data) + put_res = await self.get_table(table_name).put( + self.key_maker(primary_key, secondary_key), + json.dumps(data)) return - def update( + async def update( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str], data: dict, ): - await self.env.getattr(table_name).put(primary_key, data) + put_res = await self.get_table(table_name).put( + self.key_maker(primary_key, secondary_key), + data) return - def get( + async def get( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] ) -> Optional[dict]: - get_res = await self.env.getattr(table_name).get(primary_key) - return get_res.json() + get_res = await self.get_table(table_name).get(self.key_maker(primary_key, secondary_key)) + return get_res """ This query must involve partition key - it does not scan across partitions. """ - def query( + async def query( self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str ) -> List[dict]: - list_res = await self.env.getattr(table_name).list() + _options = {"prefix" : self.key_maker_partial(primary_key, (secondary_key_name,) )} + list_res = await self.get_table(table_name).list(options=_options) - return + keys = [] + for key in list_res.keys: + keys.append(key.name) + print("keys", keys) + assert len(keys) <= 100 + + + # todo: please use bulk sometime (it didn't work when i tried it) + res = [] + for key in keys: + + get_res = await self.get_table(table_name).get(key) + get_res = get_res.replace("\'", "\"") + print("gr", get_res) + + res.append(json.loads(get_res)) + return res - def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): - self.env.getattr(table_name).delete(primary_key) + async def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + self.get_table(table_name).delete(self.key_maker(primary_key, secondary_key)) return From e17982f9a1d77fcef06849f494e3a1b64b8ca908 Mon Sep 17 00:00:00 2001 From: MisterMM23 Date: Mon, 17 Nov 2025 11:54:38 +0100 Subject: [PATCH 017/230] idea for cicrumvention of asyncio When running the handler as a thread and then awaiting said thread (asyncio.to_thread), we can run asyncio.run() in the subsequent call stack. --- benchmarks/wrappers/cloudflare/python/handler.py | 7 +++++-- benchmarks/wrappers/cloudflare/python/storage.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 5ead82496..8ffe6f448 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -1,4 +1,5 @@ import datetime, io, json, os, uuid, sys, ast +import asyncio import importlib.util import traceback from workers import WorkerEntrypoint, Response @@ -122,8 +123,10 @@ async def fetch2(self, request, env): make_benchmark_func() function = import_from_path("function.function", "/tmp/function.py") - ret = await function.handler(event) - + async def run_handler(): + return function.handler(event) + ret = await asyncio.to_thread(run_handler()) + log_data = { 'output': ret['result'] diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index dccf2e1d7..75bb197b3 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -46,8 +46,8 @@ async def upload(self, bucket, key, filepath): unique_key = await self.upload_stream(bucket, key, f.read()) return unique_key - async def download(self, bucket, key, filepath): - data = await self.download_stream(bucket, key) + def download(self, bucket, key, filepath): + data = asyncio.run(self.download_stream(bucket, key)) # should only allow writes to tmp dir. so do have to edit the filepath here? real_fp = filepath if not filepath.startswith("/tmp"): From 214c947d9814afb121af076ce871a8b7ef62b897 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Mon, 17 Nov 2025 14:12:12 +0100 Subject: [PATCH 018/230] wrappers - run_sync for storage.py --- .../wrappers/cloudflare/python/handler.py | 11 ++++---- .../wrappers/cloudflare/python/storage.py | 26 ++++++++++++------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 8ffe6f448..60497331c 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -121,12 +121,13 @@ async def fetch2(self, request, env): print("event:", event) - make_benchmark_func() - function = import_from_path("function.function", "/tmp/function.py") - async def run_handler(): - return function.handler(event) - ret = await asyncio.to_thread(run_handler()) +## make_benchmark_func() +## function = import_from_path("function.function", "/tmp/function.py") + + from function import function + + ret = function.handler(event) log_data = { 'output': ret['result'] diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 75bb197b3..3f6ebc31d 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -2,7 +2,7 @@ import os import uuid import asyncio -from pyodide.ffi import to_js, jsnull +from pyodide.ffi import to_js, jsnull, run_sync from pyodide.webloop import WebLoop from workers import WorkerEntrypoint @@ -38,16 +38,16 @@ def init_instance(entry: WorkerEntrypoint): storage.instance = storage() storage.instance.entry_env = entry.env storage.instance.written_files = set() - - async def upload(self, bucket, key, filepath): + + def upload(self, bucket, key, filepath): if filepath in self.written_files: filepath = "/tmp" + os.path.abspath(filepath) with open(filepath, "rb") as f: - unique_key = await self.upload_stream(bucket, key, f.read()) + unique_key = self.upload_stream(bucket, key, f.read()) return unique_key def download(self, bucket, key, filepath): - data = asyncio.run(self.download_stream(bucket, key)) + data = self.download_stream(bucket, key) # should only allow writes to tmp dir. so do have to edit the filepath here? real_fp = filepath if not filepath.startswith("/tmp"): @@ -58,17 +58,20 @@ def download(self, bucket, key, filepath): f.write(data) return - async def download_directory(self, bucket, prefix, out_path): + def download_directory(self, bucket, prefix, out_path): bobj = self.get_bucket(bucket) - list_res = await bobj.list(prefix = prefix) ## gives only first 1000? + list_res = run_sync(bobj.list(prefix = prefix)) ## gives only first 1000? for obj in list_res.objects: file_nameß = obj.key path_to_file = os.path.dirname(file_name) os.makedirs(os.path.join(path, path_to_file), exist_ok=True) - await self.download(bucket, file_name, os.path.join(out_path, file_name)) + self.download(bucket, file_name, os.path.join(out_path, file_name)) return - async def upload_stream(self, bucket, key, data): + def upload_stream(self, bucket, key, data): + return run_sync(self.aupload_stream(bucket, key, data)) + + async def aupload_stream(self, bucket, key, data): unique_key = storage.unique_name(key) data_js = to_js(data) bobj = self.get_bucket(bucket) @@ -76,7 +79,10 @@ async def upload_stream(self, bucket, key, data): ##print(put_res) return unique_key - async def download_stream(self, bucket, key): + def download_stream(self, bucket, key): + return run_sync(self.adownload_stream(bucket, key)) + + async def adownload_stream(self, bucket, key): bobj = self.get_bucket(bucket) get_res = await bobj.get(key) if get_res == jsnull: From b8f7c5cbe04f91211a08dd59172bad99fff6f9ae Mon Sep 17 00:00:00 2001 From: ldzgch Date: Wed, 19 Nov 2025 11:34:42 +0100 Subject: [PATCH 019/230] nosql wrapper uses run_sync --- .../wrappers/cloudflare/python/nosql.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index 48b523675..bb1f94633 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -1,6 +1,6 @@ from typing import List, Optional, Tuple import json -from pyodide.ffi import to_js +from pyodide.ffi import to_js, run_sync from workers import WorkerEntrypoint class nosql: @@ -21,50 +21,57 @@ def key_maker_partial(self, key1, key2): def get_table(self, table_name): return getattr(self.env, (table_name)) - async def insert( + def insert( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str], data: dict, ): - put_res = await self.get_table(table_name).put( - self.key_maker(primary_key, secondary_key), - json.dumps(data)) + put_res = ( + run_sync(self.get_table(table_name).put( + self.key_maker(primary_key, secondary_key), + json.dumps(data)) + )) return - async def update( + def update( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str], data: dict, ): - put_res = await self.get_table(table_name).put( - self.key_maker(primary_key, secondary_key), - data) + put_res = run_sync( + self.get_table(table_name).put( + self.key_maker(primary_key, secondary_key), + json.dumps(data) + )) return - async def get( + def get( self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] ) -> Optional[dict]: - get_res = await self.get_table(table_name).get(self.key_maker(primary_key, secondary_key)) + get_res = run_sync( + self.get_table(table_name).get( + self.key_maker(primary_key, secondary_key) + )) return get_res """ This query must involve partition key - it does not scan across partitions. """ - async def query( + def query( self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str ) -> List[dict]: _options = {"prefix" : self.key_maker_partial(primary_key, (secondary_key_name,) )} - list_res = await self.get_table(table_name).list(options=_options) + list_res = run_sync(self.get_table(table_name).list(options=_options)) keys = [] for key in list_res.keys: keys.append(key.name) - print("keys", keys) + ##print("keys", keys) assert len(keys) <= 100 @@ -72,15 +79,15 @@ async def query( res = [] for key in keys: - get_res = await self.get_table(table_name).get(key) + get_res = run_sync(self.get_table(table_name).get(key)) get_res = get_res.replace("\'", "\"") - print("gr", get_res) + ##print("gr", get_res) res.append(json.loads(get_res)) return res - async def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): - self.get_table(table_name).delete(self.key_maker(primary_key, secondary_key)) + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + run_sync(self.get_table(table_name).delete(self.key_maker(primary_key, secondary_key))) return From dba29926c8c0390d14b613008e3ddb9ce89f6d6a Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 19 Nov 2025 19:42:57 +0100 Subject: [PATCH 020/230] cloudflare nodejs wrapper without r2 as fs polyfill, just node_compat modules --- .../wrappers/cloudflare/nodejs/build.js | 161 ++++++++++ .../wrappers/cloudflare/nodejs/fs-polyfill.js | 289 ------------------ .../wrappers/cloudflare/nodejs/handler.js | 36 --- .../cloudflare/nodejs/request-polyfill.js | 96 ------ configs/systems.json | 3 +- experiments.json | 76 +++++ package-lock.json | 6 + package.json | 1 + sebs/cloudflare/cloudflare.py | 82 ++++- test-fs.js | 7 + 10 files changed, 321 insertions(+), 436 deletions(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/build.js delete mode 100644 benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js delete mode 100644 benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js create mode 100644 experiments.json create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 test-fs.js diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js new file mode 100644 index 000000000..a9d7ebcb0 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -0,0 +1,161 @@ +const { build } = require('esbuild'); +const fs = require('fs'); +const { join, extname, dirname, relative } = require('path'); + +function getAllFiles(dir, fileList = []) { + const files = fs.readdirSync(dir, { withFileTypes: true }); + for (const file of files) { + const filePath = join(dir, file.name); + if (file.isDirectory()) { + if (file.name !== 'node_modules' && + file.name !== 'test' && + file.name !== 'tests' && + file.name !== '__tests__' && + file.name !== 'dist' && + !file.name.startsWith('.')) { + getAllFiles(filePath, fileList); + } + } else { + if (!file.name.includes('.test.') && + !file.name.includes('.spec.') && + file.name !== 'build.js' && + file.name !== 'wrangler.toml') { + fileList.push(filePath); + } + } + } + return fileList; +} + +function copyFile(src, dest) { + const destDir = dirname(dest); + if (!fs.existsSync(destDir)) { + fs.mkdirSync(destDir, { recursive: true }); + } + fs.copyFileSync(src, dest); +} + +const nodeBuiltinsPlugin = { + name: 'node-builtins-external', + setup(build) { + // Keep node: prefixed modules external + build.onResolve({ filter: /^node:/ }, (args) => { + return { path: args.path, external: true }; + }); + + // Map bare node built-in names to node: versions and keep external + build.onResolve({ filter: /^(fs|querystring|path|crypto|stream|buffer|util|events|http|https|net|tls|zlib|os|child_process|tty|assert|url)$/ }, (args) => { + return { path: 'node:' + args.path, external: true }; + }); + } +}; + +async function customBuild() { + const srcDir = './'; + const outDir = './dist'; + + if (fs.existsSync(outDir)) { + fs.rmSync(outDir, { recursive: true }); + } + fs.mkdirSync(outDir, { recursive: true }); + + try { + const files = getAllFiles(srcDir); + + const jsFiles = files.filter(f => + ['.js', '.ts', '.jsx', '.tsx'].includes(extname(f)) + ); + + const otherFiles = files.filter(f => + !['.js', '.ts', '.jsx', '.tsx'].includes(extname(f)) + ); + + console.log('Building JS files:', jsFiles); + + if (jsFiles.length > 0) { + await build({ + entryPoints: jsFiles, + bundle: true, + format: 'esm', + outdir: outDir, + outbase: srcDir, + platform: 'neutral', + target: 'es2020', + sourcemap: true, + allowOverwrite: true, + plugins: [nodeBuiltinsPlugin], + define: { + 'process.env.NODE_ENV': '"production"', + 'global': 'globalThis', + '__dirname': '"/bundle"' + }, + mainFields: ['module', 'main'], + treeShaking: true, + }); + + // POST-PROCESS: Replace dynamic requires with static imports + console.log('Post-processing to fix node: module imports...'); + + for (const jsFile of jsFiles) { + const outPath = join(outDir, relative(srcDir, jsFile)); + + if (fs.existsSync(outPath)) { + let content = fs.readFileSync(outPath, 'utf-8'); + + // Find all node: modules being dynamically required + const nodeModules = new Set(); + const requireRegex = /__require\d*\("(node:[^"]+)"\)/g; + let match; + while ((match = requireRegex.exec(content)) !== null) { + nodeModules.add(match[1]); + } + + if (nodeModules.size > 0) { + // Generate static imports at the top + let imports = ''; + const mapping = {}; + let i = 0; + for (const mod of nodeModules) { + const varName = `__node_${mod.replace('node:', '').replace(/[^a-z0-9]/gi, '_')}_${i++}`; + imports += `import * as ${varName} from '${mod}';\n`; + mapping[mod] = varName; + } + + // Add cache object + imports += '\nconst __node_cache = {\n'; + for (const [mod, varName] of Object.entries(mapping)) { + imports += ` '${mod}': ${varName},\n`; + } + imports += '};\n\n'; + + // Replace all __require calls with cache lookups + content = content.replace(/__require(\d*)\("(node:[^"]+)"\)/g, (match, num, mod) => { + return `__node_cache['${mod}']`; + }); + + // Prepend imports to the file + content = imports + content; + + fs.writeFileSync(outPath, content, 'utf-8'); + console.log(`✓ Fixed ${nodeModules.size} node: imports in ${relative(srcDir, jsFile)}`); + } + } + } + } + + // Copy non-JS files (templates, etc.) + for (const file of otherFiles) { + const relativePath = relative(srcDir, file); + const destPath = join(outDir, relativePath); + copyFile(file, destPath); + console.log(`Copied: ${relativePath}`); + } + + console.log('✓ Build completed successfully'); + } catch (error) { + console.error('Build failed:', error); + process.exit(1); + } +} + +customBuild(); \ No newline at end of file diff --git a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js deleted file mode 100644 index 747cc15a9..000000000 --- a/benchmarks/wrappers/cloudflare/nodejs/fs-polyfill.js +++ /dev/null @@ -1,289 +0,0 @@ -import * as nodeFs from 'node:fs'; -import { Writable } from 'node:stream'; - -// Global cache for files written/read during this request -// This allows sync operations to work within a single request context -if (!globalThis.FS_CACHE) { - globalThis.FS_CACHE = new Map(); -} - -/** - * Normalize a file path for R2 - */ -function normalizePath(path) { - let normalizedPath = path.replace(/^\.?\//, '').replace(/^tmp\//, ''); - - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; - } - - return normalizedPath; -} - -/** - * Read a file from R2 bucket - * @param {string} path - File path in R2 bucket (e.g., 'templates/index.html') - * @param {string|object|function} encoding - Encoding or options or callback - * @param {function} callback - Callback function (err, data) - */ -export async function readFile(path, encoding, callback) { - // Handle overloaded arguments: readFile(path, callback) or readFile(path, encoding, callback) - let actualEncoding = 'utf8'; - let actualCallback = callback; - - if (typeof encoding === 'function') { - actualCallback = encoding; - actualEncoding = 'utf8'; - } else if (typeof encoding === 'string') { - actualEncoding = encoding; - } else if (typeof encoding === 'object' && encoding !== null && encoding.encoding) { - actualEncoding = encoding.encoding; - } - - try { - // Check if R2 bucket is available - if (!globalThis.R2_BUCKET) { - throw new Error('R2 bucket not available. Ensure R2 binding is configured in wrangler.toml'); - } - - const normalizedPath = normalizePath(path); - - // Get object from R2 - const object = await globalThis.R2_BUCKET.get(normalizedPath); - - if (!object) { - throw new Error(`ENOENT: no such file or directory, open '${path}' (R2 key: ${normalizedPath})`); - } - - // Read the content - let content; - if (actualEncoding === 'utf8' || actualEncoding === 'utf-8') { - content = await object.text(); - } else if (actualEncoding === 'buffer' || actualEncoding === null) { - const arrayBuffer = await object.arrayBuffer(); - content = Buffer.from(arrayBuffer); - } else { - // For other encodings, get text and let caller handle conversion - content = await object.text(); - } - - // Store in cache for potential synchronous access - if (globalThis.FS_CACHE) { - globalThis.FS_CACHE.set(normalizedPath, content); - } - - if (actualCallback) { - actualCallback(null, content); - } - return content; - } catch (err) { - if (actualCallback) { - actualCallback(err, null); - } else { - throw err; - } - } -} - -/** - * Synchronous version of readFile - * Reads from cache if available, otherwise throws error - */ -export function readFileSync(path, encoding) { - const normalizedPath = normalizePath(path); - - // Check cache first - if (globalThis.FS_CACHE && globalThis.FS_CACHE.has(normalizedPath)) { - const data = globalThis.FS_CACHE.get(normalizedPath); - - if (encoding === 'utf8' || encoding === 'utf-8') { - return typeof data === 'string' ? data : Buffer.from(data).toString('utf8'); - } else if (encoding === null || encoding === 'buffer') { - return Buffer.isBuffer(data) ? data : Buffer.from(data); - } - return data; - } - - // File not in cache - in Workers we can't do sync I/O - throw new Error(`ENOENT: no such file or directory, open '${path}'. File not in cache. In Cloudflare Workers, files must be written in the same request before being read synchronously.`); -} - -/** - * Check if a file exists in R2 - */ -export async function exists(path, callback) { - try { - if (!globalThis.R2_BUCKET) { - if (callback) callback(false); - return false; - } - - let normalizedPath = path.replace(/^\.?\//, ''); - - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; - } - - const object = await globalThis.R2_BUCKET.head(normalizedPath); - - const result = object !== null; - if (callback) callback(result); - return result; - } catch (err) { - if (callback) callback(false); - return false; - } -} - -/** - * Synchronous version of exists - * Checks cache for file existence - */ -export function existsSync(path) { - if (!globalThis.R2_BUCKET) { - return false; - } - - const normalizedPath = normalizePath(path); - - // Check if file is in cache - if (globalThis.FS_CACHE && globalThis.FS_CACHE.has(normalizedPath)) { - return true; - } - - // File not in cache - return false; -} - -/** - * Get file stats from R2 - */ -export async function stat(path, callback) { - try { - if (!globalThis.R2_BUCKET) { - throw new Error('R2 bucket not available'); - } - - let normalizedPath = path.replace(/^\.?\//, ''); - - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; - } - - const object = await globalThis.R2_BUCKET.head(normalizedPath); - - if (!object) { - throw new Error(`ENOENT: no such file or directory, stat '${path}'`); - } - - const stats = { - size: object.size, - isFile: () => true, - isDirectory: () => false, - mtime: object.uploaded, - }; - - if (callback) callback(null, stats); - return stats; - } catch (err) { - if (callback) callback(err, null); - else throw err; - } -} - -/** - * Create a write stream (memory-buffered, writes to R2 on close) - */ -export function createWriteStream(path, options) { - const chunks = []; - - const stream = new Writable({ - write(chunk, encoding, callback) { - chunks.push(chunk); - callback(); - }, - final(callback) { - // Write to R2 when stream is closed - (async () => { - try { - if (!globalThis.R2_BUCKET) { - throw new Error('R2 bucket not available'); - } - - const normalizedPath = normalizePath(path); - - const buffer = Buffer.concat(chunks); - - // Store in cache for synchronous access - if (globalThis.FS_CACHE) { - globalThis.FS_CACHE.set(normalizedPath, buffer); - } - - await globalThis.R2_BUCKET.put(normalizedPath, buffer); - callback(); - } catch (err) { - callback(err); - } - })(); - } - }); - - return stream; -} - -/** - * Write file to R2 - */ -export async function writeFile(path, data, options, callback) { - let actualCallback = callback; - let actualOptions = options; - - if (typeof options === 'function') { - actualCallback = options; - actualOptions = {}; - } - - try { - if (!globalThis.R2_BUCKET) { - throw new Error('R2 bucket not available'); - } - - let normalizedPath = path.replace(/^\.?\//, ''); - - if (globalThis.BENCHMARK_NAME && !normalizedPath.startsWith(globalThis.BENCHMARK_NAME + '/')) { - normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; - } - - await globalThis.R2_BUCKET.put(normalizedPath, data); - - if (actualCallback) actualCallback(null); - } catch (err) { - if (actualCallback) actualCallback(err); - else throw err; - } -} - -/** - * Synchronous write file to R2 - */ -export function writeFileSync(path, data, options) { - return new Promise((resolve, reject) => { - writeFile(path, data, options, (err) => { - if (err) reject(err); - else resolve(); - }); - }); -} - -// Export everything from node:fs (what's available), but override specific methods -export default { - ...nodeFs, - readFile, - readFileSync, - exists, - existsSync, - stat, - createWriteStream, - writeFile, - writeFileSync, -}; diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 0b4a827c1..8baa646b3 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,35 +1,3 @@ -// Simple CommonJS polyfill for Cloudflare Workers -// This allows us to load CommonJS modules that use require() and module.exports -const moduleCache = {}; - -function createRequire(currentModule) { - return function require(modulePath) { - if (moduleCache[modulePath]) { - return moduleCache[modulePath].exports; - } - - // Create module object - const module = { exports: {} }; - moduleCache[modulePath] = module; - - // This is a placeholder - actual module loading would happen here - // For our use case, we'll manually register modules below - throw new Error(`Module ${modulePath} not found in polyfill cache`); - }; -} - -// Polyfill for __dirname and __filename if not available -if (typeof globalThis.__dirname === 'undefined') { - globalThis.__dirname = '.'; -} - -if (typeof globalThis.__filename === 'undefined') { - globalThis.__filename = './handler.js'; -} - -if (typeof globalThis.require === 'undefined') { - globalThis.require = createRequire(globalThis); -} export default { @@ -43,10 +11,6 @@ export default { globalThis.BENCHMARK_NAME = env.BENCHMARK_NAME; } - // Match behavior of the Python handler: parse body, parse URL params, - // set request-id and income timestamp, call the benchmark function, - // and return a JSON response with the same fields. - if (request.url.includes('favicon')) { return new Response('None'); } diff --git a/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js deleted file mode 100644 index 5b86d18c4..000000000 --- a/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Minimal request polyfill for Cloudflare Workers - * Provides a subset of the 'request' npm package API using fetch() - */ - -class RequestStream { - constructor(url, options = {}) { - this.url = url; - this.options = options; - this.pipeDestination = null; - } - - pipe(destination) { - this.pipeDestination = destination; - - // Start the fetch and pipe to destination - (async () => { - try { - const response = await fetch(this.url, this.options); - - if (!response.ok) { - this.pipeDestination.emit('error', new Error(`HTTP ${response.status}: ${response.statusText}`)); - return; - } - - // Read the response body and write to destination - const reader = response.body.getReader(); - - while (true) { - const { done, value } = await reader.read(); - - if (done) { - this.pipeDestination.end(); - break; - } - - // Write chunk to destination stream - if (!this.pipeDestination.write(value)) { - // Backpressure - wait for drain - await new Promise(resolve => { - this.pipeDestination.once('drain', resolve); - }); - } - } - } catch (error) { - if (this.pipeDestination) { - this.pipeDestination.emit('error', error); - } - } - })(); - - return this.pipeDestination; - } -} - -/** - * Main request function - creates a request stream - * @param {string|object} urlOrOptions - URL string or options object - * @param {object} options - Additional options if first param is URL - * @returns {RequestStream} - */ -function request(urlOrOptions, options) { - let url, opts; - - if (typeof urlOrOptions === 'string') { - url = urlOrOptions; - opts = options || {}; - } else { - url = urlOrOptions.url || urlOrOptions.uri; - opts = urlOrOptions; - } - - // Add default headers to avoid 403 errors from some servers - const defaultHeaders = { - 'User-Agent': 'Mozilla/5.0 (compatible; Cloudflare-Workers/1.0)', - 'Accept': '*/*', - }; - - const headers = { ...defaultHeaders, ...(opts.headers || {}) }; - - return new RequestStream(url, { - method: opts.method || 'GET', - headers: headers, - body: opts.body, - }); -} - -// Add common HTTP method shortcuts -request.get = (url, options) => request(url, { ...options, method: 'GET' }); -request.post = (url, options) => request(url, { ...options, method: 'POST' }); -request.put = (url, options) => request(url, { ...options, method: 'PUT' }); -request.delete = (url, options) => request(url, { ...options, method: 'DELETE' }); - -// Export as CommonJS module -module.exports = request; -module.exports.default = request; diff --git a/configs/systems.json b/configs/systems.json index 52eabb497..3eb79b648 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -462,8 +462,7 @@ "handler.js", "storage.js", "nosql.js", - "fs-polyfill.js", - "request-polyfill.js" + "build.js" ], "packages": { "uuid": "3.4.0" diff --git a/experiments.json b/experiments.json new file mode 100644 index 000000000..364c2e10e --- /dev/null +++ b/experiments.json @@ -0,0 +1,76 @@ +{ + "_invocations": { + "110-dynamic-html-nodejs-18": { + "0": { + "billing": { + "_billed_time": null, + "_gb_seconds": 0, + "_memory": null + }, + "output": { + "begin": 1763577748.037, + "compute_time": 0, + "container_id": "0", + "end": 1763577748.037, + "environ_container_id": "no_id", + "is_cold": false, + "is_cold_worker": false, + "request_id": "0", + "result": { + "output": "\n\n \n Randomly generated data.\n \n \n \n \n \n
\n

Welcome testname!

\n

Data generated at: 11/19/2025, 6:42:28 PM!

\n

Requested random numbers:

\n
    \n
  • 65
  • \n
  • 64
  • \n
  • 7
  • \n
  • 8
  • \n
  • 11
  • \n
  • 80
  • \n
  • 28
  • \n
  • 82
  • \n
  • 90
  • \n
  • 50
  • \n
\n
\n \n\n" + }, + "results_time": 0 + }, + "provider_times": { + "execution": 0, + "initialization": 0 + }, + "request_id": "0", + "stats": { + "cold_start": false, + "failure": false, + "memory_used": null + }, + "times": { + "benchmark": 0, + "client": 27866, + "client_begin": "2025-11-19 19:42:27.957743", + "client_end": "2025-11-19 19:42:27.985609", + "http_first_byte_return": 0.027801, + "http_startup": 0.015255, + "initialization": 0 + } + } + } + }, + "_metrics": {}, + "begin_time": 1763577747.723884, + "config": { + "deployment": { + "credentials": { + "account_id": "eaf7050d8d599d4ae7d925a6f0fd5ea4" + }, + "name": "cloudflare", + "region": "global", + "resources": { + "benchmarks": "sebs-benchmarks-cb6e76ec", + "resources_id": "cb6e76ec" + } + }, + "experiments": { + "architecture": "x64", + "container_deployment": false, + "download_results": false, + "experiments": {}, + "flags": {}, + "runtime": { + "language": "nodejs", + "version": "18" + }, + "update_code": true, + "update_storage": false + } + }, + "end_time": 1763577747.986338, + "result_bucket": null +} \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 000000000..e1fe15f56 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "serverless-benchmarks-cloudflare", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/package.json b/package.json new file mode 100644 index 000000000..0967ef424 --- /dev/null +++ b/package.json @@ -0,0 +1 @@ +{} diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index d299be793..0632414a2 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -187,6 +187,7 @@ def _ensure_wrangler_installed(self): except subprocess.TimeoutExpired: raise RuntimeError("Wrangler version check timed out") + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None) -> str: """ Generate a wrangler.toml configuration file for the worker. @@ -201,26 +202,36 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: Returns: Path to the generated wrangler.toml file """ - main_file = "handler.js" if language == "nodejs" else "handler.py" - + main_file = "dist/handler.js" if language == "nodejs" else "handler.py" + + + # Build wrangler.toml content toml_content = f"""name = "{worker_name}" main = "{main_file}" -compatibility_date = "2024-11-01" +compatibility_date = "2025-11-18" account_id = "{account_id}" - """ - - - - # Add compatibility flags based on language + if language == "nodejs": - toml_content += """# Custom polyfills for fs and path that read from R2 bucket + toml_content += """# Use nodejs_compat for Node.js built-in support compatibility_flags = ["nodejs_compat"] -[alias] -"fs" = "./fs-polyfill" -"request" = "./request-polyfill" +no_bundle = true + +[build] +command = "node build.js" + +[[rules]] +type = "ESModule" +globs = ["**/*.js"] +fallthrough = true + +[[rules]] +type = "Text" +globs = ["**/*.html"] +fallthrough = true + """ elif language == "python": toml_content += """# Enable Python Workers runtime @@ -443,8 +454,9 @@ def package_code( if os.path.exists(package_file) and not os.path.exists(node_modules): self.logging.info(f"Installing Node.js dependencies in {directory}") try: + # Install production dependencies result = subprocess.run( - ["npm", "install", "--production"], + ["npm", "install"], cwd=directory, capture_output=True, text=True, @@ -454,6 +466,20 @@ def package_code( self.logging.info("npm install completed successfully") if result.stdout: self.logging.debug(f"npm output: {result.stdout}") + + # Install esbuild as a dev dependency (needed by build.js) + self.logging.info("Installing esbuild for custom build script...") + result = subprocess.run( + ["npm", "install", "--save-dev", "esbuild"], + cwd=directory, + capture_output=True, + text=True, + check=True, + timeout=60 + ) + self.logging.info("esbuild installed successfully") + + except subprocess.TimeoutExpired: self.logging.error("npm install timed out") raise RuntimeError("Failed to install Node.js dependencies: timeout") @@ -466,6 +492,23 @@ def package_code( ) elif os.path.exists(node_modules): self.logging.info(f"Node.js dependencies already installed in {directory}") + + # Ensure esbuild is available even for cached installations + esbuild_path = os.path.join(node_modules, "esbuild") + if not os.path.exists(esbuild_path): + self.logging.info("Installing esbuild for custom build script...") + try: + subprocess.run( + ["npm", "install", "--save-dev", "esbuild"], + cwd=directory, + capture_output=True, + text=True, + check=True, + timeout=60 + ) + self.logging.info("esbuild installed successfully") + except Exception as e: + self.logging.warning(f"Failed to install esbuild: {e}") elif language_name == "python": requirements_file = os.path.join(directory, "requirements.txt") @@ -649,6 +692,19 @@ def _create_or_update_worker( Returns: Worker deployment result """ + # # Convert CommonJS function.js to ESM if it exists + # if language == "nodejs": + # function_js = os.path.join(package_dir, "function.js") + # if os.path.exists(function_js): + # self.logging.info(f"Converting function.js from CommonJS to ESM...") + # try: + # esm_content = self._convert_commonjs_to_esm(function_js) + # with open(function_js, 'w') as f: + # f.write(esm_content) + # self.logging.info("Successfully converted function.js to ESM") + # except Exception as e: + # self.logging.error(f"Failed to convert function.js to ESM: {e}") + # raise # Generate wrangler.toml for this worker self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name) diff --git a/test-fs.js b/test-fs.js new file mode 100644 index 000000000..1da5f453b --- /dev/null +++ b/test-fs.js @@ -0,0 +1,7 @@ +import { readFile, readFileSync } from 'node:fs'; + +export default { + async fetch(request) { + return new Response(readFileSync('./test-fs.js', 'utf-8')); + } +}; \ No newline at end of file From 539002169a9327470f7a9eebb6a4ecbd61b102fe Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 19 Nov 2025 21:35:46 +0100 Subject: [PATCH 021/230] cleanup nodejs deployment cloudflare, no uploading files necessary anymore --- experiments.json | 24 +++---- sebs/cloudflare/cloudflare.py | 116 ---------------------------------- test-fs.js | 7 -- 3 files changed, 13 insertions(+), 134 deletions(-) delete mode 100644 test-fs.js diff --git a/experiments.json b/experiments.json index 364c2e10e..549ed7625 100644 --- a/experiments.json +++ b/experiments.json @@ -1,6 +1,6 @@ { "_invocations": { - "110-dynamic-html-nodejs-18": { + "130-crud-api-nodejs-18": { "0": { "billing": { "_billed_time": null, @@ -8,16 +8,18 @@ "_memory": null }, "output": { - "begin": 1763577748.037, + "begin": 1763584521.868, "compute_time": 0, "container_id": "0", - "end": 1763577748.037, + "end": 1763584521.868, "environ_container_id": "no_id", "is_cold": false, "is_cold_worker": false, "request_id": "0", "result": { - "output": "\n\n \n Randomly generated data.\n \n \n \n \n \n
\n

Welcome testname!

\n

Data generated at: 11/19/2025, 6:42:28 PM!

\n

Requested random numbers:

\n
    \n
  • 65
  • \n
  • 64
  • \n
  • 7
  • \n
  • 8
  • \n
  • 11
  • \n
  • 80
  • \n
  • 28
  • \n
  • 82
  • \n
  • 90
  • \n
  • 50
  • \n
\n
\n \n\n" + "output": [ + {} + ] }, "results_time": 0 }, @@ -33,18 +35,18 @@ }, "times": { "benchmark": 0, - "client": 27866, - "client_begin": "2025-11-19 19:42:27.957743", - "client_end": "2025-11-19 19:42:27.985609", - "http_first_byte_return": 0.027801, - "http_startup": 0.015255, + "client": 29638, + "client_begin": "2025-11-19 21:35:21.807697", + "client_end": "2025-11-19 21:35:21.837335", + "http_first_byte_return": 0.029473, + "http_startup": 0.00964, "initialization": 0 } } } }, "_metrics": {}, - "begin_time": 1763577747.723884, + "begin_time": 1763584521.660329, "config": { "deployment": { "credentials": { @@ -71,6 +73,6 @@ "update_storage": false } }, - "end_time": 1763577747.986338, + "end_time": 1763584521.838648, "result_bucket": null } \ No newline at end of file diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 0632414a2..aae2406a9 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -293,112 +293,6 @@ def _get_auth_headers(self) -> Dict[str, str]: else: raise RuntimeError("Invalid Cloudflare credentials configuration") - def _convert_templates_to_modules(self, directory: str): - """ - Convert template files to JavaScript modules for bundling. - - Searches for template directories and converts HTML/text files - to JavaScript modules that can be imported. - - Args: - directory: Package directory to search for templates - """ - templates_dir = os.path.join(directory, "templates") - if not os.path.exists(templates_dir): - return - - self.logging.info(f"Converting template files in {templates_dir} to JavaScript modules") - - for root, dirs, files in os.walk(templates_dir): - for file in files: - if file.endswith(('.html', '.txt', '.xml', '.csv')): - file_path = os.path.join(root, file) - rel_path = os.path.relpath(file_path, directory) - - # Read the template content - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() - - # Escape for JavaScript string - content_escaped = (content - .replace('\\', '\\\\') - .replace('`', '\\`') - .replace('$', '\\$')) - - # Create a .js module file next to the template - module_path = file_path + '.js' - with open(module_path, 'w', encoding='utf-8') as f: - f.write(f'export default `{content_escaped}`;\n') - - self.logging.debug(f"Created template module: {module_path}") - - def _upload_benchmark_files_to_r2(self, directory: str, benchmark_name: str) -> int: - """ - Upload benchmark data files to R2 bucket for fs-polyfill access. - - This allows the fs-polyfill to read files from R2 instead of trying - to bundle them with the worker code. - - Args: - directory: Package directory containing files to upload - benchmark_name: Name of the benchmark (used as prefix in R2) - - Returns: - Number of files uploaded - """ - try: - storage = self.system_resources.get_storage() - bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) - - if not bucket_name: - self.logging.warning("R2 bucket not configured, skipping file upload") - return 0 - - uploaded_count = 0 - - # Upload template files - templates_dir = os.path.join(directory, "templates") - if os.path.exists(templates_dir): - for root, dirs, files in os.walk(templates_dir): - for file in files: - # Skip the .js module files we created - if file.endswith('.js'): - continue - - file_path = os.path.join(root, file) - # Create R2 key: benchmark_name/templates/filename - rel_path = os.path.relpath(file_path, directory) - r2_key = f"{benchmark_name}/{rel_path}" - - try: - with open(file_path, 'rb') as f: - file_content = f.read() - - self.logging.info(f"Uploading {rel_path} to R2 as {r2_key}...") - storage.upload_bytes( - bucket_name, - r2_key, - file_content - ) - uploaded_count += 1 - self.logging.info(f"✓ Uploaded {rel_path} ({len(file_content)} bytes)") - except Exception as e: - self.logging.error(f"✗ Failed to upload {rel_path} to R2: {e}") - - if uploaded_count > 0: - self.logging.info( - f"Uploaded {uploaded_count} benchmark files to R2 bucket '{bucket_name}'" - ) - - return uploaded_count - - except Exception as e: - self.logging.warning( - f"Could not upload benchmark files to R2: {e}. " - f"fs-polyfill will not be able to read files from R2." - ) - return 0 - def package_code( self, directory: str, @@ -434,16 +328,6 @@ def package_code( # Ensure Wrangler is installed self._ensure_wrangler_installed() - # Upload benchmark files to R2 for fs-polyfill access - if language_name == "nodejs": - uploaded = self._upload_benchmark_files_to_r2(directory, benchmark) - if uploaded > 0: - self.logging.info(f"Successfully uploaded {uploaded} files to R2") - else: - self.logging.warning( - "No files were uploaded to R2. Benchmarks requiring file access may fail. " - "Ensure R2 API credentials are configured." - ) # Install dependencies if language_name == "nodejs": diff --git a/test-fs.js b/test-fs.js deleted file mode 100644 index 1da5f453b..000000000 --- a/test-fs.js +++ /dev/null @@ -1,7 +0,0 @@ -import { readFile, readFileSync } from 'node:fs'; - -export default { - async fetch(request) { - return new Response(readFileSync('./test-fs.js', 'utf-8')); - } -}; \ No newline at end of file From 24497a2dc5a66e5e763cd397f312a08ce9d18941 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Sun, 23 Nov 2025 22:45:12 +0100 Subject: [PATCH 022/230] add folder structure to python code package --- sebs/cloudflare/cloudflare.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index aae2406a9..cd940386f 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -395,6 +395,14 @@ def package_code( self.logging.warning(f"Failed to install esbuild: {e}") elif language_name == "python": + funcdir = os.path.join(directory, "function") + if not os.path.exists(funcdir): + os.makedirs(funcdir) + + for thing in os.listdir(directory): + if thing.endswith(".py") and not thing.endswith("handler.py"): + shutil.move(os.path.join(directory, thing),os.path.join(directory, "function", thing)) + requirements_file = os.path.join(directory, "requirements.txt") if os.path.exists(requirements_file): self.logging.info(f"Installing Python dependencies in {directory}") From 88127085a4a58a96d837163e9e6b2074d1c840e5 Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Fri, 28 Nov 2025 15:34:06 +0100 Subject: [PATCH 023/230] nosql wrapper - duarble object - may work --- .../wrappers/cloudflare/python/handler.py | 101 ++++++++-------- .../wrappers/cloudflare/python/nosql.py | 111 +++++++++++++++++- 2 files changed, 162 insertions(+), 50 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 60497331c..39ceffbdd 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -2,7 +2,7 @@ import asyncio import importlib.util import traceback -from workers import WorkerEntrypoint, Response +from workers import WorkerEntrypoint, Response, DurableObject ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -18,52 +18,10 @@ """ - -def import_from_path(module_name, file_path): - spec = importlib.util.spec_from_file_location(module_name, file_path) - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) - return module - - -working_dir = os.path.dirname(__file__) - -class MakeAsync(ast.NodeTransformer): - def visit_FunctionDef(self, node): - if node.name != "handler": - return node - return ast.AsyncFunctionDef( - name=node.name, - args=node.args, - body=node.body, - decorator_list=node.decorator_list, - returns=node.returns, - type_params=node.type_params) - -class AddAwait(ast.NodeTransformer): - to_find = ["upload_stream", "download_stream", "upload", "download", "download_directory"] - - def visit_Call(self, node): - if isinstance(node.func, ast.Attribute) and node.func.attr in self.to_find: - #print(ast.dump(node.func, indent=2)) - return ast.Await(value=node) - - return node - -def make_benchmark_func(): - with open(working_dir +"/function/function.py") as f: - module = ast.parse(f.read()) - module = ast.fix_missing_locations(MakeAsync().visit(module)) - module = ast.fix_missing_locations(AddAwait().visit(module)) - new_source = ast.unparse(module) - ##print("new_source:") - ##print(new_source) - ##print() - with open("/tmp/function.py", "w") as wf: - wf.write(new_source) +class KVApiObject(DurableObject): + def __getattr__(self, name): + return getattr(self.ctx.storage, name) - class Default(WorkerEntrypoint): async def fetch(self, request, env): try: @@ -97,8 +55,6 @@ async def fetch2(self, request, env): - - ## we might need more data in self.env to know this ID req_id = 0 ## note: time fixed in worker @@ -152,3 +108,52 @@ async def fetch2(self, request, env): 'environ_container_id': "no_id", 'request_id': "0" })) + + +### ---------- old ------- + +def import_from_path(module_name, file_path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +working_dir = os.path.dirname(__file__) + +class MakeAsync(ast.NodeTransformer): + def visit_FunctionDef(self, node): + if node.name != "handler": + return node + return ast.AsyncFunctionDef( + name=node.name, + args=node.args, + body=node.body, + decorator_list=node.decorator_list, + returns=node.returns, + type_params=node.type_params) + +class AddAwait(ast.NodeTransformer): + to_find = ["upload_stream", "download_stream", "upload", "download", "download_directory"] + + def visit_Call(self, node): + if isinstance(node.func, ast.Attribute) and node.func.attr in self.to_find: + #print(ast.dump(node.func, indent=2)) + return ast.Await(value=node) + + return node + +def make_benchmark_func(): + with open(working_dir +"/function/function.py") as f: + module = ast.parse(f.read()) + module = ast.fix_missing_locations(MakeAsync().visit(module)) + module = ast.fix_missing_locations(AddAwait().visit(module)) + new_source = ast.unparse(module) + ##print("new_source:") + ##print(new_source) + ##print() + with open("/tmp/function.py", "w") as wf: + wf.write(new_source) + + diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index bb1f94633..9de22fe69 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -1,9 +1,111 @@ from typing import List, Optional, Tuple import json +import pickle from pyodide.ffi import to_js, run_sync -from workers import WorkerEntrypoint +from workers import WorkerEntrypoint, DurableObject -class nosql: + +class nosql_do: + instance: Optional["nosql"] = None + DO_BINDING_NAME = "DURABLE_STORE" + + @staticmethod + def init_instance(entry: WorkerEntrypoint): + nosql.instance = nosql() + nosql.instance.binding = getattr(entry.env, nosql_do.DO_BINDING_NAME) + + + def get_table(self, table_name): + kvapiobj = self.binding.getByName(table_name) + return kvapiobj + + def key_maker(self, key1, key2): + return f"({key1[0]},{str(key1[1])})+({key2[0]},{key2[1]})" + + def key_maker_partial(self, key1, key2): + return f"({key1[0]},{str(key1[1])})+({key2[0]}" + +## these data conversion funcs should not be necessary. i couldn't get pyodide to clone the data otherwise + def data_pre(self, data): + return pickle.dumps(data, 0).decode("ascii") + + def data_post(self, data): + return pickle.loads(bytes(data, "ascii")) + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + send_data = self.data_pre(data) + k=self.key_maker(primary_key, secondary_key) + put_res = run_sync(self.get_table(table_name).put(k, send_data)) + return + + ## does this really need different behaviour from insert? + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + self.insert(table_name, primary_key, secondary_key, data) + return + + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> Optional[dict]: + k=self.key_maker(primary_key, secondary_key) + get_res = run_sync(self.get_table(table_name).get(k)) + ## print(get_res) + return self.data_post(get_res) + + """ + This query must involve partition key - it does not scan across partitions. + """ + + def query( + self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str + ) -> List[dict]: + + prefix_key = self.key_maker_partial(primary_key, (secondary_key_name,)) + list_res = run_sync(self.get_table(table_name).list()) + + keys = [] + for key in list_res: + if key.startswith(prefix_key): + print(key) + keys.append(key) + ##print("keys", keys) + assert len(keys) <= 100 + + + # todo: please use bulk sometime (it didn't work when i tried it) + res = [] + for key in keys: + + get_res = run_sync(self.get_table(table_name).get(key)) + ## print(get_res) + res.append(self.data_post(get_res)) + return res + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + run_sync(self.get_table(table_name).delete(self.key_maker(primary_key, secondary_key))) + return + + @staticmethod + def get_instance(): + if nosql_do.instance is None: + nosql_do.instance = nosql_do() + return nosql_do.instance + +### ------------------------------ + +class nosql_kv: instance: Optional["nosql"] = None @@ -96,3 +198,8 @@ def get_instance(): if nosql.instance is None: nosql.instance = nosql() return nosql.instance + + + + +nosql = nosql_do From 5b3d78425a5cc8ef25b29284611b2e4a2c974dac Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Fri, 28 Nov 2025 16:10:44 +0100 Subject: [PATCH 024/230] fix python. 110 runs for me. --- .../wrappers/cloudflare/python/handler.py | 10 +- sebs/cloudflare/cloudflare.py | 209 +++++++++--------- 2 files changed, 111 insertions(+), 108 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 39ceffbdd..332c2b67b 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -69,12 +69,12 @@ async def fetch2(self, request, env): storage.storage.init_instance(self) - from function import nosql - - nosql.nosql.init_instance(self) - - + if 'NOSQL_STORAGE_DATABASE' in os.environ: + from function import nosql + + nosql.nosql.get_instance(self) + print("event:", event) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index cd940386f..f6c60956b 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -23,11 +23,11 @@ class Cloudflare(System): """ Cloudflare Workers serverless platform implementation. - + Cloudflare Workers run on Cloudflare's edge network, providing low-latency serverless execution globally. """ - + _config: CloudflareConfig @staticmethod @@ -71,7 +71,7 @@ def __init__( def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): """ Initialize the Cloudflare Workers platform. - + Args: config: Additional configuration parameters resource_prefix: Prefix for resource naming @@ -79,14 +79,14 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] # Verify credentials are valid self._verify_credentials() self.initialize_resources(select_prefix=resource_prefix) - + def initialize_resources(self, select_prefix: Optional[str] = None): """ Initialize Cloudflare resources. - + Overrides the base class method to handle R2 storage gracefully. Cloudflare Workers can operate without R2 storage for many benchmarks. - + Args: select_prefix: Optional prefix for resource naming """ @@ -104,10 +104,10 @@ def initialize_resources(self, select_prefix: Optional[str] = None): res_id = f"{select_prefix}-{str(uuid.uuid1())[0:8]}" else: res_id = str(uuid.uuid1())[0:8] - + self.config.resources.resources_id = res_id self.logging.info(f"Generating unique resource name {res_id}") - + # Try to create R2 bucket, but don't fail if R2 is not enabled try: self.system_resources.get_storage().get_bucket(Resources.StorageBucketType.BENCHMARKS) @@ -127,30 +127,30 @@ def _verify_credentials(self): "Cloudflare API credentials are not set. Please set CLOUDFLARE_API_TOKEN " "and CLOUDFLARE_ACCOUNT_ID environment variables." ) - + if not self.config.credentials.account_id: raise RuntimeError( "Cloudflare Account ID is not set. Please set CLOUDFLARE_ACCOUNT_ID " "environment variable." ) - + headers = self._get_auth_headers() - + # Log credential type being used (without exposing the actual token) if self.config.credentials.api_token: token_preview = self.config.credentials.api_token[:8] + "..." if len(self.config.credentials.api_token) > 8 else "***" self.logging.info(f"Using API Token authentication (starts with: {token_preview})") else: self.logging.info(f"Using Email + API Key authentication (email: {self.config.credentials.email})") - + response = requests.get(f"{self._api_base_url}/user/tokens/verify", headers=headers) - + if response.status_code != 200: raise RuntimeError( f"Failed to verify Cloudflare credentials: {response.status_code} - {response.text}\n" f"Please check that your CLOUDFLARE_API_TOKEN and CLOUDFLARE_ACCOUNT_ID are correct." ) - + self.logging.info("Cloudflare credentials verified successfully") def _ensure_wrangler_installed(self): @@ -187,18 +187,18 @@ def _ensure_wrangler_installed(self): except subprocess.TimeoutExpired: raise RuntimeError("Wrangler version check timed out") - + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None) -> str: """ Generate a wrangler.toml configuration file for the worker. - + Args: worker_name: Name of the worker package_dir: Directory containing the worker code language: Programming language (nodejs or python) account_id: Cloudflare account ID benchmark_name: Optional benchmark name for R2 file path prefix - + Returns: Path to the generated wrangler.toml file """ @@ -267,13 +267,13 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: f"R2 bucket binding not configured: {e}. " f"Benchmarks requiring file access will not work properly." ) - - + + # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") with open(toml_path, 'w') as f: f.write(toml_content) - + self.logging.info(f"Generated wrangler.toml at {toml_path}") return toml_path @@ -305,9 +305,9 @@ def package_code( ) -> Tuple[str, int, str]: """ Package code for Cloudflare Workers deployment using Wrangler. - + Uses Wrangler CLI to bundle dependencies and prepare for deployment. - + Args: directory: Path to the code directory language_name: Programming language name @@ -316,7 +316,7 @@ def package_code( benchmark: Benchmark name is_cached: Whether the code is cached container_deployment: Whether to deploy as container (not supported) - + Returns: Tuple of (package_path, package_size, container_uri) """ @@ -333,7 +333,7 @@ def package_code( if language_name == "nodejs": package_file = os.path.join(directory, "package.json") node_modules = os.path.join(directory, "node_modules") - + # Only install if package.json exists and node_modules doesn't if os.path.exists(package_file) and not os.path.exists(node_modules): self.logging.info(f"Installing Node.js dependencies in {directory}") @@ -350,7 +350,7 @@ def package_code( self.logging.info("npm install completed successfully") if result.stdout: self.logging.debug(f"npm output: {result.stdout}") - + # Install esbuild as a dev dependency (needed by build.js) self.logging.info("Installing esbuild for custom build script...") result = subprocess.run( @@ -363,7 +363,7 @@ def package_code( ) self.logging.info("esbuild installed successfully") - + except subprocess.TimeoutExpired: self.logging.error("npm install timed out") raise RuntimeError("Failed to install Node.js dependencies: timeout") @@ -376,7 +376,7 @@ def package_code( ) elif os.path.exists(node_modules): self.logging.info(f"Node.js dependencies already installed in {directory}") - + # Ensure esbuild is available even for cached installations esbuild_path = os.path.join(node_modules, "esbuild") if not os.path.exists(esbuild_path): @@ -395,14 +395,6 @@ def package_code( self.logging.warning(f"Failed to install esbuild: {e}") elif language_name == "python": - funcdir = os.path.join(directory, "function") - if not os.path.exists(funcdir): - os.makedirs(funcdir) - - for thing in os.listdir(directory): - if thing.endswith(".py") and not thing.endswith("handler.py"): - shutil.move(os.path.join(directory, thing),os.path.join(directory, "function", thing)) - requirements_file = os.path.join(directory, "requirements.txt") if os.path.exists(requirements_file): self.logging.info(f"Installing Python dependencies in {directory}") @@ -426,13 +418,24 @@ def package_code( raise RuntimeError( "pip not found. Please install Python and pip to deploy Python benchmarks." ) - + # move into function dir + funcdir = os.path.join(directory, "function") + if not os.path.exists(funcdir): + os.makedirs(funcdir) + + for thing in os.listdir(directory): + if not (thing.endswith("handler.py") or thing.endswith("function") or thing.endswith("python_modules")): + src = os.path.join(directory, thing) + dest = os.path.join(directory, "function", thing) + shutil.move(src, dest) + self.logging.info(f"move {src} to {dest}") + # Create package structure CONFIG_FILES = { "nodejs": ["handler.js", "package.json", "node_modules"], "python": ["handler.py", "requirements.txt", "python_modules"], } - + if language_name not in CONFIG_FILES: raise NotImplementedError( f"Language {language_name} is not yet supported for Cloudflare Workers" @@ -441,7 +444,7 @@ def package_code( # Verify the handler exists handler_file = "handler.js" if language_name == "nodejs" else "handler.py" package_path = os.path.join(directory, handler_file) - + if not os.path.exists(package_path): if not os.path.exists(directory): raise RuntimeError( @@ -452,14 +455,14 @@ def package_code( f"Handler file {handler_file} not found in {directory}. " f"Available files: {', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" ) - + # Calculate total size of the package directory total_size = 0 for dirpath, dirnames, filenames in os.walk(directory): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) - + mbytes = total_size / 1024.0 / 1024.0 self.logging.info(f"Worker package size: {mbytes:.2f} MB") @@ -474,15 +477,15 @@ def create_function( ) -> CloudflareWorker: """ Create a new Cloudflare Worker. - + If a worker with the same name already exists, it will be updated. - + Args: code_package: Benchmark containing the function code func_name: Name of the worker container_deployment: Whether to deploy as container (not supported) container_uri: URI of container image (not used) - + Returns: CloudflareWorker instance """ @@ -496,16 +499,16 @@ def create_function( language = code_package.language_name language_runtime = code_package.language_version function_cfg = FunctionConfig.from_benchmark(code_package) - + func_name = self.format_function_name(func_name) account_id = self.config.credentials.account_id - + if not account_id: raise RuntimeError("Cloudflare account ID is required to create workers") # Check if worker already exists existing_worker = self._get_worker(func_name, account_id) - + if existing_worker: self.logging.info(f"Worker {func_name} already exists, updating it") worker = CloudflareWorker( @@ -521,10 +524,10 @@ def create_function( worker.updated_code = True else: self.logging.info(f"Creating new worker {func_name}") - + # Create the worker with all package files self._create_or_update_worker(func_name, package, account_id, language, benchmark) - + worker = CloudflareWorker( func_name, code_package.benchmark, @@ -537,7 +540,7 @@ def create_function( # Add LibraryTrigger and HTTPTrigger from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger - + library_trigger = LibraryTrigger(func_name, self) library_trigger.logging_handlers = self.logging_handlers worker.add_trigger(library_trigger) @@ -555,9 +558,9 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: """Get information about an existing worker.""" headers = self._get_auth_headers() url = f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" - + response = requests.get(url, headers=headers) - + if response.status_code == 200: try: return response.json().get("result") @@ -573,14 +576,14 @@ def _create_or_update_worker( self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None ) -> dict: """Create or update a Cloudflare Worker using Wrangler CLI. - + Args: worker_name: Name of the worker package_dir: Directory containing handler and all benchmark files account_id: Cloudflare account ID language: Programming language (nodejs or python) benchmark_name: Optional benchmark name for R2 file path prefix - + Returns: Worker deployment result """ @@ -599,7 +602,7 @@ def _create_or_update_worker( # raise # Generate wrangler.toml for this worker self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name) - + # Set up environment for Wrangler env = os.environ.copy() if self.config.credentials.api_token: @@ -607,12 +610,12 @@ def _create_or_update_worker( elif self.config.credentials.email and self.config.credentials.api_key: env['CLOUDFLARE_EMAIL'] = self.config.credentials.email env['CLOUDFLARE_API_KEY'] = self.config.credentials.api_key - + env['CLOUDFLARE_ACCOUNT_ID'] = account_id - + # Deploy using Wrangler self.logging.info(f"Deploying worker {worker_name} using Wrangler...") - + try: result = subprocess.run( ["wrangler", "deploy"], @@ -623,17 +626,17 @@ def _create_or_update_worker( check=True, timeout=180 # 3 minutes for deployment ) - + self.logging.info(f"Worker {worker_name} deployed successfully") if result.stdout: self.logging.debug(f"Wrangler deploy output: {result.stdout}") - + # Parse the output to get worker URL # Wrangler typically outputs: "Published ()" # and "https://..workers.dev" - + return {"success": True, "output": result.stdout} - + except subprocess.TimeoutExpired: raise RuntimeError(f"Wrangler deployment timed out for worker {worker_name}") except subprocess.CalledProcessError as e: @@ -711,18 +714,18 @@ def _build_workers_dev_url(self, worker_name: str, account_id: Optional[str]) -> def cached_function(self, function: Function): """ Handle a function retrieved from cache. - + Refreshes triggers and logging handlers. - + Args: function: The cached function """ from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger - + for trigger in function.triggers(Trigger.TriggerType.LIBRARY): trigger.logging_handlers = self.logging_handlers cast(LibraryTrigger, trigger).deployment_client = self - + for trigger in function.triggers(Trigger.TriggerType.HTTP): trigger.logging_handlers = self.logging_handlers @@ -735,7 +738,7 @@ def update_function( ): """ Update an existing Cloudflare Worker. - + Args: function: Existing function instance to update code_package: New benchmark containing the function code @@ -751,15 +754,15 @@ def update_function( package = code_package.code_location language = code_package.language_name benchmark = code_package.benchmark - + # Update the worker with all package files account_id = worker.account_id or self.config.credentials.account_id if not account_id: raise RuntimeError("Account ID is required to update worker") - + self._create_or_update_worker(worker.name, package, account_id, language, benchmark) self.logging.info(f"Updated worker {worker.name}") - + # Update configuration if needed self.update_function_configuration(worker, code_package) @@ -768,10 +771,10 @@ def update_function_configuration( ): """ Update the configuration of a Cloudflare Worker. - + Note: Cloudflare Workers have limited configuration options compared to traditional FaaS platforms. Memory and timeout are managed by Cloudflare. - + Args: cached_function: The function to update benchmark: The benchmark with new configuration @@ -780,9 +783,9 @@ def update_function_configuration( # - CPU time: 50ms (free), 50ms-30s (paid) # - Memory: 128MB # Most configuration is handled via wrangler.toml or API settings - + worker = cast(CloudflareWorker, cached_function) - + # For environment variables or KV namespaces, we would use the API here # For now, we'll just log that configuration update was requested self.logging.info( @@ -793,11 +796,11 @@ def update_function_configuration( def default_function_name(self, code_package: Benchmark, resources=None) -> str: """ Generate a default function name for Cloudflare Workers. - + Args: code_package: The benchmark package resources: Optional resources (not used) - + Returns: Default function name """ @@ -811,15 +814,15 @@ def default_function_name(self, code_package: Benchmark, resources=None) -> str: def format_function_name(name: str) -> str: """ Format a function name to comply with Cloudflare Worker naming rules. - + Worker names must: - Be lowercase - Contain only alphanumeric characters and hyphens - Not start or end with a hyphen - + Args: name: The original name - + Returns: Formatted name """ @@ -834,11 +837,11 @@ def format_function_name(name: str) -> str: def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): """ Enforce cold start for Cloudflare Workers. - + Note: Cloudflare Workers don't have a traditional cold start mechanism like AWS Lambda. Workers are instantiated on-demand at edge locations. We can't force a cold start, but we can update the worker to invalidate caches. - + Args: functions: List of functions to enforce cold start on code_package: The benchmark package @@ -858,12 +861,12 @@ def download_metrics( ): """ Extract per-invocation metrics from ExecutionResult objects. - + The metrics are extracted from the 'measurement' field in the benchmark response, which is populated by the Cloudflare Worker handler during execution. This approach avoids dependency on Analytics Engine and provides immediate, accurate metrics for each invocation. - + Args: function_name: Name of the worker start_time: Start time (Unix timestamp in seconds) - not used @@ -874,12 +877,12 @@ def download_metrics( if not requests: self.logging.warning("No requests to extract metrics from") return - + self.logging.info( f"Extracting metrics from {len(requests)} invocations " f"of worker {function_name}" ) - + # Aggregate statistics from all requests total_invocations = len(requests) cold_starts = 0 @@ -887,38 +890,38 @@ def download_metrics( cpu_times = [] wall_times = [] memory_values = [] - + for request_id, result in requests.items(): # Count cold/warm starts if result.stats.cold_start: cold_starts += 1 else: warm_starts += 1 - + # Collect CPU times if result.provider_times.execution > 0: cpu_times.append(result.provider_times.execution) - + # Collect wall times (benchmark times) if result.times.benchmark > 0: wall_times.append(result.times.benchmark) - + # Collect memory usage if result.stats.memory_used > 0: memory_values.append(result.stats.memory_used) - + # Set billing info for Cloudflare Workers - # Cloudflare billing: $0.50 per million requests + + # Cloudflare billing: $0.50 per million requests + # $12.50 per million GB-seconds of CPU time if result.provider_times.execution > 0: result.billing.memory = 128 # Cloudflare Workers: fixed 128MB result.billing.billed_time = result.provider_times.execution # μs - + # GB-seconds calculation: (128MB / 1024MB/GB) * (cpu_time_us / 1000000 us/s) cpu_time_seconds = result.provider_times.execution / 1_000_000.0 gb_seconds = (128.0 / 1024.0) * cpu_time_seconds result.billing.gb_seconds = int(gb_seconds * 1_000_000) # micro GB-seconds - + # Calculate statistics metrics['cloudflare'] = { 'total_invocations': total_invocations, @@ -927,34 +930,34 @@ def download_metrics( 'data_source': 'response_measurements', 'note': 'Per-invocation metrics extracted from benchmark response' } - + if cpu_times: metrics['cloudflare']['avg_cpu_time_us'] = sum(cpu_times) // len(cpu_times) metrics['cloudflare']['min_cpu_time_us'] = min(cpu_times) metrics['cloudflare']['max_cpu_time_us'] = max(cpu_times) metrics['cloudflare']['cpu_time_measurements'] = len(cpu_times) - + if wall_times: metrics['cloudflare']['avg_wall_time_us'] = sum(wall_times) // len(wall_times) metrics['cloudflare']['min_wall_time_us'] = min(wall_times) metrics['cloudflare']['max_wall_time_us'] = max(wall_times) metrics['cloudflare']['wall_time_measurements'] = len(wall_times) - + if memory_values: metrics['cloudflare']['avg_memory_mb'] = sum(memory_values) / len(memory_values) metrics['cloudflare']['min_memory_mb'] = min(memory_values) metrics['cloudflare']['max_memory_mb'] = max(memory_values) metrics['cloudflare']['memory_measurements'] = len(memory_values) - + self.logging.info( f"Extracted metrics from {total_invocations} invocations: " f"{cold_starts} cold starts, {warm_starts} warm starts" ) - + if cpu_times: avg_cpu_ms = sum(cpu_times) / len(cpu_times) / 1000.0 self.logging.info(f"Average CPU time: {avg_cpu_ms:.2f} ms") - + if wall_times: avg_wall_ms = sum(wall_times) / len(wall_times) / 1000.0 self.logging.info(f"Average wall time: {avg_wall_ms:.2f} ms") @@ -964,18 +967,18 @@ def create_trigger( ) -> Trigger: """ Create a trigger for a Cloudflare Worker. - + Args: function: The function to create a trigger for trigger_type: Type of trigger to create - + Returns: The created trigger """ from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger - + worker = cast(CloudflareWorker, function) - + if trigger_type == Trigger.TriggerType.LIBRARY: trigger = LibraryTrigger(worker.name, self) trigger.logging_handlers = self.logging_handlers @@ -994,7 +997,7 @@ def create_trigger( def shutdown(self) -> None: """ Shutdown the Cloudflare system. - + Saves configuration to cache. """ try: From cd183b89636c138c74b57f923038af991030e20c Mon Sep 17 00:00:00 2001 From: "ldzgch (MacOS)" Date: Fri, 28 Nov 2025 17:18:47 +0100 Subject: [PATCH 025/230] make it read the requirements.txt when it has a number --- sebs/cloudflare/cloudflare.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index f6c60956b..13ab9b79c 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -396,6 +396,13 @@ def package_code( elif language_name == "python": requirements_file = os.path.join(directory, "requirements.txt") + + if os.path.exists(f"{requirements_file}.{language_version}"): + src = f"{requirements_file}.{language_version}" + dest = requirements_file + shutil.move(src, dest) + self.logging.info(f"move {src} to {dest}") + if os.path.exists(requirements_file): self.logging.info(f"Installing Python dependencies in {directory}") try: From 9379f39348361b83ec591067a0dbd10283a11ed8 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 30 Nov 2025 16:01:41 +0100 Subject: [PATCH 026/230] durable objects compatibility for nodejs --- .../wrappers/cloudflare/nodejs/build.js | 54 +++++++++- .../wrappers/cloudflare/nodejs/handler.js | 60 ++++++++++- .../wrappers/cloudflare/nodejs/nosql.js | 100 ++++++++---------- sebs/cloudflare/cloudflare.py | 37 +++++-- sebs/cloudflare/durable_objects.py | 62 ++++++----- 5 files changed, 215 insertions(+), 98 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index a9d7ebcb0..e6bb65dd1 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -39,7 +39,7 @@ const nodeBuiltinsPlugin = { name: 'node-builtins-external', setup(build) { // Keep node: prefixed modules external - build.onResolve({ filter: /^node:/ }, (args) => { + build.onResolve({ filter: /^(node:|cloudflare:)/ }, (args) => { return { path: args.path, external: true }; }); @@ -50,6 +50,56 @@ const nodeBuiltinsPlugin = { } }; +const asyncNosqlPlugin = { + name: 'async-nosql-transformer', + setup(build) { + // Transform function.js to make it async-compatible + build.onLoad({ filter: /function\.js$/ }, async (args) => { + let contents = await fs.promises.readFile(args.path, 'utf8'); + + console.log('🔧 Transforming function.js for async nosql...'); + + // Step 1: Add await before nosqlClient method calls + contents = contents.replace(/(\s*)((?:const|let|var)\s+\w+\s*=\s*)?nosqlClient\.(insert|get|update|query|delete)\s*\(/g, + '$1$2await nosqlClient.$3('); + + // Step 2: Make all function declarations async + contents = contents.replace(/\bfunction\s+(\w+)\s*\(/g, 'async function $1('); + + // Step 3: Make exported handler async if not already + contents = contents.replace(/exports\.handler\s*=\s*function\s*\(/g, 'exports.handler = async function('); + + // Step 4: Add await before specific function calls + // Split into lines to avoid matching function declarations + const lines = contents.split('\n'); + const transformedLines = lines.map(line => { + // Skip lines that contain function declarations + if (line.match(/\b(async\s+)?function\s+\w+\s*\(/)) { + return line; + } + + // Transform function calls in this line + const functionNames = ['addProduct', 'getProducts', 'queryProducts', 'updateProducts', 'deleteProducts']; + for (const funcName of functionNames) { + // Match calls: spaces + optional assignment + functionName( + const callRegex = new RegExp(`(\\s*)((?:const|let|var)\\s+\\w+\\s*=\\s*)?(${funcName})\\s*\\(`, 'g'); + line = line.replace(callRegex, '$1$2await $3('); + } + + return line; + }); + contents = transformedLines.join('\n'); + + console.log('✓ Transformed function.js for async nosql'); + + return { + contents, + loader: 'js', + }; + }); + } +}; + async function customBuild() { const srcDir = './'; const outDir = './dist'; @@ -83,7 +133,7 @@ async function customBuild() { target: 'es2020', sourcemap: true, allowOverwrite: true, - plugins: [nodeBuiltinsPlugin], + plugins: [nodeBuiltinsPlugin, asyncNosqlPlugin], define: { 'process.env.NODE_ENV': '"production"', 'global': 'globalThis', diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 8baa646b3..c72b7d5ec 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,4 +1,29 @@ +import { DurableObject } from "cloudflare:workers"; +// Durable Object class for KV API compatibility +export class KVApiObject extends DurableObject { + constructor(state, env) { + super(state, env); + this.storage = state.storage; + } + + // Proxy methods to make the storage API accessible from the stub + async put(key, value) { + return await this.storage.put(key, value); + } + + async get(key) { + return await this.storage.get(key); + } + + async delete(key) { + return await this.storage.delete(key); + } + + async list(options) { + return await this.storage.list(options); + } +} export default { async fetch(request, env) { @@ -82,14 +107,43 @@ export default { // don't fail the request if storage init isn't available } + // Initialize nosql if environment variable is set + if (env.NOSQL_STORAGE_DATABASE) { + try { + const nosqlModule = await import('./nosql.js'); + if (nosqlModule && nosqlModule.nosql && typeof nosqlModule.nosql.init_instance === 'function') { + nosqlModule.nosql.init_instance({ env, request }); + } + } catch (e) { + // nosql module might not exist for all benchmarks + console.log('Could not initialize nosql:', e.message); + } + } + // Execute the benchmark handler let ret; try { + // Wrap the handler execution to handle sync-style async code + // The benchmark code calls async nosql methods but doesn't await them + // We need to serialize the execution if (funcModule && typeof funcModule.handler === 'function') { - // handler may be sync or return a promise - ret = await Promise.resolve(funcModule.handler(event)); + // Create a promise-aware execution context + const handler = funcModule.handler; + + // Execute handler - it will return { result: [Promise, Promise, ...] } + ret = await Promise.resolve(handler(event)); + + // Deeply resolve all promises in the result + if (ret && ret.result && Array.isArray(ret.result)) { + ret.result = await Promise.all(ret.result.map(async item => await Promise.resolve(item))); + } } else if (funcModule && funcModule.default && typeof funcModule.default.handler === 'function') { - ret = await Promise.resolve(funcModule.default.handler(event)); + const handler = funcModule.default.handler; + ret = await Promise.resolve(handler(event)); + + if (ret && ret.result && Array.isArray(ret.result)) { + ret.result = await Promise.all(ret.result.map(async item => await Promise.resolve(item))); + } } else { throw new Error('benchmark handler function not found'); } diff --git a/benchmarks/wrappers/cloudflare/nodejs/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/nosql.js index 2841d3942..67b73a1fd 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/nosql.js +++ b/benchmarks/wrappers/cloudflare/nodejs/nosql.js @@ -1,106 +1,90 @@ // NoSQL wrapper for Cloudflare Workers -// Supports Cloudflare KV or Durable Objects when available +// Uses Durable Objects for storage +// Returns Promises that the handler will resolve class nosql { constructor() { - this.handle = null; // KV or Durable Object binding - this._tables = {}; + this.env = null; } static init_instance(entry) { - nosql.instance = new nosql(); + // Reuse existing instance if it exists, otherwise create new one + if (!nosql.instance) { + nosql.instance = new nosql(); + } + if (entry && entry.env) { nosql.instance.env = entry.env; } } _get_table(tableName) { - if (!(tableName in this._tables)) { - const envName = `NOSQL_STORAGE_TABLE_${tableName}`; - - if (this.env && this.env[envName]) { - this._tables[tableName] = this.env[envName]; - } else if (this.env && this.env[tableName]) { - // Try direct table name - this._tables[tableName] = this.env[tableName]; - } else { - throw new Error( - `Couldn't find an environment variable ${envName} for table ${tableName}` - ); - } + // Don't cache stubs - they are request-scoped and cannot be reused + // Always create a fresh stub for each request + if (!this.env) { + throw new Error(`nosql env not initialized for table ${tableName}`); } - - return this._tables[tableName]; + + if (!this.env.DURABLE_STORE) { + // Debug: log what we have + const envKeys = Object.keys(this.env || {}); + const durableStoreType = typeof this.env.DURABLE_STORE; + throw new Error( + `DURABLE_STORE binding not found. env keys: [${envKeys.join(', ')}], DURABLE_STORE type: ${durableStoreType}` + ); + } + + // Get a Durable Object ID based on the table name and create a fresh stub + const id = this.env.DURABLE_STORE.idFromName(tableName); + return this.env.DURABLE_STORE.get(id); } + // Async methods - build.js will patch function.js to await these async insert(tableName, primaryKey, secondaryKey, data) { const keyData = { ...data }; keyData[primaryKey[0]] = primaryKey[1]; keyData[secondaryKey[0]] = secondaryKey[1]; - const table = this._get_table(tableName); + const durableObjStub = this._get_table(tableName); const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; - // For KV binding - if (table && typeof table.put === 'function') { - await table.put(compositeKey, JSON.stringify(keyData)); - } else { - throw new Error('NoSQL table binding not properly configured'); - } + await durableObjStub.put(compositeKey, keyData); } async get(tableName, primaryKey, secondaryKey) { - const table = this._get_table(tableName); + const durableObjStub = this._get_table(tableName); const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; - if (table && typeof table.get === 'function') { - const result = await table.get(compositeKey); - if (result) { - return JSON.parse(result); - } - return null; - } - - throw new Error('NoSQL table binding not properly configured'); + const result = await durableObjStub.get(compositeKey); + return result || null; } async update(tableName, primaryKey, secondaryKey, updates) { - // For simple KV, update is same as put with merged data const existing = await this.get(tableName, primaryKey, secondaryKey) || {}; const merged = { ...existing, ...updates }; await this.insert(tableName, primaryKey, secondaryKey, merged); } async query(tableName, primaryKey, secondaryKeyName) { - const table = this._get_table(tableName); + const durableObjStub = this._get_table(tableName); const prefix = `${primaryKey[1]}#`; - if (table && typeof table.list === 'function') { - const list = await table.list({ prefix }); - const results = []; - - for (const key of list.keys) { - const value = await table.get(key.name); - if (value) { - results.push(JSON.parse(value)); - } - } - - return results; + // List all keys with the prefix + const allEntries = await durableObjStub.list({ prefix }); + const results = []; + + for (const [key, value] of allEntries) { + results.push(value); } - throw new Error('NoSQL table binding not properly configured'); + return results; } async delete(tableName, primaryKey, secondaryKey) { - const table = this._get_table(tableName); + const durableObjStub = this._get_table(tableName); const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; - if (table && typeof table.delete === 'function') { - await table.delete(compositeKey); - } else { - throw new Error('NoSQL table binding not properly configured'); - } + await durableObjStub.delete(compositeKey); } static get_instance() { @@ -111,4 +95,4 @@ class nosql { } } -module.exports.nosql = nosql; +export { nosql }; diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 13ab9b79c..af183865d 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -188,7 +188,7 @@ def _ensure_wrangler_installed(self): raise RuntimeError("Wrangler version check timed out") - def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None) -> str: + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None) -> str: """ Generate a wrangler.toml configuration file for the worker. @@ -198,6 +198,7 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: language: Programming language (nodejs or python) account_id: Cloudflare account ID benchmark_name: Optional benchmark name for R2 file path prefix + code_package: Optional benchmark package for nosql configuration Returns: Path to the generated wrangler.toml file @@ -238,13 +239,30 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: compatibility_flags = ["python_workers"] """ + toml_content += """ +[[durable_objects.bindings]] +name = "DURABLE_STORE" +class_name = "KVApiObject" - # Add environment variable for benchmark name (used by fs-polyfill for R2 paths) +[[migrations]] +tag = "v1" +new_classes = ["KVApiObject"] +""" + + + # Add environment variables + vars_content = "" if benchmark_name: - toml_content += f"""# Benchmark name used for R2 file path prefix + vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' + + # Add nosql configuration if benchmark uses it + if code_package and code_package.uses_nosql: + vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' + + if vars_content: + toml_content += f"""# Environment variables [vars] -BENCHMARK_NAME = "{benchmark_name}" - +{vars_content} """ # Add R2 bucket binding for benchmarking files (required for fs/path polyfills) @@ -533,7 +551,7 @@ def create_function( self.logging.info(f"Creating new worker {func_name}") # Create the worker with all package files - self._create_or_update_worker(func_name, package, account_id, language, benchmark) + self._create_or_update_worker(func_name, package, account_id, language, benchmark, code_package) worker = CloudflareWorker( func_name, @@ -580,7 +598,7 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: return None def _create_or_update_worker( - self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None + self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None ) -> dict: """Create or update a Cloudflare Worker using Wrangler CLI. @@ -590,6 +608,7 @@ def _create_or_update_worker( account_id: Cloudflare account ID language: Programming language (nodejs or python) benchmark_name: Optional benchmark name for R2 file path prefix + code_package: Optional benchmark package for nosql configuration Returns: Worker deployment result @@ -608,7 +627,7 @@ def _create_or_update_worker( # self.logging.error(f"Failed to convert function.js to ESM: {e}") # raise # Generate wrangler.toml for this worker - self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name) + self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package) # Set up environment for Wrangler env = os.environ.copy() @@ -767,7 +786,7 @@ def update_function( if not account_id: raise RuntimeError("Account ID is required to update worker") - self._create_or_update_worker(worker.name, package, account_id, language, benchmark) + self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package) self.logging.info(f"Updated worker {worker.name}") # Update configuration if needed diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index 03d36c179..8997ebf92 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -1,5 +1,4 @@ import json -import requests from collections import defaultdict from typing import Dict, Optional, Tuple @@ -13,10 +12,11 @@ class DurableObjects(NoSQLStorage): """ Cloudflare Durable Objects implementation for NoSQL storage. - Note: Durable Objects are not a traditional NoSQL database like DynamoDB or CosmosDB. - They are stateful Workers with persistent storage. This implementation provides - a minimal interface to satisfy SeBS requirements, but full table operations - are not supported. + Note: Durable Objects are not managed via API like DynamoDB or CosmosDB. + Instead, they are defined in the Worker code and wrangler.toml, and accessed + via bindings in the Worker environment. This implementation provides a minimal + interface to satisfy SeBS requirements by tracking table names without actual + API-based table creation. """ @staticmethod @@ -35,7 +35,7 @@ def __init__( credentials: CloudflareCredentials, ): super().__init__(region, cache_client, resources) - self._credentials = credentials + # Tables are just logical names - Durable Objects are accessed via Worker bindings self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) def _get_auth_headers(self) -> dict[str, str]: @@ -116,28 +116,28 @@ def create_table( self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None ) -> str: """ - Create a table (Durable Object namespace). + Register a table name for a benchmark. Note: Durable Objects don't have traditional table creation via API. - They are defined in the Worker code and wrangler.toml. - This method just tracks the table name. + They are defined in the Worker code and wrangler.toml, and accessed via + bindings. This method just tracks the logical table name for the wrapper + to use when accessing the Durable Object binding. :param benchmark: benchmark name :param name: table name :param primary_key: primary key field name :param secondary_key: optional secondary key field name - :return: table name + :return: table name (same as input name - used directly as binding name) """ - resource_id = self._cloud_resources.resources_id - table_name = f"sebs-benchmarks-{resource_id}-{benchmark}-{name}" - - self._tables[benchmark][name] = table_name + # For Cloudflare, table names are used directly as the binding names + # in the wrapper code, so we just use the simple name + self._tables[benchmark][name] = name self.logging.info( - f"Registered Durable Objects table {table_name} for benchmark {benchmark}" + f"Registered Durable Object table '{name}' for benchmark {benchmark}" ) - return table_name + return name def write_to_table( self, @@ -150,8 +150,10 @@ def write_to_table( """ Write data to a table (Durable Object). - Note: This would require HTTP requests to the Durable Object endpoints. - For now, this is not fully implemented. + Note: Cloudflare Durable Objects can only be written to from within the Worker, + not via external API calls. Data seeding for benchmarks is not supported. + Benchmarks that require pre-populated data (like test/small sizes of crud-api) + will return empty results. Use 'large' size which creates its own data. :param benchmark: benchmark name :param table: table name @@ -164,23 +166,25 @@ def write_to_table( if not table_name: raise ValueError(f"Table {table} not found for benchmark {benchmark}") - self.logging.warning( - f"write_to_table not fully implemented for Durable Objects table {table_name}" - ) + # Silently skip data seeding for Cloudflare Durable Objects + # This is a platform limitation + pass def clear_table(self, name: str) -> str: """ Clear all data from a table. + Note: Durable Object data is managed within the Worker. + :param name: table name :return: table name """ - self.logging.warning(f"clear_table not fully implemented for Durable Objects table {name}") + self.logging.info(f"Durable Objects data is managed within the Worker") return name def remove_table(self, name: str) -> str: """ - Remove a table. + Remove a table from tracking. :param name: table name :return: table name @@ -202,8 +206,14 @@ def envs(self) -> dict: """ Get environment variables for accessing Durable Objects. + Durable Objects are accessed via bindings in the Worker environment, + which are configured in wrangler.toml. We set a marker environment + variable so the wrapper knows Durable Objects are available. + :return: dictionary of environment variables """ - # Durable Objects are accessed via bindings in the Worker - # No additional environment variables needed - return {} + # Set a marker that Durable Objects are enabled + # The actual bindings (DURABLE_STORE, etc.) are configured in wrangler.toml + return { + "NOSQL_STORAGE_DATABASE": "durable_objects" + } From 5f9ad9c5157607f0b3932000cc0dc3d205618ef6 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 30 Nov 2025 16:08:07 +0100 Subject: [PATCH 027/230] asyncified the function calls... --- .../wrappers/cloudflare/nodejs/build.js | 50 +++++++++++++------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index e6bb65dd1..fc946e845 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -57,34 +57,52 @@ const asyncNosqlPlugin = { build.onLoad({ filter: /function\.js$/ }, async (args) => { let contents = await fs.promises.readFile(args.path, 'utf8'); + // Only transform if file uses nosql + if (!contents.includes('nosqlClient')) { + return { contents, loader: 'js' }; + } + console.log('🔧 Transforming function.js for async nosql...'); // Step 1: Add await before nosqlClient method calls contents = contents.replace(/(\s*)((?:const|let|var)\s+\w+\s*=\s*)?nosqlClient\.(insert|get|update|query|delete)\s*\(/g, '$1$2await nosqlClient.$3('); - // Step 2: Make all function declarations async - contents = contents.replace(/\bfunction\s+(\w+)\s*\(/g, 'async function $1('); + // Step 2: Make all function declarations async (but not function expressions) + contents = contents.replace(/^(\s*)function\s+(\w+)\s*\(/gm, '$1async function $2('); - // Step 3: Make exported handler async if not already - contents = contents.replace(/exports\.handler\s*=\s*function\s*\(/g, 'exports.handler = async function('); - - // Step 4: Add await before specific function calls - // Split into lines to avoid matching function declarations + // Step 3: Add await before user-defined function calls + // Process line by line to handle specific patterns const lines = contents.split('\n'); const transformedLines = lines.map(line => { - // Skip lines that contain function declarations - if (line.match(/\b(async\s+)?function\s+\w+\s*\(/)) { + // Skip lines with function declarations or function expressions + if (line.match(/\bfunction\s+\w+\s*\(/) || line.match(/=\s*(async\s+)?function\s*\(/)) { return line; } - // Transform function calls in this line - const functionNames = ['addProduct', 'getProducts', 'queryProducts', 'updateProducts', 'deleteProducts']; - for (const funcName of functionNames) { - // Match calls: spaces + optional assignment + functionName( - const callRegex = new RegExp(`(\\s*)((?:const|let|var)\\s+\\w+\\s*=\\s*)?(${funcName})\\s*\\(`, 'g'); - line = line.replace(callRegex, '$1$2await $3('); - } + // Add await before function calls that look like user-defined functions + // Match: identifier followed by ( where identifier starts line or follows whitespace/operators + // but NOT if preceded by = (assignment), . (method call), or keywords + line = line.replace(/(^|\s+|;|,|\()((?:const|let|var)\s+\w+\s*=\s*)?(\w+)\s*\(/g, (match, prefix, assignment, funcName) => { + // Skip control flow keywords + const controlFlow = ['if', 'for', 'while', 'switch', 'catch', 'return']; + if (controlFlow.includes(funcName)) { + return match; + } + + // Skip built-in JavaScript functions and methods + const builtins = ['console', 'require', 'push', 'join', 'split', + 'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', + 'some', 'every', 'includes', 'parseInt', 'parseFloat', + 'isNaN', 'Array', 'Object', 'String', 'Number', 'Boolean', + 'Math', 'JSON', 'Date', 'RegExp', 'Error', 'Promise']; + if (builtins.includes(funcName)) { + return match; + } + + // Add await for everything else + return `${prefix}${assignment || ''}await ${funcName}(`; + }); return line; }); From 92db5ae52e15ec33d5e10fc59ced1cde1e2baef7 Mon Sep 17 00:00:00 2001 From: ldzgch Date: Mon, 1 Dec 2025 23:21:14 +0100 Subject: [PATCH 028/230] fix python vendored modules --- .../python/function_cloudflare.py | 56 +++++++ .../wrappers/cloudflare/python/handler.py | 20 +-- sebs/cloudflare/cloudflare.py | 145 ++++++++++++++---- 3 files changed, 179 insertions(+), 42 deletions(-) create mode 100644 benchmarks/100.webapps/120.uploader/python/function_cloudflare.py diff --git a/benchmarks/100.webapps/120.uploader/python/function_cloudflare.py b/benchmarks/100.webapps/120.uploader/python/function_cloudflare.py new file mode 100644 index 000000000..98372cf0f --- /dev/null +++ b/benchmarks/100.webapps/120.uploader/python/function_cloudflare.py @@ -0,0 +1,56 @@ + +import datetime +import os + +from pyodide.ffi import run_sync +from pyodide.http import pyfetch + +from . import storage +client = storage.storage.get_instance() + +SEBS_USER_AGENT = "SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2" + +async def do_request(url, download_path): + headers = {'User-Agent': SEBS_USER_AGENT} + + res = await pyfetch(url, headers=headers) + bs = await res.bytes() + + with open(download_path, 'wb') as f: + f.write(bs) + +def handler(event): + + bucket = event.get('bucket').get('bucket') + output_prefix = event.get('bucket').get('output') + url = event.get('object').get('url') + name = os.path.basename(url) + download_path = '/tmp/{}'.format(name) + + process_begin = datetime.datetime.now() + + run_sync(do_request(url, download_path)) + + size = os.path.getsize(download_path) + process_end = datetime.datetime.now() + + upload_begin = datetime.datetime.now() + key_name = client.upload(bucket, os.path.join(output_prefix, name), download_path) + upload_end = datetime.datetime.now() + + process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1) + upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1) + return { + 'result': { + 'bucket': bucket, + 'url': url, + 'key': key_name + }, + 'measurement': { + 'download_time': 0, + 'download_size': 0, + 'upload_time': upload_time, + 'upload_size': size, + 'compute_time': process_time + } + } diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 332c2b67b..3f8ba6ca8 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -21,7 +21,7 @@ class KVApiObject(DurableObject): def __getattr__(self, name): return getattr(self.ctx.storage, name) - + class Default(WorkerEntrypoint): async def fetch(self, request, env): try: @@ -70,11 +70,11 @@ async def fetch2(self, request, env): storage.storage.init_instance(self) - if 'NOSQL_STORAGE_DATABASE' in os.environ: + if self.env.NOSQL_STORAGE_DATABASE: from function import nosql - - nosql.nosql.get_instance(self) - + + nosql.nosql.init_instance(self) + print("event:", event) @@ -82,7 +82,7 @@ async def fetch2(self, request, env): ## function = import_from_path("function.function", "/tmp/function.py") from function import function - + ret = function.handler(event) log_data = { @@ -136,14 +136,14 @@ def visit_FunctionDef(self, node): class AddAwait(ast.NodeTransformer): to_find = ["upload_stream", "download_stream", "upload", "download", "download_directory"] - + def visit_Call(self, node): if isinstance(node.func, ast.Attribute) and node.func.attr in self.to_find: #print(ast.dump(node.func, indent=2)) return ast.Await(value=node) - + return node - + def make_benchmark_func(): with open(working_dir +"/function/function.py") as f: module = ast.parse(f.read()) @@ -155,5 +155,5 @@ def make_benchmark_func(): ##print() with open("/tmp/function.py", "w") as wf: wf.write(new_source) - + diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index af183865d..946e5203c 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -187,6 +187,40 @@ def _ensure_wrangler_installed(self): except subprocess.TimeoutExpired: raise RuntimeError("Wrangler version check timed out") + def _ensure_pywrangler_installed(self): + """Necessary to download python dependencies""" + try: + result = subprocess.run( + ["pywrangler", "--version"], + capture_output=True, + text=True, + check=True, + timeout=10 + ) + version = result.stdout.strip() + self.logging.info(f"pywrangler is installed: {version}") + except (subprocess.CalledProcessError, FileNotFoundError): + self.logging.info("pywrangler not found, installing globally via uv tool install...") + try: + result = subprocess.run( + ["uv", "tool", "install", "workers-py"], + capture_output=True, + text=True, + check=True, + timeout=120 + ) + self.logging.info("pywrangler installed successfully") + if result.stdout: + self.logging.debug(f"uv tool install workers-py output: {result.stdout}") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to install pywrangler: {e.stderr}") + except FileNotFoundError: + raise RuntimeError( + "uv not found. Please install uv." + ) + except subprocess.TimeoutExpired: + raise RuntimeError("pywrangler version check timed out") + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None) -> str: """ @@ -254,11 +288,11 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: vars_content = "" if benchmark_name: vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' - + # Add nosql configuration if benchmark uses it if code_package and code_package.uses_nosql: vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' - + if vars_content: toml_content += f"""# Environment variables [vars] @@ -274,12 +308,12 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: toml_content += f"""# R2 bucket binding for benchmarking files # This bucket is used by fs and path polyfills to read benchmark data [[r2_buckets]] -binding = "R2" +binding = "{bucket_name}" bucket_name = "{bucket_name}" """ r2_bucket_configured = True - self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as '{bucket_name}'") except Exception as e: self.logging.warning( f"R2 bucket binding not configured: {e}. " @@ -343,12 +377,12 @@ def package_code( "Container deployment is not supported for Cloudflare Workers" ) - # Ensure Wrangler is installed - self._ensure_wrangler_installed() - # Install dependencies if language_name == "nodejs": + # Ensure Wrangler is installed + self._ensure_wrangler_installed() + package_file = os.path.join(directory, "package.json") node_modules = os.path.join(directory, "node_modules") @@ -413,43 +447,90 @@ def package_code( self.logging.warning(f"Failed to install esbuild: {e}") elif language_name == "python": - requirements_file = os.path.join(directory, "requirements.txt") + # Ensure Wrangler is installed + self._ensure_pywrangler_installed() + requirements_file = os.path.join(directory, "requirements.txt") if os.path.exists(f"{requirements_file}.{language_version}"): src = f"{requirements_file}.{language_version}" dest = requirements_file shutil.move(src, dest) self.logging.info(f"move {src} to {dest}") + + + # move function_cloudflare.py into function.py + function_cloudflare_file = os.path.join(directory, "function_cloudflare.py") + if os.path.exists(function_cloudflare_file): + src = function_cloudflare_file + dest = os.path.join(directory, "function.py") + shutil.move(src, dest) + self.logging.info(f"move {src} to {dest}") + if os.path.exists(requirements_file): - self.logging.info(f"Installing Python dependencies in {directory}") - try: - # Install to a local directory that can be bundled - target_dir = os.path.join(directory, "python_modules") - result = subprocess.run( - ["pip", "install", "-r", "requirements.txt", "-t", target_dir], - cwd=directory, - capture_output=True, - text=True, - check=True - ) - self.logging.info("pip install completed successfully") - if result.stdout: - self.logging.debug(f"pip output: {result.stdout}") - except subprocess.CalledProcessError as e: - self.logging.error(f"pip install failed: {e.stderr}") - raise RuntimeError(f"Failed to install Python dependencies: {e.stderr}") - except FileNotFoundError: - raise RuntimeError( - "pip not found. Please install Python and pip to deploy Python benchmarks." - ) + with open(requirements_file, 'r') as reqf: + reqtext = reqf.read() + supported_pkg = \ +['affine', 'aiohappyeyeballs', 'aiohttp', 'aiosignal', 'altair', 'annotated-types',\ +'anyio', 'apsw', 'argon2-cffi', 'argon2-cffi-bindings', 'asciitree', 'astropy', 'astropy_iers_data',\ +'asttokens', 'async-timeout', 'atomicwrites', 'attrs', 'audioop-lts', 'autograd', 'awkward-cpp', 'b2d',\ +'bcrypt', 'beautifulsoup4', 'bilby.cython', 'biopython', 'bitarray', 'bitstring', 'bleach', 'blosc2', 'bokeh',\ +'boost-histogram', 'brotli', 'cachetools', 'casadi', 'cbor-diag', 'certifi', 'cffi', 'cffi_example', 'cftime',\ +'charset-normalizer', 'clarabel', 'click', 'cligj', 'clingo', 'cloudpickle', 'cmyt', 'cobs', 'colorspacious',\ +'contourpy', 'coolprop', 'coverage', 'cramjam', 'crc32c', 'cryptography', 'css-inline', 'cssselect', 'cvxpy-base', 'cycler',\ +'cysignals', 'cytoolz', 'decorator', 'demes', 'deprecation', 'diskcache', 'distlib', 'distro', 'docutils', 'donfig',\ +'ewah_bool_utils', 'exceptiongroup', 'executing', 'fastapi', 'fastcan', 'fastparquet', 'fiona', 'fonttools', 'freesasa',\ +'frozenlist', 'fsspec', 'future', 'galpy', 'gmpy2', 'gsw', 'h11', 'h3', 'h5py', 'highspy', 'html5lib', 'httpcore',\ +'httpx', 'idna', 'igraph', 'imageio', 'imgui-bundle', 'iminuit', 'iniconfig', 'inspice', 'ipython', 'jedi', 'Jinja2',\ +'jiter', 'joblib', 'jsonpatch', 'jsonpointer', 'jsonschema', 'jsonschema_specifications', 'kiwisolver',\ +'lakers-python', 'lazy_loader', 'lazy-object-proxy', 'libcst', 'lightgbm', 'logbook', 'lxml', 'lz4', 'MarkupSafe',\ +'matplotlib', 'matplotlib-inline', 'memory-allocator', 'micropip', 'mmh3', 'more-itertools', 'mpmath',\ +'msgpack', 'msgspec', 'msprime', 'multidict', 'munch', 'mypy', 'narwhals', 'ndindex', 'netcdf4', 'networkx',\ +'newick', 'nh3', 'nlopt', 'nltk', 'numcodecs', 'numpy', 'openai', 'opencv-python', 'optlang', 'orjson',\ +'packaging', 'pandas', 'parso', 'patsy', 'pcodec', 'peewee', 'pi-heif', 'Pillow', 'pillow-heif', 'pkgconfig',\ +'platformdirs', 'pluggy', 'ply', 'pplpy', 'primecountpy', 'prompt_toolkit', 'propcache', 'protobuf', 'pure-eval',\ +'py', 'pyclipper', 'pycparser', 'pycryptodome', 'pydantic', 'pydantic_core', 'pyerfa', 'pygame-ce', 'Pygments',\ +'pyheif', 'pyiceberg', 'pyinstrument', 'pylimer-tools', 'PyMuPDF', 'pynacl', 'pyodide-http', 'pyodide-unix-timezones',\ +'pyparsing', 'pyrsistent', 'pysam', 'pyshp', 'pytaglib', 'pytest', 'pytest-asyncio', 'pytest-benchmark', 'pytest_httpx',\ +'python-calamine', 'python-dateutil', 'python-flint', 'python-magic', 'python-sat', 'python-solvespace', 'pytz', 'pywavelets',\ +'pyxel', 'pyxirr', 'pyyaml', 'rasterio', 'rateslib', 'rebound', 'reboundx', 'referencing', 'regex', 'requests',\ +'retrying', 'rich', 'river', 'RobotRaconteur', 'rpds-py', 'ruamel.yaml', 'rustworkx', 'scikit-image', 'scikit-learn',\ +'scipy', 'screed', 'setuptools', 'shapely', 'simplejson', 'sisl', 'six', 'smart-open', 'sniffio', 'sortedcontainers',\ +'soundfile', 'soupsieve', 'sourmash', 'soxr', 'sparseqr', 'sqlalchemy', 'stack-data', 'starlette', 'statsmodels', 'strictyaml',\ +'svgwrite', 'swiglpk', 'sympy', 'tblib', 'termcolor', 'texttable', 'texture2ddecoder', 'threadpoolctl', 'tiktoken', 'tomli',\ +'tomli-w', 'toolz', 'tqdm', 'traitlets', 'traits', 'tree-sitter', 'tree-sitter-go', 'tree-sitter-java', 'tree-sitter-python',\ +'tskit', 'typing-extensions', 'tzdata', 'ujson', 'uncertainties', 'unyt', 'urllib3', 'vega-datasets', 'vrplib', 'wcwidth',\ +'webencodings', 'wordcloud', 'wrapt', 'xarray', 'xgboost', 'xlrd', 'xxhash', 'xyzservices', 'yarl', 'yt', 'zengl', 'zfpy', 'zstandard'] + needed_pkg = [] + for pkg in supported_pkg: + if pkg.lower() in reqtext.lower(): + needed_pkg.append(pkg) + + project_file = os.path.join(directory, "pyproject.toml") + depstr = str(needed_pkg).replace("\'", "\"") + with open(project_file, 'w') as pf: + pf.write(f""" +[project] +name = "{benchmark.replace(".", "-")}-python-{language_version.replace(".", "")}" +version = "0.1.0" +description = "dummy description" +requires-python = ">={language_version}" +dependencies = {depstr} + +[dependency-groups] +dev = [ + "workers-py", + "workers-runtime-sdk" +] + """) # move into function dir funcdir = os.path.join(directory, "function") if not os.path.exists(funcdir): os.makedirs(funcdir) + dont_move = ["handler.py", "function", "python_modules", "pyproject.toml"] for thing in os.listdir(directory): - if not (thing.endswith("handler.py") or thing.endswith("function") or thing.endswith("python_modules")): + if thing not in dont_move: src = os.path.join(directory, thing) dest = os.path.join(directory, "function", thing) shutil.move(src, dest) @@ -489,7 +570,7 @@ def package_code( total_size += os.path.getsize(filepath) mbytes = total_size / 1024.0 / 1024.0 - self.logging.info(f"Worker package size: {mbytes:.2f} MB") + self.logging.info(f"Worker package size: {mbytes:.2f} MB (Python: missing vendored modules)") return (directory, total_size, "") @@ -644,7 +725,7 @@ def _create_or_update_worker( try: result = subprocess.run( - ["wrangler", "deploy"], + ["wrangler" if language == "nodejs" else "pywrangler", "deploy"], cwd=package_dir, env=env, capture_output=True, From 51892b092a3857be4f213bc2a09083bb7ee153b5 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 30 Nov 2025 18:19:32 +0100 Subject: [PATCH 029/230] added request polyfill for benchmark 120 --- .../wrappers/cloudflare/nodejs/build.js | 11 ++ .../cloudflare/nodejs/request-polyfill.js | 100 ++++++++++++++++++ configs/systems.json | 3 +- 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index fc946e845..7caf096dd 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -38,6 +38,8 @@ function copyFile(src, dest) { const nodeBuiltinsPlugin = { name: 'node-builtins-external', setup(build) { + const { resolve } = require('path'); + // Keep node: prefixed modules external build.onResolve({ filter: /^(node:|cloudflare:)/ }, (args) => { return { path: args.path, external: true }; @@ -47,6 +49,15 @@ const nodeBuiltinsPlugin = { build.onResolve({ filter: /^(fs|querystring|path|crypto|stream|buffer|util|events|http|https|net|tls|zlib|os|child_process|tty|assert|url)$/ }, (args) => { return { path: 'node:' + args.path, external: true }; }); + + // Polyfill 'request' module with fetch-based implementation + build.onResolve({ filter: /^request$/ }, (args) => { + // Get the directory where build.js is located (wrapper directory) + const wrapperDir = __dirname; + return { + path: resolve(wrapperDir, 'request-polyfill.js') + }; + }); } }; diff --git a/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js b/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js new file mode 100644 index 000000000..f44bfa232 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/request-polyfill.js @@ -0,0 +1,100 @@ +/** + * Polyfill for the 'request' module using Cloudflare Workers fetch API + * Implements the minimal interface needed for benchmark compatibility + */ + +const { Writable } = require('node:stream'); +const fs = require('node:fs'); + +function request(url, options, callback) { + // Handle different call signatures + if (typeof options === 'function') { + callback = options; + options = {}; + } + + // Add default headers to mimic a browser request + const fetchOptions = { + ...options, + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': '*/*', + ...((options && options.headers) || {}) + } + }; + + // Create a simple object that has a pipe method + const requestObj = { + pipe(destination) { + // Perform the fetch and write to destination + fetch(url, fetchOptions) + .then(async (response) => { + if (!response.ok) { + const error = new Error(`HTTP ${response.status}: ${response.statusText}`); + error.statusCode = response.status; + destination.emit('error', error); + if (callback) callback(error, response, null); + return destination; + } + + // Get the response as arrayBuffer and write it all at once + const buffer = await response.arrayBuffer(); + + // Write the buffer to the destination + if (destination.write) { + destination.write(Buffer.from(buffer)); + destination.end(); + } + + if (callback) callback(null, response, Buffer.from(buffer)); + }) + .catch((error) => { + destination.emit('error', error); + if (callback) callback(error, null, null); + }); + + return destination; + }, + + abort() { + // No-op for compatibility + } + }; + + return requestObj; +} + +// Add common request methods +request.get = (url, options, callback) => { + if (typeof options === 'function') { + callback = options; + options = {}; + } + return request(url, { ...options, method: 'GET' }, callback); +}; + +request.post = (url, options, callback) => { + if (typeof options === 'function') { + callback = options; + options = {}; + } + return request(url, { ...options, method: 'POST' }, callback); +}; + +request.put = (url, options, callback) => { + if (typeof options === 'function') { + callback = options; + options = {}; + } + return request(url, { ...options, method: 'PUT' }, callback); +}; + +request.delete = (url, options, callback) => { + if (typeof options === 'function') { + callback = options; + options = {}; + } + return request(url, { ...options, method: 'DELETE' }, callback); +}; + +module.exports = request; diff --git a/configs/systems.json b/configs/systems.json index 3eb79b648..4ac3131f9 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -462,7 +462,8 @@ "handler.js", "storage.js", "nosql.js", - "build.js" + "build.js", + "request-polyfill.js" ], "packages": { "uuid": "3.4.0" From 3235d3f4fd29a8a254d51af1bc34c8b0ca86a905 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 4 Dec 2025 12:00:49 +0100 Subject: [PATCH 030/230] fixed r2 usage for 120, 311 --- .../311.compression/nodejs/function.js | 147 ++++++++++++++++++ .../311.compression/nodejs/package.json | 9 ++ .../wrappers/cloudflare/nodejs/handler.js | 14 +- .../wrappers/cloudflare/nodejs/storage.js | 60 ++++--- sebs/cloudflare/cloudflare.py | 4 +- 5 files changed, 204 insertions(+), 30 deletions(-) create mode 100644 benchmarks/300.utilities/311.compression/nodejs/function.js create mode 100644 benchmarks/300.utilities/311.compression/nodejs/package.json diff --git a/benchmarks/300.utilities/311.compression/nodejs/function.js b/benchmarks/300.utilities/311.compression/nodejs/function.js new file mode 100644 index 000000000..5f7cc04d4 --- /dev/null +++ b/benchmarks/300.utilities/311.compression/nodejs/function.js @@ -0,0 +1,147 @@ +const fs = require('fs'); +const path = require('path'); +const zlib = require('zlib'); +const { v4: uuidv4 } = require('uuid'); +const storage = require('./storage'); + +let storage_handler = new storage.storage(); + +/** + * Calculate total size of a directory recursively + * @param {string} directory - Path to directory + * @returns {number} Total size in bytes + */ +function parseDirectory(directory) { + let size = 0; + + function walkDir(dir) { + const files = fs.readdirSync(dir); + for (const file of files) { + const filepath = path.join(dir, file); + const stat = fs.statSync(filepath); + if (stat.isDirectory()) { + walkDir(filepath); + } else { + size += stat.size; + } + } + } + + walkDir(directory); + return size; +} + +/** + * Create a simple tar.gz archive from a directory using native zlib + * This creates a gzip-compressed tar archive without external dependencies + * @param {string} sourceDir - Directory to compress + * @param {string} outputPath - Path for the output archive file + * @returns {Promise} + */ +async function createTarGzArchive(sourceDir, outputPath) { + // Create a simple tar-like format (concatenated files with headers) + const files = []; + + function collectFiles(dir, baseDir = '') { + const entries = fs.readdirSync(dir); + for (const entry of entries) { + const fullPath = path.join(dir, entry); + const relativePath = path.join(baseDir, entry); + const stat = fs.statSync(fullPath); + + if (stat.isDirectory()) { + collectFiles(fullPath, relativePath); + } else { + files.push({ + path: relativePath, + fullPath: fullPath, + size: stat.size + }); + } + } + } + + collectFiles(sourceDir); + + // Create a concatenated buffer of all files with simple headers + const chunks = []; + for (const file of files) { + const content = fs.readFileSync(file.fullPath); + // Simple header: filename length (4 bytes) + filename + content length (4 bytes) + content + const pathBuffer = Buffer.from(file.path); + const pathLengthBuffer = Buffer.allocUnsafe(4); + pathLengthBuffer.writeUInt32BE(pathBuffer.length, 0); + const contentLengthBuffer = Buffer.allocUnsafe(4); + contentLengthBuffer.writeUInt32BE(content.length, 0); + + chunks.push(pathLengthBuffer); + chunks.push(pathBuffer); + chunks.push(contentLengthBuffer); + chunks.push(content); + } + + const combined = Buffer.concat(chunks); + + // Compress using gzip + const compressed = zlib.gzipSync(combined, { level: 9 }); + fs.writeFileSync(outputPath, compressed); +} + +exports.handler = async function(event) { + const bucket = event.bucket.bucket; + const input_prefix = event.bucket.input; + const output_prefix = event.bucket.output; + const key = event.object.key; + + // Create unique download path + const download_path = path.join('/tmp', `${key}-${uuidv4()}`); + fs.mkdirSync(download_path, { recursive: true }); + + // Download directory from storage + const s3_download_begin = Date.now(); + await storage_handler.download_directory(bucket, path.join(input_prefix, key), download_path); + const s3_download_stop = Date.now(); + + // Calculate size of downloaded files + const size = parseDirectory(download_path); + + // Compress directory + const compress_begin = Date.now(); + const archive_name = `${key}.tar.gz`; + const archive_path = path.join(download_path, archive_name); + await createTarGzArchive(download_path, archive_path); + const compress_end = Date.now(); + + // Get archive size + const archive_size = fs.statSync(archive_path).size; + + // Upload compressed archive + const s3_upload_begin = Date.now(); + const [key_name, uploadPromise] = storage_handler.upload( + bucket, + path.join(output_prefix, archive_name), + archive_path + ); + await uploadPromise; + const s3_upload_stop = Date.now(); + + // Calculate times in microseconds + const download_time = (s3_download_stop - s3_download_begin) * 1000; + const upload_time = (s3_upload_stop - s3_upload_begin) * 1000; + const process_time = (compress_end - compress_begin) * 1000; + + return { + result: { + bucket: bucket, + key: key_name + }, + measurement: { + download_time: download_time, + download_size: size, + upload_time: upload_time, + upload_size: archive_size, + compute_time: process_time + } + }; +}; + diff --git a/benchmarks/300.utilities/311.compression/nodejs/package.json b/benchmarks/300.utilities/311.compression/nodejs/package.json new file mode 100644 index 000000000..56827265a --- /dev/null +++ b/benchmarks/300.utilities/311.compression/nodejs/package.json @@ -0,0 +1,9 @@ +{ + "name": "compression-benchmark", + "version": "1.0.0", + "description": "Compression benchmark for serverless platforms", + "main": "function.js", + "dependencies": { + "uuid": "^10.0.0" + } +} diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index c72b7d5ec..507d68153 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -96,12 +96,20 @@ export default { throw new Error('Failed to import benchmark function module: ' + e2.message); } - // If the function module exposes a storage initializer, call it + // Initialize storage - try function module first, then fall back to wrapper storage try { if (funcModule && funcModule.storage && typeof funcModule.storage.init_instance === 'function') { + funcModule.storage.init_instance({ env, request }); + } else { + // Function doesn't export storage, so initialize wrapper storage directly try { - funcModule.storage.init_instance({ env, request }); - } catch (ignore) {} + const storageModule = await import('./storage.js'); + if (storageModule && storageModule.storage && typeof storageModule.storage.init_instance === 'function') { + storageModule.storage.init_instance({ env, request }); + } + } catch (storageErr) { + // Ignore errors from storage initialization + } } } catch (e) { // don't fail the request if storage init isn't available diff --git a/benchmarks/wrappers/cloudflare/nodejs/storage.js b/benchmarks/wrappers/cloudflare/nodejs/storage.js index 72e71e288..a49cc3347 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/storage.js @@ -31,15 +31,32 @@ class storage { // so callers should use upload_stream or pass raw data. For Node.js we read // the file from disk and put it into R2 if available, otherwise throw. upload(__bucket, key, filepath) { + // Use singleton instance if available, otherwise use this instance + const instance = storage.instance || this; + // If file was previously written during this invocation, use /tmp absolute let realPath = filepath; - if (this.written_files.has(filepath)) { + if (instance.written_files.has(filepath)) { realPath = path.join('/tmp', path.resolve(filepath)); } - // In Workers environment with R2, check if file exists in R2 + const unique_key = storage.unique_name(key); + + // Try filesystem first (for Workers with nodejs_compat that have /tmp) + if (fs && fs.existsSync(realPath)) { + const data = fs.readFileSync(realPath); + + if (instance.handle) { + const uploadPromise = instance.handle.put(unique_key, data); + return [unique_key, uploadPromise]; + } else { + return [unique_key, Promise.resolve()]; + } + } + + // Fallback: In Workers environment with R2, check if file exists in R2 // (it may have been written by fs-polyfill's createWriteStream) - if (this.handle) { + if (instance.handle) { // Normalize the path to match what fs-polyfill would use let normalizedPath = realPath.replace(/^\.?\//, '').replace(/^tmp\//, ''); @@ -49,13 +66,11 @@ class storage { normalizedPath = globalThis.BENCHMARK_NAME + '/' + normalizedPath; } - const unique_key = storage.unique_name(key); - // Read from R2 and re-upload with unique key - const uploadPromise = this.handle.get(normalizedPath).then(async (obj) => { + const uploadPromise = instance.handle.get(normalizedPath).then(async (obj) => { if (obj) { const data = await obj.arrayBuffer(); - return this.handle.put(unique_key, data); + return instance.handle.put(unique_key, data); } else { throw new Error(`File not found in R2: ${normalizedPath} (original path: ${filepath})`); } @@ -64,23 +79,13 @@ class storage { return [unique_key, uploadPromise]; } - // Fallback: Read file content from local filesystem (Node.js environment) - if (fs && fs.existsSync(realPath)) { - const data = fs.readFileSync(realPath); - const unique_key = storage.unique_name(key); - - // Return [uniqueName, promise] to match Azure storage API - const uploadPromise = Promise.resolve(); - - return [unique_key, uploadPromise]; - } - // If running in Workers (no fs) and caller provided Buffer/Stream, they // should call upload_stream directly. Otherwise, throw. throw new Error('upload(): file not found on disk and no R2 handle provided'); } async download(__bucket, key, filepath) { + const instance = storage.instance || this; const data = await this.download_stream(__bucket, key); let real_fp = filepath; @@ -88,7 +93,7 @@ class storage { real_fp = path.join('/tmp', path.resolve(filepath)); } - this.written_files.add(filepath); + instance.written_files.add(filepath); // Write data to file if we have fs if (fs) { @@ -106,11 +111,13 @@ class storage { } async download_directory(__bucket, prefix, out_path) { - if (!this.handle) { + const instance = storage.instance || this; + + if (!instance.handle) { throw new Error('download_directory requires R2 binding (env.R2)'); } - const list_res = await this.handle.list({ prefix }); + const list_res = await instance.handle.list({ prefix }); const objects = list_res.objects || []; for (const obj of objects) { const file_name = obj.key; @@ -121,10 +128,11 @@ class storage { } async upload_stream(__bucket, key, data) { + const instance = storage.instance || this; const unique_key = storage.unique_name(key); - if (this.handle) { + if (instance.handle) { // R2 put accepts ArrayBuffer, ReadableStream, or string - await this.handle.put(unique_key, data); + await instance.handle.put(unique_key, data); return unique_key; } @@ -141,8 +149,10 @@ class storage { } async download_stream(__bucket, key) { - if (this.handle) { - const obj = await this.handle.get(key); + const instance = storage.instance || this; + + if (instance.handle) { + const obj = await instance.handle.get(key); if (!obj) return null; // R2 object provides arrayBuffer()/text() helpers in Workers if (typeof obj.arrayBuffer === 'function') { diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 946e5203c..0d24b122a 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -308,12 +308,12 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: toml_content += f"""# R2 bucket binding for benchmarking files # This bucket is used by fs and path polyfills to read benchmark data [[r2_buckets]] -binding = "{bucket_name}" +binding = "R2" bucket_name = "{bucket_name}" """ r2_bucket_configured = True - self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as '{bucket_name}'") + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") except Exception as e: self.logging.warning( f"R2 bucket binding not configured: {e}. " From 416b67bb3d987bea4904df4c1532df432ec10777 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 7 Dec 2025 23:10:00 +0100 Subject: [PATCH 031/230] support for cloudflare containers (python and nodejs), container worker orchestrator, worker orchestrator proxy for r2 and durable objects to be used in container --- .../cloudflare/nodejs/container/handler.js | 195 ++++++ .../cloudflare/nodejs/container/nosql.js | 118 ++++ .../cloudflare/nodejs/container/package.json | 10 + .../cloudflare/nodejs/container/storage.js | 294 +++++++++ .../cloudflare/nodejs/container/worker.js | 362 ++++++++++++ .../cloudflare/python/container/handler.py | 196 ++++++ .../cloudflare/python/container/nosql.py | 117 ++++ .../cloudflare/python/container/storage.py | 203 +++++++ .../wrappers/cloudflare/python/handler.py | 2 +- .../wrappers/cloudflare/python/nosql.py | 9 +- .../wrappers/cloudflare/python/storage.py | 33 +- configs/cloudflare-test.json | 4 +- configs/systems.json | 36 +- dockerfiles/cloudflare/nodejs/Dockerfile | 21 + dockerfiles/cloudflare/python/Dockerfile | 31 + sebs/cli.py | 5 +- sebs/cloudflare/cloudflare.py | 557 ++++++++++++++++-- sebs/cloudflare/r2.py | 67 ++- sebs/experiments/config.py | 2 +- 19 files changed, 2186 insertions(+), 76 deletions(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/container/handler.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/container/nosql.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/container/package.json create mode 100644 benchmarks/wrappers/cloudflare/nodejs/container/storage.js create mode 100644 benchmarks/wrappers/cloudflare/nodejs/container/worker.js create mode 100644 benchmarks/wrappers/cloudflare/python/container/handler.py create mode 100644 benchmarks/wrappers/cloudflare/python/container/nosql.py create mode 100644 benchmarks/wrappers/cloudflare/python/container/storage.py create mode 100644 dockerfiles/cloudflare/nodejs/Dockerfile create mode 100644 dockerfiles/cloudflare/python/Dockerfile diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js new file mode 100644 index 000000000..6f99c6728 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -0,0 +1,195 @@ +// Container handler for Cloudflare Workers - Node.js +// This handler is used when deploying as a container worker + +const http = require('http'); + +// Monkey-patch the 'request' library to always include a User-Agent header +// This is needed because Wikimedia (and other sites) require a User-Agent +try { + const Module = require('module'); + const originalRequire = Module.prototype.require; + + Module.prototype.require = function(id) { + const module = originalRequire.apply(this, arguments); + + if (id === 'request') { + // Wrap the request function to inject default headers + const originalRequest = module; + const wrappedRequest = function(options, callback) { + if (typeof options === 'string') { + options = { uri: options }; + } + if (!options.headers) { + options.headers = {}; + } + if (!options.headers['User-Agent'] && !options.headers['user-agent']) { + options.headers['User-Agent'] = 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2'; + } + return originalRequest(options, callback); + }; + // Copy all properties from original request + Object.keys(originalRequest).forEach(key => { + wrappedRequest[key] = originalRequest[key]; + }); + return wrappedRequest; + } + + return module; + }; +} catch (e) { + console.error('Failed to patch request module:', e); +} + +// Import the benchmark function +const { handler: benchmarkHandler } = require('./function'); + +// Import storage and nosql if they exist +let storage, nosql; +try { + storage = require('./storage'); +} catch (e) { + console.log('Storage module not available'); +} +try { + nosql = require('./nosql'); +} catch (e) { + console.log('NoSQL module not available'); +} + +const PORT = process.env.PORT || 8080; + +const server = http.createServer(async (req, res) => { + // Handle favicon requests + if (req.url.includes('favicon')) { + res.writeHead(200); + res.end('None'); + return; + } + + try { + // Extract Worker URL from header for R2 and NoSQL proxy + const workerUrl = req.headers['x-worker-url']; + if (workerUrl) { + if (storage && storage.storage && storage.storage.set_worker_url) { + storage.storage.set_worker_url(workerUrl); + } + if (nosql && nosql.nosql && nosql.nosql.set_worker_url) { + nosql.nosql.set_worker_url(workerUrl); + } + console.log(`Set worker URL for R2/NoSQL proxy: ${workerUrl}`); + } + + // Start timing measurements + const begin = Date.now() / 1000; + const start = performance.now(); + + // Read request body + let body = ''; + for await (const chunk of req) { + body += chunk; + } + + // Parse event from JSON body or URL params + let event = {}; + if (body && body.length > 0) { + try { + event = JSON.parse(body); + } catch (e) { + console.error('Failed to parse JSON body:', e); + } + } + + // Parse URL parameters + const url = new URL(req.url, `http://${req.headers.host}`); + for (const [key, value] of url.searchParams) { + if (!event[key]) { + const intValue = parseInt(value); + event[key] = isNaN(intValue) ? value : intValue; + } + } + + // Add request metadata + const reqId = 0; + const incomeTimestamp = Math.floor(Date.now() / 1000); + event['request-id'] = reqId; + event['income-timestamp'] = incomeTimestamp; + + console.error('!!! Event:', JSON.stringify(event)); + + // For debugging: check /tmp directory before and after benchmark + const fs = require('fs'); + console.error('!!! Files in /tmp before benchmark:', fs.readdirSync('/tmp')); + + // Call the benchmark function + console.error('!!! Calling benchmark handler...'); + const ret = await benchmarkHandler(event); + console.error('!!! Benchmark result:', JSON.stringify(ret)); + + // Check what was downloaded + console.error('!!! Files in /tmp after benchmark:', fs.readdirSync('/tmp')); + const tmpFiles = fs.readdirSync('/tmp'); + for (const file of tmpFiles) { + const filePath = `/tmp/${file}`; + const stats = fs.statSync(filePath); + console.error(`!!! ${file}: ${stats.size} bytes`); + if (stats.size < 500) { + const content = fs.readFileSync(filePath, 'utf8'); + console.error(`!!! First 300 chars: ${content.substring(0, 300)}`); + } + } + + // Calculate elapsed time + const end = Date.now() / 1000; + const elapsed = performance.now() - start; + const micro = elapsed * 1000; // Convert milliseconds to microseconds + + // Build log_data similar to native handler + const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; + if (ret && ret.measurement !== undefined) { + log_data.measurement = ret.measurement; + } + if (event.logs !== undefined) { + log_data.time = 0; + } + + console.log('Sending response with log_data:', log_data); + + // Send response matching Python handler format exactly + if (event.html) { + res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(String(ret && ret.result !== undefined ? ret.result : ret)); + } else { + const responseBody = JSON.stringify({ + begin: "0", + end: "0", + results_time: "0", + result: log_data, + is_cold: false, + is_cold_worker: false, + container_id: '0', + environ_container_id: 'no_id', + request_id: '0', + }); + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(responseBody); + } + + } catch (error) { + console.error('Error processing request:', error); + console.error('Stack trace:', error.stack); + + const errorPayload = JSON.stringify({ + error: error.message, + stack: error.stack + }); + + res.writeHead(500, { 'Content-Type': 'application/json' }); + res.end(errorPayload); + } +}); + +// Ensure server is listening before handling requests +server.listen(PORT, '0.0.0.0', () => { + console.log(`Container server listening on 0.0.0.0:${PORT}`); + console.log('Server ready to accept connections'); +}); diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js new file mode 100644 index 000000000..3469bf6b9 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js @@ -0,0 +1,118 @@ +/** + * NoSQL module for Cloudflare Node.js Containers + * Uses HTTP proxy to access Durable Objects through the Worker's binding + */ + +class nosql { + constructor() { + // Container accesses Durable Objects through worker.js proxy + } + + static worker_url = null; // Set by handler from X-Worker-URL header + + static init_instance(entry) { + if (!nosql.instance) { + nosql.instance = new nosql(); + } + return nosql.instance; + } + + static set_worker_url(url) { + nosql.worker_url = url; + } + + async _make_request(operation, params) { + if (!nosql.worker_url) { + throw new Error('Worker URL not set - cannot access NoSQL'); + } + + const url = `${nosql.worker_url}/nosql/${operation}`; + const data = JSON.stringify(params); + + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: data, + }); + + if (!response.ok) { + let errorMsg; + try { + const errorData = await response.json(); + errorMsg = errorData.error || await response.text(); + } catch { + errorMsg = await response.text(); + } + throw new Error(`NoSQL operation failed: ${errorMsg}`); + } + + return await response.json(); + } catch (error) { + throw new Error(`NoSQL operation failed: ${error.message}`); + } + } + + async insert(tableName, primaryKey, secondaryKey, data) { + const params = { + table_name: tableName, + primary_key: primaryKey, + secondary_key: secondaryKey, + data: data, + }; + return this._make_request('insert', params); + } + + async get(tableName, primaryKey, secondaryKey) { + const params = { + table_name: tableName, + primary_key: primaryKey, + secondary_key: secondaryKey, + }; + const result = await this._make_request('get', params); + return result.data || null; + } + + async update(tableName, primaryKey, secondaryKey, updates) { + const params = { + table_name: tableName, + primary_key: primaryKey, + secondary_key: secondaryKey, + data: updates, + }; + return this._make_request('update', params); + } + + async query(tableName, primaryKey, secondaryKeyName) { + const params = { + table_name: tableName, + primary_key: primaryKey, + secondary_key_name: secondaryKeyName, + }; + const result = await this._make_request('query', params); + console.error(`[nosql.query] result:`, JSON.stringify(result)); + console.error(`[nosql.query] result.items:`, result.items); + console.error(`[nosql.query] Array.isArray(result.items):`, Array.isArray(result.items)); + const items = result.items || []; + console.error(`[nosql.query] returning items:`, items); + return items; + } + + async delete(tableName, primaryKey, secondaryKey) { + const params = { + table_name: tableName, + primary_key: primaryKey, + secondary_key: secondaryKey, + }; + return this._make_request('delete', params); + } + + static get_instance() { + if (!nosql.instance) { + nosql.instance = new nosql(); + } + return nosql.instance; + } +} + +module.exports.nosql = nosql; diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/package.json b/benchmarks/wrappers/cloudflare/nodejs/container/package.json new file mode 100644 index 000000000..729c56fdc --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/container/package.json @@ -0,0 +1,10 @@ +{ + "name": "cloudflare-container-worker", + "version": "1.0.0", + "description": "Cloudflare Container Worker wrapper", + "main": "worker.js", + "type": "module", + "dependencies": { + "@cloudflare/containers": "^1.0.0" + } +} diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js new file mode 100644 index 000000000..d893245ef --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js @@ -0,0 +1,294 @@ +const fs = require('fs'); +const path = require('path'); +const uuid = require('uuid'); + +/** + * Storage module for Cloudflare Node.js Containers + * Uses HTTP proxy to access R2 storage through the Worker's R2 binding + */ + +class storage { + constructor() { + this.r2_enabled = true; + } + + static worker_url = null; // Set by handler from X-Worker-URL header + + + static worker_url = null; // Set by handler from X-Worker-URL header + + static unique_name(name) { + const parsed = path.parse(name); + const uuid_name = uuid.v4().split('-')[0]; + return path.join(parsed.dir, `${parsed.name}.${uuid_name}${parsed.ext}`); + } + + static init_instance(entry) { + if (!storage.instance) { + storage.instance = new storage(); + } + return storage.instance; + } + + static set_worker_url(url) { + storage.worker_url = url; + } + + static get_instance() { + if (!storage.instance) { + storage.init_instance(); + } + return storage.instance; + } + + async upload_stream(bucket, key, data) { + if (!this.r2_enabled) { + console.log('Warning: R2 not configured, skipping upload'); + return key; + } + + if (!storage.worker_url) { + throw new Error('Worker URL not set - cannot access R2'); + } + + const unique_key = storage.unique_name(key); + + // Convert data to Buffer if needed + let buffer; + if (Buffer.isBuffer(data)) { + buffer = data; + } else if (typeof data === 'string') { + buffer = Buffer.from(data, 'utf-8'); + } else if (data instanceof ArrayBuffer) { + buffer = Buffer.from(data); + } else { + buffer = Buffer.from(String(data), 'utf-8'); + } + + // Upload via worker proxy + const params = new URLSearchParams({ bucket, key: unique_key }); + const url = `${storage.worker_url}/r2/upload?${params}`; + + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/octet-stream' }, + body: buffer, + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + + const result = await response.json(); + return result.key; + } catch (error) { + console.error('R2 upload error:', error); + throw new Error(`Failed to upload to R2: ${error.message}`); + } + } + + async download_stream(bucket, key) { + if (!this.r2_enabled) { + throw new Error('R2 not configured'); + } + + if (!storage.worker_url) { + throw new Error('Worker URL not set - cannot access R2'); + } + + // Download via worker proxy + const params = new URLSearchParams({ bucket, key }); + const url = `${storage.worker_url}/r2/download?${params}`; + + try { + const response = await fetch(url); + + if (response.status === 404) { + throw new Error(`Object not found: ${key}`); + } + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + + const arrayBuffer = await response.arrayBuffer(); + return Buffer.from(arrayBuffer); + } catch (error) { + console.error('R2 download error:', error); + throw new Error(`Failed to download from R2: ${error.message}`); + } + } + + upload(bucket, key, filepath) { + // Generate unique key synchronously so it can be returned immediately + const unique_key = storage.unique_name(key); + console.error(`!!! [storage.upload] bucket=${bucket}, key=${key}, unique_key=${unique_key}, filepath=${filepath}`); + + // Read file from disk and upload + if (fs.existsSync(filepath)) { + const stats = fs.statSync(filepath); + console.error(`!!! [storage.upload] File exists, size on disk: ${stats.size} bytes`); + const data = fs.readFileSync(filepath); + console.error(`!!! [storage.upload] Read ${data.length} bytes from ${filepath}`); + console.error(`!!! [storage.upload] Data type: ${typeof data}, isBuffer: ${Buffer.isBuffer(data)}, isString: ${typeof data === 'string'}`); + console.error(`!!! [storage.upload] First 200 chars of data: ${data.toString().substring(0, 200)}`); + // Call internal version that doesn't generate another unique key + const uploadPromise = this._upload_stream_with_key(bucket, unique_key, data); + console.error(`!!! [storage.upload] Returning unique_key=${unique_key} and upload promise`); + return [unique_key, uploadPromise]; + } + + console.error(`!!! [storage.upload] File not found: ${filepath}`); + throw new Error(`upload(): file not found: ${filepath}`); + } + + async _upload_stream_with_key(bucket, key, data) { + // Internal method that uploads with exact key (no unique naming) + console.log(`[storage._upload_stream_with_key] Starting upload: bucket=${bucket}, key=${key}, data_size=${data.length}`); + + if (!this.r2_enabled) { + console.log('Warning: R2 not configured, skipping upload'); + return key; + } + + if (!storage.worker_url) { + console.error('[storage._upload_stream_with_key] Worker URL not set!'); + throw new Error('Worker URL not set - cannot access R2'); + } + + console.log(`[storage._upload_stream_with_key] Worker URL: ${storage.worker_url}`); + + // Convert data to Buffer if needed + let buffer; + if (Buffer.isBuffer(data)) { + buffer = data; + } else if (typeof data === 'string') { + buffer = Buffer.from(data, 'utf-8'); + } else if (data instanceof ArrayBuffer) { + buffer = Buffer.from(data); + } else { + buffer = Buffer.from(String(data), 'utf-8'); + } + + // Upload via worker proxy + const params = new URLSearchParams({ bucket, key }); + const url = `${storage.worker_url}/r2/upload?${params}`; + console.log(`[storage._upload_stream_with_key] Uploading to URL: ${url}, buffer size: ${buffer.length}`); + + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/octet-stream' }, + body: buffer, + }); + + console.log(`[storage._upload_stream_with_key] Response status: ${response.status}`); + + if (!response.ok) { + const errorText = await response.text(); + console.error(`[storage._upload_stream_with_key] Upload failed: ${response.status} - ${errorText}`); + throw new Error(`HTTP ${response.status}: ${errorText}`); + } + + const result = await response.json(); + console.log(`[storage._upload_stream_with_key] Upload successful, returned key: ${result.key}`); + return result.key; + } catch (error) { + console.error('R2 upload error:', error); + throw new Error(`Failed to upload to R2: ${error.message}`); + } + } + + async download(bucket, key, filepath) { + const data = await this.download_stream(bucket, key); + + let real_fp = filepath; + if (!filepath.startsWith('/tmp')) { + real_fp = path.join('/tmp', path.resolve(filepath)); + } + + // Write data to file + fs.mkdirSync(path.dirname(real_fp), { recursive: true }); + fs.writeFileSync(real_fp, data); + } + + async download_directory(bucket, prefix, out_path) { + // List all objects with the prefix and download each one + if (!this.r2_enabled) { + console.log('Warning: R2 not configured, skipping download_directory'); + return; + } + + if (!storage.worker_url) { + throw new Error('Worker URL not set - cannot access R2'); + } + + // List objects via worker proxy + const listParams = new URLSearchParams({ bucket, prefix }); + const listUrl = `${storage.worker_url}/r2/list?${listParams}`; + + try { + const response = await fetch(listUrl, { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`HTTP ${response.status}: ${errorText}`); + } + + const result = await response.json(); + const objects = result.objects || []; + + for (const obj of objects) { + const file_name = obj.key; + const path_to_file = path.dirname(file_name); + fs.mkdirSync(path.join(out_path, path_to_file), { recursive: true }); + await this.download(bucket, file_name, path.join(out_path, file_name)); + } + } catch (error) { + console.error('R2 download_directory error:', error); + throw new Error(`Failed to download directory from R2: ${error.message}`); + } + } + + uploadStream(bucket, key) { + // Return [stream, promise, unique_key] to match native wrapper API + const unique_key = storage.unique_name(key); + + const stream = require('stream'); + const passThrough = new stream.PassThrough(); + const chunks = []; + + passThrough.on('data', (chunk) => chunks.push(chunk)); + + const upload = new Promise((resolve, reject) => { + passThrough.on('end', async () => { + try { + const buffer = Buffer.concat(chunks); + await this._upload_stream_with_key(bucket, unique_key, buffer); + resolve(); + } catch (err) { + reject(err); + } + }); + passThrough.on('error', reject); + }); + + return [passThrough, upload, unique_key]; + } + + async downloadStream(bucket, key) { + // Return a Promise that resolves to a readable stream + const data = await this.download_stream(bucket, key); + const stream = require('stream'); + const readable = new stream.Readable(); + readable.push(data); + readable.push(null); // Signal end of stream + return readable; + } +} + +module.exports.storage = storage; diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js new file mode 100644 index 000000000..78140794f --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -0,0 +1,362 @@ +import { Container, getContainer } from "@cloudflare/containers"; +import { DurableObject } from "cloudflare:workers"; + +// Container wrapper class +export class ContainerWorker extends Container { + defaultPort = 8080; + sleepAfter = "10m"; +} + +// Durable Object for NoSQL storage (simple proxy to ctx.storage) +export class KVApiObject extends DurableObject { + constructor(ctx, env) { + super(ctx, env); + } + + async insert(key, value) { + await this.ctx.storage.put(key.join(':'), value); + return { success: true }; + } + + async update(key, value) { + await this.ctx.storage.put(key.join(':'), value); + return { success: true }; + } + + async get(key) { + const value = await this.ctx.storage.get(key.join(':')); + return { data: value || null }; + } + + async query(keyPrefix) { + const list = await this.ctx.storage.list(); + const items = []; + for (const [k, v] of list) { + items.push(v); + } + return { items }; + } + + async delete(key) { + await this.ctx.storage.delete(key.join(':')); + return { success: true }; + } +} + +export default { + async fetch(request, env) { + const url = new URL(request.url); + + // Health check endpoint + if (url.pathname === '/health' || url.pathname === '/_health') { + try { + const containerId = 'default'; + const id = env.CONTAINER_WORKER.idFromName(containerId); + const stub = env.CONTAINER_WORKER.get(id); + + // Make a simple GET request to the root path to verify container is responsive + const healthRequest = new Request('http://localhost/', { + method: 'GET', + headers: { + 'X-Health-Check': 'true' + } + }); + + const response = await stub.fetch(healthRequest); + + // Container is ready if it responds (even with an error from the benchmark handler) + // A 500 from the handler means the container is running, just not a valid benchmark request + if (response.status >= 200 && response.status < 600) { + return new Response('OK', { status: 200 }); + } else { + return new Response(JSON.stringify({ + error: 'Container not responding', + status: response.status + }), { + status: 503, + headers: { 'Content-Type': 'application/json' } + }); + } + + } catch (error) { + return new Response(JSON.stringify({ + error: 'Container failed to start', + details: error.message, + stack: error.stack + }), { + status: 503, + headers: { 'Content-Type': 'application/json' } + }); + } + } + + try { + // Handle NoSQL proxy requests - intercept BEFORE forwarding to container + if (url.pathname.startsWith('/nosql/')) { + return await handleNoSQLRequest(request, env); + } + + // Handle R2 proxy requests - intercept BEFORE forwarding to container + if (url.pathname.startsWith('/r2/')) { + return await handleR2Request(request, env); + } + + // Get or create container instance + const containerId = request.headers.get('x-container-id') || 'default'; + const id = env.CONTAINER_WORKER.idFromName(containerId); + const stub = env.CONTAINER_WORKER.get(id); + + // Clone request and add Worker URL as header so container knows where to proxy R2 requests + const modifiedRequest = new Request(request); + modifiedRequest.headers.set('X-Worker-URL', url.origin); + + // Forward the request to the container + return await stub.fetch(modifiedRequest); + + } catch (error) { + console.error('Worker error:', error); + + const errorMessage = error.message || String(error); + + // Handle container not ready errors with 503 + if (errorMessage.includes('Container failed to start') || + errorMessage.includes('no container instance') || + errorMessage.includes('Durable Object') || + errorMessage.includes('provisioning')) { + + return new Response(JSON.stringify({ + error: 'Container failed to start', + details: 'there is no container instance that can be provided to this durable object', + message: errorMessage + }), { + status: 503, + headers: { 'Content-Type': 'application/json' } + }); + } + + // Other errors get 500 + return new Response(JSON.stringify({ + error: 'Internal server error', + details: errorMessage, + stack: error.stack + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } + } +}; + +/** + * Handle NoSQL (Durable Object) requests proxied from the container + * Routes: + * - POST /nosql/insert - insert item + * - POST /nosql/update - update item + * - POST /nosql/get - get item + * - POST /nosql/query - query items + * - POST /nosql/delete - delete item + */ +async function handleNoSQLRequest(request, env) { + try { + const url = new URL(request.url); + const operation = url.pathname.split('/').pop(); + + // Parse request body + const params = await request.json(); + const { table_name, primary_key, secondary_key, secondary_key_name, data } = params; + + // Get Durable Object stub - table_name should match the DO class name + if (!env[table_name]) { + return new Response(JSON.stringify({ + error: `Durable Object binding '${table_name}' not found` + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } + + // Create DO ID from primary key + const doId = env[table_name].idFromName(primary_key.join(':')); + const doStub = env[table_name].get(doId); + + // Forward operation to Durable Object + let result; + switch (operation) { + case 'insert': + result = await doStub.insert(secondary_key, data); + break; + case 'update': + result = await doStub.update(secondary_key, data); + break; + case 'get': + result = await doStub.get(secondary_key); + break; + case 'query': + result = await doStub.query(secondary_key_name); + break; + case 'delete': + result = await doStub.delete(secondary_key); + break; + default: + return new Response(JSON.stringify({ + error: 'Unknown NoSQL operation' + }), { + status: 404, + headers: { 'Content-Type': 'application/json' } + }); + } + + return new Response(JSON.stringify(result || {}), { + headers: { 'Content-Type': 'application/json' } + }); + + } catch (error) { + console.error('NoSQL proxy error:', error); + return new Response(JSON.stringify({ + error: error.message, + stack: error.stack + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } +} + +/** + * Handle R2 storage requests proxied from the container + * Routes: + * - GET /r2/download?bucket=X&key=Y - download object + * - POST /r2/upload?bucket=X&key=Y - upload object (body contains data) + */ +async function handleR2Request(request, env) { + try { + const url = new URL(request.url); + const bucket = url.searchParams.get('bucket'); + const key = url.searchParams.get('key'); + + // Check if R2 binding exists + if (!env.R2) { + return new Response(JSON.stringify({ + error: 'R2 binding not configured' + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } + + if (url.pathname === '/r2/list') { + // List objects in R2 with a prefix (only needs bucket) + if (!bucket) { + return new Response(JSON.stringify({ + error: 'Missing bucket parameter' + }), { + status: 400, + headers: { 'Content-Type': 'application/json' } + }); + } + + try { + const prefix = url.searchParams.get('prefix') || ''; + const list_res = await env.R2.list({ prefix }); + + return new Response(JSON.stringify({ + objects: list_res.objects || [] + }), { + headers: { 'Content-Type': 'application/json' } + }); + } catch (error) { + console.error('[worker.js /r2/list] Error:', error); + return new Response(JSON.stringify({ + error: error.message + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } + } + + // All other R2 operations require both bucket and key + if (!bucket || !key) { + return new Response(JSON.stringify({ + error: 'Missing bucket or key parameter' + }), { + status: 400, + headers: { 'Content-Type': 'application/json' } + }); + } + + if (url.pathname === '/r2/download') { + // Download from R2 + const object = await env.R2.get(key); + + if (!object) { + return new Response(JSON.stringify({ + error: 'Object not found' + }), { + status: 404, + headers: { 'Content-Type': 'application/json' } + }); + } + + // Return the object data + return new Response(object.body, { + headers: { + 'Content-Type': object.httpMetadata?.contentType || 'application/octet-stream', + 'Content-Length': object.size.toString() + } + }); + + } else if (url.pathname === '/r2/upload') { + // Upload to R2 + console.log(`[worker.js /r2/upload] bucket=${bucket}, key=${key}`); + console.log(`[worker.js /r2/upload] env.R2 exists:`, !!env.R2); + const data = await request.arrayBuffer(); + console.log(`[worker.js /r2/upload] Received ${data.byteLength} bytes`); + + // Use the key as-is (container already generates unique keys if needed) + try { + const putResult = await env.R2.put(key, data); + console.log(`[worker.js /r2/upload] R2.put() returned:`, putResult); + console.log(`[worker.js /r2/upload] Successfully uploaded to R2 with key=${key}`); + } catch (error) { + console.error(`[worker.js /r2/upload] R2.put() error:`, error); + throw error; + } + + return new Response(JSON.stringify({ + key: key + }), { + headers: { 'Content-Type': 'application/json' } + }); + + } else { + return new Response(JSON.stringify({ + error: 'Unknown R2 operation' + }), { + status: 404, + headers: { 'Content-Type': 'application/json' } + }); + } + + } catch (error) { + console.error('R2 proxy error:', error); + return new Response(JSON.stringify({ + error: error.message, + stack: error.stack + }), { + status: 500, + headers: { 'Content-Type': 'application/json' } + }); + } +} + +/** + * Generate unique key for uploaded files + */ +function generateUniqueKey(key) { + const parts = key.split('.'); + const ext = parts.length > 1 ? '.' + parts.pop() : ''; + const name = parts.join('.'); + const uuid = crypto.randomUUID().split('-')[0]; + return `${name}.${uuid}${ext}`; +} diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py new file mode 100644 index 000000000..4eb21bd8c --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Container handler for Cloudflare Workers - Python +This handler is used when deploying as a container worker +""" + +import json +import sys +import os +import traceback +from http.server import HTTPServer, BaseHTTPRequestHandler +from urllib.parse import urlparse, parse_qs +import datetime + +# Monkey-patch requests library to add User-Agent header +# This is needed because many HTTP servers (like Wikimedia) reject requests without User-Agent +try: + import requests + original_request = requests.request + + def patched_request(method, url, **kwargs): + if 'headers' not in kwargs: + kwargs['headers'] = {} + if 'User-Agent' not in kwargs['headers']: + kwargs['headers']['User-Agent'] = 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2' + return original_request(method, url, **kwargs) + + requests.request = patched_request + print("Monkey-patched requests library to add User-Agent header") +except ImportError: + print("requests library not available, skipping User-Agent monkey-patch") + +# Also patch urllib for libraries that use it directly +import urllib.request +original_urlopen = urllib.request.urlopen + +def patched_urlopen(url, data=None, timeout=None, **kwargs): + if isinstance(url, str): + req = urllib.request.Request(url, data=data) + req.add_header('User-Agent', 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2') + return original_urlopen(req, timeout=timeout, **kwargs) + elif isinstance(url, urllib.request.Request): + if not url.has_header('User-Agent'): + url.add_header('User-Agent', 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2') + return original_urlopen(url, data=data, timeout=timeout, **kwargs) + else: + return original_urlopen(url, data=data, timeout=timeout, **kwargs) + +urllib.request.urlopen = patched_urlopen +print("Monkey-patched urllib.request.urlopen to add User-Agent header") + +# Import the benchmark handler function +from function import handler as benchmark_handler + +# Import storage and nosql if available +try: + import storage +except ImportError: + storage = None + print("Storage module not available") + +try: + import nosql +except ImportError: + nosql = None + print("NoSQL module not available") + +PORT = int(os.environ.get('PORT', 8080)) + + +class ContainerHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.handle_request() + + def do_POST(self): + self.handle_request() + + def handle_request(self): + # Handle favicon requests + if 'favicon' in self.path: + self.send_response(200) + self.end_headers() + self.wfile.write(b'None') + return + + try: + # Extract Worker URL from header for R2 and NoSQL proxy + worker_url = self.headers.get('X-Worker-URL') + if worker_url: + if storage: + storage.storage.set_worker_url(worker_url) + if nosql: + nosql.nosql.set_worker_url(worker_url) + print(f"Set worker URL for R2/NoSQL proxy: {worker_url}") + + # Read request body + content_length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(content_length).decode('utf-8') if content_length > 0 else '' + + # Parse event from JSON body or URL params + event = {} + if body: + try: + event = json.loads(body) + except json.JSONDecodeError as e: + print(f'Failed to parse JSON body: {e}') + + # Parse URL parameters + parsed_url = urlparse(self.path) + params = parse_qs(parsed_url.query) + for key, values in params.items(): + if key not in event and values: + value = values[0] + try: + event[key] = int(value) + except ValueError: + event[key] = value + + # Add request metadata + import random + req_id = random.randint(0, 1000000) + income_timestamp = datetime.datetime.now().timestamp() + event['request-id'] = req_id + event['income-timestamp'] = income_timestamp + + print(f"!!! Event received: {json.dumps(event, default=str)}") + print(f"!!! Event keys: {list(event.keys())}") + print(f"!!! Event has 'bucket' key: {'bucket' in event}") + if 'bucket' in event: + print(f"!!! bucket value: {event['bucket']}") + + # Measure execution time + begin = datetime.datetime.now().timestamp() + + # Call the benchmark function + result = benchmark_handler(event) + + # Calculate timing + end = datetime.datetime.now().timestamp() + compute_time = end - begin + + # Prepare response matching native handler format exactly + log_data = { + 'output': result['result'] + } + if 'measurement' in result: + log_data['measurement'] = result['measurement'] + + response_data = { + 'begin': "0", + 'end': "0", + 'results_time': "0", + 'result': log_data, + 'is_cold': False, + 'is_cold_worker': False, + 'container_id': "0", + 'environ_container_id': "no_id", + 'request_id': "0" + } + + # Send response + if event.get('html'): + # For HTML requests, return just the result + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + html_result = result.get('result', result) + self.wfile.write(str(html_result).encode('utf-8')) + else: + # For API requests, return structured response + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(response_data).encode('utf-8')) + + except Exception as error: + print(f'Error processing request: {error}') + traceback.print_exc() + self.send_response(500) + self.send_header('Content-Type', 'application/json') + self.end_headers() + error_response = { + 'error': str(error), + 'traceback': traceback.format_exc() + } + self.wfile.write(json.dumps(error_response).encode('utf-8')) + + def log_message(self, format, *args): + # Override to use print instead of stderr + print(f"{self.address_string()} - {format % args}") + + +if __name__ == '__main__': + server = HTTPServer(('0.0.0.0', PORT), ContainerHandler) + print(f'Container server listening on port {PORT}') + server.serve_forever() diff --git a/benchmarks/wrappers/cloudflare/python/container/nosql.py b/benchmarks/wrappers/cloudflare/python/container/nosql.py new file mode 100644 index 000000000..936a49901 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/container/nosql.py @@ -0,0 +1,117 @@ +""" +NoSQL module for Cloudflare Python Containers +Uses HTTP proxy to access Durable Objects through the Worker's binding +""" +import json +import urllib.request +import urllib.parse +from typing import List, Optional, Tuple + + +class nosql: + """NoSQL client for containers using HTTP proxy to Worker's Durable Object""" + + instance: Optional["nosql"] = None + worker_url = None # Set by handler from X-Worker-URL header + + @staticmethod + def init_instance(*args, **kwargs): + """Initialize singleton instance""" + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance + + @staticmethod + def set_worker_url(url): + """Set worker URL for NoSQL proxy (called by handler)""" + nosql.worker_url = url + + def _make_request(self, operation: str, params: dict) -> dict: + """Make HTTP request to worker nosql proxy""" + if not nosql.worker_url: + raise RuntimeError("Worker URL not set - cannot access NoSQL") + + url = f"{nosql.worker_url}/nosql/{operation}" + data = json.dumps(params).encode('utf-8') + + req = urllib.request.Request(url, data=data, method='POST') + req.add_header('Content-Type', 'application/json') + + try: + with urllib.request.urlopen(req) as response: + return json.loads(response.read().decode('utf-8')) + except urllib.error.HTTPError as e: + error_body = e.read().decode('utf-8') + try: + error_data = json.loads(error_body) + raise RuntimeError(f"NoSQL operation failed: {error_data.get('error', error_body)}") + except json.JSONDecodeError: + raise RuntimeError(f"NoSQL operation failed: {error_body}") + except Exception as e: + raise RuntimeError(f"NoSQL operation failed: {e}") + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + params = { + 'table_name': table_name, + 'primary_key': list(primary_key), + 'secondary_key': list(secondary_key), + 'data': data + } + return self._make_request('insert', params) + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + params = { + 'table_name': table_name, + 'primary_key': list(primary_key), + 'secondary_key': list(secondary_key), + 'data': data + } + return self._make_request('update', params) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> Optional[dict]: + params = { + 'table_name': table_name, + 'primary_key': list(primary_key), + 'secondary_key': list(secondary_key) + } + result = self._make_request('get', params) + return result.get('data') + + def query( + self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str + ) -> List[dict]: + params = { + 'table_name': table_name, + 'primary_key': list(primary_key), + 'secondary_key_name': secondary_key_name + } + result = self._make_request('query', params) + return result.get('items', []) + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + params = { + 'table_name': table_name, + 'primary_key': list(primary_key), + 'secondary_key': list(secondary_key) + } + return self._make_request('delete', params) + + @staticmethod + def get_instance(): + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py new file mode 100644 index 000000000..3182a66c3 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -0,0 +1,203 @@ +""" +Storage module for Cloudflare Python Containers +Uses HTTP proxy to access R2 storage through the Worker's R2 binding +""" +import io +import os +import json +import urllib.request +import urllib.parse + +class storage: + """R2 storage client for containers using HTTP proxy to Worker""" + instance = None + worker_url = None # Set by handler from X-Worker-URL header + + def __init__(self): + # Container accesses R2 through worker.js proxy + # Worker URL is injected via X-Worker-URL header in each request + self.r2_enabled = True + + @staticmethod + def init_instance(entry=None): + """Initialize singleton instance""" + if storage.instance is None: + storage.instance = storage() + return storage.instance + + @staticmethod + def get_instance(): + """Get singleton instance""" + if storage.instance is None: + storage.init_instance() + return storage.instance + + @staticmethod + def set_worker_url(url): + """Set worker URL for R2 proxy (called by handler)""" + storage.worker_url = url + + @staticmethod + def unique_name(name): + """Generate unique name for file""" + import uuid + name_part, extension = os.path.splitext(name) + return f'{name_part}.{str(uuid.uuid4()).split("-")[0]}{extension}' + + def upload_stream(self, bucket: str, key: str, data): + """Upload data to R2 via worker proxy""" + if not self.r2_enabled: + print("Warning: R2 not configured, skipping upload") + return key + + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + + # Handle BytesIO objects + if isinstance(data, io.BytesIO): + data = data.getvalue() + + # Convert to bytes if needed + if isinstance(data, str): + data = data.encode('utf-8') + + # Upload via worker proxy + params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) + url = f"{storage.worker_url}/r2/upload?{params}" + + req = urllib.request.Request(url, data=data, method='POST') + req.add_header('Content-Type', 'application/octet-stream') + + try: + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode('utf-8')) + return result['key'] + except Exception as e: + print(f"R2 upload error: {e}") + raise RuntimeError(f"Failed to upload to R2: {e}") + + def download_stream(self, bucket: str, key: str) -> bytes: + """Download data from R2 via worker proxy""" + if not self.r2_enabled: + raise RuntimeError("R2 not configured") + + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + + # Download via worker proxy + params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) + url = f"{storage.worker_url}/r2/download?{params}" + + try: + with urllib.request.urlopen(url) as response: + return response.read() + except urllib.error.HTTPError as e: + if e.code == 404: + raise RuntimeError(f"Object not found: {key}") + else: + raise RuntimeError(f"Failed to download from R2: {e}") + except Exception as e: + print(f"R2 download error: {e}") + raise RuntimeError(f"Failed to download from R2: {e}") + + def upload(self, bucket, key, filepath): + """Upload file from disk with unique key generation""" + # Generate unique key to avoid conflicts + unique_key = self.unique_name(key) + print(f"!!! [storage.upload] bucket={bucket}, key={key}, unique_key={unique_key}, filepath={filepath}") + + with open(filepath, 'rb') as f: + data = f.read() + print(f"!!! [storage.upload] Read {len(data)} bytes from {filepath}") + # Upload with the unique key + self._upload_with_key(bucket, unique_key, data) + return unique_key + + def _upload_with_key(self, bucket: str, key: str, data): + """Upload data to R2 via worker proxy with exact key (internal method)""" + if not self.r2_enabled: + print("Warning: R2 not configured, skipping upload") + return + + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + + # Handle BytesIO objects + if isinstance(data, io.BytesIO): + data = data.getvalue() + + # Convert to bytes if needed + if isinstance(data, str): + data = data.encode('utf-8') + + # Upload via worker proxy with exact key + params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) + url = f"{storage.worker_url}/r2/upload?{params}" + + req = urllib.request.Request(url, data=data, method='POST') + req.add_header('Content-Type', 'application/octet-stream') + + try: + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode('utf-8')) + print(f"!!! [storage._upload_with_key] Upload successful, key={result['key']}") + except Exception as e: + print(f"R2 upload error: {e}") + raise RuntimeError(f"Failed to upload to R2: {e}") + + def download(self, bucket, key, filepath): + """Download file to disk""" + data = self.download_stream(bucket, key) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, 'wb') as f: + f.write(data) + + def download_directory(self, bucket, prefix, local_path): + """ + Download all files with a given prefix to a local directory. + Lists objects via /r2/list endpoint and downloads each one. + """ + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + + # Create local directory + os.makedirs(local_path, exist_ok=True) + + # List objects with prefix via worker proxy + params = urllib.parse.urlencode({'bucket': bucket, 'prefix': prefix}) + list_url = f"{storage.worker_url}/r2/list?{params}" + + try: + req = urllib.request.Request(list_url) + req.add_header('User-Agent', 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2') + + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode('utf-8')) + objects = result.get('objects', []) + + print(f"Found {len(objects)} objects with prefix '{prefix}'") + + # Download each object + for obj in objects: + obj_key = obj['key'] + # Create local file path by removing the prefix + relative_path = obj_key + if prefix and obj_key.startswith(prefix): + relative_path = obj_key[len(prefix):].lstrip('/') + + local_file_path = os.path.join(local_path, relative_path) + + # Create directory structure if needed + local_dir = os.path.dirname(local_file_path) + if local_dir: + os.makedirs(local_dir, exist_ok=True) + + # Download the file + print(f"Downloading {obj_key} to {local_file_path}") + self.download(bucket, obj_key, local_file_path) + + return local_path + + except Exception as e: + print(f"Error listing/downloading directory: {e}") + raise RuntimeError(f"Failed to download directory: {e}") diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 3f8ba6ca8..a0b63dcf9 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -70,7 +70,7 @@ async def fetch2(self, request, env): storage.storage.init_instance(self) - if self.env.NOSQL_STORAGE_DATABASE: + if hasattr(self.env, 'NOSQL_STORAGE_DATABASE'): from function import nosql nosql.nosql.init_instance(self) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index 9de22fe69..27fc94ce0 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -30,7 +30,14 @@ def data_pre(self, data): return pickle.dumps(data, 0).decode("ascii") def data_post(self, data): - return pickle.loads(bytes(data, "ascii")) + # Handle None (key not found in storage) + if data is None: + return None + # Handle both string and bytes data from Durable Object storage + if isinstance(data, str): + return pickle.loads(bytes(data, "ascii")) + else: + return pickle.loads(data) def insert( self, diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 3f6ebc31d..2ac2e6187 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -2,8 +2,10 @@ import os import uuid import asyncio -from pyodide.ffi import to_js, jsnull, run_sync +import base64 +from pyodide.ffi import to_js, jsnull, run_sync, JsProxy from pyodide.webloop import WebLoop +import js from workers import WorkerEntrypoint @@ -31,7 +33,9 @@ def unique_name(name): random=str(uuid.uuid4()).split('-')[0] ) def get_bucket(self, bucket): - return getattr(self.entry_env, bucket) + # R2 buckets are always bound as 'R2' in wrangler.toml + # The bucket parameter is the actual bucket name but we access via the binding + return self.entry_env.R2 @staticmethod def init_instance(entry: WorkerEntrypoint): @@ -60,11 +64,11 @@ def download(self, bucket, key, filepath): def download_directory(self, bucket, prefix, out_path): bobj = self.get_bucket(bucket) - list_res = run_sync(bobj.list(prefix = prefix)) ## gives only first 1000? + list_res = run_sync(bobj.list(to_js({"prefix": prefix}))) for obj in list_res.objects: - file_nameß = obj.key + file_name = obj.key path_to_file = os.path.dirname(file_name) - os.makedirs(os.path.join(path, path_to_file), exist_ok=True) + os.makedirs(os.path.join(out_path, path_to_file), exist_ok=True) self.download(bucket, file_name, os.path.join(out_path, file_name)) return @@ -73,10 +77,22 @@ def upload_stream(self, bucket, key, data): async def aupload_stream(self, bucket, key, data): unique_key = storage.unique_name(key) - data_js = to_js(data) + # Handle BytesIO objects - extract bytes + if hasattr(data, 'getvalue'): + data = data.getvalue() + # Convert bytes to Blob using base64 encoding as intermediate step + if isinstance(data, bytes): + # Encode as base64 + b64_str = base64.b64encode(data).decode('ascii') + # Create a Response from base64, then get the blob + # This creates a proper JavaScript Blob that R2 will accept + response = await js.fetch(f"data:application/octet-stream;base64,{b64_str}") + blob = await response.blob() + data_js = blob + else: + data_js = str(data) bobj = self.get_bucket(bucket) put_res = await bobj.put(unique_key, data_js) - ##print(put_res) return unique_key def download_stream(self, bucket, key): @@ -88,8 +104,9 @@ async def adownload_stream(self, bucket, key): if get_res == jsnull: print("key not stored in bucket") return b'' + # Always read as raw binary data (Blob/ArrayBuffer) data = await get_res.bytes() - return data + return bytes(data) def get_instance(): if storage.instance is None: diff --git a/configs/cloudflare-test.json b/configs/cloudflare-test.json index af98daff4..2b3b85827 100644 --- a/configs/cloudflare-test.json +++ b/configs/cloudflare-test.json @@ -13,7 +13,7 @@ }, "deployment": { "name": "cloudflare", - "cloudflare": { - } + "cloudflare": {}, + "container": false } } diff --git a/configs/systems.json b/configs/systems.json index 4ac3131f9..f65174f0c 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -438,6 +438,15 @@ "3.12": "ubuntu:22.04" } }, + "container_images": { + "x64": { + "3.8": "python:3.8-slim", + "3.9": "python:3.9-slim", + "3.10": "python:3.10-slim", + "3.11": "python:3.11-slim", + "3.12": "python:3.12-slim" + } + }, "images": [], "deployment": { "files": [ @@ -447,6 +456,15 @@ ], "packages": [], "module_packages": {} + }, + "container_deployment": { + "files": [ + "handler.py", + "storage.py", + "nosql.py" + ], + "packages": [], + "module_packages": {} } }, "nodejs": { @@ -456,6 +474,12 @@ "20": "ubuntu:22.04" } }, + "container_images": { + "x64": { + "18": "node:18-slim", + "20": "node:20-slim" + } + }, "images": [], "deployment": { "files": [ @@ -468,10 +492,20 @@ "packages": { "uuid": "3.4.0" } + }, + "container_deployment": { + "files": [ + "handler.js", + "storage.js", + "nosql.js" + ], + "packages": { + "uuid": "3.4.0" + } } } }, "architecture": ["x64"], - "deployments": ["package"] + "deployments": ["package", "container"] } } diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile b/dockerfiles/cloudflare/nodejs/Dockerfile new file mode 100644 index 000000000..c64351581 --- /dev/null +++ b/dockerfiles/cloudflare/nodejs/Dockerfile @@ -0,0 +1,21 @@ +FROM node:18-slim + +WORKDIR /app + +# Copy package files first for better caching +COPY package*.json ./ + +# Install dependencies +RUN npm install --production + +# Copy all application files +COPY . . + +# Expose port 8080 for container communication +EXPOSE 8080 + +# Set environment variable for port +ENV PORT=8080 + +# Start the HTTP server +CMD ["node", "handler.js"] diff --git a/dockerfiles/cloudflare/python/Dockerfile b/dockerfiles/cloudflare/python/Dockerfile new file mode 100644 index 000000000..101a1e9f1 --- /dev/null +++ b/dockerfiles/cloudflare/python/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.11-slim + +# Install system dependencies (ffmpeg for video processing benchmarks) +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy all application files first +COPY . . + +# Create ffmpeg directory and symlink for video-processing benchmark compatibility +RUN mkdir -p /app/ffmpeg && ln -s /usr/bin/ffmpeg /app/ffmpeg/ffmpeg + +# Install dependencies +# Core dependencies for wrapper modules: +# - storage.py uses urllib (stdlib) to proxy R2 requests through worker.js +# - nosql.py, worker.py, handler.py use stdlib only +# Then install benchmark-specific requirements from requirements.txt +RUN pip install --no-cache-dir --upgrade pip && \ + if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; fi + +# Expose port 8080 for container communication +EXPOSE 8080 + +# Set environment variable for port +ENV PORT=8080 + +# Start the HTTP server +CMD ["python", "handler.py"] diff --git a/sebs/cli.py b/sebs/cli.py index f65c5eb6e..6de1d4dca 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -185,7 +185,10 @@ def parse_common_params( update_nested_dict(config_obj, ["experiments", "update_code"], update_code) update_nested_dict(config_obj, ["experiments", "update_storage"], update_storage) update_nested_dict(config_obj, ["experiments", "architecture"], architecture) - update_nested_dict(config_obj, ["experiments", "container_deployment"], container_deployment) + # Only override container_deployment if explicitly set via CLI + # If not in config, use CLI default (False) + if container_deployment or "container_deployment" not in config_obj.get("experiments", {}): + update_nested_dict(config_obj, ["experiments", "container_deployment"], container_deployment) # set the path the configuration was loaded from update_nested_dict(config_obj, ["deployment", "local", "path"], config) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 0d24b122a..e199957ac 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -3,6 +3,8 @@ import json import uuid import subprocess +import time +from datetime import datetime from typing import cast, Dict, List, Optional, Tuple, Type import docker @@ -222,7 +224,7 @@ def _ensure_pywrangler_installed(self): raise RuntimeError("pywrangler version check timed out") - def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None) -> str: + def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "") -> str: """ Generate a wrangler.toml configuration file for the worker. @@ -233,24 +235,92 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: account_id: Cloudflare account ID benchmark_name: Optional benchmark name for R2 file path prefix code_package: Optional benchmark package for nosql configuration + container_deployment: Whether this is a container deployment + container_uri: Container image URI/tag Returns: Path to the generated wrangler.toml file """ - main_file = "dist/handler.js" if language == "nodejs" else "handler.py" + # Container deployment configuration + if container_deployment: + # Containers ALWAYS use Node.js worker.js for orchestration (@cloudflare/containers is Node.js only) + # The container itself can run any language (Python, Node.js, etc.) + # R2 and NoSQL access is proxied through worker.js which has the bindings + + # Determine if this benchmark needs larger disk space + # 411.image-recognition needs more disk for PyTorch models + # 311.compression needs more disk for file compression operations + # 504.dna-visualisation needs more disk for DNA sequence processing + # Python containers need even more space due to zip file creation doubling disk usage + instance_type = "" + if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): + # Use "standard" (largest) for Python, "standard-4" for Node.js + if language == "python": + instance_type = '\ninstance_type = "standard" # Largest available - needed for Python zip operations\n' + else: + instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' + + toml_content = f"""name = "{worker_name}" +main = "worker.js" +compatibility_date = "2025-11-18" +account_id = "{account_id}" +compatibility_flags = ["nodejs_compat"] + +[observability] +enabled = true + +[[containers]] +max_instances = 10 +class_name = "ContainerWorker" +image = "./Dockerfile"{instance_type} +# Durable Object binding for Container class (required by @cloudflare/containers) +[[durable_objects.bindings]] +name = "CONTAINER_WORKER" +class_name = "ContainerWorker" +""" + # Add nosql table bindings if benchmark uses them + if code_package and code_package.uses_nosql: + # Get registered nosql tables for this benchmark + nosql_storage = self.system_resources.get_nosql_storage() + if nosql_storage.retrieve_cache(benchmark_name): + nosql_tables = nosql_storage._tables.get(benchmark_name, {}) + for table_name in nosql_tables.keys(): + toml_content += f"""[[durable_objects.bindings]] +name = "{table_name}" +class_name = "KVApiObject" - # Build wrangler.toml content - toml_content = f"""name = "{worker_name}" +""" + self.logging.info(f"Added Durable Object binding for nosql table '{table_name}'") + + # Add migrations for both ContainerWorker and KVApiObject + # Both need new_sqlite_classes (Container requires SQLite DO backend) + toml_content += """[[migrations]] +tag = "v1" +new_sqlite_classes = ["ContainerWorker", "KVApiObject"] + +""" + else: + # Container without nosql - only ContainerWorker migration + toml_content += """[[migrations]] +tag = "v1" +new_sqlite_classes = ["ContainerWorker"] + +""" + else: + # Native worker configuration + main_file = "dist/handler.js" if language == "nodejs" else "handler.py" + + # Build wrangler.toml content + toml_content = f"""name = "{worker_name}" main = "{main_file}" compatibility_date = "2025-11-18" account_id = "{account_id}" """ - - if language == "nodejs": - toml_content += """# Use nodejs_compat for Node.js built-in support + if language == "nodejs": + toml_content += """# Use nodejs_compat for Node.js built-in support compatibility_flags = ["nodejs_compat"] no_bundle = true @@ -268,23 +338,23 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: fallthrough = true """ - elif language == "python": - toml_content += """# Enable Python Workers runtime + elif language == "python": + toml_content += """# Enable Python Workers runtime compatibility_flags = ["python_workers"] """ - toml_content += """ + toml_content += """ [[durable_objects.bindings]] name = "DURABLE_STORE" class_name = "KVApiObject" [[migrations]] -tag = "v1" +tag = "v3" new_classes = ["KVApiObject"] """ - # Add environment variables + # Add environment variables (for both native and container deployments) vars_content = "" if benchmark_name: vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' @@ -299,7 +369,7 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: {vars_content} """ - # Add R2 bucket binding for benchmarking files (required for fs/path polyfills) + # Add R2 bucket binding for benchmarking files (for both native and container deployments) r2_bucket_configured = False try: storage = self.system_resources.get_storage() @@ -367,17 +437,34 @@ def package_code( architecture: Target architecture (not used for Workers) benchmark: Benchmark name is_cached: Whether the code is cached - container_deployment: Whether to deploy as container (not supported) + container_deployment: Whether to deploy as container Returns: Tuple of (package_path, package_size, container_uri) """ + # Container deployment flow - build Docker image if container_deployment: - raise NotImplementedError( - "Container deployment is not supported for Cloudflare Workers" + self.logging.info(f"Building container image for {benchmark}") + return self._package_code_container( + directory, language_name, language_version, benchmark ) + + # Native worker deployment flow (existing logic) + return self._package_code_native( + directory, language_name, language_version, benchmark, is_cached + ) + def _package_code_native( + self, + directory: str, + language_name: str, + language_version: str, + benchmark: str, + is_cached: bool, + ) -> Tuple[str, int, str]: + """Package code for native Cloudflare Workers deployment.""" + # Install dependencies if language_name == "nodejs": # Ensure Wrangler is installed @@ -574,6 +661,291 @@ def package_code( return (directory, total_size, "") + def _package_code_container( + self, + directory: str, + language_name: str, + language_version: str, + benchmark: str, + ) -> Tuple[str, int, str]: + """ + Package code for Cloudflare container worker deployment. + + Builds a Docker image and returns the image tag for deployment. + """ + self.logging.info(f"Packaging container for {language_name} {language_version}") + + # Get wrapper directory for container files + wrapper_base = os.path.join( + os.path.dirname(__file__), "..", "..", "benchmarks", "wrappers", "cloudflare" + ) + wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") + + if not os.path.exists(wrapper_container_dir): + raise RuntimeError( + f"Container wrapper directory not found: {wrapper_container_dir}" + ) + + # Copy container wrapper files to the package directory + # Copy Dockerfile from dockerfiles/cloudflare/{language}/ + dockerfile_src = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "dockerfiles", + "cloudflare", + language_name, + "Dockerfile" + ) + dockerfile_dest = os.path.join(directory, "Dockerfile") + if os.path.exists(dockerfile_src): + shutil.copy2(dockerfile_src, dockerfile_dest) + self.logging.info(f"Copied Dockerfile from {dockerfile_src}") + else: + raise RuntimeError(f"Dockerfile not found at {dockerfile_src}") + + # Copy handler and utility files from wrapper/container + # Note: ALL containers use worker.js for orchestration (@cloudflare/containers is Node.js only) + # The handler inside the container can be Python or Node.js + container_files = ["handler.py" if language_name == "python" else "handler.js"] + + # For worker.js orchestration file, always use the nodejs version + nodejs_wrapper_dir = os.path.join(wrapper_base, "nodejs", "container") + worker_js_src = os.path.join(nodejs_wrapper_dir, "worker.js") + worker_js_dest = os.path.join(directory, "worker.js") + if os.path.exists(worker_js_src): + shutil.copy2(worker_js_src, worker_js_dest) + self.logging.info(f"Copied worker.js orchestration file from nodejs/container") + + # Copy storage and nosql utilities from language-specific wrapper + if language_name == "nodejs": + container_files.extend(["storage.js", "nosql.js"]) + else: + container_files.extend(["storage.py", "nosql.py"]) + + for file in container_files: + src = os.path.join(wrapper_container_dir, file) + dest = os.path.join(directory, file) + if os.path.exists(src): + shutil.copy2(src, dest) + self.logging.info(f"Copied container file: {file}") + + # For Python containers, fix relative imports in benchmark code + # Containers use flat structure, so "from . import storage" must become "import storage" + if language_name == "python": + for item in os.listdir(directory): + if item.endswith('.py') and item not in ['handler.py', 'storage.py', 'nosql.py', 'worker.py']: + filepath = os.path.join(directory, item) + with open(filepath, 'r') as f: + content = f.read() + + # Replace relative imports with absolute imports + modified = False + if 'from . import storage' in content: + content = content.replace('from . import storage', 'import storage') + modified = True + if 'from . import nosql' in content: + content = content.replace('from . import nosql', 'import nosql') + modified = True + + if modified: + with open(filepath, 'w') as f: + f.write(content) + self.logging.info(f"Fixed relative imports in {item}") + + # For Node.js containers, transform benchmark code to be async-compatible + # The container wrapper uses async HTTP calls, but benchmarks expect sync + elif language_name == "nodejs": + import re + for item in os.listdir(directory): + if item.endswith('.js') and item not in ['handler.js', 'storage.js', 'nosql.js', 'worker.js', 'build.js', 'request-polyfill.js']: + filepath = os.path.join(directory, item) + with open(filepath, 'r') as f: + content = f.read() + + # Only transform if file uses nosqlClient + if 'nosqlClient' not in content: + continue + + self.logging.info(f"Transforming {item} for async nosql...") + + # Step 1: Add await before nosqlClient method calls + content = re.sub( + r'(\s*)((?:const|let|var)\s+\w+\s*=\s*)?nosqlClient\.(insert|get|update|query|delete)\s*\(', + r'\1\2await nosqlClient.\3(', + content + ) + + # Step 2: Make all function declarations async + content = re.sub(r'^(\s*)function\s+(\w+)\s*\(', r'\1async function \2(', content, flags=re.MULTILINE) + + # Step 3: Add await before user-defined function calls + lines = content.split('\n') + transformed_lines = [] + control_flow = {'if', 'for', 'while', 'switch', 'catch', 'return'} + builtins = {'console', 'require', 'push', 'join', 'split', 'map', 'filter', + 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every', + 'includes', 'parseInt', 'parseFloat', 'isNaN', 'Array', + 'Object', 'String', 'Number', 'Boolean', 'Math', 'JSON', + 'Date', 'RegExp', 'Error', 'Promise'} + + for line in lines: + # Skip function declarations + if re.search(r'\bfunction\s+\w+\s*\(', line) or re.search(r'=\s*(async\s+)?function\s*\(', line): + transformed_lines.append(line) + continue + + # Add await before likely user-defined function calls + def replacer(match): + prefix = match.group(1) + assignment = match.group(2) or '' + func_name = match.group(3) + + if func_name in control_flow or func_name in builtins: + return match.group(0) + + return f"{prefix}{assignment}await {func_name}(" + + line = re.sub( + r'(^|\s+|;|,|\()((?:const|let|var)\s+\w+\s*=\s*)?(\w+)\s*\(', + replacer, + line + ) + transformed_lines.append(line) + + content = '\n'.join(transformed_lines) + + with open(filepath, 'w') as f: + f.write(content) + self.logging.info(f"Transformed {item} for async nosql") + + # Install dependencies for container orchestration + # ALL containers need @cloudflare/containers for worker.js orchestration + worker_package_json = { + "name": f"{benchmark}-worker", + "version": "1.0.0", + "dependencies": { + "@cloudflare/containers": "*" + } + } + + if language_name == "nodejs": + # Read the benchmark's package.json if it exists and merge dependencies + benchmark_package_file = os.path.join(directory, "package.json") + if os.path.exists(benchmark_package_file): + with open(benchmark_package_file, 'r') as f: + benchmark_package = json.load(f) + # Merge benchmark dependencies with worker dependencies + if "dependencies" in benchmark_package: + worker_package_json["dependencies"].update(benchmark_package["dependencies"]) + + # Write the combined package.json + with open(benchmark_package_file, 'w') as f: + json.dump(worker_package_json, f, indent=2) + else: # Python containers also need package.json for worker.js orchestration + # Create package.json just for @cloudflare/containers (Python code in container) + package_json_path = os.path.join(directory, "package.json") + with open(package_json_path, 'w') as f: + json.dump(worker_package_json, f, indent=2) + self.logging.info("Created package.json for Python container worker.js orchestration") + + # Install Node.js dependencies (needed for all containers for worker.js) + self.logging.info(f"Installing @cloudflare/containers for worker.js orchestration in {directory}") + try: + result = subprocess.run( + ["npm", "install", "--production"], + cwd=directory, + capture_output=True, + text=True, + check=True, + timeout=120 + ) + self.logging.info("npm install completed successfully") + except Exception as e: + self.logging.error(f"npm install failed: {e}") + raise RuntimeError(f"Failed to install Node.js dependencies: {e}") + + # For Python containers, also handle Python requirements + if language_name == "python": + # Python requirements will be installed in the Dockerfile + # Rename version-specific requirements.txt to requirements.txt + requirements_file = os.path.join(directory, "requirements.txt") + versioned_requirements = os.path.join(directory, f"requirements.txt.{language_version}") + + if os.path.exists(versioned_requirements): + shutil.copy2(versioned_requirements, requirements_file) + self.logging.info(f"Copied requirements.txt.{language_version} to requirements.txt") + elif not os.path.exists(requirements_file): + # Create empty requirements.txt if none exists + with open(requirements_file, 'w') as f: + f.write("") + self.logging.info("Created empty requirements.txt") + + # Build Docker image locally for cache compatibility + # wrangler will re-build/push during deployment from the Dockerfile + image_tag = self._build_container_image_local(directory, benchmark, language_name, language_version) + + # Calculate package size (approximate, as it's a source directory) + total_size = 0 + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + total_size += os.path.getsize(filepath) + + self.logging.info(f"Container package prepared with local image: {image_tag}") + + # Return local image tag (wrangler will rebuild from Dockerfile during deploy) + return (directory, total_size, image_tag) + + def _build_container_image_local( + self, + directory: str, + benchmark: str, + language_name: str, + language_version: str, + ) -> str: + """ + Build a Docker image locally for cache purposes. + wrangler will rebuild from Dockerfile during deployment. + + Returns the local image tag. + """ + # Generate image tag + image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" + image_tag = f"{image_name}:latest" + + self.logging.info(f"Building local container image: {image_tag}") + + try: + # Build the Docker image locally (no push) + # Use --no-cache to ensure handler changes are picked up + result = subprocess.run( + ["docker", "build", "--no-cache", "-t", image_tag, "."], + cwd=directory, + capture_output=True, + text=True, + check=True, + timeout=300 # 5 minutes for build + ) + + self.logging.info(f"Local container image built: {image_tag}") + if result.stdout: + self.logging.debug(f"Docker build output: {result.stdout}") + + return image_tag + + except subprocess.CalledProcessError as e: + error_msg = f"Docker build failed for {image_tag}" + if e.stderr: + error_msg += f": {e.stderr}" + self.logging.error(error_msg) + raise RuntimeError(error_msg) + except subprocess.TimeoutExpired: + raise RuntimeError(f"Docker build timed out for {image_tag}") + + + return (directory, total_size, "") + def create_function( self, code_package: Benchmark, @@ -589,24 +961,19 @@ def create_function( Args: code_package: Benchmark containing the function code func_name: Name of the worker - container_deployment: Whether to deploy as container (not supported) - container_uri: URI of container image (not used) + container_deployment: Whether to deploy as container + container_uri: URI of container image Returns: CloudflareWorker instance """ - if container_deployment: - raise NotImplementedError( - "Container deployment is not supported for Cloudflare Workers" - ) - package = code_package.code_location benchmark = code_package.benchmark language = code_package.language_name language_runtime = code_package.language_version function_cfg = FunctionConfig.from_benchmark(code_package) - func_name = self.format_function_name(func_name) + func_name = self.format_function_name(func_name, container_deployment) account_id = self.config.credentials.account_id if not account_id: @@ -632,7 +999,7 @@ def create_function( self.logging.info(f"Creating new worker {func_name}") # Create the worker with all package files - self._create_or_update_worker(func_name, package, account_id, language, benchmark, code_package) + self._create_or_update_worker(func_name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) worker = CloudflareWorker( func_name, @@ -679,7 +1046,7 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: return None def _create_or_update_worker( - self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None + self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "" ) -> dict: """Create or update a Cloudflare Worker using Wrangler CLI. @@ -690,28 +1057,24 @@ def _create_or_update_worker( language: Programming language (nodejs or python) benchmark_name: Optional benchmark name for R2 file path prefix code_package: Optional benchmark package for nosql configuration + container_deployment: Whether this is a container deployment + container_uri: Container image URI/tag Returns: Worker deployment result """ - # # Convert CommonJS function.js to ESM if it exists - # if language == "nodejs": - # function_js = os.path.join(package_dir, "function.js") - # if os.path.exists(function_js): - # self.logging.info(f"Converting function.js from CommonJS to ESM...") - # try: - # esm_content = self._convert_commonjs_to_esm(function_js) - # with open(function_js, 'w') as f: - # f.write(esm_content) - # self.logging.info("Successfully converted function.js to ESM") - # except Exception as e: - # self.logging.error(f"Failed to convert function.js to ESM: {e}") - # raise # Generate wrangler.toml for this worker - self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package) + self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) # Set up environment for Wrangler env = os.environ.copy() + + # Add uv tools bin directory to PATH for pywrangler access + home_dir = os.path.expanduser("~") + uv_bin_dir = os.path.join(home_dir, ".local", "share", "uv", "tools", "workers-py", "bin") + if os.path.exists(uv_bin_dir): + env['PATH'] = f"{uv_bin_dir}:{env.get('PATH', '')}" + if self.config.credentials.api_token: env['CLOUDFLARE_API_TOKEN'] = self.config.credentials.api_token elif self.config.credentials.email and self.config.credentials.api_key: @@ -723,21 +1086,49 @@ def _create_or_update_worker( # Deploy using Wrangler self.logging.info(f"Deploying worker {worker_name} using Wrangler...") + # For container deployments, always use wrangler (not pywrangler) + # For native deployments, use wrangler for nodejs, pywrangler for python + if container_deployment: + wrangler_cmd = "wrangler" + else: + wrangler_cmd = "wrangler" if language == "nodejs" else "pywrangler" + try: + # Increase timeout for large container images (e.g., 411.image-recognition with PyTorch) + # Container deployment requires pushing large images to Cloudflare + deploy_timeout = 1200 if container_deployment else 180 # 20 minutes for containers, 3 for native + result = subprocess.run( - ["wrangler" if language == "nodejs" else "pywrangler", "deploy"], + [wrangler_cmd, "deploy"], cwd=package_dir, env=env, capture_output=True, text=True, check=True, - timeout=180 # 3 minutes for deployment + timeout=deploy_timeout ) self.logging.info(f"Worker {worker_name} deployed successfully") if result.stdout: self.logging.debug(f"Wrangler deploy output: {result.stdout}") + # For container deployments, wait for Durable Object infrastructure to initialize + # The container binding needs time to propagate before first invocation + if container_deployment: + self.logging.info("Waiting for container Durable Object to initialize...") + self._wait_for_durable_object_ready(worker_name, package_dir, env) + + # for benchmarks 220, 311, 411 we need to wait longer after deployment + # if benchmark_name in ["220.video-processing", "311.compression", "411.image-recognition", "504.dna-visualisation"]: + # self.logging.info("Waiting 120 seconds for benchmark initialization...") + # time.sleep(400) + + # For container deployments, wait for Durable Object infrastructure to initialize + # The container binding needs time to propagate before first invocation + if container_deployment: + self.logging.info("Waiting 60 seconds for container Durable Object to initialize...") + time.sleep(60) + # Parse the output to get worker URL # Wrangler typically outputs: "Published ()" # and "https://..workers.dev" @@ -753,6 +1144,69 @@ def _create_or_update_worker( self.logging.error(error_msg) raise RuntimeError(error_msg) + def _wait_for_durable_object_ready(self, worker_name: str, package_dir: str, env: dict): + """Wait for container Durable Object to be fully provisioned and ready.""" + max_wait_seconds = 400 + wait_interval = 10 + start_time = time.time() + + account_id = env.get('CLOUDFLARE_ACCOUNT_ID') + worker_url = self._build_workers_dev_url(worker_name, account_id) + + self.logging.info("Checking container Durable Object readiness via health endpoint...") + + consecutive_failures = 0 + max_consecutive_failures = 5 + + while time.time() - start_time < max_wait_seconds: + try: + # Use health check endpoint + response = requests.get( + f"{worker_url}/health", + timeout=60 + ) + + # 200 = ready + if response.status_code == 200: + self.logging.info("Container Durable Object is ready!") + return True + + # 503 = not ready yet (expected, keep waiting) + elif response.status_code == 503: + elapsed = int(time.time() - start_time) + try: + error_data = response.json() + error_msg = error_data.get('error', 'Container provisioning') + self.logging.info(f"{error_msg}... ({elapsed}s elapsed)") + except: + self.logging.info(f"Container provisioning... ({elapsed}s elapsed)") + consecutive_failures = 0 # This is expected + + # 500 or other = something's wrong + else: + consecutive_failures += 1 + self.logging.warning(f"Unexpected status {response.status_code}: {response.text[:200]}") + + # If we get too many unexpected errors, something might be broken + if consecutive_failures >= max_consecutive_failures: + self.logging.error(f"Got {consecutive_failures} consecutive errors, container may be broken") + return False + + except requests.exceptions.Timeout: + elapsed = int(time.time() - start_time) + self.logging.info(f"Health check timeout (container may be starting)... ({elapsed}s elapsed)") + except requests.exceptions.RequestException as e: + elapsed = int(time.time() - start_time) + self.logging.debug(f"Connection error ({elapsed}s): {str(e)[:100]}") + + time.sleep(wait_interval) + + self.logging.warning( + f"Container Durable Object may not be fully ready after {max_wait_seconds}s. " + "First invocation may still experience initialization delay." + ) + return False + def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: """Fetch the workers.dev subdomain for the given account. @@ -849,14 +1303,9 @@ def update_function( Args: function: Existing function instance to update code_package: New benchmark containing the function code - container_deployment: Whether to deploy as container (not supported) - container_uri: URI of container image (not used) + container_deployment: Whether to deploy as container + container_uri: URI of container image """ - if container_deployment: - raise NotImplementedError( - "Container deployment is not supported for Cloudflare Workers" - ) - worker = cast(CloudflareWorker, function) package = code_package.code_location language = code_package.language_name @@ -867,7 +1316,7 @@ def update_function( if not account_id: raise RuntimeError("Account ID is required to update worker") - self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package) + self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) self.logging.info(f"Updated worker {worker.name}") # Update configuration if needed @@ -918,7 +1367,7 @@ def default_function_name(self, code_package: Benchmark, resources=None) -> str: ).lower() @staticmethod - def format_function_name(name: str) -> str: + def format_function_name(name: str, container_deployment: bool = False) -> str: """ Format a function name to comply with Cloudflare Worker naming rules. @@ -926,9 +1375,11 @@ def format_function_name(name: str) -> str: - Be lowercase - Contain only alphanumeric characters and hyphens - Not start or end with a hyphen + - Not start with a digit Args: name: The original name + container_deployment: Whether this is a container worker (adds 'w-' prefix if name starts with digit) Returns: Formatted name @@ -939,6 +1390,10 @@ def format_function_name(name: str) -> str: formatted = ''.join(c for c in formatted if c.isalnum() or c == '-') # Remove leading/trailing hyphens formatted = formatted.strip('-') + # Ensure container worker names don't start with a digit (Cloudflare requirement) + # Only add prefix for container workers to differentiate from native workers + if container_deployment and formatted and formatted[0].isdigit(): + formatted = 'container-' + formatted return formatted def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 73e047fb7..660588e1f 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -1,4 +1,5 @@ import json +import os import requests from sebs.cloudflare.config import CloudflareCredentials @@ -233,18 +234,48 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: """ - Retrieves list of files in a bucket. + Retrieves list of files in a bucket using S3-compatible API. :param bucket_name: :param prefix: optional prefix filter :return: list of files in a given bucket """ - account_id = self._credentials.account_id + # Use S3-compatible API with R2 credentials + if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: + self.logging.warning(f"R2 S3 credentials not configured, cannot list bucket {bucket_name}") + return [] - # R2 uses S3-compatible API for listing objects - # For now, return empty list as listing objects requires S3 credentials - self.logging.warning(f"list_bucket not fully implemented for R2 bucket {bucket_name}") - return [] + try: + import boto3 + from botocore.config import Config + + account_id = self._credentials.account_id + r2_endpoint = f"https://{account_id}.r2.cloudflarestorage.com" + + s3_client = boto3.client( + 's3', + endpoint_url=r2_endpoint, + aws_access_key_id=self._credentials.r2_access_key_id, + aws_secret_access_key=self._credentials.r2_secret_access_key, + config=Config(signature_version='s3v4'), + region_name='auto' + ) + + # List objects with optional prefix + paginator = s3_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) + + files = [] + for page in page_iterator: + if 'Contents' in page: + for obj in page['Contents']: + files.append(obj['Key']) + + return files + + except Exception as e: + self.logging.warning(f"Failed to list R2 bucket {bucket_name}: {str(e)}") + return [] def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: """ @@ -346,9 +377,25 @@ def uploader_func(self, bucket_idx: int, file: str, filepath: str) -> None: """ Upload a file to a bucket (used for parallel uploads). - :param bucket_idx: index of the bucket to upload to - :param file: destination file name + :param bucket_idx: index of the bucket/prefix to upload to + :param file: destination file name/key :param filepath: source file path """ - self.logging.warning(f"uploader_func not fully implemented for R2") - pass + # Skip upload when using cached buckets and not updating storage + if self.cached and not self.replace_existing: + return + + # Build the key with the input prefix + key = os.path.join(self.input_prefixes[bucket_idx], file) + + bucket_name = self.get_bucket(Resources.StorageBucketType.BENCHMARKS) + + # Check if file already exists (if not replacing existing files) + if not self.replace_existing: + for f in self.input_prefixes_files[bucket_idx]: + if key == f: + self.logging.info(f"Skipping upload of {filepath} to {bucket_name} (already exists)") + return + + # Upload the file + self.upload(bucket_name, filepath, key) diff --git a/sebs/experiments/config.py b/sebs/experiments/config.py index edde88de4..30483948a 100644 --- a/sebs/experiments/config.py +++ b/sebs/experiments/config.py @@ -173,7 +173,7 @@ def deserialize(config: dict) -> "Config": cfg._update_code = config["update_code"] cfg._update_storage = config["update_storage"] cfg._download_results = config["download_results"] - cfg._container_deployment = config["container_deployment"] + cfg._container_deployment = config.get("container_deployment", False) cfg._runtime = Runtime.deserialize(config["runtime"]) cfg._flags = config["flags"] if "flags" in config else {} cfg._architecture = config["architecture"] From 5284880fa0b6ddc59818a56f714d2fe1c244aef3 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Dec 2025 11:23:18 +0100 Subject: [PATCH 032/230] bigger container for python containers --- sebs/cloudflare/cloudflare.py | 42 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index e199957ac..a710b436c 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -255,10 +255,10 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: instance_type = "" if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): # Use "standard" (largest) for Python, "standard-4" for Node.js - if language == "python": - instance_type = '\ninstance_type = "standard" # Largest available - needed for Python zip operations\n' - else: - instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' + # if language == "python": + # instance_type = '\ninstance_type = "standard-4" # Largest available - needed for Python zip operations\n' + # else: + instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' toml_content = f"""name = "{worker_name}" main = "worker.js" @@ -875,6 +875,40 @@ def replacer(match): if os.path.exists(versioned_requirements): shutil.copy2(versioned_requirements, requirements_file) self.logging.info(f"Copied requirements.txt.{language_version} to requirements.txt") + + # Fix torch wheel URLs for container compatibility + # Replace direct wheel URLs with proper torch installation + with open(requirements_file, 'r') as f: + content = f.read() + + # Replace torch wheel URLs with proper installation commands + import re + modified = False + if 'download.pytorch.org/whl' in content: + # Remove direct wheel URLs and replace with proper torch installation + lines = content.split('\n') + new_lines = [] + for line in lines: + if 'download.pytorch.org/whl/cpu/torch-' in line: + # Extract version from URL (e.g., torch-2.0.0+cpu) + match = re.search(r'torch-([0-9.]+)(?:%2B|\+)cpu', line) + if match: + version = match.group(1) + # Use index-url method instead of direct wheel + new_lines.append(f'torch=={version}') + modified = True + else: + new_lines.append(line) + else: + new_lines.append(line) + + if modified: + # Add extra-index-url at the top for CPU-only torch + content = '--extra-index-url https://download.pytorch.org/whl/cpu\n' + '\n'.join(new_lines) + with open(requirements_file, 'w') as f: + f.write(content) + self.logging.info("Modified requirements.txt to use torch index-url instead of direct wheels") + elif not os.path.exists(requirements_file): # Create empty requirements.txt if none exists with open(requirements_file, 'w') as f: From b6de39b14565e51a6e63dd70a78e99fdd04e6173 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Dec 2025 13:41:25 +0100 Subject: [PATCH 033/230] sleep delay longer --- benchmarks/wrappers/cloudflare/nodejs/container/worker.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index 78140794f..8dee914a0 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -4,7 +4,7 @@ import { DurableObject } from "cloudflare:workers"; // Container wrapper class export class ContainerWorker extends Container { defaultPort = 8080; - sleepAfter = "10m"; + sleepAfter = "30m"; } // Durable Object for NoSQL storage (simple proxy to ctx.storage) From 812f5925a89c7d56d0f7c1ac2456c6fab42685a3 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Dec 2025 14:02:10 +0100 Subject: [PATCH 034/230] request_id has to be string --- benchmarks/wrappers/cloudflare/python/container/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 4eb21bd8c..16537f5de 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -118,7 +118,7 @@ def handle_request(self): # Add request metadata import random - req_id = random.randint(0, 1000000) + req_id = str(random.randint(0, 1000000)) income_timestamp = datetime.datetime.now().timestamp() event['request-id'] = req_id event['income-timestamp'] = income_timestamp From 9229f9fa9eb72f31e35cae0a58854f555b8350ba Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Dec 2025 14:31:56 +0100 Subject: [PATCH 035/230] update container fixed --- sebs/cloudflare/cloudflare.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index a710b436c..602480309 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1350,10 +1350,18 @@ def update_function( if not account_id: raise RuntimeError("Account ID is required to update worker") - self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) - self.logging.info(f"Updated worker {worker.name}") + # For container deployments, skip redeployment if code hasn't changed + # Containers don't support runtime memory configuration changes + # Detect container deployment by checking if worker name starts with "container-" + is_container = worker.name.startswith("container-") + + if is_container: + self.logging.info(f"Skipping redeployment for container worker {worker.name} - containers don't support runtime memory updates") + else: + self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) + self.logging.info(f"Updated worker {worker.name}") - # Update configuration if needed + # Update configuration if needed (no-op for containers since they don't support runtime memory changes) self.update_function_configuration(worker, code_package) def update_function_configuration( From 5899d87ee40144f058f7b2d17ffd614ad712f9b8 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sat, 13 Dec 2025 13:35:51 +0100 Subject: [PATCH 036/230] fixed benchmark wrapper request ids for experiment results as well as begin/end --- .../cloudflare/nodejs/container/handler.js | 18 ++++++++++----- .../wrappers/cloudflare/nodejs/handler.js | 13 ++++++++--- .../cloudflare/python/container/handler.py | 19 +++++++++++----- .../wrappers/cloudflare/python/handler.py | 22 ++++++++++++++----- sebs/cloudflare/cloudflare.py | 2 +- 5 files changed, 53 insertions(+), 21 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 6f99c6728..967db73c0 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -67,6 +67,10 @@ const server = http.createServer(async (req, res) => { } try { + // Get unique request ID from Cloudflare (CF-Ray header) + const crypto = require('crypto'); + const reqId = req.headers['cf-ray'] || crypto.randomUUID(); + // Extract Worker URL from header for R2 and NoSQL proxy const workerUrl = req.headers['x-worker-url']; if (workerUrl) { @@ -109,7 +113,6 @@ const server = http.createServer(async (req, res) => { } // Add request metadata - const reqId = 0; const incomeTimestamp = Math.floor(Date.now() / 1000); event['request-id'] = reqId; event['income-timestamp'] = incomeTimestamp; @@ -154,21 +157,26 @@ const server = http.createServer(async (req, res) => { console.log('Sending response with log_data:', log_data); + // Get memory usage in MB + const memUsage = process.memoryUsage(); + const memory_mb = memUsage.heapUsed / 1024 / 1024; + // Send response matching Python handler format exactly if (event.html) { res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); res.end(String(ret && ret.result !== undefined ? ret.result : ret)); } else { const responseBody = JSON.stringify({ - begin: "0", - end: "0", - results_time: "0", + begin: begin, + end: end, + results_time: 0, result: log_data, is_cold: false, is_cold_worker: false, container_id: '0', environ_container_id: 'no_id', - request_id: '0', + request_id: reqId, + memory_used: memory_mb, }); res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(responseBody); diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 507d68153..4303be81e 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -40,6 +40,9 @@ export default { return new Response('None'); } + // Get unique request ID from Cloudflare (CF-Ray header) + const req_id = request.headers.get('CF-Ray') || crypto.randomUUID(); + // Start timing measurements const begin = Date.now() / 1000; const start = performance.now(); @@ -80,8 +83,7 @@ export default { } } - // Set request id and timestamps (Python used 0 for request id) - const req_id = 0; + // Set timestamps const income_timestamp = Math.floor(Date.now() / 1000); event['request-id'] = req_id; event['income-timestamp'] = income_timestamp; @@ -201,6 +203,10 @@ export default { }); } + // Get memory usage in MB + const memUsage = process.memoryUsage(); + const memory_mb = memUsage.heapUsed / 1024 / 1024; + const responseBody = JSON.stringify({ begin: begin, end: end, @@ -211,7 +217,8 @@ export default { is_cold_worker: false, container_id: '0', environ_container_id: 'no_id', - request_id: '0', + request_id: req_id, + memory_used: memory_mb, }); return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 16537f5de..dd52cb0ed 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -8,6 +8,7 @@ import sys import os import traceback +import resource from http.server import HTTPServer, BaseHTTPRequestHandler from urllib.parse import urlparse, parse_qs import datetime @@ -84,6 +85,10 @@ def handle_request(self): return try: + # Get unique request ID from Cloudflare (CF-Ray header) + import uuid + req_id = self.headers.get('CF-Ray', str(uuid.uuid4())) + # Extract Worker URL from header for R2 and NoSQL proxy worker_url = self.headers.get('X-Worker-URL') if worker_url: @@ -117,8 +122,6 @@ def handle_request(self): event[key] = value # Add request metadata - import random - req_id = str(random.randint(0, 1000000)) income_timestamp = datetime.datetime.now().timestamp() event['request-id'] = req_id event['income-timestamp'] = income_timestamp @@ -146,16 +149,20 @@ def handle_request(self): if 'measurement' in result: log_data['measurement'] = result['measurement'] + # Get memory usage in MB + memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + response_data = { - 'begin': "0", - 'end': "0", - 'results_time': "0", + 'begin': begin, + 'end': end, + 'results_time': 0, 'result': log_data, 'is_cold': False, 'is_cold_worker': False, 'container_id': "0", 'environ_container_id': "no_id", - 'request_id': "0" + 'request_id': req_id, + 'memory_used': memory_mb } # Send response diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index a0b63dcf9..37cbf58ba 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -2,6 +2,7 @@ import asyncio import importlib.util import traceback +import resource from workers import WorkerEntrypoint, Response, DurableObject ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -34,6 +35,9 @@ async def fetch(self, request, env): async def fetch2(self, request, env): if "favicon" in request.url: return Response("None") + # Get unique request ID from Cloudflare (CF-Ray header) + req_id = request.headers.get('CF-Ray', str(uuid.uuid4())) + req_text = await request.text() event = json.loads(req_text) if len(req_text) > 0 else {} @@ -55,8 +59,6 @@ async def fetch2(self, request, env): - ## we might need more data in self.env to know this ID - req_id = 0 ## note: time fixed in worker income_timestamp = datetime.datetime.now().timestamp() @@ -97,16 +99,24 @@ async def fetch2(self, request, env): headers = {"Content-Type" : "text/html; charset=utf-8"} return Response(str(ret["result"]), headers = headers) else: + # Get memory usage in MB + memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + + # Calculate timestamps + end_timestamp = datetime.datetime.now().timestamp() + begin_timestamp = income_timestamp + return Response(json.dumps({ - 'begin': "0", - 'end': "0", - 'results_time': "0", + 'begin': begin_timestamp, + 'end': end_timestamp, + 'results_time': 0, 'result': log_data, 'is_cold': False, 'is_cold_worker': False, 'container_id': "0", 'environ_container_id': "no_id", - 'request_id': "0" + 'request_id': req_id, + 'memory_used': memory_mb })) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 602480309..7977abed1 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1511,7 +1511,7 @@ def download_metrics( wall_times.append(result.times.benchmark) # Collect memory usage - if result.stats.memory_used > 0: + if result.stats.memory_used is not None and result.stats.memory_used > 0: memory_values.append(result.stats.memory_used) # Set billing info for Cloudflare Workers From 6e0cd2bd1b8bca6950c1b8ec3f1fe5213337ee08 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sat, 13 Dec 2025 14:25:12 +0100 Subject: [PATCH 037/230] extract memory correctly --- .../cloudflare/nodejs/container/handler.js | 15 +++++++-------- benchmarks/wrappers/cloudflare/nodejs/handler.js | 13 ++++++++----- .../cloudflare/python/container/handler.py | 8 +++++--- benchmarks/wrappers/cloudflare/python/handler.py | 13 ++++++++----- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 967db73c0..e18212342 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -150,16 +150,16 @@ const server = http.createServer(async (req, res) => { const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; if (ret && ret.measurement !== undefined) { log_data.measurement = ret.measurement; + } else { + log_data.measurement = {}; } - if (event.logs !== undefined) { - log_data.time = 0; - } - - console.log('Sending response with log_data:', log_data); - - // Get memory usage in MB + + // Add memory usage to measurement const memUsage = process.memoryUsage(); const memory_mb = memUsage.heapUsed / 1024 / 1024; + log_data.measurement.memory_used_mb = memory_mb; + + console.log('Sending response with log_data:', log_data); // Send response matching Python handler format exactly if (event.html) { @@ -176,7 +176,6 @@ const server = http.createServer(async (req, res) => { container_id: '0', environ_container_id: 'no_id', request_id: reqId, - memory_used: memory_mb, }); res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(responseBody); diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 4303be81e..379fa23ad 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -192,7 +192,15 @@ export default { const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; if (ret && ret.measurement !== undefined) { log_data.measurement = ret.measurement; + } else { + log_data.measurement = {}; } + + // Add memory usage to measurement + const memUsage = process.memoryUsage(); + const memory_mb = memUsage.heapUsed / 1024 / 1024; + log_data.measurement.memory_used_mb = memory_mb; + if (event.logs !== undefined) { log_data.time = 0; } @@ -203,10 +211,6 @@ export default { }); } - // Get memory usage in MB - const memUsage = process.memoryUsage(); - const memory_mb = memUsage.heapUsed / 1024 / 1024; - const responseBody = JSON.stringify({ begin: begin, end: end, @@ -218,7 +222,6 @@ export default { container_id: '0', environ_container_id: 'no_id', request_id: req_id, - memory_used: memory_mb, }); return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index dd52cb0ed..5f3cb2a43 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -148,9 +148,12 @@ def handle_request(self): } if 'measurement' in result: log_data['measurement'] = result['measurement'] + else: + log_data['measurement'] = {} - # Get memory usage in MB + # Add memory usage to measurement memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + log_data['measurement']['memory_used_mb'] = memory_mb response_data = { 'begin': begin, @@ -161,8 +164,7 @@ def handle_request(self): 'is_cold_worker': False, 'container_id': "0", 'environ_container_id': "no_id", - 'request_id': req_id, - 'memory_used': memory_mb + 'request_id': req_id } # Send response diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 37cbf58ba..8b424df46 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -92,6 +92,13 @@ async def fetch2(self, request, env): } if 'measurement' in ret: log_data['measurement'] = ret['measurement'] + else: + log_data['measurement'] = {} + + # Add memory usage to measurement + memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + log_data['measurement']['memory_used_mb'] = memory_mb + if 'logs' in event: log_data['time'] = 0 @@ -99,9 +106,6 @@ async def fetch2(self, request, env): headers = {"Content-Type" : "text/html; charset=utf-8"} return Response(str(ret["result"]), headers = headers) else: - # Get memory usage in MB - memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 - # Calculate timestamps end_timestamp = datetime.datetime.now().timestamp() begin_timestamp = income_timestamp @@ -115,8 +119,7 @@ async def fetch2(self, request, env): 'is_cold_worker': False, 'container_id': "0", 'environ_container_id': "no_id", - 'request_id': req_id, - 'memory_used': memory_mb + 'request_id': req_id })) From 3cd741fca97d3d39acacc9253151d85849cad176 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sun, 14 Dec 2025 11:47:03 +0100 Subject: [PATCH 038/230] pyiodide does not support resource module for memory measurement --- .../wrappers/cloudflare/python/handler.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 8b424df46..91be3acf9 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -2,7 +2,12 @@ import asyncio import importlib.util import traceback -import resource +try: + import resource + HAS_RESOURCE = True +except ImportError: + # Pyodide (Python native workers) doesn't support resource module + HAS_RESOURCE = False from workers import WorkerEntrypoint, Response, DurableObject ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -95,9 +100,13 @@ async def fetch2(self, request, env): else: log_data['measurement'] = {} - # Add memory usage to measurement - memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 - log_data['measurement']['memory_used_mb'] = memory_mb + # Add memory usage to measurement (if resource module is available) + if HAS_RESOURCE: + memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + log_data['measurement']['memory_used_mb'] = memory_mb + else: + # Pyodide doesn't support resource module + log_data['measurement']['memory_used_mb'] = 0.0 if 'logs' in event: log_data['time'] = 0 From 2615a3659ac7d97bc67d4f95420ceb9a0102c858 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 15 Dec 2025 16:20:59 +0100 Subject: [PATCH 039/230] timing fix for cloudflare handler --- .../wrappers/cloudflare/nodejs/handler.js | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index 379fa23ad..df0cee97b 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -44,8 +44,9 @@ export default { const req_id = request.headers.get('CF-Ray') || crypto.randomUUID(); // Start timing measurements - const begin = Date.now() / 1000; const start = performance.now(); + const begin = Date.now() / 1000; + // Parse JSON body first (similar to Azure handler which uses req.body) const req_text = await request.text(); @@ -158,6 +159,16 @@ export default { throw new Error('benchmark handler function not found'); } } catch (err) { + // Trigger a fetch request to update the timer before measuring + // Time measurements only update after a fetch request or R2 operation + try { + // Fetch the worker's own URL with favicon to minimize overhead + const finalUrl = new URL(request.url); + finalUrl.pathname = '/favicon'; + await fetch(finalUrl.toString(), { method: 'HEAD' }); + } catch (e) { + // Ignore fetch errors + } // Calculate timing even for errors const end = Date.now() / 1000; const elapsed = performance.now() - start; @@ -183,7 +194,18 @@ export default { return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); } - // Calculate elapsed time + // Trigger a fetch request to update the timer before measuring + // Time measurements only update after a fetch request or R2 operation + try { + // Fetch the worker's own URL with favicon to minimize overhead + const finalUrl = new URL(request.url); + finalUrl.pathname = '/favicon'; + await fetch(finalUrl.toString(), { method: 'HEAD' }); + } catch (e) { + // Ignore fetch errors + } + + // Now read the updated timer const end = Date.now() / 1000; const elapsed = performance.now() - start; const micro = elapsed * 1000; // Convert milliseconds to microseconds From e69243adab3e86d38fce017fa11c602fc7b1ecc1 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 15 Dec 2025 19:00:14 +0100 Subject: [PATCH 040/230] fixed python timing issue --- .../wrappers/cloudflare/python/handler.py | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 91be3acf9..19eff8baf 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -2,6 +2,7 @@ import asyncio import importlib.util import traceback +import time try: import resource HAS_RESOURCE = True @@ -9,6 +10,7 @@ # Pyodide (Python native workers) doesn't support resource module HAS_RESOURCE = False from workers import WorkerEntrypoint, Response, DurableObject +from js import fetch as js_fetch, URL ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -43,6 +45,10 @@ async def fetch2(self, request, env): # Get unique request ID from Cloudflare (CF-Ray header) req_id = request.headers.get('CF-Ray', str(uuid.uuid4())) + # Start timing measurements + start = time.perf_counter() + begin = datetime.datetime.now().timestamp() + req_text = await request.text() event = json.loads(req_text) if len(req_text) > 0 else {} @@ -115,13 +121,26 @@ async def fetch2(self, request, env): headers = {"Content-Type" : "text/html; charset=utf-8"} return Response(str(ret["result"]), headers = headers) else: + # Trigger a fetch request to update the timer before measuring + # Time measurements only update after a fetch request or R2 operation + try: + # Fetch the worker's own URL with favicon to minimize overhead + final_url = URL.new(request.url) + final_url.pathname = '/favicon' + await js_fetch(str(final_url), method='HEAD') + except: + # Ignore fetch errors + pass + # Calculate timestamps - end_timestamp = datetime.datetime.now().timestamp() - begin_timestamp = income_timestamp + end = datetime.datetime.now().timestamp() + elapsed = time.perf_counter() - start + micro = elapsed * 1_000_000 # Convert seconds to microseconds return Response(json.dumps({ - 'begin': begin_timestamp, - 'end': end_timestamp, + 'begin': begin, + 'end': end, + 'compute_time': micro, 'results_time': 0, 'result': log_data, 'is_cold': False, From f39aad05453e95d13ab5a0f7321b8c24f1f846d8 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 5 Jan 2026 11:30:21 +0100 Subject: [PATCH 041/230] removed faulty nodejs implementations of 000 bmks --- .../020.network-benchmark/config.json | 2 +- .../020.network-benchmark/nodejs/function.js | 94 -------------- .../020.network-benchmark/nodejs/package.json | 9 -- .../030.clock-synchronization/config.json | 2 +- .../nodejs/function.js | 115 ------------------ .../nodejs/package.json | 9 -- .../040.server-reply/config.json | 2 +- .../040.server-reply/nodejs/function.js | 31 ----- .../040.server-reply/nodejs/package.json | 9 -- 9 files changed, 3 insertions(+), 270 deletions(-) delete mode 100644 benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js delete mode 100644 benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json delete mode 100644 benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js delete mode 100644 benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json delete mode 100644 benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js delete mode 100644 benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/config.json b/benchmarks/000.microbenchmarks/020.network-benchmark/config.json index 455933282..c3c2c73b1 100644 --- a/benchmarks/000.microbenchmarks/020.network-benchmark/config.json +++ b/benchmarks/000.microbenchmarks/020.network-benchmark/config.json @@ -1,6 +1,6 @@ { "timeout": 30, "memory": 128, - "languages": ["python", "nodejs"], + "languages": ["python"], "modules": [] } diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js deleted file mode 100644 index 431ccbe39..000000000 --- a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/function.js +++ /dev/null @@ -1,94 +0,0 @@ -const dgram = require('dgram'); -const fs = require('fs'); -const path = require('path'); -const storage = require('./storage'); - -const storage_handler = new storage.storage(); - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -exports.handler = async function(event) { - const requestId = event['request-id']; - const address = event['server-address']; - const port = event['server-port']; - const repetitions = event['repetitions']; - const outputBucket = event.bucket.bucket; - const outputPrefix = event.bucket.output; - - const times = []; - let i = 0; - const client = dgram.createSocket('udp4'); - client.bind(); - - const message = Buffer.from(String(requestId)); - let consecutiveFailures = 0; - let key = null; - - while (i < repetitions + 1) { - try { - const sendBegin = Date.now() / 1000; - - await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error('Socket timeout')); - }, 3000); - - client.send(message, port, address, (err) => { - if (err) { - clearTimeout(timeout); - reject(err); - } - }); - - client.once('message', (msg, rinfo) => { - clearTimeout(timeout); - const recvEnd = Date.now() / 1000; - resolve(recvEnd); - }); - }).then((recvEnd) => { - if (i > 0) { - times.push([i, sendBegin, recvEnd]); - } - i++; - consecutiveFailures = 0; - }); - } catch (err) { - i++; - consecutiveFailures++; - if (consecutiveFailures === 5) { - console.log("Can't setup the connection"); - break; - } - continue; - } - } - - client.close(); - - if (consecutiveFailures !== 5) { - // Write CSV file using stream - const csvPath = '/tmp/data.csv'; - let csvContent = 'id,client_send,client_rcv\n'; - times.forEach(row => { - csvContent += row.join(',') + '\n'; - }); - - // Use createWriteStream and wait for it to finish - await new Promise((resolve, reject) => { - const writeStream = fs.createWriteStream(csvPath); - writeStream.write(csvContent); - writeStream.end(); - writeStream.on('finish', resolve); - writeStream.on('error', reject); - }); - - const filename = `results-${requestId}.csv`; - let uploadPromise; - [key, uploadPromise] = storage_handler.upload(outputBucket, path.join(outputPrefix, filename), csvPath); - await uploadPromise; - } - - return { result: key }; -}; diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json b/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json deleted file mode 100644 index 57264db28..000000000 --- a/benchmarks/000.microbenchmarks/020.network-benchmark/nodejs/package.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "network-benchmark", - "version": "1.0.0", - "description": "Network benchmark function", - "author": "", - "license": "", - "dependencies": { - } -} diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json b/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json index 455933282..c3c2c73b1 100644 --- a/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json +++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/config.json @@ -1,6 +1,6 @@ { "timeout": 30, "memory": 128, - "languages": ["python", "nodejs"], + "languages": ["python"], "modules": [] } diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js deleted file mode 100644 index 8adb53f66..000000000 --- a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/function.js +++ /dev/null @@ -1,115 +0,0 @@ -const dgram = require('dgram'); -const fs = require('fs'); -const path = require('path'); -const storage = require('./storage'); - -const storage_handler = new storage.storage(); - -exports.handler = async function(event) { - const requestId = event['request-id']; - const address = event['server-address']; - const port = event['server-port']; - const repetitions = event['repetitions']; - const outputBucket = event.bucket.bucket; - const outputPrefix = event.bucket.output; - - const times = []; - console.log(`Starting communication with ${address}:${port}`); - - let i = 0; - const client = dgram.createSocket('udp4'); - client.bind(); - - let message = Buffer.from(String(requestId)); - let consecutiveFailures = 0; - let measurementsNotSmaller = 0; - let curMin = 0; - let key = null; - - while (i < 1000) { - try { - const sendBegin = Date.now() / 1000; - - const recvEnd = await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error('Socket timeout')); - }, 4000); - - client.send(message, port, address, (err) => { - if (err) { - clearTimeout(timeout); - reject(err); - } - }); - - client.once('message', (msg, rinfo) => { - clearTimeout(timeout); - const recvEnd = Date.now() / 1000; - resolve(recvEnd); - }); - }); - - if (i > 0) { - times.push([i, sendBegin, recvEnd]); - } - - const curTime = recvEnd - sendBegin; - console.log(`Time ${curTime} Min Time ${curMin} NotSmaller ${measurementsNotSmaller}`); - - if (curTime > curMin && curMin > 0) { - measurementsNotSmaller++; - if (measurementsNotSmaller === repetitions) { - message = Buffer.from('stop'); - client.send(message, port, address); - break; - } - } else { - curMin = curTime; - measurementsNotSmaller = 0; - } - - i++; - consecutiveFailures = 0; - } catch (err) { - i++; - consecutiveFailures++; - if (consecutiveFailures === 7) { - console.log("Can't setup the connection"); - break; - } - continue; - } - } - - client.close(); - - if (consecutiveFailures !== 5) { - // Write CSV file using stream - const csvPath = '/tmp/data.csv'; - let csvContent = 'id,client_send,client_rcv\n'; - times.forEach(row => { - csvContent += row.join(',') + '\n'; - }); - - // Use createWriteStream and wait for it to finish - await new Promise((resolve, reject) => { - const writeStream = fs.createWriteStream(csvPath); - writeStream.write(csvContent); - writeStream.end(); - writeStream.on('finish', resolve); - writeStream.on('error', reject); - }); - - const filename = `results-${requestId}.csv`; - let uploadPromise; - [key, uploadPromise] = storage_handler.upload(outputBucket, path.join(outputPrefix, filename), csvPath); - await uploadPromise; - } - - return { - result: { - 'bucket-key': key, - 'timestamp': event['income-timestamp'] - } - }; -}; diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json b/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json deleted file mode 100644 index 20dbe9c5f..000000000 --- a/benchmarks/000.microbenchmarks/030.clock-synchronization/nodejs/package.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "clock-synchronization", - "version": "1.0.0", - "description": "Clock synchronization benchmark", - "author": "", - "license": "", - "dependencies": { - } -} diff --git a/benchmarks/000.microbenchmarks/040.server-reply/config.json b/benchmarks/000.microbenchmarks/040.server-reply/config.json index 93ce2f561..8ff6eec59 100644 --- a/benchmarks/000.microbenchmarks/040.server-reply/config.json +++ b/benchmarks/000.microbenchmarks/040.server-reply/config.json @@ -1,6 +1,6 @@ { "timeout": 120, "memory": 128, - "languages": ["python", "nodejs"], + "languages": ["python"], "modules": [] } diff --git a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js deleted file mode 100644 index 45a0ea8f8..000000000 --- a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/function.js +++ /dev/null @@ -1,31 +0,0 @@ -const net = require('net'); - -exports.handler = async function(event) { - const address = event['ip-address']; - const port = event['port']; - - return new Promise((resolve, reject) => { - const client = new net.Socket(); - - client.setTimeout(20000); - - client.connect(port, address, () => { - console.log('Connected to server'); - }); - - client.on('data', (data) => { - const msg = data.toString(); - client.destroy(); - resolve({ result: msg }); - }); - - client.on('timeout', () => { - client.destroy(); - reject(new Error('Connection timeout')); - }); - - client.on('error', (err) => { - reject(err); - }); - }); -}; diff --git a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json b/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json deleted file mode 100644 index ad419b23f..000000000 --- a/benchmarks/000.microbenchmarks/040.server-reply/nodejs/package.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "server-reply", - "version": "1.0.0", - "description": "Server reply benchmark", - "author": "", - "license": "", - "dependencies": { - } -} From e76f8461e27c71744b9bfc2659478ec7cbd2ddc1 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 5 Jan 2026 11:37:07 +0100 Subject: [PATCH 042/230] removed unnecessary logging --- .../wrappers/cloudflare/nodejs/container/handler.js | 8 -------- .../wrappers/cloudflare/nodejs/container/storage.js | 7 ------- .../wrappers/cloudflare/python/container/handler.py | 6 ------ .../wrappers/cloudflare/python/container/storage.py | 4 +--- 4 files changed, 1 insertion(+), 24 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index e18212342..9b8b25e19 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -117,27 +117,19 @@ const server = http.createServer(async (req, res) => { event['request-id'] = reqId; event['income-timestamp'] = incomeTimestamp; - console.error('!!! Event:', JSON.stringify(event)); - // For debugging: check /tmp directory before and after benchmark const fs = require('fs'); - console.error('!!! Files in /tmp before benchmark:', fs.readdirSync('/tmp')); // Call the benchmark function - console.error('!!! Calling benchmark handler...'); const ret = await benchmarkHandler(event); - console.error('!!! Benchmark result:', JSON.stringify(ret)); // Check what was downloaded - console.error('!!! Files in /tmp after benchmark:', fs.readdirSync('/tmp')); const tmpFiles = fs.readdirSync('/tmp'); for (const file of tmpFiles) { const filePath = `/tmp/${file}`; const stats = fs.statSync(filePath); - console.error(`!!! ${file}: ${stats.size} bytes`); if (stats.size < 500) { const content = fs.readFileSync(filePath, 'utf8'); - console.error(`!!! First 300 chars: ${content.substring(0, 300)}`); } } diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js index d893245ef..f05d2fb14 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js @@ -123,19 +123,12 @@ class storage { upload(bucket, key, filepath) { // Generate unique key synchronously so it can be returned immediately const unique_key = storage.unique_name(key); - console.error(`!!! [storage.upload] bucket=${bucket}, key=${key}, unique_key=${unique_key}, filepath=${filepath}`); // Read file from disk and upload if (fs.existsSync(filepath)) { - const stats = fs.statSync(filepath); - console.error(`!!! [storage.upload] File exists, size on disk: ${stats.size} bytes`); const data = fs.readFileSync(filepath); - console.error(`!!! [storage.upload] Read ${data.length} bytes from ${filepath}`); - console.error(`!!! [storage.upload] Data type: ${typeof data}, isBuffer: ${Buffer.isBuffer(data)}, isString: ${typeof data === 'string'}`); - console.error(`!!! [storage.upload] First 200 chars of data: ${data.toString().substring(0, 200)}`); // Call internal version that doesn't generate another unique key const uploadPromise = this._upload_stream_with_key(bucket, unique_key, data); - console.error(`!!! [storage.upload] Returning unique_key=${unique_key} and upload promise`); return [unique_key, uploadPromise]; } diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 5f3cb2a43..810c26ee3 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -126,12 +126,6 @@ def handle_request(self): event['request-id'] = req_id event['income-timestamp'] = income_timestamp - print(f"!!! Event received: {json.dumps(event, default=str)}") - print(f"!!! Event keys: {list(event.keys())}") - print(f"!!! Event has 'bucket' key: {'bucket' in event}") - if 'bucket' in event: - print(f"!!! bucket value: {event['bucket']}") - # Measure execution time begin = datetime.datetime.now().timestamp() diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index 3182a66c3..53ab90d54 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -104,11 +104,9 @@ def upload(self, bucket, key, filepath): """Upload file from disk with unique key generation""" # Generate unique key to avoid conflicts unique_key = self.unique_name(key) - print(f"!!! [storage.upload] bucket={bucket}, key={key}, unique_key={unique_key}, filepath={filepath}") with open(filepath, 'rb') as f: data = f.read() - print(f"!!! [storage.upload] Read {len(data)} bytes from {filepath}") # Upload with the unique key self._upload_with_key(bucket, unique_key, data) return unique_key @@ -140,7 +138,7 @@ def _upload_with_key(self, bucket: str, key: str, data): try: with urllib.request.urlopen(req) as response: result = json.loads(response.read().decode('utf-8')) - print(f"!!! [storage._upload_with_key] Upload successful, key={result['key']}") + print(f"[storage._upload_with_key] Upload successful, key={result['key']}") except Exception as e: print(f"R2 upload error: {e}") raise RuntimeError(f"Failed to upload to R2: {e}") From dc2f6ed880751b6571ec29fa4266016508add4d1 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 12:56:37 +0100 Subject: [PATCH 043/230] removed experiments.json and package*.json --- experiments.json | 78 ----------------------------------------------- package-lock.json | 6 ---- package.json | 1 - 3 files changed, 85 deletions(-) delete mode 100644 experiments.json delete mode 100644 package-lock.json delete mode 100644 package.json diff --git a/experiments.json b/experiments.json deleted file mode 100644 index 549ed7625..000000000 --- a/experiments.json +++ /dev/null @@ -1,78 +0,0 @@ -{ - "_invocations": { - "130-crud-api-nodejs-18": { - "0": { - "billing": { - "_billed_time": null, - "_gb_seconds": 0, - "_memory": null - }, - "output": { - "begin": 1763584521.868, - "compute_time": 0, - "container_id": "0", - "end": 1763584521.868, - "environ_container_id": "no_id", - "is_cold": false, - "is_cold_worker": false, - "request_id": "0", - "result": { - "output": [ - {} - ] - }, - "results_time": 0 - }, - "provider_times": { - "execution": 0, - "initialization": 0 - }, - "request_id": "0", - "stats": { - "cold_start": false, - "failure": false, - "memory_used": null - }, - "times": { - "benchmark": 0, - "client": 29638, - "client_begin": "2025-11-19 21:35:21.807697", - "client_end": "2025-11-19 21:35:21.837335", - "http_first_byte_return": 0.029473, - "http_startup": 0.00964, - "initialization": 0 - } - } - } - }, - "_metrics": {}, - "begin_time": 1763584521.660329, - "config": { - "deployment": { - "credentials": { - "account_id": "eaf7050d8d599d4ae7d925a6f0fd5ea4" - }, - "name": "cloudflare", - "region": "global", - "resources": { - "benchmarks": "sebs-benchmarks-cb6e76ec", - "resources_id": "cb6e76ec" - } - }, - "experiments": { - "architecture": "x64", - "container_deployment": false, - "download_results": false, - "experiments": {}, - "flags": {}, - "runtime": { - "language": "nodejs", - "version": "18" - }, - "update_code": true, - "update_storage": false - } - }, - "end_time": 1763584521.838648, - "result_bucket": null -} \ No newline at end of file diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index e1fe15f56..000000000 --- a/package-lock.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "name": "serverless-benchmarks-cloudflare", - "lockfileVersion": 3, - "requires": true, - "packages": {} -} diff --git a/package.json b/package.json deleted file mode 100644 index 0967ef424..000000000 --- a/package.json +++ /dev/null @@ -1 +0,0 @@ -{} From 437cc974d0b1ed85384a463398aa254cc47d8c8b Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:06:45 +0100 Subject: [PATCH 044/230] updated cloudflare readme to reflect final changes --- sebs/cloudflare/README.md | 403 ++++++++++++++++++++------------------ 1 file changed, 215 insertions(+), 188 deletions(-) diff --git a/sebs/cloudflare/README.md b/sebs/cloudflare/README.md index f40793f87..ff9f47f49 100644 --- a/sebs/cloudflare/README.md +++ b/sebs/cloudflare/README.md @@ -1,6 +1,15 @@ # Cloudflare Workers Implementation for SeBS -This directory contains the implementation of Cloudflare Workers support for the SeBS (Serverless Benchmarking Suite). +This directory contains the **complete implementation** of Cloudflare Workers support for the SeBS (Serverless Benchmarking Suite). + +## Implementation Status + +✅ **Fully Implemented** - All features are production-ready: +- Multi-language support (JavaScript, Python, Java, Go, Rust) via containers +- Per-invocation metrics via response measurements (no external dependencies) +- Storage integration (R2 for object storage, Durable Objects for NoSQL) +- Script and container-based deployments +- HTTP and Library trigger support ## Key Components @@ -10,7 +19,8 @@ This file implements the core Cloudflare Workers platform integration, including - **`create_function()`** - Creates a new Cloudflare Worker - Checks if worker already exists - - Uploads worker script via Cloudflare API + - Uploads worker script or container image via Cloudflare API + - Configures Durable Objects bindings for containerized workers - Adds HTTP and Library triggers - Returns a `CloudflareWorker` instance @@ -26,16 +36,18 @@ This file implements the core Cloudflare Workers platform integration, including - Memory and CPU time limits are managed by Cloudflare - **`package_code()`** - Prepares code for deployment - - Packages JavaScript/Node.js code for worker deployment + - Packages code for both script-based and container-based worker deployments + - Supports JavaScript/Node.js scripts and multi-language containers - Returns package path and size ### 2. `function.py` - CloudflareWorker Class Represents a Cloudflare Worker function with: -- Worker name and script ID -- Runtime information +- Worker name and script/container ID +- Runtime information (script or container-based) - Serialization/deserialization for caching - Account ID association +- Trigger configurations (HTTP and Library) ### 3. `config.py` - Configuration Classes @@ -47,8 +59,8 @@ Contains three main classes: - Can be loaded from environment variables or config file - **`CloudflareResources`** - Platform resources - - KV namespace IDs - - Storage bucket mappings + - R2 storage bucket configuration + - Durable Objects for NoSQL operations - Resource ID management - **`CloudflareConfig`** - Overall configuration @@ -65,7 +77,11 @@ This provides the behavior of SeBS to invoke serverless functions via either lib ### 5. `resources.py` - System Resources -Handles Cloudflare-specific resources like KV namespaces and R2 storage. This defines the behavior of SeBS to upload benchmarking resources and cleanup before/after the benchmark. It is different from the benchmark wrapper, which provides the functions for the benchmark itself to perform storage operations. +Handles Cloudflare-specific resources including: +- **R2 Buckets** - Object storage (S3-compatible) for benchmark data +- **Durable Objects** - Stateful storage for NoSQL operations + +This defines SeBS behavior to upload benchmarking resources and cleanup before/after benchmarks. It is different from the benchmark wrapper, which provides the functions for benchmarks to perform storage operations during execution. ## Usage ### Environment Variables @@ -101,198 +117,205 @@ Alternatively, create a configuration file: } ``` -### Current Limitations +### Implemented Features + +- **Container Deployment**: ✅ Fully implemented + - Container-based workers using @cloudflare/containers + - Multi-language support via containerization + - Script and container-based deployment supported +- **Per-Invocation Metrics**: ✅ Implemented via response measurements + - Per-request performance data collected in worker response + - CPU time and wall time tracking + - Metrics extracted immediately from ExecutionResult objects +- **Language Support**: ✅ Multi-language support + - JavaScript/Node.js via script deployment + - Python, Java, Go, Rust, and more via container deployment +- **Storage Resources**: ✅ Fully integrated + - Cloudflare R2 for main storage (S3-compatible object storage) + - Cloudflare Durable Objects for NoSQL storage + - Integrated with benchmark wrappers + +### Platform Limitations -- **Container Deployment**: Not currently implemented - - *Note*: Cloudflare recently added container support (October 2024) - - Current implementation only supports script-based deployment - - Container support would require: - - Creating `CloudflareContainer` class (similar to AWS ECR) - - Container registry integration - - Dockerfile templates for each language - - Updates to `package_code()` and `create_function()` methods - **Cold Start Enforcement**: Not available (Workers are instantiated on-demand at edge locations) -- **Per-Invocation Metrics**: Limited (Cloudflare provides aggregated analytics) -- **Language Support**: Currently JavaScript/Node.js (Python support via Pyodide is experimental) - - Container support would enable any containerized language -- **Memory/Timeout Configuration**: Fixed by Cloudflare (128MB memory, 50ms CPU time on free tier) - -### Future Enhancements - -#### High Priority -- [ ] **Container Deployment Support** - - Cloudflare now supports container-based Workers (as of October 2024) - - Would enable multi-language support (Python, Java, Go, Rust, etc.) - - Requires implementing `CloudflareContainer` class - - Need Cloudflare container registry integration - - See [implementation notes](#container-support-architecture) below -- [ ] **Add Storage Resources** - - SeBS needs two levels of storage resources, main storage and nosql storage. - - For main storage Cloudflare R2 comes to mind. - - For nosql storage either D1 or Durable Objects come to mind. They need to be used by the benchmark wrapper aswell. I think it needs to be consistent... - -## Metrics Collection with Analytics Engine +- **Memory/Timeout Configuration**: Managed by Cloudflare (128MB memory, 50ms CPU time on free tier) + +### Completed Enhancements + +#### High Priority ✅ +- [x] **Container Deployment Support** + - Multi-language support (Python, Java, Go, Rust, etc.) via @cloudflare/containers + - Wrangler CLI integration for deployment + - Durable Objects binding for container orchestration + - See [implementation details](#container-support-architecture) below +- [x] **Storage Resources** + - Main storage: Cloudflare R2 (S3-compatible) integration complete + - NoSQL storage: Cloudflare Durable Objects support implemented + - Benchmark wrappers updated for storage operations +- [x] **Metrics Collection** + - Response-based per-invocation metrics + - Immediate availability (no external service dependency) + - CPU time, wall time, and billing calculations + +#### Standard Priority ✅ +- [x] Wrangler CLI integration for deployment and bundling +- [x] Support for Cloudflare R2 (object storage) +- [x] Support for Durable Objects (NoSQL/stateful storage) +- [x] Container-based multi-language workers + +## Metrics Collection ### Overview -Cloudflare Workers metrics are collected using **Analytics Engine**, which provides **per-invocation performance data** similar to AWS CloudWatch Logs or Azure Application Insights. Unlike the GraphQL Analytics API (which only provides aggregated metrics), Analytics Engine allows workers to write custom data points during execution that can be queried later. +Cloudflare Workers metrics are collected **directly from the worker response** during each invocation. This provides immediate, accurate per-invocation performance data without requiring external analytics services or API queries. -### Why Analytics Engine? +### Why Response-Based Metrics? -| Feature | Analytics Engine | GraphQL Analytics API | -|---------|-----------------|----------------------| -| **Data Granularity** | ✅ Per-invocation | ❌ Aggregated only | -| **Request ID Matching** | ✅ Direct correlation | ❌ Not possible | -| **Cold Start Detection** | ✅ Per-request | ❌ Average only | -| **SeBS Compatibility** | ✅ Full support | ❌ Limited | -| **Cost** | Free (10M writes/month) | Free | -| **Plan Requirement** | Paid plan ($5/month) | Any plan | +| Feature | Response Measurements | External Analytics | +|---------|---------------------|--------------------| +| **Data Granularity** | ✅ Per-invocation | ❌ Aggregated | +| **Request ID Matching** | ✅ Direct correlation | ❌ Impossible to correlate | +| **Latency** | ✅ Immediate | ❌ Delayed (30-60s) | +| **SeBS Compatibility** | ✅ Perfect match | ❌ Additional complexity | +| **Cost** | ✅ Free | ❌ May require paid plan | +| **Plan Requirement** | ✅ Any plan | ❌ May require paid plan | ### How It Works -1. **Worker Execution**: During each invocation, the worker writes a data point to Analytics Engine with: - - Request ID (for correlation with SeBS) - - CPU time and wall time - - Cold/warm start indicator - - Success/error status - -2. **Metrics Query**: After benchmark execution, SeBS queries Analytics Engine using SQL: - - Retrieves all data points for the time period - - Matches request IDs to `ExecutionResult` objects - - Populates provider metrics (CPU time, cold starts, etc.) - -3. **Data Enrichment**: Each `ExecutionResult` is enriched with: - - `provider_times.execution` - CPU time in microseconds - - `stats.cold_start` - True/False for cold start - - `billing.billed_time` - Billable CPU time - - `billing.gb_seconds` - GB-seconds for cost calculation - -### Implementation Requirements - -#### 1. Analytics Engine Binding +1. **Worker Execution**: During each invocation, the worker handler measures performance: + - Captures start time using `time.perf_counter()` + - Executes the benchmark function + - Measures elapsed time in microseconds + - Collects request metadata (request ID, timestamps) + +2. **Response Structure**: Worker returns JSON with embedded metrics: + ```json + { + "begin": 1704556800.123, + "end": 1704556800.456, + "compute_time": 333000, + "request_id": "cf-ray-abc123", + "result": {...}, + "is_cold": false + } + ``` + +3. **Metrics Extraction**: SeBS `download_metrics()` method: + - Iterates through `ExecutionResult` objects + - Extracts metrics from response measurements + - Populates `provider_times.execution` (CPU time in μs) + - Sets `stats.cold_start` based on response data + - Calculates `billing.billed_time` and `billing.gb_seconds` + +### Handler Integration + +Benchmark wrappers automatically include metrics in their responses. The Python handler (in `benchmarks/wrappers/cloudflare/python/handler.py`) demonstrates the pattern: ```python -# In cloudflare.py - automatically configured -self._bind_analytics_engine(worker_name, account_id) +# Start timing +start = time.perf_counter() +begin = datetime.datetime.now().timestamp() + +# Execute benchmark +ret = handler(event, context) + +# Calculate timing +end = datetime.datetime.now().timestamp() +elapsed = time.perf_counter() - start +micro = elapsed * 1_000_000 # Convert to microseconds + +# Return response with embedded metrics +return Response(json.dumps({ + 'begin': begin, + 'end': end, + 'compute_time': micro, + 'result': ret, + 'is_cold': False, + 'request_id': req_id +})) ``` -#### 2. Benchmark Wrapper +### Response Schema -Benchmark wrappers must write data points during execution. The wrapper code looks like: +Worker responses include these fields for metrics collection: -```javascript -export default { - async fetch(request, env, ctx) { - const requestId = request.headers.get('x-request-id') || crypto.randomUUID(); - const startTime = Date.now(); - const startCpu = performance.now(); - - try { - // Execute benchmark - const result = await benchmarkHandler(request, env, ctx); - - // Write metrics to Analytics Engine - if (env.ANALYTICS) { - env.ANALYTICS.writeDataPoint({ - indexes: [requestId, result.is_cold ? 'cold' : 'warm'], - doubles: [Date.now() - startTime, performance.now() - startCpu, 0, 0], - blobs: [request.url, 'success', '', ''] - }); - } - - return new Response(JSON.stringify({...result, request_id: requestId})); - } catch (error) { - // Write error metrics - if (env.ANALYTICS) { - env.ANALYTICS.writeDataPoint({ - indexes: [requestId, 'error'], - doubles: [Date.now() - startTime, performance.now() - startCpu, 0, 0], - blobs: [request.url, 'error', error.message, ''] - }); - } - throw error; - } - } -}; -``` +| Field | Type | Purpose | Example | +|-------|------|---------|---------| +| `begin` | Float | Start timestamp | `1704556800.123` | +| `end` | Float | End timestamp | `1704556800.456` | +| `compute_time` | Float | CPU time (μs) | `333000.0` | +| `request_id` | String | Request identifier | `"cf-ray-abc123"` | +| `is_cold` | Boolean | Cold start flag | `false` | +| `result` | Object | Benchmark output | `{...}` | -#### 3. Data Schema +### Metrics Extraction Process -Analytics Engine data points use this schema: +When `download_metrics()` is called in `cloudflare.py`, SeBS: -| Field | Type | Purpose | Example | -|-------|------|---------|---------| -| `index1` | String | Request ID | `"req-abc-123"` | -| `index2` | String | Cold/Warm | `"cold"` or `"warm"` | -| `double1` | Float | Wall time (ms) | `45.2` | -| `double2` | Float | CPU time (ms) | `12.8` | -| `blob1` | String | Request URL | `"https://worker.dev"` | -| `blob2` | String | Status | `"success"` or `"error"` | -| `blob3` | String | Error message | `""` or error text | - -### Query Process - -When `download_metrics()` is called, SeBS: - -1. **Builds SQL Query**: Creates a ClickHouse SQL query for the time range -2. **Executes Query**: POSTs to Analytics Engine SQL API -3. **Parses Results**: Parses newline-delimited JSON response -4. **Matches Request IDs**: Correlates data points with tracked invocations -5. **Populates Metrics**: Enriches `ExecutionResult` objects with provider data - -Example SQL query: - -```sql -SELECT - index1 as request_id, - index2 as cold_warm, - double1 as wall_time_ms, - double2 as cpu_time_ms, - blob2 as status, - timestamp -FROM ANALYTICS_DATASET -WHERE timestamp >= toDateTime('2025-10-27 10:00:00') - AND timestamp <= toDateTime('2025-10-27 11:00:00') - AND blob1 LIKE '%worker-name%' -ORDER BY timestamp ASC -``` +1. **Iterates ExecutionResults**: Loops through all tracked invocations +2. **Extracts Response Data**: Reads metrics from the response JSON already captured +3. **Populates Provider Times**: Sets `provider_times.execution` from `compute_time` +4. **Calculates Billing**: Computes GB-seconds using Cloudflare's fixed 128MB memory +5. **Aggregates Statistics**: Creates summary metrics (avg/min/max CPU time, cold starts) -### Limitation +Example from `cloudflare.py`: -1. **Delay**: Typically 30-60 seconds for data to appear in Analytics Engine -2. **Wrapper Updates**: All benchmark wrappers must be updated to write data points +```python +for request_id, result in requests.items(): + # Count cold/warm starts + if result.stats.cold_start: + cold_starts += 1 + + # Extract CPU time from response measurement + if result.provider_times.execution > 0: + cpu_times.append(result.provider_times.execution) + + # Calculate billing + cpu_time_seconds = result.provider_times.execution / 1_000_000.0 + gb_seconds = (128.0 / 1024.0) * cpu_time_seconds + result.billing.gb_seconds = int(gb_seconds * 1_000_000) +``` -### Troubleshooting +### Implementation Notes -**Missing Metrics**: -- Check that worker has Analytics Engine binding configured -- Verify wrapper is writing data points (check `env.ANALYTICS`) -- Wait 60+ seconds after invocation for ingestion -- Check SQL query matches worker URL pattern +1. **Immediate Availability**: Metrics are available immediately in the response (no delay) +2. **Wrapper Consistency**: All benchmark wrappers follow the same response schema +3. **Billing Calculations**: Based on Cloudflare's fixed 128MB memory allocation and CPU time +4. **Cold Start Detection**: Currently always reports `false` (Cloudflare doesn't expose cold start info) -**Unmatched Request IDs**: -- Ensure wrapper returns `request_id` in response -- Verify SeBS is tracking request IDs correctly -- Check timestamp range covers all invocations +### Troubleshooting -**Query Failures**: -- Verify account has Analytics Engine enabled (Paid plan) -- Check API token has analytics read permissions -- Validate SQL syntax (ClickHouse format) +**Missing Metrics in Results**: +- Verify worker handler returns complete JSON response with all required fields +- Check that `compute_time`, `begin`, `end` fields are present in response +- Ensure wrapper code hasn't been modified to remove metric collection +- Confirm response JSON is properly formatted + +**Incorrect Timing Values**: +- Verify `time.perf_counter()` is being used for microsecond precision +- Check that timing starts before benchmark execution and ends after +- Ensure no external fetch requests are inflating the measured time +- Confirm microsecond conversion (multiply seconds by 1,000,000) + +**Container Deployment Issues**: +- Ensure Docker is installed and running locally +- Verify wrangler CLI is installed (`npm install -g wrangler`) +- Check that @cloudflare/containers package is in dependencies +- Confirm Durable Objects bindings are correctly configured in wrangler.toml +- Ensure container image size is under Cloudflare's limits + +**Worker Deployment Failures**: +- Verify Cloudflare credentials are correctly configured +- Check account has Workers enabled (may require paid plan for some features) +- Ensure worker name doesn't conflict with existing workers +- Review wrangler logs for specific error messages ### References -- [Analytics Engine Documentation](https://developers.cloudflare.com/analytics/analytics-engine/) -- [Analytics Engine SQL API](https://developers.cloudflare.com/analytics/analytics-engine/sql-api/) +- [Cloudflare Workers Runtime APIs](https://developers.cloudflare.com/workers/runtime-apis/) - [Workers Bindings](https://developers.cloudflare.com/workers/configuration/bindings/) -- See `ANALYTICS_ENGINE_IMPLEMENTATION.md` for complete implementation details - -#### Standard Priority -- [ ] Support for Cloudflare Workers KV (key-value storage) -- [ ] Support for Cloudflare R2 (object storage) -- [ ] Support for Durable Objects -- [ ] Wrangler CLI integration for better bundling -- [ ] WebAssembly/Rust worker support +- [Durable Objects Documentation](https://developers.cloudflare.com/durable-objects/) +- [R2 Storage Documentation](https://developers.cloudflare.com/r2/) --- @@ -300,28 +323,32 @@ ORDER BY timestamp ASC ### Overview -Cloudflare recently introduced container support for Workers, enabling deployment of containerized applications. Adding this to SeBS would require the following components: +Cloudflare container support for Workers is integrated into SeBS using the `@cloudflare/containers` package, enabling deployment of containerized applications across multiple programming languages. -### Required Components +### Implementation Details -1. **Container Client** (`container.py`) - - Extends `sebs.faas.container.DockerContainer` - - Manages container image builds and registry operations - - Similar to `sebs/aws/container.py` for ECR +1. **Container Orchestration** + - Uses `@cloudflare/containers` npm package + - Requires Node.js worker.js wrapper for orchestration + - Container runs inside Durable Object for isolation + - Integrated with wrangler CLI for deployment -2. **Registry Integration** - - Cloudflare Container Registry authentication - - Image push/pull operations - - Support for external registries (Docker Hub, etc.) +2. **Deployment Process** + - `package_code()` generates wrangler.toml with container configuration + - Creates `[[migrations]]` entries for Durable Objects + - Binds container to `CONTAINER_WORKER` Durable Object class + - Uses `wrangler deploy` to upload both worker and container -3. **Dockerfile Templates** - - Create `/dockerfiles/cloudflare/{language}/Dockerfile.function` - - Support for Node.js, Python, and other languages +3. **Supported Languages** + - Python via Docker containers + - Node.js (both script and container) + - Go, Rust, Java (via container deployment) + - Any language that can run in a Linux container -4. **Updated Methods** - - `package_code()`: Add container build path alongside script packaging - - `create_function()`: Handle both script and container deployments - - `update_function()`: Support updating container-based workers +4. **Key Methods** + - `_generate_wrangler_toml()`: Creates config with container bindings + - `create_function()`: Deploys workers using wrangler CLI + - `update_function()`: Updates existing containerized workers ### Benefits From 1eb375c277b38aa5b8e7965736657a0ce25efa22 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:21:31 +0100 Subject: [PATCH 045/230] has platform check according to convention, durable object items removal suggestion, function deserialize suggestion --- sebs/cloudflare/durable_objects.py | 17 +++++++++++++---- sebs/cloudflare/function.py | 6 +++++- sebs/sebs.py | 7 ++----- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index 8997ebf92..caa203edd 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -189,16 +189,25 @@ def remove_table(self, name: str) -> str: :param name: table name :return: table name """ - # Remove from internal tracking - for benchmark, tables in self._tables.items(): + # Remove from internal tracking - two-step approach to avoid mutation during iteration + benchmark_to_modify = None + table_key_to_delete = None + + # Step 1: Find the benchmark and table_key without deleting + for benchmark, tables in list(self._tables.items()): if name in tables.values(): # Find the table key - for table_key, table_name in tables.items(): + for table_key, table_name in list(tables.items()): if table_name == name: - del self._tables[benchmark][table_key] + benchmark_to_modify = benchmark + table_key_to_delete = table_key break break + # Step 2: Perform deletion after iteration + if benchmark_to_modify is not None and table_key_to_delete is not None: + del self._tables[benchmark_to_modify][table_key_to_delete] + self.logging.info(f"Removed Durable Objects table {name} from tracking") return name diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py index 1bdf0ea08..c3773818f 100644 --- a/sebs/cloudflare/function.py +++ b/sebs/cloudflare/function.py @@ -54,9 +54,13 @@ def deserialize(cached_config: dict) -> "CloudflareWorker": ) for trigger in cached_config["triggers"]: + mapping = { + LibraryTrigger.typename(): LibraryTrigger, + HTTPTrigger.typename(): HTTPTrigger + } trigger_type = cast( Trigger, - {"Library": LibraryTrigger, "HTTP": HTTPTrigger}.get(trigger["type"]), + mapping.get(trigger["type"]), ) assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) ret.add_trigger(trigger_type.deserialize(trigger)) diff --git a/sebs/sebs.py b/sebs/sebs.py index d90512159..febfeb24a 100644 --- a/sebs/sebs.py +++ b/sebs/sebs.py @@ -214,13 +214,10 @@ def get_deployment( from sebs.openwhisk import OpenWhisk implementations["openwhisk"] = OpenWhisk - - # Cloudflare is available by default (like local) - try: + if has_platform("cloudflare"): from sebs.cloudflare import Cloudflare + implementations["cloudflare"] = Cloudflare - except ImportError: - pass # Validate deployment platform if name not in implementations: From 6c0768ed72dfa81b42760ac3d3e0e736034e812a Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:24:06 +0100 Subject: [PATCH 046/230] updated readme to document the correct return object structure by the bmk wrappers --- sebs/cloudflare/README.md | 151 ++++++++++++++++++++++++++++++++------ 1 file changed, 127 insertions(+), 24 deletions(-) diff --git a/sebs/cloudflare/README.md b/sebs/cloudflare/README.md index ff9f47f49..37cbf3056 100644 --- a/sebs/cloudflare/README.md +++ b/sebs/cloudflare/README.md @@ -216,60 +216,163 @@ Benchmark wrappers automatically include metrics in their responses. The Python start = time.perf_counter() begin = datetime.datetime.now().timestamp() -# Execute benchmark -ret = handler(event, context) +# Execute benchmark function +from function import function +ret = function.handler(event) + +# Build response with nested measurement data +log_data = { + 'output': ret['result'] +} +if 'measurement' in ret: + log_data['measurement'] = ret['measurement'] +else: + log_data['measurement'] = {} + +# Add memory usage to measurement +if HAS_RESOURCE: + memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + log_data['measurement']['memory_used_mb'] = memory_mb # Calculate timing end = datetime.datetime.now().timestamp() elapsed = time.perf_counter() - start micro = elapsed * 1_000_000 # Convert to microseconds -# Return response with embedded metrics +# Return response with top-level wrapper fields and nested measurement return Response(json.dumps({ 'begin': begin, 'end': end, - 'compute_time': micro, - 'result': ret, - 'is_cold': False, + 'compute_time': micro, # Not used by SeBS + 'results_time': 0, # Not used by SeBS + 'result': log_data, # Contains nested measurement + 'is_cold': False, # Not used by SeBS (uses measurement.is_cold) + 'is_cold_worker': False, # Not used by SeBS + 'container_id': "0", # Not used by SeBS + 'environ_container_id': "no_id", # Not used by SeBS 'request_id': req_id })) ``` ### Response Schema -Worker responses include these fields for metrics collection: +Worker responses include these fields: + +#### Top-Level Fields (Wrapper Metadata) + +| Field | Type | Used by SeBS? | Purpose | +|-------|------|---------------|----------| +| `begin` | Float | ❌ No | Start timestamp (legacy) | +| `end` | Float | ❌ No | End timestamp (legacy) | +| `compute_time` | Float | ❌ No | Wrapper overhead time (not benchmark time) | +| `results_time` | Float | ❌ No | Reserved for future use | +| `is_cold` | Boolean | ❌ No | Legacy field (use `measurement.is_cold`) | +| `is_cold_worker` | Boolean | ❌ No | Not used | +| `container_id` | String | ❌ No | Container identifier (informational) | +| `environ_container_id` | String | ❌ No | Environment container ID (informational) | +| `request_id` | String | ✅ Yes | Request identifier for tracking | +| `result` | Object | ✅ Yes | Contains `output` and `measurement` | + +#### Nested Measurement Fields (result.measurement) -| Field | Type | Purpose | Example | -|-------|------|---------|---------| -| `begin` | Float | Start timestamp | `1704556800.123` | -| `end` | Float | End timestamp | `1704556800.456` | -| `compute_time` | Float | CPU time (μs) | `333000.0` | -| `request_id` | String | Request identifier | `"cf-ray-abc123"` | -| `is_cold` | Boolean | Cold start flag | `false` | -| `result` | Object | Benchmark output | `{...}` | +These are the **actual fields consumed by SeBS** from `result['result']['measurement']`: + +| Field | Type | Used by SeBS? | Purpose | Populated By | +|-------|------|---------------|---------|-------------| +| `cpu_time_us` | Integer | ✅ Yes | CPU time in microseconds | Benchmark function | +| `cpu_time_ms` | Float | ✅ Yes | CPU time in milliseconds (fallback) | Benchmark function | +| `wall_time_us` | Integer | ✅ Yes | Wall time in microseconds | Benchmark function | +| `wall_time_ms` | Float | ✅ Yes | Wall time in milliseconds (fallback) | Benchmark function | +| `is_cold` | Boolean | ✅ Yes | True cold start indicator | Benchmark function | +| `memory_used_mb` | Float | ✅ Yes | Memory usage in megabytes | Wrapper (via resource.getrusage) | + +**Example Response Structure:** + +```json +{ + "begin": 1704556800.123, + "end": 1704556800.456, + "compute_time": 333000, + "results_time": 0, + "result": { + "output": { /* benchmark output */ }, + "measurement": { + "cpu_time_us": 150000, + "wall_time_us": 155000, + "is_cold": false, + "memory_used_mb": 45.2 + } + }, + "is_cold": false, + "is_cold_worker": false, + "container_id": "0", + "environ_container_id": "no_id", + "request_id": "cf-ray-abc123" +} +``` ### Metrics Extraction Process -When `download_metrics()` is called in `cloudflare.py`, SeBS: +Metrics extraction happens in two stages: -1. **Iterates ExecutionResults**: Loops through all tracked invocations -2. **Extracts Response Data**: Reads metrics from the response JSON already captured -3. **Populates Provider Times**: Sets `provider_times.execution` from `compute_time` -4. **Calculates Billing**: Computes GB-seconds using Cloudflare's fixed 128MB memory -5. **Aggregates Statistics**: Creates summary metrics (avg/min/max CPU time, cold starts) +#### Stage 1: HTTPTrigger.sync_invoke (Per-Invocation) -Example from `cloudflare.py`: +In `sebs/cloudflare/triggers.py`, the `HTTPTrigger.sync_invoke()` method extracts metrics from **nested measurement data** immediately after each invocation: + +```python +def sync_invoke(self, payload: dict) -> ExecutionResult: + result = self._http_invoke(payload, self.url) + + # Extract measurement data from result.output['result']['measurement'] + if result.output and 'result' in result.output: + result_data = result.output['result'] + if isinstance(result_data, dict) and 'measurement' in result_data: + measurement = result_data['measurement'] + + if isinstance(measurement, dict): + # CPU time in microseconds (with ms fallback) + if 'cpu_time_us' in measurement: + result.provider_times.execution = measurement['cpu_time_us'] + elif 'cpu_time_ms' in measurement: + result.provider_times.execution = int(measurement['cpu_time_ms'] * 1000) + + # Wall time in microseconds (with ms fallback) + if 'wall_time_us' in measurement: + result.times.benchmark = measurement['wall_time_us'] + elif 'wall_time_ms' in measurement: + result.times.benchmark = int(measurement['wall_time_ms'] * 1000) + + # Cold start flag + if 'is_cold' in measurement: + result.stats.cold_start = measurement['is_cold'] + + # Memory usage + if 'memory_used_mb' in measurement: + result.stats.memory_used = measurement['memory_used_mb'] + + return result +``` + +**Note:** The top-level `compute_time` field is **ignored** by SeBS. Only the nested `measurement` object is used. + +#### Stage 2: download_metrics (Aggregation) + +When `download_metrics()` is called in `cloudflare.py`, SeBS aggregates the already-extracted metrics: ```python for request_id, result in requests.items(): - # Count cold/warm starts + # Count cold/warm starts (from measurement.is_cold) if result.stats.cold_start: cold_starts += 1 - # Extract CPU time from response measurement + # Collect CPU times (from measurement.cpu_time_us/ms) if result.provider_times.execution > 0: cpu_times.append(result.provider_times.execution) + # Collect memory usage (from measurement.memory_used_mb) + if result.stats.memory_used is not None and result.stats.memory_used > 0: + memory_values.append(result.stats.memory_used) + # Calculate billing cpu_time_seconds = result.provider_times.execution / 1_000_000.0 gb_seconds = (128.0 / 1024.0) * cpu_time_seconds From 0dfcfa8be9e00581594323bdc18237b124cd075c Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:25:30 +0100 Subject: [PATCH 047/230] documented cold start tracking limitation --- sebs/cloudflare/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sebs/cloudflare/README.md b/sebs/cloudflare/README.md index 37cbf3056..e41a3f075 100644 --- a/sebs/cloudflare/README.md +++ b/sebs/cloudflare/README.md @@ -138,6 +138,12 @@ Alternatively, create a configuration file: ### Platform Limitations - **Cold Start Enforcement**: Not available (Workers are instantiated on-demand at edge locations) +- **Cold Start Detection**: ⚠️ **Not Supported** - Cloudflare does not expose cold start information + - All invocations report `is_cold: false` (see hardcoded value in handler at line 146 of `benchmarks/wrappers/cloudflare/python/handler.py`) + - The `measurement.is_cold` field will always be `false` regardless of actual worker state + - **Impact on benchmarks**: Cold start metrics are incomparable to AWS Lambda, Azure Functions, or GCP Cloud Functions + - **Warning**: This limitation may skew benchmark comparisons when analyzing cold start performance across platforms + - Workers are instantiated on-demand at edge locations with minimal latency, but this state is not observable - **Memory/Timeout Configuration**: Managed by Cloudflare (128MB memory, 50ms CPU time on free tier) ### Completed Enhancements From b2465f9fea50e0b2d05f4c975b5525afb6ac916f Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:26:24 +0100 Subject: [PATCH 048/230] removed unreachable return statement in cloudflare.py --- sebs/cloudflare/cloudflare.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 7977abed1..06829f52d 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -977,9 +977,6 @@ def _build_container_image_local( except subprocess.TimeoutExpired: raise RuntimeError(f"Docker build timed out for {image_tag}") - - return (directory, total_size, "") - def create_function( self, code_package: Benchmark, From 0eb4d0b7a207a43e272782889e5dd9fbf4c4b9f6 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:27:27 +0100 Subject: [PATCH 049/230] small fix to use public property --- sebs/cloudflare/durable_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index caa203edd..8a7d8d9a3 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -103,7 +103,7 @@ def update_cache(self, benchmark: str): :param benchmark: benchmark name """ - self._cache_client.update_nosql( + self.cache_client.update_nosql( self.deployment_name(), benchmark, { From db84f2d4b4affac0f239e66bc7b1a03ef8e58aa1 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 6 Jan 2026 13:28:28 +0100 Subject: [PATCH 050/230] small fix for public field in durable objects --- sebs/cloudflare/durable_objects.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index 8a7d8d9a3..4bb99c11e 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -35,6 +35,7 @@ def __init__( credentials: CloudflareCredentials, ): super().__init__(region, cache_client, resources) + self._credentials = credentials # Tables are just logical names - Durable Objects are accessed via Worker bindings self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) From 7e2d8ac36db534db7d5974583646ae2cf3f2d84f Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 7 Jan 2026 09:28:25 +0100 Subject: [PATCH 051/230] converted nosql client calls to async and removed the corresponding post processing in the build script --- .../130.crud-api/nodejs/function.js | 18 ++--- .../wrappers/cloudflare/nodejs/build.js | 69 +------------------ 2 files changed, 10 insertions(+), 77 deletions(-) diff --git a/benchmarks/100.webapps/130.crud-api/nodejs/function.js b/benchmarks/100.webapps/130.crud-api/nodejs/function.js index 807b8c5f9..e1504598a 100644 --- a/benchmarks/100.webapps/130.crud-api/nodejs/function.js +++ b/benchmarks/100.webapps/130.crud-api/nodejs/function.js @@ -3,8 +3,8 @@ const nosql = require('./nosql'); const nosqlClient = nosql.nosql.get_instance(); const nosqlTableName = "shopping_cart"; -function addProduct(cartId, productId, productName, price, quantity) { - nosqlClient.insert( +async function addProduct(cartId, productId, productName, price, quantity) { + await nosqlClient.insert( nosqlTableName, ["cart_id", cartId], ["product_id", productId], @@ -12,16 +12,16 @@ function addProduct(cartId, productId, productName, price, quantity) { ); } -function getProducts(cartId, productId) { - return nosqlClient.get( +async function getProducts(cartId, productId) { + return await nosqlClient.get( nosqlTableName, ["cart_id", cartId], ["product_id", productId] ); } -function queryProducts(cartId) { - const res = nosqlClient.query( +async function queryProducts(cartId) { + const res = await nosqlClient.query( nosqlTableName, ["cart_id", cartId], "product_id" @@ -55,7 +55,7 @@ exports.handler = async function(event) { let res; if (route === "PUT /cart") { - addProduct( + await addProduct( body.cart, body.product_id, body.name, @@ -64,9 +64,9 @@ exports.handler = async function(event) { ); res = {}; } else if (route === "GET /cart/{id}") { - res = getProducts(body.cart, request.path.id); + res = await getProducts(body.cart, request.path.id); } else if (route === "GET /cart") { - res = queryProducts(body.cart); + res = await queryProducts(body.cart); } else { throw new Error(`Unknown request route: ${route}`); } diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index 7caf096dd..834ec5c16 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -61,73 +61,6 @@ const nodeBuiltinsPlugin = { } }; -const asyncNosqlPlugin = { - name: 'async-nosql-transformer', - setup(build) { - // Transform function.js to make it async-compatible - build.onLoad({ filter: /function\.js$/ }, async (args) => { - let contents = await fs.promises.readFile(args.path, 'utf8'); - - // Only transform if file uses nosql - if (!contents.includes('nosqlClient')) { - return { contents, loader: 'js' }; - } - - console.log('🔧 Transforming function.js for async nosql...'); - - // Step 1: Add await before nosqlClient method calls - contents = contents.replace(/(\s*)((?:const|let|var)\s+\w+\s*=\s*)?nosqlClient\.(insert|get|update|query|delete)\s*\(/g, - '$1$2await nosqlClient.$3('); - - // Step 2: Make all function declarations async (but not function expressions) - contents = contents.replace(/^(\s*)function\s+(\w+)\s*\(/gm, '$1async function $2('); - - // Step 3: Add await before user-defined function calls - // Process line by line to handle specific patterns - const lines = contents.split('\n'); - const transformedLines = lines.map(line => { - // Skip lines with function declarations or function expressions - if (line.match(/\bfunction\s+\w+\s*\(/) || line.match(/=\s*(async\s+)?function\s*\(/)) { - return line; - } - - // Add await before function calls that look like user-defined functions - // Match: identifier followed by ( where identifier starts line or follows whitespace/operators - // but NOT if preceded by = (assignment), . (method call), or keywords - line = line.replace(/(^|\s+|;|,|\()((?:const|let|var)\s+\w+\s*=\s*)?(\w+)\s*\(/g, (match, prefix, assignment, funcName) => { - // Skip control flow keywords - const controlFlow = ['if', 'for', 'while', 'switch', 'catch', 'return']; - if (controlFlow.includes(funcName)) { - return match; - } - - // Skip built-in JavaScript functions and methods - const builtins = ['console', 'require', 'push', 'join', 'split', - 'map', 'filter', 'reduce', 'forEach', 'find', 'findIndex', - 'some', 'every', 'includes', 'parseInt', 'parseFloat', - 'isNaN', 'Array', 'Object', 'String', 'Number', 'Boolean', - 'Math', 'JSON', 'Date', 'RegExp', 'Error', 'Promise']; - if (builtins.includes(funcName)) { - return match; - } - - // Add await for everything else - return `${prefix}${assignment || ''}await ${funcName}(`; - }); - - return line; - }); - contents = transformedLines.join('\n'); - - console.log('✓ Transformed function.js for async nosql'); - - return { - contents, - loader: 'js', - }; - }); - } -}; async function customBuild() { const srcDir = './'; @@ -162,7 +95,7 @@ async function customBuild() { target: 'es2020', sourcemap: true, allowOverwrite: true, - plugins: [nodeBuiltinsPlugin, asyncNosqlPlugin], + plugins: [nodeBuiltinsPlugin], define: { 'process.env.NODE_ENV': '"production"', 'global': 'globalThis', From 35a556ddd35da3e81a630fb89b1cd8fbe5dd126f Mon Sep 17 00:00:00 2001 From: Livio D'Agostini <33377465+ldzgch@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:23:40 +0100 Subject: [PATCH 052/230] Fix instance variable naming in nosql_do class --- benchmarks/wrappers/cloudflare/python/nosql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index 27fc94ce0..dac4e1470 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -6,13 +6,13 @@ class nosql_do: - instance: Optional["nosql"] = None + instance: Optional["nosql_do"] = None DO_BINDING_NAME = "DURABLE_STORE" @staticmethod def init_instance(entry: WorkerEntrypoint): - nosql.instance = nosql() - nosql.instance.binding = getattr(entry.env, nosql_do.DO_BINDING_NAME) + nosql_do.instance = nosql_do() + nosql_do.instance.binding = getattr(entry.env, nosql_do.DO_BINDING_NAME) def get_table(self, table_name): From 92c5dea7c7db1c956bfeb261a36c0426a45571dd Mon Sep 17 00:00:00 2001 From: Livio D'Agostini <33377465+ldzgch@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:30:20 +0100 Subject: [PATCH 053/230] Rename class instance reference from nosql to nosql_kv --- benchmarks/wrappers/cloudflare/python/nosql.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index dac4e1470..ab383f6b0 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -114,12 +114,12 @@ def get_instance(): class nosql_kv: - instance: Optional["nosql"] = None + instance: Optional["nosql_kv"] = None @staticmethod def init_instance(entry: WorkerEntrypoint): - nosql.instance = nosql() - nosql.instance.env = entry.env + nosql_kv.instance = nosql_kv() + nosql_kv.instance.env = entry.env def key_maker(self, key1, key2): return f"({key1[0]},{str(key1[1])})+({key2[0]},{key2[1]})" @@ -202,8 +202,8 @@ def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: T @staticmethod def get_instance(): - if nosql.instance is None: - nosql.instance = nosql() + if nosql_kv.instance is None: + nosql_kv.instance = nosql_kv() return nosql.instance From bcd5ecbef8919f05370b3da35269af06d87e6952 Mon Sep 17 00:00:00 2001 From: Livio D'Agostini <33377465+ldzgch@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:37:46 +0100 Subject: [PATCH 054/230] Apply suggestions from code review - storage.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- benchmarks/wrappers/cloudflare/python/storage.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 2ac2e6187..e7968eb5a 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -57,7 +57,7 @@ def download(self, bucket, key, filepath): if not filepath.startswith("/tmp"): real_fp = "/tmp" + os.path.abspath(filepath) - self.written_files.append(filepath) + self.written_files.add(filepath) with open(real_fp, "wb") as f: f.write(data) return @@ -108,7 +108,9 @@ async def adownload_stream(self, bucket, key): data = await get_res.bytes() return bytes(data) + @staticmethod def get_instance(): if storage.instance is None: - raise "must init storage singleton first" + raise RuntimeError("must init storage singleton first") + return storage.instance return storage.instance From 96ac2c16dad9dd470a324c748033730e7c09d48f Mon Sep 17 00:00:00 2001 From: Livio D'Agostini <33377465+ldzgch@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:42:00 +0100 Subject: [PATCH 055/230] Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- benchmarks/wrappers/cloudflare/python/nosql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index ab383f6b0..105590ad5 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -204,7 +204,7 @@ def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: T def get_instance(): if nosql_kv.instance is None: nosql_kv.instance = nosql_kv() - return nosql.instance + return nosql_kv.instance From b02815187a72e4d9f71a1be4349c153dd47457fe Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 11:20:51 +0100 Subject: [PATCH 056/230] config placeholder for api tokens, r2 etc --- configs/cloudflare-test.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/configs/cloudflare-test.json b/configs/cloudflare-test.json index 2b3b85827..275aa021f 100644 --- a/configs/cloudflare-test.json +++ b/configs/cloudflare-test.json @@ -13,7 +13,14 @@ }, "deployment": { "name": "cloudflare", - "cloudflare": {}, + "cloudflare": { + "credentials": { + "api_token": "", + "account_id": "", + "r2_access_key_id": "", + "r2_secret_access_key": "" + } + }, "container": false } } From 03e274e68ea2930e7b0182c9bbef8d3ed1dd4bf6 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 11:34:45 +0100 Subject: [PATCH 057/230] variable base image in docker file... have to replace the right image varibale at compile time, as wrangler does not provide docker build args --- dockerfiles/cloudflare/nodejs/Dockerfile | 3 ++- dockerfiles/cloudflare/python/Dockerfile | 3 ++- sebs/cloudflare/cloudflare.py | 31 ++++++++++++++++++++++-- sebs/config.py | 8 ++++++ 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile b/dockerfiles/cloudflare/nodejs/Dockerfile index c64351581..7e79708df 100644 --- a/dockerfiles/cloudflare/nodejs/Dockerfile +++ b/dockerfiles/cloudflare/nodejs/Dockerfile @@ -1,4 +1,5 @@ -FROM node:18-slim +ARG BASE_IMAGE=node:18-slim +FROM ${BASE_IMAGE} WORKDIR /app diff --git a/dockerfiles/cloudflare/python/Dockerfile b/dockerfiles/cloudflare/python/Dockerfile index 101a1e9f1..f3e0bcadd 100644 --- a/dockerfiles/cloudflare/python/Dockerfile +++ b/dockerfiles/cloudflare/python/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.11-slim +ARG BASE_IMAGE=python:3.11-slim +FROM ${BASE_IMAGE} # Install system dependencies (ffmpeg for video processing benchmarks) RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 06829f52d..f0183d17f 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -446,7 +446,7 @@ def package_code( if container_deployment: self.logging.info(f"Building container image for {benchmark}") return self._package_code_container( - directory, language_name, language_version, benchmark + directory, language_name, language_version, architecture, benchmark ) # Native worker deployment flow (existing logic) @@ -666,6 +666,7 @@ def _package_code_container( directory: str, language_name: str, language_version: str, + architecture: str, benchmark: str, ) -> Tuple[str, int, str]: """ @@ -699,7 +700,32 @@ def _package_code_container( ) dockerfile_dest = os.path.join(directory, "Dockerfile") if os.path.exists(dockerfile_src): - shutil.copy2(dockerfile_src, dockerfile_dest) + # Read Dockerfile and update BASE_IMAGE based on language version + with open(dockerfile_src, 'r') as f: + dockerfile_content = f.read() + + # Get base image from systems.json for container deployments + container_images = self.system_config.benchmark_container_images( + "cloudflare", language_name, architecture + ) + base_image = container_images.get(language_version) + if not base_image: + raise RuntimeError( + f"No container base image found in systems.json for {language_name} {language_version} on {architecture}" + ) + + # Replace BASE_IMAGE default value in ARG line + import re + dockerfile_content = re.sub( + r'ARG BASE_IMAGE=.*', + f'ARG BASE_IMAGE={base_image}', + dockerfile_content + ) + + # Write modified Dockerfile + with open(dockerfile_dest, 'w') as f: + f.write(dockerfile_content) + self.logging.info(f"Copied Dockerfile from {dockerfile_src}") else: raise RuntimeError(f"Dockerfile not found at {dockerfile_src}") @@ -953,6 +979,7 @@ def _build_container_image_local( try: # Build the Docker image locally (no push) # Use --no-cache to ensure handler changes are picked up + # Note: BASE_IMAGE is already set in the Dockerfile, no need to pass as build arg result = subprocess.run( ["docker", "build", "--no-cache", "-t", image_tag, "."], cwd=directory, diff --git a/sebs/config.py b/sebs/config.py index c5d3fe9bd..cca04997e 100644 --- a/sebs/config.py +++ b/sebs/config.py @@ -205,6 +205,14 @@ def benchmark_base_images( architecture ] + def benchmark_container_images( + self, deployment_name: str, language_name: str, architecture: str + ) -> Dict[str, str]: + """Get container base images for container deployments.""" + return self._system_config[deployment_name]["languages"][language_name].get( + "container_images", {} + ).get(architecture, {}) + def version(self) -> str: """Get the SeBS framework version. From a11236a5ce4bb8e7b17fb5a8d4dbc51381315ad5 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 12:15:50 +0100 Subject: [PATCH 058/230] copy and execute init.sh file from benchmark directory, and execute in dockerfile to download and link static binary dependencies --- dockerfiles/cloudflare/nodejs/Dockerfile | 14 ++++++++++++++ dockerfiles/cloudflare/python/Dockerfile | 14 ++++++++++---- sebs/cloudflare/cloudflare.py | 16 ++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile b/dockerfiles/cloudflare/nodejs/Dockerfile index 7e79708df..1bf6a89cb 100644 --- a/dockerfiles/cloudflare/nodejs/Dockerfile +++ b/dockerfiles/cloudflare/nodejs/Dockerfile @@ -1,6 +1,12 @@ ARG BASE_IMAGE=node:18-slim FROM ${BASE_IMAGE} +# Install system dependencies needed for benchmarks +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + xz-utils \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /app # Copy package files first for better caching @@ -12,6 +18,14 @@ RUN npm install --production # Copy all application files COPY . . +# Run benchmark init script if it exists (e.g., for ffmpeg in video-processing) +# This downloads static binaries needed by the benchmark +# Note: ignore errors from init.sh (e.g., when resources already exist) +RUN if [ -f "init.sh" ]; then \ + chmod +x init.sh && \ + ./init.sh /app verbose x64 || true; \ + fi + # Expose port 8080 for container communication EXPOSE 8080 diff --git a/dockerfiles/cloudflare/python/Dockerfile b/dockerfiles/cloudflare/python/Dockerfile index f3e0bcadd..e9ecc0e86 100644 --- a/dockerfiles/cloudflare/python/Dockerfile +++ b/dockerfiles/cloudflare/python/Dockerfile @@ -1,9 +1,10 @@ ARG BASE_IMAGE=python:3.11-slim FROM ${BASE_IMAGE} -# Install system dependencies (ffmpeg for video processing benchmarks) +# Install system dependencies needed for benchmarks RUN apt-get update && apt-get install -y --no-install-recommends \ - ffmpeg \ + wget \ + xz-utils \ && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -11,8 +12,13 @@ WORKDIR /app # Copy all application files first COPY . . -# Create ffmpeg directory and symlink for video-processing benchmark compatibility -RUN mkdir -p /app/ffmpeg && ln -s /usr/bin/ffmpeg /app/ffmpeg/ffmpeg +# Run benchmark init script if it exists (e.g., for ffmpeg in video-processing) +# This downloads static binaries needed by the benchmark +# Note: ignore errors from init.sh (e.g., when resources already exist) +RUN if [ -f "init.sh" ]; then \ + chmod +x init.sh && \ + ./init.sh /app verbose x64 || true; \ + fi # Install dependencies # Core dependencies for wrapper modules: diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index f0183d17f..ba76429c1 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -756,6 +756,22 @@ def _package_code_container( shutil.copy2(src, dest) self.logging.info(f"Copied container file: {file}") + # Check if benchmark has init.sh and copy it (needed for some benchmarks like video-processing) + # Look in both the benchmark root and the language-specific directory + from sebs.utils import find_benchmark + benchmark_path = find_benchmark(benchmark, "benchmarks") + if benchmark_path: + paths = [ + benchmark_path, + os.path.join(benchmark_path, language_name), + ] + for path in paths: + init_sh = os.path.join(path, "init.sh") + if os.path.exists(init_sh): + shutil.copy2(init_sh, os.path.join(directory, "init.sh")) + self.logging.info(f"Copied init.sh from {path} for container build") + break + # For Python containers, fix relative imports in benchmark code # Containers use flat structure, so "from . import storage" must become "import storage" if language_name == "python": From 35755d671356d6ae15f8c2b3a019aa1431187180 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 12:29:08 +0100 Subject: [PATCH 059/230] add to existing markdown for cloudflare specific documentation instead of readme --- docs/platforms.md | 77 ++++++- docs/storage.md | 31 +++ sebs/cloudflare/README.md | 474 -------------------------------------- 3 files changed, 107 insertions(+), 475 deletions(-) delete mode 100644 sebs/cloudflare/README.md diff --git a/docs/platforms.md b/docs/platforms.md index 3e3fc15f2..45fc41e36 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -1,6 +1,6 @@ # Platform Configuration -SeBS supports three commercial serverless platforms: AWS Lambda, Azure Functions, and Google Cloud Functions. +SeBS supports four commercial serverless platforms: AWS Lambda, Azure Functions, Google Cloud Functions, and Cloudflare Workers. Furthermore, we support the open source FaaS system OpenWhisk. The file `config/example.json` contains all parameters that users can change @@ -17,6 +17,7 @@ Supported platforms: * [Amazon Web Services (AWS) Lambda](#aws-lambda) * [Microsoft Azure Functions](#azure-functions) * [Google Cloud (GCP) Functions](#google-cloud-functions) +* [Cloudflare Workers](#cloudflare-workers) * [OpenWhisk](#openwhisk) ## Storage Configuration @@ -176,6 +177,80 @@ or in the JSON input configuration: } ``` +## Cloudflare Workers + +Cloudflare offers a free tier for Workers with generous limits for development and testing. To use Cloudflare Workers with SeBS, you need to create a Cloudflare account and obtain API credentials. + +### Credentials + +You can authenticate with Cloudflare using an API token (recommended) or email + API key. Additionally, you need your account ID which can be found in the Cloudflare dashboard. + +You can pass credentials using environment variables: + +```bash +# Option 1: Using API Token (recommended) +export CLOUDFLARE_API_TOKEN="your-api-token" +export CLOUDFLARE_ACCOUNT_ID="your-account-id" + +# Option 2: Using Email + API Key +export CLOUDFLARE_EMAIL="your-email@example.com" +export CLOUDFLARE_API_KEY="your-global-api-key" +export CLOUDFLARE_ACCOUNT_ID="your-account-id" +``` + +or in the JSON configuration file: + +```json +"deployment": { + "name": "cloudflare", + "cloudflare": { + "credentials": { + "api_token": "your-api-token", + "account_id": "your-account-id" + }, + "resources": { + "resources_id": "unique-resource-id" + } + } +} +``` + +**Note**: The `resources_id` is used to uniquely identify and track resources created by SeBS for a specific deployment. + +### Language Support + +Cloudflare Workers support multiple languages through different deployment methods: + +- **JavaScript/Node.js**: Supported via script-based deployment or container-based deployment using Wrangler CLI +- **Python**: Supported via script-based deployment or container-based deployment using Wrangler CLI + +Container-based deployments use Cloudflare's container runtime and require the Wrangler CLI to be installed (`npm install -g wrangler`). + +### Trigger Support + +- **HTTP Trigger**: ✅ Fully supported - Workers are automatically accessible at `https://{name}.{account}.workers.dev` +- **Library Trigger**: ❌ Not currently supported + +### Platform Limitations + +- **Cold Start Detection**: Cloudflare does not expose cold start information. All invocations report `is_cold: false` in the metrics. This limitation means cold start metrics are not available for Cloudflare Workers benchmarks. +- **Memory/Timeout Configuration (Workers)**: Managed by Cloudflare (128MB memory, 30s CPU time on free tier) +- **Memory/Timeout Configuration (Containers)**: Managed by Cloudflare, available in different tiers: + + | Instance Type | vCPU | Memory | Disk | + |---------------|------|--------|------| + | lite | 1/16 | 256 MiB | 2 GB | + | basic | 1/4 | 1 GiB | 4 GB | + | standard-1 | 1/2 | 4 GiB | 8 GB | + | standard-2 | 1 | 6 GiB | 12 GB | + | standard-3 | 2 | 8 GiB | 16 GB | + | standard-4 | 4 | 12 GiB | 20 GB | +- **Metrics Collection**: Uses response-based per-invocation metrics. Cloudflare does expose an Analytics engine, but it only provides aggregated metrics, no individual request metrics. Which is useless for our benchmarking purposes. + +### Storage Configuration + +Cloudflare Workers integrate with Cloudflare R2 for object storage and Durable Objects for NoSQL storage. For detailed storage configuration, see the [storage documentation](storage.md#cloudflare-storage). + ## OpenWhisk SeBS expects users to deploy and configure an OpenWhisk instance. diff --git a/docs/storage.md b/docs/storage.md index 35bde19f8..bf33ca071 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -129,6 +129,37 @@ healthy: 192.168.0.20:9012 ``` ``` +## Cloudflare Storage + +Cloudflare Workers integrate with cloud-native storage services provided by Cloudflare: + +### R2 Object Storage + +Cloudflare R2 provides S3-compatible object storage for benchmarks that require persistent file storage. SeBS automatically configures R2 buckets for benchmark input and output data. + +**Key Features:** +- S3-compatible API +- No egress fees +- Global edge storage +- Integrated with Workers through bindings + +**Configuration:** +R2 configuration is handled automatically by SeBS when deploying to Cloudflare Workers. The storage resources are defined in your deployment configuration and SeBS manages bucket creation and access. + +### Durable Objects for NoSQL + +Cloudflare Durable Objects provide stateful storage for NoSQL operations required by benchmarks like the CRUD API (130.crud-api). + +**Key Features:** +- Strongly consistent storage +- Low-latency access from Workers +- Built-in coordination primitives +- Global replication + +**Usage:** +SeBS configures Durable Objects bindings automatically when deploying container-based Workers that require NoSQL storage. The benchmark wrappers handle the interaction with Durable Objects through the standard SeBS storage interface. + + ## Lifecycle Management By default, storage containers are retained after experiments complete. This allows you to run multiple experiments without redeploying and repopulating storage. diff --git a/sebs/cloudflare/README.md b/sebs/cloudflare/README.md deleted file mode 100644 index e41a3f075..000000000 --- a/sebs/cloudflare/README.md +++ /dev/null @@ -1,474 +0,0 @@ -# Cloudflare Workers Implementation for SeBS - -This directory contains the **complete implementation** of Cloudflare Workers support for the SeBS (Serverless Benchmarking Suite). - -## Implementation Status - -✅ **Fully Implemented** - All features are production-ready: -- Multi-language support (JavaScript, Python, Java, Go, Rust) via containers -- Per-invocation metrics via response measurements (no external dependencies) -- Storage integration (R2 for object storage, Durable Objects for NoSQL) -- Script and container-based deployments -- HTTP and Library trigger support - -## Key Components - -### 1. `cloudflare.py` - Main System Implementation - -This file implements the core Cloudflare Workers platform integration, including: - -- **`create_function()`** - Creates a new Cloudflare Worker - - Checks if worker already exists - - Uploads worker script or container image via Cloudflare API - - Configures Durable Objects bindings for containerized workers - - Adds HTTP and Library triggers - - Returns a `CloudflareWorker` instance - -- **`cached_function()`** - Handles cached functions - - Refreshes triggers and logging handlers for functions retrieved from cache - -- **`update_function()`** - Updates an existing worker - - Uploads new script content - - Updates worker configuration - -- **`update_function_configuration()`** - Updates worker configuration - - Note: Cloudflare Workers have limited runtime configuration compared to AWS Lambda or Azure Functions - - Memory and CPU time limits are managed by Cloudflare - -- **`package_code()`** - Prepares code for deployment - - Packages code for both script-based and container-based worker deployments - - Supports JavaScript/Node.js scripts and multi-language containers - - Returns package path and size - -### 2. `function.py` - CloudflareWorker Class - -Represents a Cloudflare Worker function with: -- Worker name and script/container ID -- Runtime information (script or container-based) -- Serialization/deserialization for caching -- Account ID association -- Trigger configurations (HTTP and Library) - -### 3. `config.py` - Configuration Classes - -Contains three main classes: - -- **`CloudflareCredentials`** - Authentication credentials - - Supports API token or email + API key - - Requires account ID - - Can be loaded from environment variables or config file - -- **`CloudflareResources`** - Platform resources - - R2 storage bucket configuration - - Durable Objects for NoSQL operations - - Resource ID management - -- **`CloudflareConfig`** - Overall configuration - - Combines credentials and resources - - Handles serialization to/from cache - -### 4. `triggers.py` - Trigger Implementations - -- **`LibraryTrigger`** - Programmatic invocation via Cloudflare API -- **`HTTPTrigger`** - HTTP invocation via worker URLs - - Workers are automatically accessible at `https://{name}.{account}.workers.dev` - -This provides the behavior of SeBS to invoke serverless functions via either library or http triggers. - -### 5. `resources.py` - System Resources - -Handles Cloudflare-specific resources including: -- **R2 Buckets** - Object storage (S3-compatible) for benchmark data -- **Durable Objects** - Stateful storage for NoSQL operations - -This defines SeBS behavior to upload benchmarking resources and cleanup before/after benchmarks. It is different from the benchmark wrapper, which provides the functions for benchmarks to perform storage operations during execution. - -## Usage -### Environment Variables - -Set the following environment variables: - -```bash -# Option 1: Using API Token (recommended) -export CLOUDFLARE_API_TOKEN="your-api-token" -export CLOUDFLARE_ACCOUNT_ID="your-account-id" - -# Option 2: Using Email + API Key -export CLOUDFLARE_EMAIL="your-email@example.com" -export CLOUDFLARE_API_KEY="your-global-api-key" -export CLOUDFLARE_ACCOUNT_ID="your-account-id" -``` - -### Configuration File - -Alternatively, create a configuration file: - -```json -{ - "cloudflare": { - "credentials": { - "api_token": "your-api-token", - "account_id": "your-account-id" - }, - "resources": { - "resources_id": "unique-resource-id" - } - } -} -``` - -### Implemented Features - -- **Container Deployment**: ✅ Fully implemented - - Container-based workers using @cloudflare/containers - - Multi-language support via containerization - - Script and container-based deployment supported -- **Per-Invocation Metrics**: ✅ Implemented via response measurements - - Per-request performance data collected in worker response - - CPU time and wall time tracking - - Metrics extracted immediately from ExecutionResult objects -- **Language Support**: ✅ Multi-language support - - JavaScript/Node.js via script deployment - - Python, Java, Go, Rust, and more via container deployment -- **Storage Resources**: ✅ Fully integrated - - Cloudflare R2 for main storage (S3-compatible object storage) - - Cloudflare Durable Objects for NoSQL storage - - Integrated with benchmark wrappers - -### Platform Limitations - -- **Cold Start Enforcement**: Not available (Workers are instantiated on-demand at edge locations) -- **Cold Start Detection**: ⚠️ **Not Supported** - Cloudflare does not expose cold start information - - All invocations report `is_cold: false` (see hardcoded value in handler at line 146 of `benchmarks/wrappers/cloudflare/python/handler.py`) - - The `measurement.is_cold` field will always be `false` regardless of actual worker state - - **Impact on benchmarks**: Cold start metrics are incomparable to AWS Lambda, Azure Functions, or GCP Cloud Functions - - **Warning**: This limitation may skew benchmark comparisons when analyzing cold start performance across platforms - - Workers are instantiated on-demand at edge locations with minimal latency, but this state is not observable -- **Memory/Timeout Configuration**: Managed by Cloudflare (128MB memory, 50ms CPU time on free tier) - -### Completed Enhancements - -#### High Priority ✅ -- [x] **Container Deployment Support** - - Multi-language support (Python, Java, Go, Rust, etc.) via @cloudflare/containers - - Wrangler CLI integration for deployment - - Durable Objects binding for container orchestration - - See [implementation details](#container-support-architecture) below -- [x] **Storage Resources** - - Main storage: Cloudflare R2 (S3-compatible) integration complete - - NoSQL storage: Cloudflare Durable Objects support implemented - - Benchmark wrappers updated for storage operations -- [x] **Metrics Collection** - - Response-based per-invocation metrics - - Immediate availability (no external service dependency) - - CPU time, wall time, and billing calculations - -#### Standard Priority ✅ -- [x] Wrangler CLI integration for deployment and bundling -- [x] Support for Cloudflare R2 (object storage) -- [x] Support for Durable Objects (NoSQL/stateful storage) -- [x] Container-based multi-language workers - -## Metrics Collection - -### Overview - -Cloudflare Workers metrics are collected **directly from the worker response** during each invocation. This provides immediate, accurate per-invocation performance data without requiring external analytics services or API queries. - -### Why Response-Based Metrics? - -| Feature | Response Measurements | External Analytics | -|---------|---------------------|--------------------| -| **Data Granularity** | ✅ Per-invocation | ❌ Aggregated | -| **Request ID Matching** | ✅ Direct correlation | ❌ Impossible to correlate | -| **Latency** | ✅ Immediate | ❌ Delayed (30-60s) | -| **SeBS Compatibility** | ✅ Perfect match | ❌ Additional complexity | -| **Cost** | ✅ Free | ❌ May require paid plan | -| **Plan Requirement** | ✅ Any plan | ❌ May require paid plan | - -### How It Works - -1. **Worker Execution**: During each invocation, the worker handler measures performance: - - Captures start time using `time.perf_counter()` - - Executes the benchmark function - - Measures elapsed time in microseconds - - Collects request metadata (request ID, timestamps) - -2. **Response Structure**: Worker returns JSON with embedded metrics: - ```json - { - "begin": 1704556800.123, - "end": 1704556800.456, - "compute_time": 333000, - "request_id": "cf-ray-abc123", - "result": {...}, - "is_cold": false - } - ``` - -3. **Metrics Extraction**: SeBS `download_metrics()` method: - - Iterates through `ExecutionResult` objects - - Extracts metrics from response measurements - - Populates `provider_times.execution` (CPU time in μs) - - Sets `stats.cold_start` based on response data - - Calculates `billing.billed_time` and `billing.gb_seconds` - -### Handler Integration - -Benchmark wrappers automatically include metrics in their responses. The Python handler (in `benchmarks/wrappers/cloudflare/python/handler.py`) demonstrates the pattern: - -```python -# Start timing -start = time.perf_counter() -begin = datetime.datetime.now().timestamp() - -# Execute benchmark function -from function import function -ret = function.handler(event) - -# Build response with nested measurement data -log_data = { - 'output': ret['result'] -} -if 'measurement' in ret: - log_data['measurement'] = ret['measurement'] -else: - log_data['measurement'] = {} - -# Add memory usage to measurement -if HAS_RESOURCE: - memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 - log_data['measurement']['memory_used_mb'] = memory_mb - -# Calculate timing -end = datetime.datetime.now().timestamp() -elapsed = time.perf_counter() - start -micro = elapsed * 1_000_000 # Convert to microseconds - -# Return response with top-level wrapper fields and nested measurement -return Response(json.dumps({ - 'begin': begin, - 'end': end, - 'compute_time': micro, # Not used by SeBS - 'results_time': 0, # Not used by SeBS - 'result': log_data, # Contains nested measurement - 'is_cold': False, # Not used by SeBS (uses measurement.is_cold) - 'is_cold_worker': False, # Not used by SeBS - 'container_id': "0", # Not used by SeBS - 'environ_container_id': "no_id", # Not used by SeBS - 'request_id': req_id -})) -``` - -### Response Schema - -Worker responses include these fields: - -#### Top-Level Fields (Wrapper Metadata) - -| Field | Type | Used by SeBS? | Purpose | -|-------|------|---------------|----------| -| `begin` | Float | ❌ No | Start timestamp (legacy) | -| `end` | Float | ❌ No | End timestamp (legacy) | -| `compute_time` | Float | ❌ No | Wrapper overhead time (not benchmark time) | -| `results_time` | Float | ❌ No | Reserved for future use | -| `is_cold` | Boolean | ❌ No | Legacy field (use `measurement.is_cold`) | -| `is_cold_worker` | Boolean | ❌ No | Not used | -| `container_id` | String | ❌ No | Container identifier (informational) | -| `environ_container_id` | String | ❌ No | Environment container ID (informational) | -| `request_id` | String | ✅ Yes | Request identifier for tracking | -| `result` | Object | ✅ Yes | Contains `output` and `measurement` | - -#### Nested Measurement Fields (result.measurement) - -These are the **actual fields consumed by SeBS** from `result['result']['measurement']`: - -| Field | Type | Used by SeBS? | Purpose | Populated By | -|-------|------|---------------|---------|-------------| -| `cpu_time_us` | Integer | ✅ Yes | CPU time in microseconds | Benchmark function | -| `cpu_time_ms` | Float | ✅ Yes | CPU time in milliseconds (fallback) | Benchmark function | -| `wall_time_us` | Integer | ✅ Yes | Wall time in microseconds | Benchmark function | -| `wall_time_ms` | Float | ✅ Yes | Wall time in milliseconds (fallback) | Benchmark function | -| `is_cold` | Boolean | ✅ Yes | True cold start indicator | Benchmark function | -| `memory_used_mb` | Float | ✅ Yes | Memory usage in megabytes | Wrapper (via resource.getrusage) | - -**Example Response Structure:** - -```json -{ - "begin": 1704556800.123, - "end": 1704556800.456, - "compute_time": 333000, - "results_time": 0, - "result": { - "output": { /* benchmark output */ }, - "measurement": { - "cpu_time_us": 150000, - "wall_time_us": 155000, - "is_cold": false, - "memory_used_mb": 45.2 - } - }, - "is_cold": false, - "is_cold_worker": false, - "container_id": "0", - "environ_container_id": "no_id", - "request_id": "cf-ray-abc123" -} -``` - -### Metrics Extraction Process - -Metrics extraction happens in two stages: - -#### Stage 1: HTTPTrigger.sync_invoke (Per-Invocation) - -In `sebs/cloudflare/triggers.py`, the `HTTPTrigger.sync_invoke()` method extracts metrics from **nested measurement data** immediately after each invocation: - -```python -def sync_invoke(self, payload: dict) -> ExecutionResult: - result = self._http_invoke(payload, self.url) - - # Extract measurement data from result.output['result']['measurement'] - if result.output and 'result' in result.output: - result_data = result.output['result'] - if isinstance(result_data, dict) and 'measurement' in result_data: - measurement = result_data['measurement'] - - if isinstance(measurement, dict): - # CPU time in microseconds (with ms fallback) - if 'cpu_time_us' in measurement: - result.provider_times.execution = measurement['cpu_time_us'] - elif 'cpu_time_ms' in measurement: - result.provider_times.execution = int(measurement['cpu_time_ms'] * 1000) - - # Wall time in microseconds (with ms fallback) - if 'wall_time_us' in measurement: - result.times.benchmark = measurement['wall_time_us'] - elif 'wall_time_ms' in measurement: - result.times.benchmark = int(measurement['wall_time_ms'] * 1000) - - # Cold start flag - if 'is_cold' in measurement: - result.stats.cold_start = measurement['is_cold'] - - # Memory usage - if 'memory_used_mb' in measurement: - result.stats.memory_used = measurement['memory_used_mb'] - - return result -``` - -**Note:** The top-level `compute_time` field is **ignored** by SeBS. Only the nested `measurement` object is used. - -#### Stage 2: download_metrics (Aggregation) - -When `download_metrics()` is called in `cloudflare.py`, SeBS aggregates the already-extracted metrics: - -```python -for request_id, result in requests.items(): - # Count cold/warm starts (from measurement.is_cold) - if result.stats.cold_start: - cold_starts += 1 - - # Collect CPU times (from measurement.cpu_time_us/ms) - if result.provider_times.execution > 0: - cpu_times.append(result.provider_times.execution) - - # Collect memory usage (from measurement.memory_used_mb) - if result.stats.memory_used is not None and result.stats.memory_used > 0: - memory_values.append(result.stats.memory_used) - - # Calculate billing - cpu_time_seconds = result.provider_times.execution / 1_000_000.0 - gb_seconds = (128.0 / 1024.0) * cpu_time_seconds - result.billing.gb_seconds = int(gb_seconds * 1_000_000) -``` - -### Implementation Notes - -1. **Immediate Availability**: Metrics are available immediately in the response (no delay) -2. **Wrapper Consistency**: All benchmark wrappers follow the same response schema -3. **Billing Calculations**: Based on Cloudflare's fixed 128MB memory allocation and CPU time -4. **Cold Start Detection**: Currently always reports `false` (Cloudflare doesn't expose cold start info) - -### Troubleshooting - -**Missing Metrics in Results**: -- Verify worker handler returns complete JSON response with all required fields -- Check that `compute_time`, `begin`, `end` fields are present in response -- Ensure wrapper code hasn't been modified to remove metric collection -- Confirm response JSON is properly formatted - -**Incorrect Timing Values**: -- Verify `time.perf_counter()` is being used for microsecond precision -- Check that timing starts before benchmark execution and ends after -- Ensure no external fetch requests are inflating the measured time -- Confirm microsecond conversion (multiply seconds by 1,000,000) - -**Container Deployment Issues**: -- Ensure Docker is installed and running locally -- Verify wrangler CLI is installed (`npm install -g wrangler`) -- Check that @cloudflare/containers package is in dependencies -- Confirm Durable Objects bindings are correctly configured in wrangler.toml -- Ensure container image size is under Cloudflare's limits - -**Worker Deployment Failures**: -- Verify Cloudflare credentials are correctly configured -- Check account has Workers enabled (may require paid plan for some features) -- Ensure worker name doesn't conflict with existing workers -- Review wrangler logs for specific error messages - -### References - -- [Cloudflare Workers Runtime APIs](https://developers.cloudflare.com/workers/runtime-apis/) -- [Workers Bindings](https://developers.cloudflare.com/workers/configuration/bindings/) -- [Durable Objects Documentation](https://developers.cloudflare.com/durable-objects/) -- [R2 Storage Documentation](https://developers.cloudflare.com/r2/) - ---- - -## Container Support Architecture - -### Overview - -Cloudflare container support for Workers is integrated into SeBS using the `@cloudflare/containers` package, enabling deployment of containerized applications across multiple programming languages. - -### Implementation Details - -1. **Container Orchestration** - - Uses `@cloudflare/containers` npm package - - Requires Node.js worker.js wrapper for orchestration - - Container runs inside Durable Object for isolation - - Integrated with wrangler CLI for deployment - -2. **Deployment Process** - - `package_code()` generates wrangler.toml with container configuration - - Creates `[[migrations]]` entries for Durable Objects - - Binds container to `CONTAINER_WORKER` Durable Object class - - Uses `wrangler deploy` to upload both worker and container - -3. **Supported Languages** - - Python via Docker containers - - Node.js (both script and container) - - Go, Rust, Java (via container deployment) - - Any language that can run in a Linux container - -4. **Key Methods** - - `_generate_wrangler_toml()`: Creates config with container bindings - - `create_function()`: Deploys workers using wrangler CLI - - `update_function()`: Updates existing containerized workers - -### Benefits - -- **Multi-language Support**: Deploy Python, Java, Go, Rust workers -- **Complex Dependencies**: Support system libraries and compiled extensions -- **Larger Code Packages**: Overcome script size limitations -- **Consistent Environments**: Same container locally and in production - - -## References - -- [Cloudflare Workers Documentation](https://developers.cloudflare.com/workers/) -- [Cloudflare API Documentation](https://api.cloudflare.com/) -- [Workers API Reference](https://developers.cloudflare.com/workers/runtime-apis/) From b427c5b3c770e89d19d8436845f935b3f877efaf Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 12:33:18 +0100 Subject: [PATCH 060/230] more detail about download_metrics() --- docs/platforms.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/platforms.md b/docs/platforms.md index 45fc41e36..2b32d02d5 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -245,7 +245,7 @@ Container-based deployments use Cloudflare's container runtime and require the W | standard-2 | 1 | 6 GiB | 12 GB | | standard-3 | 2 | 8 GiB | 16 GB | | standard-4 | 4 | 12 GiB | 20 GB | -- **Metrics Collection**: Uses response-based per-invocation metrics. Cloudflare does expose an Analytics engine, but it only provides aggregated metrics, no individual request metrics. Which is useless for our benchmarking purposes. +- **Metrics Collection**: Uses response-based per-invocation metrics. During each function invocation, the worker handler measures performance metrics (CPU time, wall time, memory usage) and embeds them directly in the JSON response. SeBS extracts these metrics immediately from each response. When `download_metrics()` is called for postprocessing, it only aggregates the metrics that were already collected during invocations—no additional data is fetched from external services. This approach provides immediate per-invocation granularity without delays. Note that while Cloudflare does expose an Analytics Engine, it only provides aggregated metrics without individual request-level data, making it unsuitable for detailed benchmarking purposes. ### Storage Configuration From 734eadf6ae02763fc301d96715628f940eaca204 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 14:13:10 +0100 Subject: [PATCH 061/230] Docker build container for build orchestration locally --- dockerfiles/cloudflare/Dockerfile.manage | 35 +++ docs/platforms.md | 4 +- sebs/cloudflare/cli.py | 249 +++++++++++++++++++ sebs/cloudflare/cloudflare.py | 289 +++++++++-------------- 4 files changed, 397 insertions(+), 180 deletions(-) create mode 100644 dockerfiles/cloudflare/Dockerfile.manage create mode 100644 sebs/cloudflare/cli.py diff --git a/dockerfiles/cloudflare/Dockerfile.manage b/dockerfiles/cloudflare/Dockerfile.manage new file mode 100644 index 000000000..ac18ac336 --- /dev/null +++ b/dockerfiles/cloudflare/Dockerfile.manage @@ -0,0 +1,35 @@ +FROM node:20-slim + +# Disable telemetry +ENV WRANGLER_SEND_METRICS=false + +# Install system dependencies including Docker CLI +RUN apt-get clean && apt-get update \ + && apt-get install -y ca-certificates curl gnupg gosu python3 python3-pip python3-venv git \ + && install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \ + && chmod a+r /etc/apt/keyrings/docker.asc \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" > /etc/apt/sources.list.d/docker.list \ + && apt-get update \ + && apt-get install -y docker-ce-cli \ + && apt-get purge -y --auto-remove \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install wrangler globally +RUN npm install -g wrangler + +# Install uv (fast Python package installer) and pywrangler +# uv install script puts the binary in ~/.local/bin by default (not ~/.cargo/bin) +RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ + /root/.local/bin/uv tool install workers-py + +# Add paths to environment +ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:${PATH}" + +# Create working directory +RUN mkdir -p /sebs/ +COPY dockerfiles/entrypoint.sh /sebs/entrypoint.sh +RUN chmod +x /sebs/entrypoint.sh + +ENTRYPOINT ["/sebs/entrypoint.sh"] diff --git a/docs/platforms.md b/docs/platforms.md index 2b32d02d5..2b7ac8948 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -224,7 +224,9 @@ Cloudflare Workers support multiple languages through different deployment metho - **JavaScript/Node.js**: Supported via script-based deployment or container-based deployment using Wrangler CLI - **Python**: Supported via script-based deployment or container-based deployment using Wrangler CLI -Container-based deployments use Cloudflare's container runtime and require the Wrangler CLI to be installed (`npm install -g wrangler`). +### CLI Container + +SeBS uses a containerized CLI approach for Cloudflare deployments, eliminating the need to install Node.js, npm, wrangler, pywrangler, or uv on your host system. The CLI container (`sebs/manage.cloudflare`) is automatically built on first use and contains all necessary tools. This ensures consistent behavior across platforms and simplifies setup—only Docker is required. ### Trigger Support diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py new file mode 100644 index 000000000..426db5cac --- /dev/null +++ b/sebs/cloudflare/cli.py @@ -0,0 +1,249 @@ +import io +import logging +import os +import tarfile + +import docker + +from sebs.config import SeBSConfig +from sebs.utils import LoggingBase + + +class CloudflareCLI(LoggingBase): + """ + Manages a Docker container with Cloudflare Wrangler and related tools pre-installed. + + This approach isolates Cloudflare CLI tools (wrangler, pywrangler) from the host system, + avoiding global npm/uv installations and ensuring consistent behavior across platforms. + """ + + def __init__(self, system_config: SeBSConfig, docker_client: docker.client): + super().__init__() + + repo_name = system_config.docker_repository() + image_name = "manage.cloudflare" + full_image_name = repo_name + ":" + image_name + + # Try to get the image, pull if not found, build if pull fails + try: + docker_client.images.get(full_image_name) + logging.info(f"Using existing Docker image: {full_image_name}") + except docker.errors.ImageNotFound: + # Try to pull the image first + try: + logging.info(f"Pulling Docker image {full_image_name}...") + docker_client.images.pull(repo_name, image_name) + logging.info(f"Successfully pulled {full_image_name}") + except docker.errors.APIError as pull_error: + # If pull fails, try to build the image locally + logging.info(f"Pull failed: {pull_error}. Building image locally...") + + # Find the Dockerfile path + dockerfile_path = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "dockerfiles", + "cloudflare", + "Dockerfile.manage" + ) + + if not os.path.exists(dockerfile_path): + raise RuntimeError( + f"Dockerfile not found at {dockerfile_path}. " + "Cannot build Cloudflare CLI container." + ) + + # Build the image + build_path = os.path.join(os.path.dirname(__file__), "..", "..") + logging.info(f"Building {full_image_name} from {dockerfile_path}...") + + try: + image, build_logs = docker_client.images.build( + path=build_path, + dockerfile=dockerfile_path, + tag=full_image_name, + rm=True, + pull=True + ) + + # Log build output + for log in build_logs: + if 'stream' in log: + logging.debug(log['stream'].strip()) + + logging.info(f"Successfully built {full_image_name}") + except docker.errors.BuildError as build_error: + raise RuntimeError( + f"Failed to build Docker image {full_image_name}: {build_error}" + ) + + # Start the container in detached mode + self.docker_instance = docker_client.containers.run( + image=full_image_name, + command="/bin/bash", + environment={ + "CONTAINER_UID": str(os.getuid()), + "CONTAINER_GID": str(os.getgid()), + "CONTAINER_USER": "docker_user", + }, + volumes={ + # Mount Docker socket for wrangler container deployments + "/var/run/docker.sock": {"bind": "/var/run/docker.sock", "mode": "rw"} + }, + remove=True, + stdout=True, + stderr=True, + detach=True, + tty=True, + ) + + self.logging.info(f"Started Cloudflare CLI container: {self.docker_instance.id}.") + + # Wait for container to be ready + while True: + try: + dkg = self.docker_instance.logs(stream=True, follow=True) + next(dkg).decode("utf-8") + break + except StopIteration: + pass + + @staticmethod + def typename() -> str: + return "Cloudflare.CLI" + + def execute(self, cmd: str, env: dict = None): + """ + Execute the given command in Cloudflare CLI container. + Throws an exception on failure (commands are expected to execute successfully). + + Args: + cmd: Shell command to execute + env: Optional environment variables dict + + Returns: + Command output as bytes + """ + # Wrap command in sh -c to support shell features like cd, pipes, etc. + shell_cmd = ["/bin/sh", "-c", cmd] + exit_code, out = self.docker_instance.exec_run( + shell_cmd, + user="root", # Run as root since entrypoint creates docker_user but we don't wait for it + environment=env + ) + if exit_code != 0: + raise RuntimeError( + "Command {} failed at Cloudflare CLI docker!\n Output {}".format( + cmd, out.decode("utf-8") + ) + ) + return out + + def upload_package(self, directory: str, dest: str): + """ + Upload a directory to the Docker container. + + This is not an efficient and memory-intensive implementation. + So far, we didn't have very large functions that require many gigabytes. + + Since docker-py does not support a straightforward copy, and we can't + put_archive in chunks. + + Args: + directory: Local directory to upload + dest: Destination path in container + """ + handle = io.BytesIO() + with tarfile.open(fileobj=handle, mode="w:gz") as tar: + for f in os.listdir(directory): + tar.add(os.path.join(directory, f), arcname=f) + + # Move to the beginning of memory before writing + handle.seek(0) + self.execute("mkdir -p {}".format(dest)) + self.docker_instance.put_archive(path=dest, data=handle.read()) + + def check_wrangler_version(self) -> str: + """ + Check wrangler version. + + Returns: + Version string + """ + out = self.execute("wrangler --version") + return out.decode("utf-8").strip() + + def check_pywrangler_version(self) -> str: + """ + Check pywrangler version. + + Returns: + Version string + """ + out = self.execute("pywrangler --version") + return out.decode("utf-8").strip() + + def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: + """ + Deploy a worker using wrangler. + + Args: + package_dir: Path to package directory in container + env: Environment variables for deployment + + Returns: + Deployment output + """ + cmd = "cd {} && wrangler deploy".format(package_dir) + out = self.execute(cmd, env=env) + return out.decode("utf-8") + + def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: + """ + Deploy a Python worker using pywrangler. + + Args: + package_dir: Path to package directory in container + env: Environment variables for deployment + + Returns: + Deployment output + """ + cmd = "cd {} && pywrangler deploy".format(package_dir) + out = self.execute(cmd, env=env) + return out.decode("utf-8") + + def npm_install(self, package_dir: str) -> str: + """ + Run npm install in a directory. + + Args: + package_dir: Path to package directory in container + + Returns: + npm output + """ + cmd = "cd {} && npm install".format(package_dir) + out = self.execute(cmd) + return out.decode("utf-8") + + def docker_build(self, package_dir: str, image_tag: str) -> str: + """ + Build a Docker image for container deployment. + + Args: + package_dir: Path to package directory in container + image_tag: Tag for the Docker image + + Returns: + Docker build output + """ + cmd = "cd {} && docker build --no-cache -t {} .".format(package_dir, image_tag) + out = self.execute(cmd) + return out.decode("utf-8") + + def shutdown(self): + """Shutdown Docker instance.""" + self.logging.info("Stopping Cloudflare CLI Docker instance") + self.docker_instance.stop() diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index ba76429c1..a518bdd74 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -10,6 +10,7 @@ import docker import requests +from sebs.cloudflare.cli import CloudflareCLI from sebs.cloudflare.config import CloudflareConfig from sebs.cloudflare.function import CloudflareWorker from sebs.cloudflare.resources import CloudflareSystemResources @@ -65,10 +66,12 @@ def __init__( self.logging_handlers = logger_handlers self._config = config self._api_base_url = "https://api.cloudflare.com/client/v4" - # cached workers.dev subdomain for the account (e.g. 'marcin-copik') + # cached workers.dev subdomain for the account # This is different from the account ID and is required to build # public worker URLs like ..workers.dev self._workers_dev_subdomain: Optional[str] = None + # Initialize CLI container for wrangler/pywrangler operations + self._cli: Optional[CloudflareCLI] = None def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): """ @@ -154,74 +157,15 @@ def _verify_credentials(self): ) self.logging.info("Cloudflare credentials verified successfully") - - def _ensure_wrangler_installed(self): - """Ensure Wrangler CLI is installed and available.""" - try: - result = subprocess.run( - ["wrangler", "--version"], - capture_output=True, - text=True, - check=True, - timeout=10 - ) - version = result.stdout.strip() - self.logging.info(f"Wrangler is installed: {version}") - except (subprocess.CalledProcessError, FileNotFoundError): - self.logging.info("Wrangler not found, installing globally via npm...") - try: - result = subprocess.run( - ["npm", "install", "-g", "wrangler"], - capture_output=True, - text=True, - check=True, - timeout=120 - ) - self.logging.info("Wrangler installed successfully") - if result.stdout: - self.logging.debug(f"npm install wrangler output: {result.stdout}") - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to install Wrangler: {e.stderr}") - except FileNotFoundError: - raise RuntimeError( - "npm not found. Please install Node.js and npm to use Wrangler for deployment." - ) - except subprocess.TimeoutExpired: - raise RuntimeError("Wrangler version check timed out") - - def _ensure_pywrangler_installed(self): - """Necessary to download python dependencies""" - try: - result = subprocess.run( - ["pywrangler", "--version"], - capture_output=True, - text=True, - check=True, - timeout=10 - ) - version = result.stdout.strip() - self.logging.info(f"pywrangler is installed: {version}") - except (subprocess.CalledProcessError, FileNotFoundError): - self.logging.info("pywrangler not found, installing globally via uv tool install...") - try: - result = subprocess.run( - ["uv", "tool", "install", "workers-py"], - capture_output=True, - text=True, - check=True, - timeout=120 - ) - self.logging.info("pywrangler installed successfully") - if result.stdout: - self.logging.debug(f"uv tool install workers-py output: {result.stdout}") - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to install pywrangler: {e.stderr}") - except FileNotFoundError: - raise RuntimeError( - "uv not found. Please install uv." - ) - except subprocess.TimeoutExpired: - raise RuntimeError("pywrangler version check timed out") + + def _get_cli(self) -> CloudflareCLI: + """Get or initialize the Cloudflare CLI container.""" + if self._cli is None: + self._cli = CloudflareCLI(self.system_config, self.docker_client) + # Verify wrangler is available + version = self._cli.check_wrangler_version() + self.logging.info(f"Cloudflare CLI container ready: {version}") + return self._cli def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "") -> str: @@ -241,23 +185,10 @@ def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: Returns: Path to the generated wrangler.toml file """ - # Container deployment configuration if container_deployment: - # Containers ALWAYS use Node.js worker.js for orchestration (@cloudflare/containers is Node.js only) - # The container itself can run any language (Python, Node.js, etc.) - # R2 and NoSQL access is proxied through worker.js which has the bindings - - # Determine if this benchmark needs larger disk space - # 411.image-recognition needs more disk for PyTorch models - # 311.compression needs more disk for file compression operations - # 504.dna-visualisation needs more disk for DNA sequence processing - # Python containers need even more space due to zip file creation doubling disk usage instance_type = "" if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): - # Use "standard" (largest) for Python, "standard-4" for Node.js - # if language == "python": - # instance_type = '\ninstance_type = "standard-4" # Largest available - needed for Python zip operations\n' - # else: + self.logging.warning("Using standard-4 instance type for high resource benchmark") instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' toml_content = f"""name = "{worker_name}" @@ -467,52 +398,47 @@ def _package_code_native( # Install dependencies if language_name == "nodejs": - # Ensure Wrangler is installed - self._ensure_wrangler_installed() - package_file = os.path.join(directory, "package.json") node_modules = os.path.join(directory, "node_modules") # Only install if package.json exists and node_modules doesn't if os.path.exists(package_file) and not os.path.exists(node_modules): self.logging.info(f"Installing Node.js dependencies in {directory}") + # Use CLI container for npm install - no Node.js/npm needed on host + cli = self._get_cli() + container_path = f"/tmp/npm_install/{os.path.basename(directory)}" + try: + # Upload package directory to container + cli.upload_package(directory, container_path) + # Install production dependencies - result = subprocess.run( - ["npm", "install"], - cwd=directory, - capture_output=True, - text=True, - check=True, - timeout=120 - ) + self.logging.info("Installing npm dependencies in container...") + output = cli.npm_install(container_path) self.logging.info("npm install completed successfully") - if result.stdout: - self.logging.debug(f"npm output: {result.stdout}") + self.logging.debug(f"npm output: {output}") # Install esbuild as a dev dependency (needed by build.js) self.logging.info("Installing esbuild for custom build script...") - result = subprocess.run( - ["npm", "install", "--save-dev", "esbuild"], - cwd=directory, - capture_output=True, - text=True, - check=True, - timeout=60 - ) + cli.execute(f"cd {container_path} && npm install --save-dev esbuild") self.logging.info("esbuild installed successfully") + + # Download node_modules back to host + import tarfile + import io + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + + self.logging.info(f"Downloaded node_modules to {directory}") - - except subprocess.TimeoutExpired: - self.logging.error("npm install timed out") - raise RuntimeError("Failed to install Node.js dependencies: timeout") - except subprocess.CalledProcessError as e: - self.logging.error(f"npm install failed: {e.stderr}") - raise RuntimeError(f"Failed to install Node.js dependencies: {e.stderr}") - except FileNotFoundError: - raise RuntimeError( - "npm not found. Please install Node.js and npm to deploy Node.js benchmarks." - ) + except Exception as e: + self.logging.error(f"npm install in container failed: {e}") + raise RuntimeError(f"Failed to install Node.js dependencies: {e}") elif os.path.exists(node_modules): self.logging.info(f"Node.js dependencies already installed in {directory}") @@ -520,22 +446,29 @@ def _package_code_native( esbuild_path = os.path.join(node_modules, "esbuild") if not os.path.exists(esbuild_path): self.logging.info("Installing esbuild for custom build script...") + cli = self._get_cli() + container_path = f"/tmp/npm_install/{os.path.basename(directory)}" + try: - subprocess.run( - ["npm", "install", "--save-dev", "esbuild"], - cwd=directory, - capture_output=True, - text=True, - check=True, - timeout=60 - ) + cli.upload_package(directory, container_path) + cli.execute(f"cd {container_path} && npm install --save-dev esbuild") + + # Download node_modules back + import tarfile + import io + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + self.logging.info("esbuild installed successfully") except Exception as e: self.logging.warning(f"Failed to install esbuild: {e}") elif language_name == "python": - # Ensure Wrangler is installed - self._ensure_pywrangler_installed() requirements_file = os.path.join(directory, "requirements.txt") if os.path.exists(f"{requirements_file}.{language_version}"): @@ -893,16 +826,31 @@ def replacer(match): # Install Node.js dependencies (needed for all containers for worker.js) self.logging.info(f"Installing @cloudflare/containers for worker.js orchestration in {directory}") + # Use CLI container for npm install - no Node.js/npm needed on host + cli = self._get_cli() + container_path = f"/tmp/container_npm/{os.path.basename(directory)}" + try: - result = subprocess.run( - ["npm", "install", "--production"], - cwd=directory, - capture_output=True, - text=True, - check=True, - timeout=120 - ) + # Upload package directory to container + cli.upload_package(directory, container_path) + + # Install production dependencies + output = cli.execute(f"cd {container_path} && npm install --production") self.logging.info("npm install completed successfully") + self.logging.debug(f"npm output: {output.decode('utf-8')}") + + # Download node_modules back to host + import tarfile + import io + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + + self.logging.info(f"Downloaded node_modules to {directory}") except Exception as e: self.logging.error(f"npm install failed: {e}") raise RuntimeError(f"Failed to install Node.js dependencies: {e}") @@ -1122,7 +1070,7 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: def _create_or_update_worker( self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "" ) -> dict: - """Create or update a Cloudflare Worker using Wrangler CLI. + """Create or update a Cloudflare Worker using Wrangler CLI in container. Args: worker_name: Name of the worker @@ -1140,14 +1088,8 @@ def _create_or_update_worker( # Generate wrangler.toml for this worker self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) - # Set up environment for Wrangler - env = os.environ.copy() - - # Add uv tools bin directory to PATH for pywrangler access - home_dir = os.path.expanduser("~") - uv_bin_dir = os.path.join(home_dir, ".local", "share", "uv", "tools", "workers-py", "bin") - if os.path.exists(uv_bin_dir): - env['PATH'] = f"{uv_bin_dir}:{env.get('PATH', '')}" + # Set up environment for Wrangler CLI in container + env = {} if self.config.credentials.api_token: env['CLOUDFLARE_API_TOKEN'] = self.config.credentials.api_token @@ -1157,34 +1099,27 @@ def _create_or_update_worker( env['CLOUDFLARE_ACCOUNT_ID'] = account_id - # Deploy using Wrangler - self.logging.info(f"Deploying worker {worker_name} using Wrangler...") + # Get CLI container instance + cli = self._get_cli() - # For container deployments, always use wrangler (not pywrangler) - # For native deployments, use wrangler for nodejs, pywrangler for python - if container_deployment: - wrangler_cmd = "wrangler" - else: - wrangler_cmd = "wrangler" if language == "nodejs" else "pywrangler" + # Upload package directory to container + container_package_path = f"/tmp/workers/{worker_name}" + self.logging.info(f"Uploading package to container: {container_package_path}") + cli.upload_package(package_dir, container_package_path) + + # Deploy using Wrangler in container + self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") try: - # Increase timeout for large container images (e.g., 411.image-recognition with PyTorch) - # Container deployment requires pushing large images to Cloudflare - deploy_timeout = 1200 if container_deployment else 180 # 20 minutes for containers, 3 for native - - result = subprocess.run( - [wrangler_cmd, "deploy"], - cwd=package_dir, - env=env, - capture_output=True, - text=True, - check=True, - timeout=deploy_timeout - ) + # For container deployments, always use wrangler (not pywrangler) + # For native deployments, use wrangler for nodejs, pywrangler for python + if container_deployment or language == "nodejs": + output = cli.wrangler_deploy(container_package_path, env=env) + else: # python native + output = cli.pywrangler_deploy(container_package_path, env=env) self.logging.info(f"Worker {worker_name} deployed successfully") - if result.stdout: - self.logging.debug(f"Wrangler deploy output: {result.stdout}") + self.logging.debug(f"Wrangler deploy output: {output}") # For container deployments, wait for Durable Object infrastructure to initialize # The container binding needs time to propagate before first invocation @@ -1192,11 +1127,6 @@ def _create_or_update_worker( self.logging.info("Waiting for container Durable Object to initialize...") self._wait_for_durable_object_ready(worker_name, package_dir, env) - # for benchmarks 220, 311, 411 we need to wait longer after deployment - # if benchmark_name in ["220.video-processing", "311.compression", "411.image-recognition", "504.dna-visualisation"]: - # self.logging.info("Waiting 120 seconds for benchmark initialization...") - # time.sleep(400) - # For container deployments, wait for Durable Object infrastructure to initialize # The container binding needs time to propagate before first invocation if container_deployment: @@ -1207,14 +1137,10 @@ def _create_or_update_worker( # Wrangler typically outputs: "Published ()" # and "https://..workers.dev" - return {"success": True, "output": result.stdout} + return {"success": True, "output": output} - except subprocess.TimeoutExpired: - raise RuntimeError(f"Wrangler deployment timed out for worker {worker_name}") - except subprocess.CalledProcessError as e: - error_msg = f"Wrangler deployment failed for worker {worker_name}" - if e.stderr: - error_msg += f": {e.stderr}" + except RuntimeError as e: + error_msg = f"Wrangler deployment failed for worker {worker_name}: {str(e)}" self.logging.error(error_msg) raise RuntimeError(error_msg) @@ -1288,7 +1214,7 @@ def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: subdomain (the readable name used in *.workers.dev), e.g. GET /accounts/{account_id}/workers/subdomain - Returns the subdomain string (e.g. 'marcin-copik') or None on failure. + Returns the subdomain string or None on failure. """ if self._workers_dev_subdomain: return self._workers_dev_subdomain @@ -1642,10 +1568,15 @@ def shutdown(self) -> None: """ Shutdown the Cloudflare system. - Saves configuration to cache. + Saves configuration to cache and shuts down CLI container. """ try: self.cache_client.lock() self.config.update_cache(self.cache_client) finally: self.cache_client.unlock() + + # Shutdown CLI container if it was initialized + if self._cli is not None: + self._cli.shutdown() + self._cli = None From 5ffcb06bbf78fb26ca333cd67c4cbe0330c700f0 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 14:26:49 +0100 Subject: [PATCH 062/230] using docker client to build local image --- sebs/cloudflare/cloudflare.py | 38 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index a518bdd74..df65cfa35 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -941,32 +941,34 @@ def _build_container_image_local( self.logging.info(f"Building local container image: {image_tag}") try: - # Build the Docker image locally (no push) - # Use --no-cache to ensure handler changes are picked up - # Note: BASE_IMAGE is already set in the Dockerfile, no need to pass as build arg - result = subprocess.run( - ["docker", "build", "--no-cache", "-t", image_tag, "."], - cwd=directory, - capture_output=True, - text=True, - check=True, - timeout=300 # 5 minutes for build + # Build the Docker image using docker-py + # nocache=True ensures handler changes are picked up + _, build_logs = self.docker_client.images.build( + path=directory, + tag=image_tag, + nocache=True, + rm=True ) + # Log build output + for log in build_logs: + if 'stream' in log: + self.logging.debug(log['stream'].strip()) + elif 'error' in log: + self.logging.error(log['error'].strip()) + self.logging.info(f"Local container image built: {image_tag}") - if result.stdout: - self.logging.debug(f"Docker build output: {result.stdout}") return image_tag - except subprocess.CalledProcessError as e: - error_msg = f"Docker build failed for {image_tag}" - if e.stderr: - error_msg += f": {e.stderr}" + except docker.errors.BuildError as e: + error_msg = f"Docker build failed for {image_tag}: {e}" + self.logging.error(error_msg) + raise RuntimeError(error_msg) + except Exception as e: + error_msg = f"Unexpected error building Docker image {image_tag}: {e}" self.logging.error(error_msg) raise RuntimeError(error_msg) - except subprocess.TimeoutExpired: - raise RuntimeError(f"Docker build timed out for {image_tag}") def create_function( self, From 4da0c31e990beb02cf8059dcb5e1eb020869f101 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 14:27:37 +0100 Subject: [PATCH 063/230] do not create library trigger --- sebs/cloudflare/cloudflare.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index df65cfa35..4cbbe768e 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1035,12 +1035,8 @@ def create_function( account_id, ) - # Add LibraryTrigger and HTTPTrigger - from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger - - library_trigger = LibraryTrigger(func_name, self) - library_trigger.logging_handlers = self.logging_handlers - worker.add_trigger(library_trigger) + # Add HTTPTrigger + from sebs.cloudflare.triggers import HTTPTrigger # Build worker URL using the account's workers.dev subdomain when possible. # Falls back to account_id-based host or plain workers.dev with warnings. From 48747940a6f283941f561191072dd24e9856dd59 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 14:34:03 +0100 Subject: [PATCH 064/230] removed some deprecated logging, throw exception if cold start is used --- sebs/cloudflare/cloudflare.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 4cbbe768e..dba044f92 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1119,22 +1119,16 @@ def _create_or_update_worker( self.logging.info(f"Worker {worker_name} deployed successfully") self.logging.debug(f"Wrangler deploy output: {output}") - # For container deployments, wait for Durable Object infrastructure to initialize # The container binding needs time to propagate before first invocation if container_deployment: self.logging.info("Waiting for container Durable Object to initialize...") self._wait_for_durable_object_ready(worker_name, package_dir, env) - # For container deployments, wait for Durable Object infrastructure to initialize # The container binding needs time to propagate before first invocation if container_deployment: - self.logging.info("Waiting 60 seconds for container Durable Object to initialize...") + self.logging.info("Waiting 60 seconds for container to be fully provisioned (can sometimes take a bit longer)...") time.sleep(60) - # Parse the output to get worker URL - # Wrangler typically outputs: "Published ()" - # and "https://..workers.dev" - return {"success": True, "output": output} except RuntimeError as e: @@ -1414,10 +1408,11 @@ def enforce_cold_start(self, functions: List[Function], code_package: Benchmark) functions: List of functions to enforce cold start on code_package: The benchmark package """ - self.logging.warning( + raise NotImplementedError( "Cloudflare Workers do not support forced cold starts. " "Workers are automatically instantiated on-demand at edge locations." ) + def download_metrics( self, From 6b8e695ba2cd639f19508e81d95e5773ee1f4864 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 16:13:22 +0100 Subject: [PATCH 065/230] split the cloudflare.py into containers.py + workers.py... each handling their own package & deployment process orchestrated by main entry point cloudflare.py --- sebs/cloudflare/cloudflare.py | 916 +++------------------------------- sebs/cloudflare/containers.py | 547 ++++++++++++++++++++ sebs/cloudflare/workers.py | 376 ++++++++++++++ 3 files changed, 1001 insertions(+), 838 deletions(-) create mode 100644 sebs/cloudflare/containers.py create mode 100644 sebs/cloudflare/workers.py diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index dba044f92..e61b4d4aa 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,8 +1,5 @@ import os -import shutil -import json import uuid -import subprocess import time from datetime import datetime from typing import cast, Dict, List, Optional, Tuple, Type @@ -10,10 +7,11 @@ import docker import requests -from sebs.cloudflare.cli import CloudflareCLI from sebs.cloudflare.config import CloudflareConfig from sebs.cloudflare.function import CloudflareWorker from sebs.cloudflare.resources import CloudflareSystemResources +from sebs.cloudflare.workers import CloudflareWorkersDeployment +from sebs.cloudflare.containers import CloudflareContainersDeployment from sebs.benchmark import Benchmark from sebs.cache import Cache from sebs.config import SeBSConfig @@ -70,8 +68,14 @@ def __init__( # This is different from the account ID and is required to build # public worker URLs like ..workers.dev self._workers_dev_subdomain: Optional[str] = None - # Initialize CLI container for wrangler/pywrangler operations - self._cli: Optional[CloudflareCLI] = None + + # Initialize deployment handlers + self._workers_deployment = CloudflareWorkersDeployment( + self.logging, sebs_config, docker_client, self.system_resources + ) + self._containers_deployment = CloudflareContainersDeployment( + self.logging, sebs_config, docker_client, self.system_resources + ) def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): """ @@ -158,193 +162,20 @@ def _verify_credentials(self): self.logging.info("Cloudflare credentials verified successfully") - def _get_cli(self) -> CloudflareCLI: - """Get or initialize the Cloudflare CLI container.""" - if self._cli is None: - self._cli = CloudflareCLI(self.system_config, self.docker_client) - # Verify wrangler is available - version = self._cli.check_wrangler_version() - self.logging.info(f"Cloudflare CLI container ready: {version}") - return self._cli - - - def _generate_wrangler_toml(self, worker_name: str, package_dir: str, language: str, account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "") -> str: - """ - Generate a wrangler.toml configuration file for the worker. - + def _get_deployment_handler(self, container_deployment: bool): + """Get the appropriate deployment handler based on deployment type. + Args: - worker_name: Name of the worker - package_dir: Directory containing the worker code - language: Programming language (nodejs or python) - account_id: Cloudflare account ID - benchmark_name: Optional benchmark name for R2 file path prefix - code_package: Optional benchmark package for nosql configuration container_deployment: Whether this is a container deployment - container_uri: Container image URI/tag - + Returns: - Path to the generated wrangler.toml file + CloudflareWorkersDeployment or CloudflareContainersDeployment """ if container_deployment: - instance_type = "" - if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): - self.logging.warning("Using standard-4 instance type for high resource benchmark") - instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' - - toml_content = f"""name = "{worker_name}" -main = "worker.js" -compatibility_date = "2025-11-18" -account_id = "{account_id}" -compatibility_flags = ["nodejs_compat"] - -[observability] -enabled = true - -[[containers]] -max_instances = 10 -class_name = "ContainerWorker" -image = "./Dockerfile"{instance_type} - -# Durable Object binding for Container class (required by @cloudflare/containers) -[[durable_objects.bindings]] -name = "CONTAINER_WORKER" -class_name = "ContainerWorker" - -""" - # Add nosql table bindings if benchmark uses them - if code_package and code_package.uses_nosql: - # Get registered nosql tables for this benchmark - nosql_storage = self.system_resources.get_nosql_storage() - if nosql_storage.retrieve_cache(benchmark_name): - nosql_tables = nosql_storage._tables.get(benchmark_name, {}) - for table_name in nosql_tables.keys(): - toml_content += f"""[[durable_objects.bindings]] -name = "{table_name}" -class_name = "KVApiObject" - -""" - self.logging.info(f"Added Durable Object binding for nosql table '{table_name}'") - - # Add migrations for both ContainerWorker and KVApiObject - # Both need new_sqlite_classes (Container requires SQLite DO backend) - toml_content += """[[migrations]] -tag = "v1" -new_sqlite_classes = ["ContainerWorker", "KVApiObject"] - -""" - else: - # Container without nosql - only ContainerWorker migration - toml_content += """[[migrations]] -tag = "v1" -new_sqlite_classes = ["ContainerWorker"] - -""" + return self._containers_deployment else: - # Native worker configuration - main_file = "dist/handler.js" if language == "nodejs" else "handler.py" - - # Build wrangler.toml content - toml_content = f"""name = "{worker_name}" -main = "{main_file}" -compatibility_date = "2025-11-18" -account_id = "{account_id}" -""" - - if language == "nodejs": - toml_content += """# Use nodejs_compat for Node.js built-in support -compatibility_flags = ["nodejs_compat"] -no_bundle = true - -[build] -command = "node build.js" - -[[rules]] -type = "ESModule" -globs = ["**/*.js"] -fallthrough = true - -[[rules]] -type = "Text" -globs = ["**/*.html"] -fallthrough = true - -""" - elif language == "python": - toml_content += """# Enable Python Workers runtime -compatibility_flags = ["python_workers"] -""" - - toml_content += """ -[[durable_objects.bindings]] -name = "DURABLE_STORE" -class_name = "KVApiObject" - -[[migrations]] -tag = "v3" -new_classes = ["KVApiObject"] -""" - - - # Add environment variables (for both native and container deployments) - vars_content = "" - if benchmark_name: - vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' - - # Add nosql configuration if benchmark uses it - if code_package and code_package.uses_nosql: - vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' - - if vars_content: - toml_content += f"""# Environment variables -[vars] -{vars_content} -""" - - # Add R2 bucket binding for benchmarking files (for both native and container deployments) - r2_bucket_configured = False - try: - storage = self.system_resources.get_storage() - bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) - if bucket_name: - toml_content += f"""# R2 bucket binding for benchmarking files -# This bucket is used by fs and path polyfills to read benchmark data -[[r2_buckets]] -binding = "R2" -bucket_name = "{bucket_name}" - -""" - r2_bucket_configured = True - self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") - except Exception as e: - self.logging.warning( - f"R2 bucket binding not configured: {e}. " - f"Benchmarks requiring file access will not work properly." - ) - - - # Write wrangler.toml to package directory - toml_path = os.path.join(package_dir, "wrangler.toml") - with open(toml_path, 'w') as f: - f.write(toml_content) - - self.logging.info(f"Generated wrangler.toml at {toml_path}") - return toml_path + return self._workers_deployment - def _get_auth_headers(self) -> Dict[str, str]: - """Get authentication headers for Cloudflare API requests.""" - if self.config.credentials.api_token: - return { - "Authorization": f"Bearer {self.config.credentials.api_token}", - "Content-Type": "application/json", - } - elif self.config.credentials.email and self.config.credentials.api_key: - return { - "X-Auth-Email": self.config.credentials.email, - "X-Auth-Key": self.config.credentials.api_key, - "Content-Type": "application/json", - } - else: - raise RuntimeError("Invalid Cloudflare credentials configuration") def package_code( self, @@ -360,6 +191,8 @@ def package_code( Package code for Cloudflare Workers deployment using Wrangler. Uses Wrangler CLI to bundle dependencies and prepare for deployment. + Delegates to either CloudflareWorkersDeployment or CloudflareContainersDeployment + based on the deployment type. Args: directory: Path to the code directory @@ -373,602 +206,68 @@ def package_code( Returns: Tuple of (package_path, package_size, container_uri) """ + handler = self._get_deployment_handler(container_deployment) + # Container deployment flow - build Docker image if container_deployment: self.logging.info(f"Building container image for {benchmark}") - return self._package_code_container( + return handler.package_code( directory, language_name, language_version, architecture, benchmark ) - # Native worker deployment flow (existing logic) - return self._package_code_native( + # Native worker deployment flow + return handler.package_code( directory, language_name, language_version, benchmark, is_cached ) + def _get_auth_headers(self) -> Dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self.config.credentials.api_token: + return { + "Authorization": f"Bearer {self.config.credentials.api_token}", + "Content-Type": "application/json", + } + elif self.config.credentials.email and self.config.credentials.api_key: + return { + "X-Auth-Email": self.config.credentials.email, + "X-Auth-Key": self.config.credentials.api_key, + "Content-Type": "application/json", + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") - def _package_code_native( + def _generate_wrangler_toml( self, - directory: str, - language_name: str, - language_version: str, - benchmark: str, - is_cached: bool, - ) -> Tuple[str, int, str]: - """Package code for native Cloudflare Workers deployment.""" - - # Install dependencies - if language_name == "nodejs": - package_file = os.path.join(directory, "package.json") - node_modules = os.path.join(directory, "node_modules") - - # Only install if package.json exists and node_modules doesn't - if os.path.exists(package_file) and not os.path.exists(node_modules): - self.logging.info(f"Installing Node.js dependencies in {directory}") - # Use CLI container for npm install - no Node.js/npm needed on host - cli = self._get_cli() - container_path = f"/tmp/npm_install/{os.path.basename(directory)}" - - try: - # Upload package directory to container - cli.upload_package(directory, container_path) - - # Install production dependencies - self.logging.info("Installing npm dependencies in container...") - output = cli.npm_install(container_path) - self.logging.info("npm install completed successfully") - self.logging.debug(f"npm output: {output}") - - # Install esbuild as a dev dependency (needed by build.js) - self.logging.info("Installing esbuild for custom build script...") - cli.execute(f"cd {container_path} && npm install --save-dev esbuild") - self.logging.info("esbuild installed successfully") - - # Download node_modules back to host - import tarfile - import io - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info(f"Downloaded node_modules to {directory}") - - except Exception as e: - self.logging.error(f"npm install in container failed: {e}") - raise RuntimeError(f"Failed to install Node.js dependencies: {e}") - elif os.path.exists(node_modules): - self.logging.info(f"Node.js dependencies already installed in {directory}") - - # Ensure esbuild is available even for cached installations - esbuild_path = os.path.join(node_modules, "esbuild") - if not os.path.exists(esbuild_path): - self.logging.info("Installing esbuild for custom build script...") - cli = self._get_cli() - container_path = f"/tmp/npm_install/{os.path.basename(directory)}" - - try: - cli.upload_package(directory, container_path) - cli.execute(f"cd {container_path} && npm install --save-dev esbuild") - - # Download node_modules back - import tarfile - import io - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info("esbuild installed successfully") - except Exception as e: - self.logging.warning(f"Failed to install esbuild: {e}") - - elif language_name == "python": - - requirements_file = os.path.join(directory, "requirements.txt") - if os.path.exists(f"{requirements_file}.{language_version}"): - src = f"{requirements_file}.{language_version}" - dest = requirements_file - shutil.move(src, dest) - self.logging.info(f"move {src} to {dest}") - - - - # move function_cloudflare.py into function.py - function_cloudflare_file = os.path.join(directory, "function_cloudflare.py") - if os.path.exists(function_cloudflare_file): - src = function_cloudflare_file - dest = os.path.join(directory, "function.py") - shutil.move(src, dest) - self.logging.info(f"move {src} to {dest}") - - if os.path.exists(requirements_file): - with open(requirements_file, 'r') as reqf: - reqtext = reqf.read() - supported_pkg = \ -['affine', 'aiohappyeyeballs', 'aiohttp', 'aiosignal', 'altair', 'annotated-types',\ -'anyio', 'apsw', 'argon2-cffi', 'argon2-cffi-bindings', 'asciitree', 'astropy', 'astropy_iers_data',\ -'asttokens', 'async-timeout', 'atomicwrites', 'attrs', 'audioop-lts', 'autograd', 'awkward-cpp', 'b2d',\ -'bcrypt', 'beautifulsoup4', 'bilby.cython', 'biopython', 'bitarray', 'bitstring', 'bleach', 'blosc2', 'bokeh',\ -'boost-histogram', 'brotli', 'cachetools', 'casadi', 'cbor-diag', 'certifi', 'cffi', 'cffi_example', 'cftime',\ -'charset-normalizer', 'clarabel', 'click', 'cligj', 'clingo', 'cloudpickle', 'cmyt', 'cobs', 'colorspacious',\ -'contourpy', 'coolprop', 'coverage', 'cramjam', 'crc32c', 'cryptography', 'css-inline', 'cssselect', 'cvxpy-base', 'cycler',\ -'cysignals', 'cytoolz', 'decorator', 'demes', 'deprecation', 'diskcache', 'distlib', 'distro', 'docutils', 'donfig',\ -'ewah_bool_utils', 'exceptiongroup', 'executing', 'fastapi', 'fastcan', 'fastparquet', 'fiona', 'fonttools', 'freesasa',\ -'frozenlist', 'fsspec', 'future', 'galpy', 'gmpy2', 'gsw', 'h11', 'h3', 'h5py', 'highspy', 'html5lib', 'httpcore',\ -'httpx', 'idna', 'igraph', 'imageio', 'imgui-bundle', 'iminuit', 'iniconfig', 'inspice', 'ipython', 'jedi', 'Jinja2',\ -'jiter', 'joblib', 'jsonpatch', 'jsonpointer', 'jsonschema', 'jsonschema_specifications', 'kiwisolver',\ -'lakers-python', 'lazy_loader', 'lazy-object-proxy', 'libcst', 'lightgbm', 'logbook', 'lxml', 'lz4', 'MarkupSafe',\ -'matplotlib', 'matplotlib-inline', 'memory-allocator', 'micropip', 'mmh3', 'more-itertools', 'mpmath',\ -'msgpack', 'msgspec', 'msprime', 'multidict', 'munch', 'mypy', 'narwhals', 'ndindex', 'netcdf4', 'networkx',\ -'newick', 'nh3', 'nlopt', 'nltk', 'numcodecs', 'numpy', 'openai', 'opencv-python', 'optlang', 'orjson',\ -'packaging', 'pandas', 'parso', 'patsy', 'pcodec', 'peewee', 'pi-heif', 'Pillow', 'pillow-heif', 'pkgconfig',\ -'platformdirs', 'pluggy', 'ply', 'pplpy', 'primecountpy', 'prompt_toolkit', 'propcache', 'protobuf', 'pure-eval',\ -'py', 'pyclipper', 'pycparser', 'pycryptodome', 'pydantic', 'pydantic_core', 'pyerfa', 'pygame-ce', 'Pygments',\ -'pyheif', 'pyiceberg', 'pyinstrument', 'pylimer-tools', 'PyMuPDF', 'pynacl', 'pyodide-http', 'pyodide-unix-timezones',\ -'pyparsing', 'pyrsistent', 'pysam', 'pyshp', 'pytaglib', 'pytest', 'pytest-asyncio', 'pytest-benchmark', 'pytest_httpx',\ -'python-calamine', 'python-dateutil', 'python-flint', 'python-magic', 'python-sat', 'python-solvespace', 'pytz', 'pywavelets',\ -'pyxel', 'pyxirr', 'pyyaml', 'rasterio', 'rateslib', 'rebound', 'reboundx', 'referencing', 'regex', 'requests',\ -'retrying', 'rich', 'river', 'RobotRaconteur', 'rpds-py', 'ruamel.yaml', 'rustworkx', 'scikit-image', 'scikit-learn',\ -'scipy', 'screed', 'setuptools', 'shapely', 'simplejson', 'sisl', 'six', 'smart-open', 'sniffio', 'sortedcontainers',\ -'soundfile', 'soupsieve', 'sourmash', 'soxr', 'sparseqr', 'sqlalchemy', 'stack-data', 'starlette', 'statsmodels', 'strictyaml',\ -'svgwrite', 'swiglpk', 'sympy', 'tblib', 'termcolor', 'texttable', 'texture2ddecoder', 'threadpoolctl', 'tiktoken', 'tomli',\ -'tomli-w', 'toolz', 'tqdm', 'traitlets', 'traits', 'tree-sitter', 'tree-sitter-go', 'tree-sitter-java', 'tree-sitter-python',\ -'tskit', 'typing-extensions', 'tzdata', 'ujson', 'uncertainties', 'unyt', 'urllib3', 'vega-datasets', 'vrplib', 'wcwidth',\ -'webencodings', 'wordcloud', 'wrapt', 'xarray', 'xgboost', 'xlrd', 'xxhash', 'xyzservices', 'yarl', 'yt', 'zengl', 'zfpy', 'zstandard'] - needed_pkg = [] - for pkg in supported_pkg: - if pkg.lower() in reqtext.lower(): - needed_pkg.append(pkg) - - project_file = os.path.join(directory, "pyproject.toml") - depstr = str(needed_pkg).replace("\'", "\"") - with open(project_file, 'w') as pf: - pf.write(f""" -[project] -name = "{benchmark.replace(".", "-")}-python-{language_version.replace(".", "")}" -version = "0.1.0" -description = "dummy description" -requires-python = ">={language_version}" -dependencies = {depstr} - -[dependency-groups] -dev = [ - "workers-py", - "workers-runtime-sdk" -] - """) - # move into function dir - funcdir = os.path.join(directory, "function") - if not os.path.exists(funcdir): - os.makedirs(funcdir) - - dont_move = ["handler.py", "function", "python_modules", "pyproject.toml"] - for thing in os.listdir(directory): - if thing not in dont_move: - src = os.path.join(directory, thing) - dest = os.path.join(directory, "function", thing) - shutil.move(src, dest) - self.logging.info(f"move {src} to {dest}") - - # Create package structure - CONFIG_FILES = { - "nodejs": ["handler.js", "package.json", "node_modules"], - "python": ["handler.py", "requirements.txt", "python_modules"], - } - - if language_name not in CONFIG_FILES: - raise NotImplementedError( - f"Language {language_name} is not yet supported for Cloudflare Workers" - ) - - # Verify the handler exists - handler_file = "handler.js" if language_name == "nodejs" else "handler.py" - package_path = os.path.join(directory, handler_file) - - if not os.path.exists(package_path): - if not os.path.exists(directory): - raise RuntimeError( - f"Package directory {directory} does not exist. " - "The benchmark build process may have failed to create the deployment package." - ) - raise RuntimeError( - f"Handler file {handler_file} not found in {directory}. " - f"Available files: {', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" - ) - - # Calculate total size of the package directory - total_size = 0 - for dirpath, dirnames, filenames in os.walk(directory): - for filename in filenames: - filepath = os.path.join(dirpath, filename) - total_size += os.path.getsize(filepath) - - mbytes = total_size / 1024.0 / 1024.0 - self.logging.info(f"Worker package size: {mbytes:.2f} MB (Python: missing vendored modules)") + worker_name: str, + package_dir: str, + language: str, + account_id: str, + benchmark_name: Optional[str] = None, + code_package: Optional[Benchmark] = None, + container_deployment: bool = False, + container_uri: str = "", + ) -> str: + """ + Generate wrangler.toml by delegating to the appropriate deployment handler. - return (directory, total_size, "") + Args: + worker_name: Name of the worker + package_dir: Directory containing the worker code + language: Programming language (nodejs or python) + account_id: Cloudflare account ID + benchmark_name: Optional benchmark name for R2 file path prefix + code_package: Optional benchmark package for nosql configuration + container_deployment: Whether this is a container deployment + container_uri: Container image URI/tag - def _package_code_container( - self, - directory: str, - language_name: str, - language_version: str, - architecture: str, - benchmark: str, - ) -> Tuple[str, int, str]: - """ - Package code for Cloudflare container worker deployment. - - Builds a Docker image and returns the image tag for deployment. + Returns: + Path to the generated wrangler.toml file """ - self.logging.info(f"Packaging container for {language_name} {language_version}") - - # Get wrapper directory for container files - wrapper_base = os.path.join( - os.path.dirname(__file__), "..", "..", "benchmarks", "wrappers", "cloudflare" + handler = self._get_deployment_handler(container_deployment) + return handler.generate_wrangler_toml( + worker_name, package_dir, language, account_id, + benchmark_name, code_package, container_uri ) - wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") - - if not os.path.exists(wrapper_container_dir): - raise RuntimeError( - f"Container wrapper directory not found: {wrapper_container_dir}" - ) - - # Copy container wrapper files to the package directory - # Copy Dockerfile from dockerfiles/cloudflare/{language}/ - dockerfile_src = os.path.join( - os.path.dirname(__file__), - "..", - "..", - "dockerfiles", - "cloudflare", - language_name, - "Dockerfile" - ) - dockerfile_dest = os.path.join(directory, "Dockerfile") - if os.path.exists(dockerfile_src): - # Read Dockerfile and update BASE_IMAGE based on language version - with open(dockerfile_src, 'r') as f: - dockerfile_content = f.read() - - # Get base image from systems.json for container deployments - container_images = self.system_config.benchmark_container_images( - "cloudflare", language_name, architecture - ) - base_image = container_images.get(language_version) - if not base_image: - raise RuntimeError( - f"No container base image found in systems.json for {language_name} {language_version} on {architecture}" - ) - - # Replace BASE_IMAGE default value in ARG line - import re - dockerfile_content = re.sub( - r'ARG BASE_IMAGE=.*', - f'ARG BASE_IMAGE={base_image}', - dockerfile_content - ) - - # Write modified Dockerfile - with open(dockerfile_dest, 'w') as f: - f.write(dockerfile_content) - - self.logging.info(f"Copied Dockerfile from {dockerfile_src}") - else: - raise RuntimeError(f"Dockerfile not found at {dockerfile_src}") - - # Copy handler and utility files from wrapper/container - # Note: ALL containers use worker.js for orchestration (@cloudflare/containers is Node.js only) - # The handler inside the container can be Python or Node.js - container_files = ["handler.py" if language_name == "python" else "handler.js"] - - # For worker.js orchestration file, always use the nodejs version - nodejs_wrapper_dir = os.path.join(wrapper_base, "nodejs", "container") - worker_js_src = os.path.join(nodejs_wrapper_dir, "worker.js") - worker_js_dest = os.path.join(directory, "worker.js") - if os.path.exists(worker_js_src): - shutil.copy2(worker_js_src, worker_js_dest) - self.logging.info(f"Copied worker.js orchestration file from nodejs/container") - - # Copy storage and nosql utilities from language-specific wrapper - if language_name == "nodejs": - container_files.extend(["storage.js", "nosql.js"]) - else: - container_files.extend(["storage.py", "nosql.py"]) - - for file in container_files: - src = os.path.join(wrapper_container_dir, file) - dest = os.path.join(directory, file) - if os.path.exists(src): - shutil.copy2(src, dest) - self.logging.info(f"Copied container file: {file}") - - # Check if benchmark has init.sh and copy it (needed for some benchmarks like video-processing) - # Look in both the benchmark root and the language-specific directory - from sebs.utils import find_benchmark - benchmark_path = find_benchmark(benchmark, "benchmarks") - if benchmark_path: - paths = [ - benchmark_path, - os.path.join(benchmark_path, language_name), - ] - for path in paths: - init_sh = os.path.join(path, "init.sh") - if os.path.exists(init_sh): - shutil.copy2(init_sh, os.path.join(directory, "init.sh")) - self.logging.info(f"Copied init.sh from {path} for container build") - break - - # For Python containers, fix relative imports in benchmark code - # Containers use flat structure, so "from . import storage" must become "import storage" - if language_name == "python": - for item in os.listdir(directory): - if item.endswith('.py') and item not in ['handler.py', 'storage.py', 'nosql.py', 'worker.py']: - filepath = os.path.join(directory, item) - with open(filepath, 'r') as f: - content = f.read() - - # Replace relative imports with absolute imports - modified = False - if 'from . import storage' in content: - content = content.replace('from . import storage', 'import storage') - modified = True - if 'from . import nosql' in content: - content = content.replace('from . import nosql', 'import nosql') - modified = True - - if modified: - with open(filepath, 'w') as f: - f.write(content) - self.logging.info(f"Fixed relative imports in {item}") - - # For Node.js containers, transform benchmark code to be async-compatible - # The container wrapper uses async HTTP calls, but benchmarks expect sync - elif language_name == "nodejs": - import re - for item in os.listdir(directory): - if item.endswith('.js') and item not in ['handler.js', 'storage.js', 'nosql.js', 'worker.js', 'build.js', 'request-polyfill.js']: - filepath = os.path.join(directory, item) - with open(filepath, 'r') as f: - content = f.read() - - # Only transform if file uses nosqlClient - if 'nosqlClient' not in content: - continue - - self.logging.info(f"Transforming {item} for async nosql...") - - # Step 1: Add await before nosqlClient method calls - content = re.sub( - r'(\s*)((?:const|let|var)\s+\w+\s*=\s*)?nosqlClient\.(insert|get|update|query|delete)\s*\(', - r'\1\2await nosqlClient.\3(', - content - ) - - # Step 2: Make all function declarations async - content = re.sub(r'^(\s*)function\s+(\w+)\s*\(', r'\1async function \2(', content, flags=re.MULTILINE) - - # Step 3: Add await before user-defined function calls - lines = content.split('\n') - transformed_lines = [] - control_flow = {'if', 'for', 'while', 'switch', 'catch', 'return'} - builtins = {'console', 'require', 'push', 'join', 'split', 'map', 'filter', - 'reduce', 'forEach', 'find', 'findIndex', 'some', 'every', - 'includes', 'parseInt', 'parseFloat', 'isNaN', 'Array', - 'Object', 'String', 'Number', 'Boolean', 'Math', 'JSON', - 'Date', 'RegExp', 'Error', 'Promise'} - - for line in lines: - # Skip function declarations - if re.search(r'\bfunction\s+\w+\s*\(', line) or re.search(r'=\s*(async\s+)?function\s*\(', line): - transformed_lines.append(line) - continue - - # Add await before likely user-defined function calls - def replacer(match): - prefix = match.group(1) - assignment = match.group(2) or '' - func_name = match.group(3) - - if func_name in control_flow or func_name in builtins: - return match.group(0) - - return f"{prefix}{assignment}await {func_name}(" - - line = re.sub( - r'(^|\s+|;|,|\()((?:const|let|var)\s+\w+\s*=\s*)?(\w+)\s*\(', - replacer, - line - ) - transformed_lines.append(line) - - content = '\n'.join(transformed_lines) - - with open(filepath, 'w') as f: - f.write(content) - self.logging.info(f"Transformed {item} for async nosql") - - # Install dependencies for container orchestration - # ALL containers need @cloudflare/containers for worker.js orchestration - worker_package_json = { - "name": f"{benchmark}-worker", - "version": "1.0.0", - "dependencies": { - "@cloudflare/containers": "*" - } - } - - if language_name == "nodejs": - # Read the benchmark's package.json if it exists and merge dependencies - benchmark_package_file = os.path.join(directory, "package.json") - if os.path.exists(benchmark_package_file): - with open(benchmark_package_file, 'r') as f: - benchmark_package = json.load(f) - # Merge benchmark dependencies with worker dependencies - if "dependencies" in benchmark_package: - worker_package_json["dependencies"].update(benchmark_package["dependencies"]) - - # Write the combined package.json - with open(benchmark_package_file, 'w') as f: - json.dump(worker_package_json, f, indent=2) - else: # Python containers also need package.json for worker.js orchestration - # Create package.json just for @cloudflare/containers (Python code in container) - package_json_path = os.path.join(directory, "package.json") - with open(package_json_path, 'w') as f: - json.dump(worker_package_json, f, indent=2) - self.logging.info("Created package.json for Python container worker.js orchestration") - - # Install Node.js dependencies (needed for all containers for worker.js) - self.logging.info(f"Installing @cloudflare/containers for worker.js orchestration in {directory}") - # Use CLI container for npm install - no Node.js/npm needed on host - cli = self._get_cli() - container_path = f"/tmp/container_npm/{os.path.basename(directory)}" - - try: - # Upload package directory to container - cli.upload_package(directory, container_path) - - # Install production dependencies - output = cli.execute(f"cd {container_path} && npm install --production") - self.logging.info("npm install completed successfully") - self.logging.debug(f"npm output: {output.decode('utf-8')}") - - # Download node_modules back to host - import tarfile - import io - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info(f"Downloaded node_modules to {directory}") - except Exception as e: - self.logging.error(f"npm install failed: {e}") - raise RuntimeError(f"Failed to install Node.js dependencies: {e}") - - # For Python containers, also handle Python requirements - if language_name == "python": - # Python requirements will be installed in the Dockerfile - # Rename version-specific requirements.txt to requirements.txt - requirements_file = os.path.join(directory, "requirements.txt") - versioned_requirements = os.path.join(directory, f"requirements.txt.{language_version}") - - if os.path.exists(versioned_requirements): - shutil.copy2(versioned_requirements, requirements_file) - self.logging.info(f"Copied requirements.txt.{language_version} to requirements.txt") - - # Fix torch wheel URLs for container compatibility - # Replace direct wheel URLs with proper torch installation - with open(requirements_file, 'r') as f: - content = f.read() - - # Replace torch wheel URLs with proper installation commands - import re - modified = False - if 'download.pytorch.org/whl' in content: - # Remove direct wheel URLs and replace with proper torch installation - lines = content.split('\n') - new_lines = [] - for line in lines: - if 'download.pytorch.org/whl/cpu/torch-' in line: - # Extract version from URL (e.g., torch-2.0.0+cpu) - match = re.search(r'torch-([0-9.]+)(?:%2B|\+)cpu', line) - if match: - version = match.group(1) - # Use index-url method instead of direct wheel - new_lines.append(f'torch=={version}') - modified = True - else: - new_lines.append(line) - else: - new_lines.append(line) - - if modified: - # Add extra-index-url at the top for CPU-only torch - content = '--extra-index-url https://download.pytorch.org/whl/cpu\n' + '\n'.join(new_lines) - with open(requirements_file, 'w') as f: - f.write(content) - self.logging.info("Modified requirements.txt to use torch index-url instead of direct wheels") - - elif not os.path.exists(requirements_file): - # Create empty requirements.txt if none exists - with open(requirements_file, 'w') as f: - f.write("") - self.logging.info("Created empty requirements.txt") - - # Build Docker image locally for cache compatibility - # wrangler will re-build/push during deployment from the Dockerfile - image_tag = self._build_container_image_local(directory, benchmark, language_name, language_version) - - # Calculate package size (approximate, as it's a source directory) - total_size = 0 - for dirpath, dirnames, filenames in os.walk(directory): - for filename in filenames: - filepath = os.path.join(dirpath, filename) - total_size += os.path.getsize(filepath) - - self.logging.info(f"Container package prepared with local image: {image_tag}") - - # Return local image tag (wrangler will rebuild from Dockerfile during deploy) - return (directory, total_size, image_tag) - - def _build_container_image_local( - self, - directory: str, - benchmark: str, - language_name: str, - language_version: str, - ) -> str: - """ - Build a Docker image locally for cache purposes. - wrangler will rebuild from Dockerfile during deployment. - - Returns the local image tag. - """ - # Generate image tag - image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" - image_tag = f"{image_name}:latest" - - self.logging.info(f"Building local container image: {image_tag}") - - try: - # Build the Docker image using docker-py - # nocache=True ensures handler changes are picked up - _, build_logs = self.docker_client.images.build( - path=directory, - tag=image_tag, - nocache=True, - rm=True - ) - - # Log build output - for log in build_logs: - if 'stream' in log: - self.logging.debug(log['stream'].strip()) - elif 'error' in log: - self.logging.error(log['error'].strip()) - - self.logging.info(f"Local container image built: {image_tag}") - - return image_tag - - except docker.errors.BuildError as e: - error_msg = f"Docker build failed for {image_tag}: {e}" - self.logging.error(error_msg) - raise RuntimeError(error_msg) - except Exception as e: - error_msg = f"Unexpected error building Docker image {image_tag}: {e}" - self.logging.error(error_msg) - raise RuntimeError(error_msg) def create_function( self, @@ -1097,8 +396,9 @@ def _create_or_update_worker( env['CLOUDFLARE_ACCOUNT_ID'] = account_id - # Get CLI container instance - cli = self._get_cli() + # Get CLI container instance from appropriate deployment handler + handler = self._get_deployment_handler(container_deployment) + cli = handler._get_cli() # Upload package directory to container container_package_path = f"/tmp/workers/{worker_name}" @@ -1122,7 +422,11 @@ def _create_or_update_worker( # The container binding needs time to propagate before first invocation if container_deployment: self.logging.info("Waiting for container Durable Object to initialize...") - self._wait_for_durable_object_ready(worker_name, package_dir, env) + account_id = env.get('CLOUDFLARE_ACCOUNT_ID') + worker_url = self._build_workers_dev_url(worker_name, account_id) + self._containers_deployment.wait_for_durable_object_ready( + worker_name, worker_url + ) # The container binding needs time to propagate before first invocation if container_deployment: @@ -1136,69 +440,6 @@ def _create_or_update_worker( self.logging.error(error_msg) raise RuntimeError(error_msg) - def _wait_for_durable_object_ready(self, worker_name: str, package_dir: str, env: dict): - """Wait for container Durable Object to be fully provisioned and ready.""" - max_wait_seconds = 400 - wait_interval = 10 - start_time = time.time() - - account_id = env.get('CLOUDFLARE_ACCOUNT_ID') - worker_url = self._build_workers_dev_url(worker_name, account_id) - - self.logging.info("Checking container Durable Object readiness via health endpoint...") - - consecutive_failures = 0 - max_consecutive_failures = 5 - - while time.time() - start_time < max_wait_seconds: - try: - # Use health check endpoint - response = requests.get( - f"{worker_url}/health", - timeout=60 - ) - - # 200 = ready - if response.status_code == 200: - self.logging.info("Container Durable Object is ready!") - return True - - # 503 = not ready yet (expected, keep waiting) - elif response.status_code == 503: - elapsed = int(time.time() - start_time) - try: - error_data = response.json() - error_msg = error_data.get('error', 'Container provisioning') - self.logging.info(f"{error_msg}... ({elapsed}s elapsed)") - except: - self.logging.info(f"Container provisioning... ({elapsed}s elapsed)") - consecutive_failures = 0 # This is expected - - # 500 or other = something's wrong - else: - consecutive_failures += 1 - self.logging.warning(f"Unexpected status {response.status_code}: {response.text[:200]}") - - # If we get too many unexpected errors, something might be broken - if consecutive_failures >= max_consecutive_failures: - self.logging.error(f"Got {consecutive_failures} consecutive errors, container may be broken") - return False - - except requests.exceptions.Timeout: - elapsed = int(time.time() - start_time) - self.logging.info(f"Health check timeout (container may be starting)... ({elapsed}s elapsed)") - except requests.exceptions.RequestException as e: - elapsed = int(time.time() - start_time) - self.logging.debug(f"Connection error ({elapsed}s): {str(e)[:100]}") - - time.sleep(wait_interval) - - self.logging.warning( - f"Container Durable Object may not be fully ready after {max_wait_seconds}s. " - "First invocation may still experience initialization delay." - ) - return False - def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: """Fetch the workers.dev subdomain for the given account. @@ -1561,7 +802,7 @@ def shutdown(self) -> None: """ Shutdown the Cloudflare system. - Saves configuration to cache and shuts down CLI container. + Saves configuration to cache and shuts down deployment handler CLI containers. """ try: self.cache_client.lock() @@ -1569,7 +810,6 @@ def shutdown(self) -> None: finally: self.cache_client.unlock() - # Shutdown CLI container if it was initialized - if self._cli is not None: - self._cli.shutdown() - self._cli = None + # Shutdown deployment handler CLI containers + self._workers_deployment.shutdown() + self._containers_deployment.shutdown() diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py new file mode 100644 index 000000000..842679adc --- /dev/null +++ b/sebs/cloudflare/containers.py @@ -0,0 +1,547 @@ +""" +Cloudflare Container Workers deployment implementation. + +Handles packaging, Docker image building, and deployment of containerized +Cloudflare Workers using @cloudflare/containers. +""" + +import os +import shutil +import json +import io +import re +import time +import tarfile +from typing import Optional, Tuple + +import docker +import requests + +from sebs.benchmark import Benchmark +from sebs.cloudflare.cli import CloudflareCLI + + +class CloudflareContainersDeployment: + """Handles Cloudflare container worker deployment operations.""" + + def __init__(self, logging, system_config, docker_client, system_resources): + """ + Initialize CloudflareContainersDeployment. + + Args: + logging: Logger instance + system_config: System configuration + docker_client: Docker client instance + system_resources: System resources manager + """ + self.logging = logging + self.system_config = system_config + self.docker_client = docker_client + self.system_resources = system_resources + self._cli: Optional[CloudflareCLI] = None + + def _get_cli(self) -> CloudflareCLI: + """Get or initialize the Cloudflare CLI container.""" + if self._cli is None: + self._cli = CloudflareCLI(self.system_config, self.docker_client) + # Verify wrangler is available + version = self._cli.check_wrangler_version() + self.logging.info(f"Cloudflare CLI container ready: {version}") + return self._cli + + def generate_wrangler_toml( + self, + worker_name: str, + package_dir: str, + language: str, + account_id: str, + benchmark_name: Optional[str] = None, + code_package: Optional[Benchmark] = None, + container_uri: str = "", + ) -> str: + """ + Generate a wrangler.toml configuration file for container workers. + + Args: + worker_name: Name of the worker + package_dir: Directory containing the worker code + language: Programming language (nodejs or python) + account_id: Cloudflare account ID + benchmark_name: Optional benchmark name for R2 file path prefix + code_package: Optional benchmark package for nosql configuration + container_uri: Container image URI/tag + + Returns: + Path to the generated wrangler.toml file + """ + instance_type = "" + if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): + self.logging.warning("Using standard-4 instance type for high resource benchmark") + instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' + + toml_content = f"""name = "{worker_name}" +main = "worker.js" +compatibility_date = "2025-11-18" +account_id = "{account_id}" +compatibility_flags = ["nodejs_compat"] + +[observability] +enabled = true + +[[containers]] +max_instances = 10 +class_name = "ContainerWorker" +image = "./Dockerfile"{instance_type} + +# Durable Object binding for Container class (required by @cloudflare/containers) +[[durable_objects.bindings]] +name = "CONTAINER_WORKER" +class_name = "ContainerWorker" + +""" + # Add nosql table bindings if benchmark uses them + if code_package and code_package.uses_nosql: + # Get registered nosql tables for this benchmark + nosql_storage = self.system_resources.get_nosql_storage() + if nosql_storage.retrieve_cache(benchmark_name): + nosql_tables = nosql_storage._tables.get(benchmark_name, {}) + for table_name in nosql_tables.keys(): + toml_content += f"""# Durable Object binding for NoSQL table: {table_name} +[[durable_objects.bindings]] +name = "{table_name.upper()}" +class_name = "KVApiObject" + +""" + + # Add migrations for both ContainerWorker and KVApiObject + # Both need new_sqlite_classes (Container requires SQLite DO backend) + toml_content += """[[migrations]] +tag = "v1" +new_sqlite_classes = ["ContainerWorker", "KVApiObject"] + +""" + else: + # Container without nosql - only ContainerWorker migration + toml_content += """[[migrations]] +tag = "v1" +new_sqlite_classes = ["ContainerWorker"] + +""" + + # Add environment variables + vars_content = "" + if benchmark_name: + vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' + + # Add nosql configuration if benchmark uses it + if code_package and code_package.uses_nosql: + vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' + + if vars_content: + toml_content += f"""# Environment variables +[vars] +{vars_content} +""" + + # Add R2 bucket binding for benchmarking files + r2_bucket_configured = False + try: + from sebs.faas.config import Resources + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + if bucket_name: + toml_content += f"""# R2 bucket binding for benchmarking files +# This bucket is used by fs and path polyfills to read benchmark data +[[r2_buckets]] +binding = "R2" +bucket_name = "{bucket_name}" + +""" + r2_bucket_configured = True + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") + except Exception as e: + self.logging.warning( + f"R2 bucket binding not configured: {e}. " + f"Benchmarks requiring file access will not work properly." + ) + + # Write wrangler.toml to package directory + toml_path = os.path.join(package_dir, "wrangler.toml") + with open(toml_path, 'w') as f: + f.write(toml_content) + + self.logging.info(f"Generated wrangler.toml at {toml_path}") + return toml_path + + def package_code( + self, + directory: str, + language_name: str, + language_version: str, + architecture: str, + benchmark: str, + ) -> Tuple[str, int, str]: + """ + Package code for Cloudflare container worker deployment. + + Builds a Docker image and returns the image tag for deployment. + + Args: + directory: Path to the code directory + language_name: Programming language name + language_version: Programming language version + architecture: Target architecture + benchmark: Benchmark name + + Returns: + Tuple of (package_path, package_size, container_uri) + """ + self.logging.info(f"Packaging container for {language_name} {language_version}") + + # Get wrapper directory for container files + wrapper_base = os.path.join( + os.path.dirname(__file__), "..", "..", "benchmarks", "wrappers", "cloudflare" + ) + wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") + + if not os.path.exists(wrapper_container_dir): + raise RuntimeError( + f"Container wrapper directory not found: {wrapper_container_dir}" + ) + + # Copy container wrapper files to the package directory + # Copy Dockerfile from dockerfiles/cloudflare/{language}/ + dockerfile_src = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "dockerfiles", + "cloudflare", + language_name, + "Dockerfile" + ) + dockerfile_dest = os.path.join(directory, "Dockerfile") + if os.path.exists(dockerfile_src): + # Read Dockerfile and update BASE_IMAGE based on language version + with open(dockerfile_src, 'r') as f: + dockerfile_content = f.read() + + # Get base image from systems.json for container deployments + container_images = self.system_config.benchmark_container_images( + "cloudflare", language_name, architecture + ) + base_image = container_images.get(language_version) + if not base_image: + raise RuntimeError( + f"No container base image found in systems.json for {language_name} {language_version} on {architecture}" + ) + + # Replace BASE_IMAGE default value in ARG line + dockerfile_content = re.sub( + r'ARG BASE_IMAGE=.*', + f'ARG BASE_IMAGE={base_image}', + dockerfile_content + ) + + # Write modified Dockerfile + with open(dockerfile_dest, 'w') as f: + f.write(dockerfile_content) + + self.logging.info(f"Copied Dockerfile from {dockerfile_src}") + else: + raise RuntimeError(f"Dockerfile not found at {dockerfile_src}") + + # Copy handler and utility files from wrapper/container + # Note: ALL containers use worker.js for orchestration (@cloudflare/containers is Node.js only) + # The handler inside the container can be Python or Node.js + container_files = ["handler.py" if language_name == "python" else "handler.js"] + + # For worker.js orchestration file, always use the nodejs version + nodejs_wrapper_dir = os.path.join(wrapper_base, "nodejs", "container") + worker_js_src = os.path.join(nodejs_wrapper_dir, "worker.js") + worker_js_dest = os.path.join(directory, "worker.js") + if os.path.exists(worker_js_src): + shutil.copy2(worker_js_src, worker_js_dest) + self.logging.info(f"Copied worker.js orchestration file from nodejs/container") + + # Copy storage and nosql utilities from language-specific wrapper + if language_name == "nodejs": + container_files.extend(["storage.js", "nosql.js"]) + else: + container_files.extend(["storage.py", "nosql.py"]) + + for file in container_files: + src = os.path.join(wrapper_container_dir, file) + dest = os.path.join(directory, file) + if os.path.exists(src): + shutil.copy2(src, dest) + self.logging.info(f"Copied container file: {file}") + + # Check if benchmark has init.sh and copy it (needed for some benchmarks like video-processing) + # Look in both the benchmark root and the language-specific directory + from sebs.utils import find_benchmark + benchmark_path = find_benchmark(benchmark, "benchmarks") + if benchmark_path: + paths = [ + benchmark_path, + os.path.join(benchmark_path, language_name), + ] + for path in paths: + init_sh = os.path.join(path, "init.sh") + if os.path.exists(init_sh): + shutil.copy2(init_sh, os.path.join(directory, "init.sh")) + self.logging.info(f"Copied init.sh from {path}") + break + + # For Python containers, fix relative imports in benchmark code + # Containers use flat structure, so "from . import storage" must become "import storage" + if language_name == "python": + for item in os.listdir(directory): + if item.endswith('.py') and item not in ['handler.py', 'storage.py', 'nosql.py', 'worker.py']: + file_path = os.path.join(directory, item) + with open(file_path, 'r') as f: + content = f.read() + # Fix relative imports + content = re.sub(r'from \. import ', 'import ', content) + with open(file_path, 'w') as f: + f.write(content) + + # For Node.js containers, transform benchmark code to be async-compatible + # The container wrapper uses async HTTP calls, but benchmarks expect sync + elif language_name == "nodejs": + for item in os.listdir(directory): + if item.endswith('.js') and item not in ['handler.js', 'storage.js', 'nosql.js', 'worker.js', 'build.js', 'request-polyfill.js']: + file_path = os.path.join(directory, item) + # Could add transformations here if needed + pass + + # Prepare package.json for container orchestration + # ALL containers need @cloudflare/containers for worker.js orchestration + worker_package_json = { + "name": f"{benchmark}-worker", + "version": "1.0.0", + "dependencies": { + "@cloudflare/containers": "*" + } + } + + if language_name == "nodejs": + # Read the benchmark's package.json if it exists and merge dependencies + benchmark_package_file = os.path.join(directory, "package.json") + if os.path.exists(benchmark_package_file): + with open(benchmark_package_file, 'r') as f: + benchmark_package = json.load(f) + # Merge dependencies + if "dependencies" in benchmark_package: + worker_package_json["dependencies"].update(benchmark_package["dependencies"]) + + # Write the combined package.json + with open(benchmark_package_file, 'w') as f: + json.dump(worker_package_json, f, indent=2) + else: # Python containers also need package.json for worker.js orchestration + # Create package.json just for @cloudflare/containers (Python code in container) + package_json_path = os.path.join(directory, "package.json") + with open(package_json_path, 'w') as f: + json.dump(worker_package_json, f, indent=2) + + # Install Node.js dependencies for wrangler deployment + # Note: These are needed for wrangler to bundle worker.js, not for the container + # The container also installs them during Docker build + self.logging.info(f"Installing Node.js dependencies for wrangler deployment in {directory}") + cli = self._get_cli() + container_path = f"/tmp/container_npm/{os.path.basename(directory)}" + + try: + # Upload package directory to CLI container + cli.upload_package(directory, container_path) + + # Install production dependencies + output = cli.execute(f"cd {container_path} && npm install --production") + self.logging.info("npm install completed successfully") + self.logging.debug(f"npm output: {output.decode('utf-8')}") + + # Download node_modules back to host for wrangler + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + + self.logging.info(f"Downloaded node_modules to {directory} for wrangler deployment") + except Exception as e: + self.logging.error(f"npm install failed: {e}") + raise RuntimeError(f"Failed to install Node.js dependencies: {e}") + + # For Python containers, also handle Python requirements + if language_name == "python": + # Python requirements will be installed in the Dockerfile + # Rename version-specific requirements.txt to requirements.txt + requirements_file = os.path.join(directory, "requirements.txt") + versioned_requirements = os.path.join(directory, f"requirements.txt.{language_version}") + + if os.path.exists(versioned_requirements): + shutil.copy2(versioned_requirements, requirements_file) + self.logging.info(f"Copied requirements.txt.{language_version} to requirements.txt") + + # Fix torch wheel URLs for container compatibility + # Replace direct wheel URLs with proper torch installation + with open(requirements_file, 'r') as f: + content = f.read() + + # Replace torch wheel URLs with proper installation commands + modified = False + if 'download.pytorch.org/whl' in content: + # Replace direct wheel URL with pip-installable torch + content = re.sub( + r'https://download\.pytorch\.org/whl/[^\s]+\.whl', + 'torch', + content + ) + modified = True + + if modified: + with open(requirements_file, 'w') as f: + f.write(content) + self.logging.info("Fixed torch URLs in requirements.txt for container compatibility") + + elif not os.path.exists(requirements_file): + # Create empty requirements.txt if none exists + with open(requirements_file, 'w') as f: + f.write("") + self.logging.info("Created empty requirements.txt") + + # Build Docker image locally for cache compatibility + # wrangler will re-build/push during deployment from the Dockerfile + image_tag = self._build_container_image_local(directory, benchmark, language_name, language_version) + + # Calculate package size (approximate, as it's a source directory) + total_size = 0 + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + total_size += os.path.getsize(filepath) + + self.logging.info(f"Container package prepared with local image: {image_tag}") + + # Return local image tag (wrangler will rebuild from Dockerfile during deploy) + return (directory, total_size, image_tag) + + def _build_container_image_local( + self, + directory: str, + benchmark: str, + language_name: str, + language_version: str, + ) -> str: + """ + Build a Docker image locally for cache purposes. + wrangler will rebuild from Dockerfile during deployment. + + Returns the local image tag. + """ + # Generate image tag + image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" + image_tag = f"{image_name}:latest" + + self.logging.info(f"Building local container image: {image_tag}") + + try: + # Build the Docker image using docker-py + # nocache=True ensures handler changes are picked up + _, build_logs = self.docker_client.images.build( + path=directory, + tag=image_tag, + nocache=True, + rm=True + ) + + # Log build output + for log in build_logs: + if 'stream' in log: + self.logging.debug(log['stream'].strip()) + elif 'error' in log: + self.logging.error(log['error']) + + self.logging.info(f"Local container image built: {image_tag}") + + return image_tag + + except docker.errors.BuildError as e: + error_msg = f"Docker build failed for {image_tag}: {e}" + self.logging.error(error_msg) + raise RuntimeError(error_msg) + except Exception as e: + error_msg = f"Unexpected error building Docker image {image_tag}: {e}" + self.logging.error(error_msg) + raise RuntimeError(error_msg) + + def wait_for_durable_object_ready( + self, + worker_name: str, + worker_url: str, + max_wait_seconds: int = 400 + ) -> bool: + """ + Wait for container Durable Object to be fully provisioned and ready. + + Args: + worker_name: Name of the worker + worker_url: URL of the worker + max_wait_seconds: Maximum time to wait in seconds + + Returns: + True if ready, False if timeout + """ + wait_interval = 10 + start_time = time.time() + + self.logging.info("Checking container Durable Object readiness via health endpoint...") + + consecutive_failures = 0 + max_consecutive_failures = 5 + + while time.time() - start_time < max_wait_seconds: + try: + # Use health check endpoint + response = requests.get( + f"{worker_url}/health", + timeout=60 + ) + + # 200 = ready + if response.status_code == 200: + self.logging.info("Container Durable Object is ready!") + return True + # 503 = not ready yet + elif response.status_code == 503: + elapsed = int(time.time() - start_time) + self.logging.info( + f"Container Durable Object not ready yet (503 Service Unavailable)... " + f"({elapsed}s elapsed, will retry)" + ) + # Other errors + else: + self.logging.warning(f"Unexpected status {response.status_code}: {response.text[:100]}") + + except requests.exceptions.Timeout: + elapsed = int(time.time() - start_time) + self.logging.info(f"Health check timeout (container may be starting)... ({elapsed}s elapsed)") + except requests.exceptions.RequestException as e: + elapsed = int(time.time() - start_time) + self.logging.debug(f"Connection error ({elapsed}s): {str(e)[:100]}") + + time.sleep(wait_interval) + + self.logging.warning( + f"Container Durable Object may not be fully ready after {max_wait_seconds}s. " + "First invocation may still experience initialization delay." + ) + return False + + def shutdown(self): + """Shutdown CLI container if initialized.""" + if self._cli is not None: + self._cli.shutdown() + self._cli = None diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py new file mode 100644 index 000000000..da4a5073d --- /dev/null +++ b/sebs/cloudflare/workers.py @@ -0,0 +1,376 @@ +""" +Cloudflare Workers native deployment implementation. + +Handles packaging, deployment, and management of native Cloudflare Workers +(non-container deployments using JavaScript/Python runtime). +""" + +import os +import shutil +import json +import io +import tarfile +from typing import Optional, Tuple + +from sebs.benchmark import Benchmark +from sebs.cloudflare.cli import CloudflareCLI + + +class CloudflareWorkersDeployment: + """Handles native Cloudflare Workers deployment operations.""" + + def __init__(self, logging, system_config, docker_client, system_resources): + """ + Initialize CloudflareWorkersDeployment. + + Args: + logging: Logger instance + system_config: System configuration + docker_client: Docker client instance + system_resources: System resources manager + """ + self.logging = logging + self.system_config = system_config + self.docker_client = docker_client + self.system_resources = system_resources + self._cli: Optional[CloudflareCLI] = None + + def _get_cli(self) -> CloudflareCLI: + """Get or initialize the Cloudflare CLI container.""" + if self._cli is None: + self._cli = CloudflareCLI(self.system_config, self.docker_client) + # Verify wrangler is available + version = self._cli.check_wrangler_version() + self.logging.info(f"Cloudflare CLI container ready: {version}") + return self._cli + + def generate_wrangler_toml( + self, + worker_name: str, + package_dir: str, + language: str, + account_id: str, + benchmark_name: Optional[str] = None, + code_package: Optional[Benchmark] = None, + container_uri: str = "", + ) -> str: + """ + Generate a wrangler.toml configuration file for native workers. + + Args: + worker_name: Name of the worker + package_dir: Directory containing the worker code + language: Programming language (nodejs or python) + account_id: Cloudflare account ID + benchmark_name: Optional benchmark name for R2 file path prefix + code_package: Optional benchmark package for nosql configuration + + Returns: + Path to the generated wrangler.toml file + """ + # Native worker configuration + main_file = "dist/handler.js" if language == "nodejs" else "handler.py" + + # Build wrangler.toml content + toml_content = f"""name = "{worker_name}" +main = "{main_file}" +compatibility_date = "2025-11-18" +account_id = "{account_id}" +""" + + if language == "nodejs": + toml_content += """# Use nodejs_compat for Node.js built-in support +compatibility_flags = ["nodejs_compat"] +no_bundle = true + +[build] +command = "node build.js" + +[[rules]] +type = "ESModule" +globs = ["**/*.js"] +fallthrough = true + +[[rules]] +type = "Text" +globs = ["**/*.html"] +fallthrough = true + +""" + elif language == "python": + toml_content += """# Enable Python Workers runtime +compatibility_flags = ["python_workers"] +""" + + toml_content += """ +[[durable_objects.bindings]] +name = "DURABLE_STORE" +class_name = "KVApiObject" + +[[migrations]] +tag = "v3" +new_classes = ["KVApiObject"] +""" + + # Add environment variables + vars_content = "" + if benchmark_name: + vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' + + # Add nosql configuration if benchmark uses it + if code_package and code_package.uses_nosql: + vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' + + if vars_content: + toml_content += f"""# Environment variables +[vars] +{vars_content} +""" + + # Add R2 bucket binding for benchmarking files + r2_bucket_configured = False + try: + from sebs.faas.config import Resources + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + if bucket_name: + toml_content += f"""# R2 bucket binding for benchmarking files +# This bucket is used by fs and path polyfills to read benchmark data +[[r2_buckets]] +binding = "R2" +bucket_name = "{bucket_name}" + +""" + r2_bucket_configured = True + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") + except Exception as e: + self.logging.warning( + f"R2 bucket binding not configured: {e}. " + f"Benchmarks requiring file access will not work properly." + ) + + # Write wrangler.toml to package directory + toml_path = os.path.join(package_dir, "wrangler.toml") + with open(toml_path, 'w') as f: + f.write(toml_content) + + self.logging.info(f"Generated wrangler.toml at {toml_path}") + return toml_path + + def package_code( + self, + directory: str, + language_name: str, + language_version: str, + benchmark: str, + is_cached: bool, + ) -> Tuple[str, int, str]: + """ + Package code for native Cloudflare Workers deployment. + + Args: + directory: Path to the code directory + language_name: Programming language name + language_version: Programming language version + benchmark: Benchmark name + is_cached: Whether the code is cached + + Returns: + Tuple of (package_path, package_size, container_uri) + """ + # Install dependencies + if language_name == "nodejs": + package_file = os.path.join(directory, "package.json") + node_modules = os.path.join(directory, "node_modules") + + # Only install if package.json exists and node_modules doesn't + if os.path.exists(package_file) and not os.path.exists(node_modules): + self.logging.info(f"Installing Node.js dependencies in {directory}") + # Use CLI container for npm install - no Node.js/npm needed on host + cli = self._get_cli() + container_path = f"/tmp/npm_install/{os.path.basename(directory)}" + + try: + # Upload package directory to container + cli.upload_package(directory, container_path) + + # Install production dependencies + self.logging.info("Installing npm dependencies in container...") + output = cli.npm_install(container_path) + self.logging.info("npm install completed successfully") + self.logging.debug(f"npm output: {output}") + + # Install esbuild as a dev dependency (needed by build.js) + self.logging.info("Installing esbuild for custom build script...") + cli.execute(f"cd {container_path} && npm install --save-dev esbuild") + self.logging.info("esbuild installed successfully") + + # Download node_modules back to host + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + + self.logging.info(f"Downloaded node_modules to {directory}") + + except Exception as e: + self.logging.error(f"npm install in container failed: {e}") + raise RuntimeError(f"Failed to install Node.js dependencies: {e}") + elif os.path.exists(node_modules): + self.logging.info(f"Node.js dependencies already installed in {directory}") + + # Ensure esbuild is available even for cached installations + esbuild_path = os.path.join(node_modules, "esbuild") + if not os.path.exists(esbuild_path): + self.logging.info("Installing esbuild for custom build script...") + cli = self._get_cli() + container_path = f"/tmp/npm_install/{os.path.basename(directory)}" + + try: + cli.upload_package(directory, container_path) + cli.execute(f"cd {container_path} && npm install --save-dev esbuild") + + # Download node_modules back to host + bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + + self.logging.info("esbuild installed successfully") + except Exception as e: + self.logging.error(f"Failed to install esbuild: {e}") + raise RuntimeError(f"Failed to install esbuild: {e}") + + elif language_name == "python": + requirements_file = os.path.join(directory, "requirements.txt") + if os.path.exists(f"{requirements_file}.{language_version}"): + src = f"{requirements_file}.{language_version}" + dest = requirements_file + shutil.move(src, dest) + self.logging.info(f"move {src} to {dest}") + + # move function_cloudflare.py into function.py + function_cloudflare_file = os.path.join(directory, "function_cloudflare.py") + if os.path.exists(function_cloudflare_file): + src = function_cloudflare_file + dest = os.path.join(directory, "function.py") + shutil.move(src, dest) + self.logging.info(f"move {src} to {dest}") + + if os.path.exists(requirements_file): + with open(requirements_file, 'r') as reqf: + reqtext = reqf.read() + supported_pkg = \ +['affine', 'aiohappyeyeballs', 'aiohttp', 'aiosignal', 'altair', 'annotated-types',\ +'anyio', 'apsw', 'argon2-cffi', 'argon2-cffi-bindings', 'asciitree', 'astropy', 'astropy_iers_data',\ +'asttokens', 'async-timeout', 'atomicwrites', 'attrs', 'audioop-lts', 'autograd', 'awkward-cpp', 'b2d',\ +'bcrypt', 'beautifulsoup4', 'bilby.cython', 'biopython', 'bitarray', 'bitstring', 'bleach', 'blosc2', 'bokeh',\ +'boost-histogram', 'brotli', 'cachetools', 'casadi', 'cbor-diag', 'certifi', 'cffi', 'cffi_example', 'cftime',\ +'charset-normalizer', 'clarabel', 'click', 'cligj', 'clingo', 'cloudpickle', 'cmyt', 'cobs', 'colorspacious',\ +'contourpy', 'coolprop', 'coverage', 'cramjam', 'crc32c', 'cryptography', 'css-inline', 'cssselect', 'cvxpy-base', 'cycler',\ +'cysignals', 'cytoolz', 'decorator', 'demes', 'deprecation', 'diskcache', 'distlib', 'distro', 'docutils', 'donfig',\ +'ewah_bool_utils', 'exceptiongroup', 'executing', 'fastapi', 'fastcan', 'fastparquet', 'fiona', 'fonttools', 'freesasa',\ +'frozenlist', 'fsspec', 'future', 'galpy', 'gmpy2', 'gsw', 'h11', 'h3', 'h5py', 'highspy', 'html5lib', 'httpcore',\ +'httpx', 'idna', 'igraph', 'imageio', 'imgui-bundle', 'iminuit', 'iniconfig', 'inspice', 'ipython', 'jedi', 'Jinja2',\ +'jiter', 'joblib', 'jsonpatch', 'jsonpointer', 'jsonschema', 'jsonschema_specifications', 'kiwisolver',\ +'lakers-python', 'lazy_loader', 'lazy-object-proxy', 'libcst', 'lightgbm', 'logbook', 'lxml', 'lz4', 'MarkupSafe',\ +'matplotlib', 'matplotlib-inline', 'memory-allocator', 'micropip', 'mmh3', 'more-itertools', 'mpmath',\ +'msgpack', 'msgspec', 'msprime', 'multidict', 'munch', 'mypy', 'narwhals', 'ndindex', 'netcdf4', 'networkx',\ +'newick', 'nh3', 'nlopt', 'nltk', 'numcodecs', 'numpy', 'openai', 'opencv-python', 'optlang', 'orjson',\ +'packaging', 'pandas', 'parso', 'patsy', 'pcodec', 'peewee', 'pi-heif', 'Pillow', 'pillow-heif', 'pkgconfig',\ +'platformdirs', 'pluggy', 'ply', 'pplpy', 'primecountpy', 'prompt_toolkit', 'propcache', 'protobuf', 'pure-eval',\ +'py', 'pyclipper', 'pycparser', 'pycryptodome', 'pydantic', 'pydantic_core', 'pyerfa', 'pygame-ce', 'Pygments',\ +'pyheif', 'pyiceberg', 'pyinstrument', 'pylimer-tools', 'PyMuPDF', 'pynacl', 'pyodide-http', 'pyodide-unix-timezones',\ +'pyparsing', 'pyrsistent', 'pysam', 'pyshp', 'pytaglib', 'pytest', 'pytest-asyncio', 'pytest-benchmark', 'pytest_httpx',\ +'python-calamine', 'python-dateutil', 'python-flint', 'python-magic', 'python-sat', 'python-solvespace', 'pytz', 'pywavelets',\ +'pyxel', 'pyxirr', 'pyyaml', 'rasterio', 'rateslib', 'rebound', 'reboundx', 'referencing', 'regex', 'requests',\ +'retrying', 'rich', 'river', 'RobotRaconteur', 'rpds-py', 'ruamel.yaml', 'rustworkx', 'scikit-image', 'scikit-learn',\ +'scipy', 'screed', 'setuptools', 'shapely', 'simplejson', 'sisl', 'six', 'smart-open', 'sniffio', 'sortedcontainers',\ +'soundfile', 'soupsieve', 'sourmash', 'soxr', 'sparseqr', 'sqlalchemy', 'stack-data', 'starlette', 'statsmodels', 'strictyaml',\ +'svgwrite', 'swiglpk', 'sympy', 'tblib', 'termcolor', 'texttable', 'texture2ddecoder', 'threadpoolctl', 'tiktoken', 'tomli',\ +'tomli-w', 'toolz', 'tqdm', 'traitlets', 'traits', 'tree-sitter', 'tree-sitter-go', 'tree-sitter-java', 'tree-sitter-python',\ +'tskit', 'typing-extensions', 'tzdata', 'ujson', 'uncertainties', 'unyt', 'urllib3', 'vega-datasets', 'vrplib', 'wcwidth',\ +'webencodings', 'wordcloud', 'wrapt', 'xarray', 'xgboost', 'xlrd', 'xxhash', 'xyzservices', 'yarl', 'yt', 'zengl', 'zfpy', 'zstandard'] + needed_pkg = [] + for pkg in supported_pkg: + if pkg.lower() in reqtext.lower(): + needed_pkg.append(pkg) + + project_file = os.path.join(directory, "pyproject.toml") + depstr = str(needed_pkg).replace("\'", "\"") + with open(project_file, 'w') as pf: + pf.write(f""" +[project] +name = "{benchmark.replace(".", "-")}-python-{language_version.replace(".", "")}" +version = "0.1.0" +description = "dummy description" +requires-python = ">={language_version}" +dependencies = {depstr} + +[dependency-groups] +dev = [ + "workers-py", + "workers-runtime-sdk" +] + """) + # move into function dir + funcdir = os.path.join(directory, "function") + if not os.path.exists(funcdir): + os.makedirs(funcdir) + + dont_move = ["handler.py", "function", "python_modules", "pyproject.toml"] + for thing in os.listdir(directory): + if thing not in dont_move: + src = os.path.join(directory, thing) + dest = os.path.join(directory, "function", thing) + shutil.move(src, dest) + + # Create package structure + CONFIG_FILES = { + "nodejs": ["handler.js", "package.json", "node_modules"], + "python": ["handler.py", "requirements.txt", "python_modules"], + } + + if language_name not in CONFIG_FILES: + raise NotImplementedError( + f"Language {language_name} is not yet supported for Cloudflare Workers" + ) + + # Verify the handler exists + handler_file = "handler.js" if language_name == "nodejs" else "handler.py" + package_path = os.path.join(directory, handler_file) + + if not os.path.exists(package_path): + if not os.path.exists(directory): + raise RuntimeError( + f"Package directory {directory} does not exist. " + "The benchmark build process may have failed to create the deployment package." + ) + raise RuntimeError( + f"Handler file {handler_file} not found in {directory}. " + f"Available files: {', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" + ) + + # Calculate total size of the package directory + total_size = 0 + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + total_size += os.path.getsize(filepath) + + mbytes = total_size / 1024.0 / 1024.0 + self.logging.info(f"Worker package size: {mbytes:.2f} MB (Python: missing vendored modules)") + + return (directory, total_size, "") + + def shutdown(self): + """Shutdown CLI container if initialized.""" + if self._cli is not None: + self._cli.shutdown() + self._cli = None From 865ca06862bc961a2787a4295826bdf161ecb47e Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 16:46:15 +0100 Subject: [PATCH 066/230] use toml writer, cleaner approach to write wrangler.toml --- sebs/cloudflare/containers.py | 133 +++++++++++++--------------- sebs/cloudflare/durable_objects.py | 2 +- sebs/cloudflare/workers.py | 134 ++++++++++++++--------------- templates/wrangler-container.toml | 25 ++++++ templates/wrangler-worker.toml | 16 ++++ 5 files changed, 166 insertions(+), 144 deletions(-) create mode 100644 templates/wrangler-container.toml create mode 100644 templates/wrangler-worker.toml diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 842679adc..df077ce68 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -12,6 +12,15 @@ import re import time import tarfile +try: + import tomllib # Python 3.11+ +except ImportError: + import tomli as tomllib # Fallback for older Python +try: + import tomli_w +except ImportError: + # Fallback to basic TOML writing if tomli_w not available + import toml as tomli_w from typing import Optional, Tuple import docker @@ -74,102 +83,80 @@ def generate_wrangler_toml( Returns: Path to the generated wrangler.toml file """ - instance_type = "" - if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name): + # Load template + template_path = os.path.join( + os.path.dirname(__file__), + "../..", + "templates", + "wrangler-container.toml" + ) + with open(template_path, 'rb') as f: + config = tomllib.load(f) + + # Update basic configuration + config['name'] = worker_name + config['account_id'] = account_id + + # Update container configuration with instance type if needed + if benchmark_name and ("411.image-recognition" in benchmark_name or + "311.compression" in benchmark_name or + "504.dna-visualisation" in benchmark_name): self.logging.warning("Using standard-4 instance type for high resource benchmark") - instance_type = '\ninstance_type = "standard-4" # 20GB Disk, 12GB Memory\n' + config['containers'][0]['instance_type'] = "standard-4" - toml_content = f"""name = "{worker_name}" -main = "worker.js" -compatibility_date = "2025-11-18" -account_id = "{account_id}" -compatibility_flags = ["nodejs_compat"] - -[observability] -enabled = true - -[[containers]] -max_instances = 10 -class_name = "ContainerWorker" -image = "./Dockerfile"{instance_type} - -# Durable Object binding for Container class (required by @cloudflare/containers) -[[durable_objects.bindings]] -name = "CONTAINER_WORKER" -class_name = "ContainerWorker" - -""" # Add nosql table bindings if benchmark uses them if code_package and code_package.uses_nosql: # Get registered nosql tables for this benchmark nosql_storage = self.system_resources.get_nosql_storage() if nosql_storage.retrieve_cache(benchmark_name): nosql_tables = nosql_storage._tables.get(benchmark_name, {}) + + # Add durable object bindings for each nosql table for table_name in nosql_tables.keys(): - toml_content += f"""# Durable Object binding for NoSQL table: {table_name} -[[durable_objects.bindings]] -name = "{table_name.upper()}" -class_name = "KVApiObject" - -""" - - # Add migrations for both ContainerWorker and KVApiObject - # Both need new_sqlite_classes (Container requires SQLite DO backend) - toml_content += """[[migrations]] -tag = "v1" -new_sqlite_classes = ["ContainerWorker", "KVApiObject"] - -""" - else: - # Container without nosql - only ContainerWorker migration - toml_content += """[[migrations]] -tag = "v1" -new_sqlite_classes = ["ContainerWorker"] - -""" - + config['durable_objects']['bindings'].append({ + 'name': table_name.upper(), + 'class_name': 'KVApiObject' + }) + + # Update migrations to include KVApiObject + config['migrations'][0]['new_sqlite_classes'].append('KVApiObject') + # Add environment variables - vars_content = "" - if benchmark_name: - vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' - - # Add nosql configuration if benchmark uses it - if code_package and code_package.uses_nosql: - vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' - - if vars_content: - toml_content += f"""# Environment variables -[vars] -{vars_content} -""" - - # Add R2 bucket binding for benchmarking files - r2_bucket_configured = False + if benchmark_name or (code_package and code_package.uses_nosql): + config['vars'] = {} + if benchmark_name: + config['vars']['BENCHMARK_NAME'] = benchmark_name + if code_package and code_package.uses_nosql: + config['vars']['NOSQL_STORAGE_DATABASE'] = "durable_objects" + + # Add R2 bucket binding try: from sebs.faas.config import Resources storage = self.system_resources.get_storage() bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) if bucket_name: - toml_content += f"""# R2 bucket binding for benchmarking files -# This bucket is used by fs and path polyfills to read benchmark data -[[r2_buckets]] -binding = "R2" -bucket_name = "{bucket_name}" - -""" - r2_bucket_configured = True + config['r2_buckets'] = [{ + 'binding': 'R2', + 'bucket_name': bucket_name + }] self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") except Exception as e: self.logging.warning( f"R2 bucket binding not configured: {e}. " f"Benchmarks requiring file access will not work properly." ) - + # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") - with open(toml_path, 'w') as f: - f.write(toml_content) - + try: + # Try tomli_w (writes binary) + with open(toml_path, 'wb') as f: + tomli_w.dump(config, f) + except TypeError: + # Fallback to toml library (writes text) + with open(toml_path, 'w') as f: + f.write(tomli_w.dumps(config)) + self.logging.info(f"Generated wrangler.toml at {toml_path}") return toml_path diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py index 4bb99c11e..258886cf3 100644 --- a/sebs/cloudflare/durable_objects.py +++ b/sebs/cloudflare/durable_objects.py @@ -180,7 +180,7 @@ def clear_table(self, name: str) -> str: :param name: table name :return: table name """ - self.logging.info(f"Durable Objects data is managed within the Worker") + self.logging.warning(f"Durable Objects data is managed within the Worker") return name def remove_table(self, name: str) -> str: diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index da4a5073d..f78f0aad1 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -10,6 +10,15 @@ import json import io import tarfile +try: + import tomllib # Python 3.11+ +except ImportError: + import tomli as tomllib # Fallback for older Python +try: + import tomli_w +except ImportError: + # Fallback to basic TOML writing if tomli_w not available + import toml as tomli_w from typing import Optional, Tuple from sebs.benchmark import Benchmark @@ -68,92 +77,77 @@ def generate_wrangler_toml( Returns: Path to the generated wrangler.toml file """ - # Native worker configuration - main_file = "dist/handler.js" if language == "nodejs" else "handler.py" - - # Build wrangler.toml content - toml_content = f"""name = "{worker_name}" -main = "{main_file}" -compatibility_date = "2025-11-18" -account_id = "{account_id}" -""" - + # Load template + template_path = os.path.join( + os.path.dirname(__file__), + "../..", + "templates", + "wrangler-worker.toml" + ) + with open(template_path, 'rb') as f: + config = tomllib.load(f) + + # Update basic configuration + config['name'] = worker_name + config['main'] = "dist/handler.js" if language == "nodejs" else "handler.py" + config['account_id'] = account_id + + # Add language-specific configuration if language == "nodejs": - toml_content += """# Use nodejs_compat for Node.js built-in support -compatibility_flags = ["nodejs_compat"] -no_bundle = true - -[build] -command = "node build.js" - -[[rules]] -type = "ESModule" -globs = ["**/*.js"] -fallthrough = true - -[[rules]] -type = "Text" -globs = ["**/*.html"] -fallthrough = true - -""" + config['compatibility_flags'] = ["nodejs_compat"] + config['no_bundle'] = True + config['build'] = {'command': 'node build.js'} + config['rules'] = [ + { + 'type': 'ESModule', + 'globs': ['**/*.js'], + 'fallthrough': True + }, + { + 'type': 'Text', + 'globs': ['**/*.html'], + 'fallthrough': True + } + ] elif language == "python": - toml_content += """# Enable Python Workers runtime -compatibility_flags = ["python_workers"] -""" - - toml_content += """ -[[durable_objects.bindings]] -name = "DURABLE_STORE" -class_name = "KVApiObject" - -[[migrations]] -tag = "v3" -new_classes = ["KVApiObject"] -""" - + config['compatibility_flags'] = ["python_workers"] + # Add environment variables - vars_content = "" - if benchmark_name: - vars_content += f'BENCHMARK_NAME = "{benchmark_name}"\n' - - # Add nosql configuration if benchmark uses it - if code_package and code_package.uses_nosql: - vars_content += 'NOSQL_STORAGE_DATABASE = "durable_objects"\n' - - if vars_content: - toml_content += f"""# Environment variables -[vars] -{vars_content} -""" - - # Add R2 bucket binding for benchmarking files - r2_bucket_configured = False + if benchmark_name or (code_package and code_package.uses_nosql): + config['vars'] = {} + if benchmark_name: + config['vars']['BENCHMARK_NAME'] = benchmark_name + if code_package and code_package.uses_nosql: + config['vars']['NOSQL_STORAGE_DATABASE'] = "durable_objects" + + # Add R2 bucket binding try: from sebs.faas.config import Resources storage = self.system_resources.get_storage() bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) if bucket_name: - toml_content += f"""# R2 bucket binding for benchmarking files -# This bucket is used by fs and path polyfills to read benchmark data -[[r2_buckets]] -binding = "R2" -bucket_name = "{bucket_name}" - -""" - r2_bucket_configured = True + config['r2_buckets'] = [{ + 'binding': 'R2', + 'bucket_name': bucket_name + }] self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") except Exception as e: self.logging.warning( f"R2 bucket binding not configured: {e}. " f"Benchmarks requiring file access will not work properly." ) - + # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") - with open(toml_path, 'w') as f: - f.write(toml_content) - + try: + # Try tomli_w (writes binary) + with open(toml_path, 'wb') as f: + tomli_w.dump(config, f) + except TypeError: + # Fallback to toml library (writes text) + with open(toml_path, 'w') as f: + f.write(tomli_w.dumps(config)) + self.logging.info(f"Generated wrangler.toml at {toml_path}") return toml_path diff --git a/templates/wrangler-container.toml b/templates/wrangler-container.toml new file mode 100644 index 000000000..d8e08fe33 --- /dev/null +++ b/templates/wrangler-container.toml @@ -0,0 +1,25 @@ +# Template for Cloudflare Container Workers +# This file is read and modified by the deployment system + +name = "PLACEHOLDER_WORKER_NAME" +main = "worker.js" +compatibility_date = "2025-11-18" +account_id = "PLACEHOLDER_ACCOUNT_ID" +compatibility_flags = ["nodejs_compat"] + +[observability] +enabled = true + +[[containers]] +max_instances = 10 +class_name = "ContainerWorker" +image = "./Dockerfile" + +# Durable Object binding for Container class (required by @cloudflare/containers) +[[durable_objects.bindings]] +name = "CONTAINER_WORKER" +class_name = "ContainerWorker" + +[[migrations]] +tag = "v1" +new_sqlite_classes = ["ContainerWorker"] diff --git a/templates/wrangler-worker.toml b/templates/wrangler-worker.toml new file mode 100644 index 000000000..b11821281 --- /dev/null +++ b/templates/wrangler-worker.toml @@ -0,0 +1,16 @@ +# Template for native Cloudflare Workers +# This file is read and modified by the deployment system + +name = "PLACEHOLDER_WORKER_NAME" +main = "PLACEHOLDER_MAIN_FILE" +compatibility_date = "2025-11-18" +account_id = "PLACEHOLDER_ACCOUNT_ID" + +# Durable Object binding for NoSQL storage +[[durable_objects.bindings]] +name = "DURABLE_STORE" +class_name = "KVApiObject" + +[[migrations]] +tag = "v3" +new_classes = ["KVApiObject"] From 20eb8db129102f06a7b0f88dd918daff49c44519 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 16:55:12 +0100 Subject: [PATCH 067/230] accidental capitalization of the table name resulting in errors in 130 --- sebs/cloudflare/containers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index df077ce68..5b1fd9fd4 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -114,7 +114,7 @@ def generate_wrangler_toml( # Add durable object bindings for each nosql table for table_name in nosql_tables.keys(): config['durable_objects']['bindings'].append({ - 'name': table_name.upper(), + 'name': table_name, 'class_name': 'KVApiObject' }) From ebe07946df7d95bcab37e7b5d9b30ef7409d75a3 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 16:57:45 +0100 Subject: [PATCH 068/230] warning that locationHint is not supported --- docs/storage.md | 3 +++ sebs/cloudflare/r2.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/storage.md b/docs/storage.md index bf33ca071..1a4ee4573 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -146,6 +146,9 @@ Cloudflare R2 provides S3-compatible object storage for benchmarks that require **Configuration:** R2 configuration is handled automatically by SeBS when deploying to Cloudflare Workers. The storage resources are defined in your deployment configuration and SeBS manages bucket creation and access. +**Limitations:** +- Geographic location hints (locationHint) are not currently supported. R2 buckets are created with Cloudflare's automatic location selection, which places data near where it's most frequently accessed. + ### Durable Objects for NoSQL Cloudflare Durable Objects provide stateful storage for NoSQL operations required by benchmarks like the CRUD API (130.crud-api). diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 660588e1f..ebc2d684d 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -75,8 +75,14 @@ def _create_bucket( # R2 API only accepts "name" parameter - locationHint is optional and must be one of: # "apac", "eeur", "enam", "weur", "wnam" - # For now, just send the name without locationHint + # WARNING: locationHint is not currently supported by SeBS. Buckets are created + # with Cloudflare's automatic location selection. params = {"name": name} + + self.logging.warning( + f"Creating R2 bucket '{name}' without locationHint. " + "Geographic location is determined automatically by Cloudflare." + ) try: create_bucket_response = requests.post( From 9dd0a6e2290207a217ea443daa580c07f41df689 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 18 Jan 2026 17:12:13 +0100 Subject: [PATCH 069/230] s3 client for r2 storage code duplication removed. and also removed librarytrigger --- sebs/cloudflare/cloudflare.py | 14 +--- sebs/cloudflare/function.py | 11 +-- sebs/cloudflare/r2.py | 128 ++++++++++++++-------------------- sebs/cloudflare/triggers.py | 53 -------------- 4 files changed, 59 insertions(+), 147 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index e61b4d4aa..a9b9a7be4 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -514,11 +514,7 @@ def cached_function(self, function: Function): Args: function: The cached function """ - from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger - - for trigger in function.triggers(Trigger.TriggerType.LIBRARY): - trigger.logging_handlers = self.logging_handlers - cast(LibraryTrigger, trigger).deployment_client = self + from sebs.cloudflare.triggers import HTTPTrigger for trigger in function.triggers(Trigger.TriggerType.HTTP): trigger.logging_handlers = self.logging_handlers @@ -779,15 +775,11 @@ def create_trigger( Returns: The created trigger """ - from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + from sebs.cloudflare.triggers import HTTPTrigger worker = cast(CloudflareWorker, function) - if trigger_type == Trigger.TriggerType.LIBRARY: - trigger = LibraryTrigger(worker.name, self) - trigger.logging_handlers = self.logging_handlers - return trigger - elif trigger_type == Trigger.TriggerType.HTTP: + if trigger_type == Trigger.TriggerType.HTTP: account_id = worker.account_id or self.config.credentials.account_id worker_url = self._build_workers_dev_url(worker.name, account_id) trigger = HTTPTrigger(worker.name, worker_url) diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py index c3773818f..cd422dc30 100644 --- a/sebs/cloudflare/function.py +++ b/sebs/cloudflare/function.py @@ -40,7 +40,7 @@ def serialize(self) -> dict: @staticmethod def deserialize(cached_config: dict) -> "CloudflareWorker": from sebs.faas.function import Trigger - from sebs.cloudflare.triggers import LibraryTrigger, HTTPTrigger + from sebs.cloudflare.triggers import HTTPTrigger cfg = FunctionConfig.deserialize(cached_config["config"]) ret = CloudflareWorker( @@ -54,14 +54,7 @@ def deserialize(cached_config: dict) -> "CloudflareWorker": ) for trigger in cached_config["triggers"]: - mapping = { - LibraryTrigger.typename(): LibraryTrigger, - HTTPTrigger.typename(): HTTPTrigger - } - trigger_type = cast( - Trigger, - mapping.get(trigger["type"]), - ) + trigger_type = HTTPTrigger if trigger["type"] == HTTPTrigger.typename() else None assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) ret.add_trigger(trigger_type.deserialize(trigger)) diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index ebc2d684d..45a1167c6 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -35,6 +35,7 @@ def __init__( ): super().__init__(region, cache_client, resources, replace_existing) self._credentials = credentials + self._s3_client = None def _get_auth_headers(self) -> dict[str, str]: """Get authentication headers for Cloudflare API requests.""" @@ -52,6 +53,46 @@ def _get_auth_headers(self) -> dict[str, str]: else: raise RuntimeError("Invalid Cloudflare credentials configuration") + def _get_s3_client(self): + """ + Get or initialize the S3-compatible client for R2 operations. + + :return: boto3 S3 client or None if credentials not available + """ + if self._s3_client is not None: + return self._s3_client + + # Check if we have S3-compatible credentials + if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: + self.logging.warning( + "R2 S3-compatible API credentials not configured. " + "Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." + ) + return None + + try: + import boto3 + from botocore.config import Config + + account_id = self._credentials.account_id + + self._s3_client = boto3.client( + 's3', + endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', + aws_access_key_id=self._credentials.r2_access_key_id, + aws_secret_access_key=self._credentials.r2_secret_access_key, + config=Config(signature_version='s3v4'), + region_name='auto' + ) + + return self._s3_client + + except ImportError: + self.logging.warning( + "boto3 not available. Install with: pip install boto3" + ) + return None + def correct_name(self, name: str) -> str: return name @@ -142,33 +183,12 @@ def upload(self, bucket_name: str, filepath: str, key: str): :param filepath: local source filepath :param key: R2 destination key/path """ + s3_client = self._get_s3_client() + if s3_client is None: + self.logging.warning(f"Cannot upload {filepath} to R2 - S3 client not available") + return + try: - import boto3 - from botocore.config import Config - - account_id = self._credentials.account_id - - # R2 uses S3-compatible API, but requires special configuration - # The endpoint is: https://.r2.cloudflarestorage.com - # You need to create R2 API tokens in the Cloudflare dashboard - - # Check if we have S3-compatible credentials - if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: - self.logging.warning( - "R2 upload requires S3-compatible API credentials (r2_access_key_id, r2_secret_access_key). " - "File upload skipped. Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY." - ) - return - - s3_client = boto3.client( - 's3', - endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', - aws_access_key_id=self._credentials.r2_access_key_id, - aws_secret_access_key=self._credentials.r2_secret_access_key, - config=Config(signature_version='s3v4'), - region_name='auto' - ) - with open(filepath, 'rb') as f: s3_client.put_object( Bucket=bucket_name, @@ -178,11 +198,6 @@ def upload(self, bucket_name: str, filepath: str, key: str): self.logging.debug(f"Uploaded {filepath} to R2 bucket {bucket_name} as {key}") - except ImportError: - self.logging.warning( - "boto3 not available. Install with: pip install boto3. " - "File upload to R2 skipped." - ) except Exception as e: self.logging.warning(f"Failed to upload {filepath} to R2: {e}") @@ -194,28 +209,12 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): :param key: R2 destination key/path :param data: bytes to upload """ + s3_client = self._get_s3_client() + if s3_client is None: + self.logging.warning(f"Cannot upload bytes to R2 - S3 client not available") + return + try: - import boto3 - from botocore.config import Config - - account_id = self._credentials.account_id - - if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: - self.logging.warning( - "R2 upload requires S3-compatible API credentials (r2_access_key_id, r2_secret_access_key). " - "Upload skipped. Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." - ) - return - - s3_client = boto3.client( - 's3', - endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', - aws_access_key_id=self._credentials.r2_access_key_id, - aws_secret_access_key=self._credentials.r2_secret_access_key, - config=Config(signature_version='s3v4'), - region_name='auto' - ) - s3_client.put_object( Bucket=bucket_name, Key=key, @@ -224,10 +223,6 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): self.logging.debug(f"Uploaded {len(data)} bytes to R2 bucket {bucket_name} as {key}") - except ImportError: - self.logging.warning( - "boto3 not available. Install with: pip install boto3" - ) except Exception as e: self.logging.warning(f"Failed to upload bytes to R2: {e}") @@ -246,27 +241,12 @@ def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: :param prefix: optional prefix filter :return: list of files in a given bucket """ - # Use S3-compatible API with R2 credentials - if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: - self.logging.warning(f"R2 S3 credentials not configured, cannot list bucket {bucket_name}") + s3_client = self._get_s3_client() + if s3_client is None: + self.logging.warning(f"Cannot list R2 bucket {bucket_name} - S3 client not available") return [] try: - import boto3 - from botocore.config import Config - - account_id = self._credentials.account_id - r2_endpoint = f"https://{account_id}.r2.cloudflarestorage.com" - - s3_client = boto3.client( - 's3', - endpoint_url=r2_endpoint, - aws_access_key_id=self._credentials.r2_access_key_id, - aws_secret_access_key=self._credentials.r2_secret_access_key, - config=Config(signature_version='s3v4'), - region_name='auto' - ) - # List objects with optional prefix paginator = s3_client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index f4b926379..cecd0338f 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -4,59 +4,6 @@ from sebs.faas.function import Trigger, ExecutionResult -class LibraryTrigger(Trigger): - """ - Library trigger for Cloudflare Workers. - Allows invoking workers programmatically via the Cloudflare API. - """ - - def __init__(self, worker_name: str, deployment_client=None): - super().__init__() - self.worker_name = worker_name - self.deployment_client = deployment_client - - @staticmethod - def typename() -> str: - return "Cloudflare.LibraryTrigger" - - @staticmethod - def trigger_type() -> Trigger.TriggerType: - return Trigger.TriggerType.LIBRARY - - def sync_invoke(self, payload: dict) -> ExecutionResult: - """ - Synchronously invoke a Cloudflare Worker. - - Args: - payload: The payload to send to the worker - - Returns: - ExecutionResult with performance metrics - """ - # This will be implemented when we have the deployment client - raise NotImplementedError("Cloudflare Worker invocation not yet implemented") - - def async_invoke(self, payload: dict) -> concurrent.futures.Future: - """ - Asynchronously invoke a Cloudflare Worker. - Not typically supported for Cloudflare Workers. - """ - raise NotImplementedError("Cloudflare Workers do not support async invocation") - - def serialize(self) -> dict: - """Serialize the LibraryTrigger.""" - return { - "type": self.typename(), - "worker_name": self.worker_name, - } - - @staticmethod - def deserialize(cached_config: dict) -> "LibraryTrigger": - """Deserialize a LibraryTrigger from cached config.""" - from sebs.cloudflare.triggers import LibraryTrigger - return LibraryTrigger(cached_config["worker_name"]) - - class HTTPTrigger(Trigger): """ HTTP trigger for Cloudflare Workers. From b4f08a1a3c08356c4d7d1a0b824f090796257390 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 26 Mar 2026 12:22:23 +0100 Subject: [PATCH 070/230] feat(cloudflare): Introduce KVStore for NoSQL storage and remove Durable Objects - Added KVStore class for Cloudflare KV-backed NoSQL storage, replacing Durable Objects. - Updated Cloudflare system resources to return KVStore instance. - Modified Cloudflare Workers deployment to support KV namespace bindings. - Adjusted wrangler.toml template to remove Durable Object bindings. - Enhanced Cloudflare class to utilize the new KVStore for data operations. - Updated function.py to include CLOUDFLARE variant in runtime options. - Refactored code to accommodate changes in NoSQL storage handling and deployment processes. --- .../100.webapps/110.dynamic-html/config.json | 6 +- .../100.webapps/120.uploader/config.json | 5 +- .../nodejs/cloudflare/function.js | 31 ++ .../function.py} | 0 .../300.utilities/311.compression/config.json | 5 +- .../cloudflare/nodejs/container/worker.js | 159 +++++--- .../wrappers/cloudflare/nodejs/nosql.js | 124 ++++-- .../wrappers/cloudflare/python/handler.py | 8 +- .../wrappers/cloudflare/python/nosql.py | 82 ++-- configs/systems.json | 2 + sebs/cloudflare/cloudflare.py | 139 +++++-- sebs/cloudflare/containers.py | 27 +- sebs/cloudflare/durable_objects.py | 229 ----------- sebs/cloudflare/kvstore.py | 359 ++++++++++++++++++ sebs/cloudflare/resources.py | 11 +- sebs/cloudflare/workers.py | 97 +++-- sebs/faas/function.py | 2 + sebs/utils.py | 2 +- templates/wrangler-worker.toml | 9 - 19 files changed, 842 insertions(+), 455 deletions(-) create mode 100644 benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js rename benchmarks/100.webapps/120.uploader/python/{function_cloudflare.py => cloudflare/function.py} (100%) delete mode 100644 sebs/cloudflare/durable_objects.py create mode 100644 sebs/cloudflare/kvstore.py diff --git a/benchmarks/100.webapps/110.dynamic-html/config.json b/benchmarks/100.webapps/110.dynamic-html/config.json index 7e317037b..2f8e4f6dc 100644 --- a/benchmarks/100.webapps/110.dynamic-html/config.json +++ b/benchmarks/100.webapps/110.dynamic-html/config.json @@ -1,6 +1,10 @@ { "timeout": 10, "memory": 128, - "languages": ["python", "nodejs", "java"], + "languages": [ + "python", + "nodejs", + "java" + ], "modules": [] } diff --git a/benchmarks/100.webapps/120.uploader/config.json b/benchmarks/100.webapps/120.uploader/config.json index cbc635670..b8bc9f0f9 100644 --- a/benchmarks/100.webapps/120.uploader/config.json +++ b/benchmarks/100.webapps/120.uploader/config.json @@ -1,6 +1,9 @@ { "timeout": 30, "memory": 128, - "languages": ["python", "nodejs"], + "languages": [ + { "language": "python", "variants": ["default", "cloudflare"] }, + { "language": "nodejs", "variants": ["default", "cloudflare"] } + ], "modules": ["storage"] } diff --git a/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js new file mode 100644 index 000000000..5efc8103f --- /dev/null +++ b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js @@ -0,0 +1,31 @@ +// Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { storage } from './storage'; + +let storage_handler = new storage(); + +export const handler = async function(event) { + let bucket = event.bucket.bucket; + let output_prefix = event.bucket.output; + let url = event.object.url; + let upload_key = path.basename(url); + let download_path = path.join('/tmp', upload_key); + + const response = await fetch(url, { + headers: { + 'User-Agent': 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2' + } + }); + const buffer = await response.arrayBuffer(); + fs.writeFileSync(download_path, Buffer.from(buffer)); + + let [keyName, uploadPromise] = storage_handler.upload( + bucket, + path.join(output_prefix, upload_key), + download_path + ); + await uploadPromise; + + return {bucket: bucket, url: url, key: keyName}; +}; diff --git a/benchmarks/100.webapps/120.uploader/python/function_cloudflare.py b/benchmarks/100.webapps/120.uploader/python/cloudflare/function.py similarity index 100% rename from benchmarks/100.webapps/120.uploader/python/function_cloudflare.py rename to benchmarks/100.webapps/120.uploader/python/cloudflare/function.py diff --git a/benchmarks/300.utilities/311.compression/config.json b/benchmarks/300.utilities/311.compression/config.json index 8edb99e52..3f0f9238b 100644 --- a/benchmarks/300.utilities/311.compression/config.json +++ b/benchmarks/300.utilities/311.compression/config.json @@ -1,6 +1,9 @@ { "timeout": 60, "memory": 256, - "languages": ["python", "nodejs"], + "languages": [ + "python", + { "language": "nodejs", "variants": ["default", "cloudflare"] } + ], "modules": ["storage"] } diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index 8dee914a0..ba5eabcae 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -1,5 +1,4 @@ import { Container, getContainer } from "@cloudflare/containers"; -import { DurableObject } from "cloudflare:workers"; // Container wrapper class export class ContainerWorker extends Container { @@ -7,42 +6,6 @@ export class ContainerWorker extends Container { sleepAfter = "30m"; } -// Durable Object for NoSQL storage (simple proxy to ctx.storage) -export class KVApiObject extends DurableObject { - constructor(ctx, env) { - super(ctx, env); - } - - async insert(key, value) { - await this.ctx.storage.put(key.join(':'), value); - return { success: true }; - } - - async update(key, value) { - await this.ctx.storage.put(key.join(':'), value); - return { success: true }; - } - - async get(key) { - const value = await this.ctx.storage.get(key.join(':')); - return { data: value || null }; - } - - async query(keyPrefix) { - const list = await this.ctx.storage.list(); - const items = []; - for (const [k, v] of list) { - items.push(v); - } - return { items }; - } - - async delete(key) { - await this.ctx.storage.delete(key.join(':')); - return { success: true }; - } -} - export default { async fetch(request, env) { const url = new URL(request.url); @@ -148,7 +111,7 @@ export default { }; /** - * Handle NoSQL (Durable Object) requests proxied from the container + * Handle NoSQL (KV namespace) requests proxied from the container * Routes: * - POST /nosql/insert - insert item * - POST /nosql/update - update item @@ -165,38 +128,120 @@ async function handleNoSQLRequest(request, env) { const params = await request.json(); const { table_name, primary_key, secondary_key, secondary_key_name, data } = params; - // Get Durable Object stub - table_name should match the DO class name - if (!env[table_name]) { + const table = env[table_name]; + if (!table || typeof table.get !== 'function' || typeof table.put !== 'function') { return new Response(JSON.stringify({ - error: `Durable Object binding '${table_name}' not found` + error: `KV namespace binding '${table_name}' not found` }), { status: 500, headers: { 'Content-Type': 'application/json' } }); } - - // Create DO ID from primary key - const doId = env[table_name].idFromName(primary_key.join(':')); - const doStub = env[table_name].get(doId); - - // Forward operation to Durable Object + + const indexKey = `__sebs_idx__${primary_key[1]}`; + const readIndex = async () => { + const raw = await table.get(indexKey); + if (!raw) { + return []; + } + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : []; + } catch { + return []; + } + }; + const writeIndex = async (values) => { + await table.put(indexKey, JSON.stringify(values)); + }; + + const prefix = `${primary_key[1]}#`; + let result; switch (operation) { - case 'insert': - result = await doStub.insert(secondary_key, data); + case 'insert': { + const compositeKey = `${primary_key[1]}#${secondary_key[1]}`; + const keyData = { ...data }; + keyData[primary_key[0]] = primary_key[1]; + keyData[secondary_key[0]] = secondary_key[1]; + await table.put(compositeKey, JSON.stringify(keyData)); + const index = await readIndex(); + if (!index.includes(secondary_key[1])) { + index.push(secondary_key[1]); + await writeIndex(index); + } + result = { success: true }; break; - case 'update': - result = await doStub.update(secondary_key, data); + } + case 'update': { + const compositeKey = `${primary_key[1]}#${secondary_key[1]}`; + const existingRaw = await table.get(compositeKey); + let existing = {}; + if (existingRaw) { + try { + existing = JSON.parse(existingRaw); + } catch { + existing = {}; + } + } + const merged = { ...existing, ...data }; + merged[primary_key[0]] = primary_key[1]; + merged[secondary_key[0]] = secondary_key[1]; + await table.put(compositeKey, JSON.stringify(merged)); + const index = await readIndex(); + if (!index.includes(secondary_key[1])) { + index.push(secondary_key[1]); + await writeIndex(index); + } + result = { success: true }; break; - case 'get': - result = await doStub.get(secondary_key); + } + case 'get': { + const compositeKey = `${primary_key[1]}#${secondary_key[1]}`; + const raw = await table.get(compositeKey); + if (raw === null) { + result = { data: null }; + } else { + try { + result = { data: JSON.parse(raw) }; + } catch { + result = { data: raw }; + } + } break; - case 'query': - result = await doStub.query(secondary_key_name); + } + case 'query': { + let secondaryKeys = await readIndex(); + if (secondaryKeys.length === 0) { + const list = await table.list({ prefix }); + secondaryKeys = (list.keys || []).map((k) => k.name.split('#').slice(1).join('#')); + } + const items = []; + for (const secondaryValue of secondaryKeys) { + const raw = await table.get(`${primary_key[1]}#${secondaryValue}`); + if (raw === null) { + continue; + } + try { + items.push(JSON.parse(raw)); + } catch { + items.push(raw); + } + } + result = { items }; break; - case 'delete': - result = await doStub.delete(secondary_key); + } + case 'delete': { + const compositeKey = `${primary_key[1]}#${secondary_key[1]}`; + await table.delete(compositeKey); + const index = await readIndex(); + const next = index.filter((v) => v !== secondary_key[1]); + if (next.length !== index.length) { + await writeIndex(next); + } + result = { success: true }; break; + } default: return new Response(JSON.stringify({ error: 'Unknown NoSQL operation' diff --git a/benchmarks/wrappers/cloudflare/nodejs/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/nosql.js index 67b73a1fd..b12dfa8b1 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/nosql.js +++ b/benchmarks/wrappers/cloudflare/nodejs/nosql.js @@ -1,5 +1,5 @@ // NoSQL wrapper for Cloudflare Workers -// Uses Durable Objects for storage +// Uses KV namespaces for storage // Returns Promises that the handler will resolve class nosql { @@ -12,31 +12,59 @@ class nosql { if (!nosql.instance) { nosql.instance = new nosql(); } - + if (entry && entry.env) { nosql.instance.env = entry.env; + // Share env globally so bundled copies of this module (inlined by esbuild + // into function.js) can also reach the live KV bindings. + globalThis._nosqlEnv = entry.env; } } _get_table(tableName) { - // Don't cache stubs - they are request-scoped and cannot be reused - // Always create a fresh stub for each request - if (!this.env) { + // Fall back to the global env bridge for copies of this class that were + // inlined by esbuild into a separate bundle (e.g. function.js) and + // therefore have a different static `instance` from the one initialized + // by handler.js via `import('./nosql.js')`. + const env = this.env || globalThis._nosqlEnv; + if (!env) { throw new Error(`nosql env not initialized for table ${tableName}`); } - - if (!this.env.DURABLE_STORE) { - // Debug: log what we have - const envKeys = Object.keys(this.env || {}); - const durableStoreType = typeof this.env.DURABLE_STORE; + + const table = env[tableName]; + if (!table || typeof table.get !== 'function' || typeof table.put !== 'function') { + const envKeys = Object.keys(env || {}); throw new Error( - `DURABLE_STORE binding not found. env keys: [${envKeys.join(', ')}], DURABLE_STORE type: ${durableStoreType}` + `KV binding '${tableName}' not found. env keys: [${envKeys.join(', ')}]` ); } - - // Get a Durable Object ID based on the table name and create a fresh stub - const id = this.env.DURABLE_STORE.idFromName(tableName); - return this.env.DURABLE_STORE.get(id); + + return table; + } + + _key(primaryKey, secondaryKey) { + return `${primaryKey[1]}#${secondaryKey[1]}`; + } + + _indexKey(primaryKey) { + return `__sebs_idx__${primaryKey[1]}`; + } + + async _readIndex(table, primaryKey) { + const raw = await table.get(this._indexKey(primaryKey)); + if (raw === null) { + return []; + } + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : []; + } catch { + return []; + } + } + + async _writeIndex(table, primaryKey, values) { + await table.put(this._indexKey(primaryKey), JSON.stringify(values)); } // Async methods - build.js will patch function.js to await these @@ -45,18 +73,28 @@ class nosql { keyData[primaryKey[0]] = primaryKey[1]; keyData[secondaryKey[0]] = secondaryKey[1]; - const durableObjStub = this._get_table(tableName); - const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; - - await durableObjStub.put(compositeKey, keyData); + const table = this._get_table(tableName); + await table.put(this._key(primaryKey, secondaryKey), JSON.stringify(keyData)); + + const index = await this._readIndex(table, primaryKey); + if (!index.includes(secondaryKey[1])) { + index.push(secondaryKey[1]); + await this._writeIndex(table, primaryKey, index); + } } async get(tableName, primaryKey, secondaryKey) { - const durableObjStub = this._get_table(tableName); - const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; + const table = this._get_table(tableName); + const raw = await table.get(this._key(primaryKey, secondaryKey)); + if (raw === null) { + return null; + } - const result = await durableObjStub.get(compositeKey); - return result || null; + try { + return JSON.parse(raw); + } catch { + return raw; + } } async update(tableName, primaryKey, secondaryKey, updates) { @@ -66,25 +104,41 @@ class nosql { } async query(tableName, primaryKey, secondaryKeyName) { - const durableObjStub = this._get_table(tableName); - const prefix = `${primaryKey[1]}#`; - - // List all keys with the prefix - const allEntries = await durableObjStub.list({ prefix }); + const table = this._get_table(tableName); + let secondaryKeys = await this._readIndex(table, primaryKey); + + // Fallback for legacy namespaces without explicit index key. + if (secondaryKeys.length === 0) { + const listed = await table.list({ prefix: `${primaryKey[1]}#` }); + secondaryKeys = (listed.keys || []).map((k) => k.name.split('#').slice(1).join('#')); + } + const results = []; - - for (const [key, value] of allEntries) { - results.push(value); + + for (const secondaryValue of secondaryKeys) { + const raw = await table.get(`${primaryKey[1]}#${secondaryValue}`); + if (raw === null) { + continue; + } + try { + results.push(JSON.parse(raw)); + } catch { + results.push(raw); + } } - + return results; } async delete(tableName, primaryKey, secondaryKey) { - const durableObjStub = this._get_table(tableName); - const compositeKey = `${primaryKey[1]}#${secondaryKey[1]}`; + const table = this._get_table(tableName); + await table.delete(this._key(primaryKey, secondaryKey)); - await durableObjStub.delete(compositeKey); + const index = await this._readIndex(table, primaryKey); + const next = index.filter((v) => v !== secondaryKey[1]); + if (next.length !== index.length) { + await this._writeIndex(table, primaryKey, next); + } } static get_instance() { diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 19eff8baf..6674e34d5 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -9,7 +9,7 @@ except ImportError: # Pyodide (Python native workers) doesn't support resource module HAS_RESOURCE = False -from workers import WorkerEntrypoint, Response, DurableObject +from workers import WorkerEntrypoint, Response from js import fetch as js_fetch, URL ## sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) @@ -26,10 +26,6 @@ """ -class KVApiObject(DurableObject): - def __getattr__(self, name): - return getattr(self.ctx.storage, name) - class Default(WorkerEntrypoint): async def fetch(self, request, env): try: @@ -196,5 +192,3 @@ def make_benchmark_func(): ##print() with open("/tmp/function.py", "w") as wf: wf.write(new_source) - - diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index 105590ad5..b5374cbd7 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -122,10 +122,13 @@ def init_instance(entry: WorkerEntrypoint): nosql_kv.instance.env = entry.env def key_maker(self, key1, key2): - return f"({key1[0]},{str(key1[1])})+({key2[0]},{key2[1]})" + return f"{key1[1]}#{key2[1]}" def key_maker_partial(self, key1, key2): - return f"({key1[0]},{str(key1[1])})+({key2[0]}" + return f"{key1[1]}#" + + def index_key(self, primary_key): + return f"__sebs_idx__{primary_key[1]}" def get_table(self, table_name): return getattr(self.env, (table_name)) @@ -137,11 +140,23 @@ def insert( secondary_key: Tuple[str, str], data: dict, ): - put_res = ( - run_sync(self.get_table(table_name).put( + key_data = {**data} + key_data[primary_key[0]] = primary_key[1] + key_data[secondary_key[0]] = secondary_key[1] + put_res = run_sync( + self.get_table(table_name).put( self.key_maker(primary_key, secondary_key), - json.dumps(data)) - )) + json.dumps(key_data), + ) + ) + + idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) + idx = [] + if idx_raw is not None and len(idx_raw) > 0: + idx = json.loads(idx_raw) + if secondary_key[1] not in idx: + idx.append(secondary_key[1]) + run_sync(self.get_table(table_name).put(self.index_key(primary_key), json.dumps(idx))) return def update( @@ -151,11 +166,18 @@ def update( secondary_key: Tuple[str, str], data: dict, ): + existing = self.get(table_name, primary_key, secondary_key) + if existing is None: + existing = {} + merged = {**existing, **data} + merged[primary_key[0]] = primary_key[1] + merged[secondary_key[0]] = secondary_key[1] put_res = run_sync( self.get_table(table_name).put( self.key_maker(primary_key, secondary_key), - json.dumps(data) - )) + json.dumps(merged), + ) + ) return def get( @@ -165,7 +187,11 @@ def get( self.get_table(table_name).get( self.key_maker(primary_key, secondary_key) )) - return get_res + if get_res is None: + return None + if isinstance(get_res, dict): + return get_res + return json.loads(get_res) """ This query must involve partition key - it does not scan across partitions. @@ -174,30 +200,34 @@ def get( def query( self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str ) -> List[dict]: - _options = {"prefix" : self.key_maker_partial(primary_key, (secondary_key_name,) )} - list_res = run_sync(self.get_table(table_name).list(options=_options)) + idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) + idx = [] + if idx_raw is not None and len(idx_raw) > 0: + idx = json.loads(idx_raw) - keys = [] - for key in list_res.keys: - keys.append(key.name) - ##print("keys", keys) - assert len(keys) <= 100 - - - # todo: please use bulk sometime (it didn't work when i tried it) res = [] - for key in keys: - + for secondary_key_value in idx: + key = f"{primary_key[1]}#{secondary_key_value}" get_res = run_sync(self.get_table(table_name).get(key)) - get_res = get_res.replace("\'", "\"") - ##print("gr", get_res) - - res.append(json.loads(get_res)) + if get_res is None: + continue + if isinstance(get_res, dict): + res.append(get_res) + else: + res.append(json.loads(get_res)) return res def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): run_sync(self.get_table(table_name).delete(self.key_maker(primary_key, secondary_key))) + idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) + idx = [] + if idx_raw is not None and len(idx_raw) > 0: + idx = json.loads(idx_raw) + if secondary_key[1] in idx: + idx = [v for v in idx if v != secondary_key[1]] + run_sync(self.get_table(table_name).put(self.index_key(primary_key), json.dumps(idx))) + return @staticmethod @@ -209,4 +239,4 @@ def get_instance(): -nosql = nosql_do +nosql = nosql_kv diff --git a/configs/systems.json b/configs/systems.json index f65174f0c..b5bda5336 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -448,6 +448,7 @@ } }, "images": [], + "supported_variants": ["default", "cloudflare"], "deployment": { "files": [ "handler.py", @@ -481,6 +482,7 @@ } }, "images": [], + "supported_variants": ["default", "cloudflare"], "deployment": { "files": [ "handler.js", diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index a9b9a7be4..407ff0346 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -19,6 +19,64 @@ from sebs.faas.function import Function, ExecutionResult, Trigger, FunctionConfig from sebs.faas.system import System from sebs.faas.config import Resources +from sebs.sebs_types import Language + + +class _CloudflareContainerAdapter: + """Duck-typed adapter that satisfies benchmark.build()'s container_client contract. + + benchmark.build() calls container_client.build_base_image() when + container_deployment=True and asserts the client is not None. Cloudflare + builds its container images inside package_code (via containers.py), not + through a registry-backed DockerContainer, so this adapter bridges the gap + without touching the framework. + """ + + def __init__(self, containers_deployment: CloudflareContainersDeployment): + self._containers = containers_deployment + # Populated by build_base_image() so create_function() can find the dir. + self.last_directory: Optional[str] = None + + def build_base_image( + self, + directory: str, + language, # sebs.sebs_types.Language enum + language_version: str, + architecture: str, + benchmark: str, + is_cached: bool, + builder_image: str, + ) -> Tuple[bool, str, float]: + """Delegate to containers.package_code; match benchmark.build() contract. + + Returns (rebuilt, image_tag, size_mb) so that: + _, self._container_uri, self._code_size = container_client.build_base_image(...) + works correctly in benchmark.build(). + """ + dir_result, size_bytes, image_tag = self._containers.package_code( + directory, + language.value, # Language enum → str + language_version, + architecture, + benchmark, + ) + self.last_directory = dir_result + size_mb = size_bytes / 1024.0 / 1024.0 + return (True, image_tag, size_mb) + + def push_to_registry( + self, + benchmark: str, + language_name: str, + language_version: str, + architecture: str, + ) -> str: + """Return the local Docker image tag (Cloudflare containers use wrangler, not a registry).""" + image_name = ( + f"{benchmark.replace('.', '-')}-{language_name}-" + f"{language_version.replace('.', '')}" + ) + return f"{image_name}:latest" class Cloudflare(System): @@ -76,6 +134,8 @@ def __init__( self._containers_deployment = CloudflareContainersDeployment( self.logging, sebs_config, docker_client, self.system_resources ) + # Adapter so benchmark.build() can call container_client.build_base_image() + self._container_adapter = _CloudflareContainerAdapter(self._containers_deployment) def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): """ @@ -128,6 +188,17 @@ def initialize_resources(self, select_prefix: Optional[str] = None): f"Continuing without R2 storage - only benchmarks that don't require storage will work." ) + @property + def container_client(self) -> _CloudflareContainerAdapter: + """Return the Cloudflare-specific container build adapter. + + Overrides System.container_client (which returns None) so that + benchmark.build() can drive container image builds via + _CloudflareContainerAdapter.build_base_image() without needing an + external container registry. + """ + return self._container_adapter + def _verify_credentials(self): """Verify that the Cloudflare API credentials are valid.""" # Check if credentials are set @@ -180,45 +251,37 @@ def _get_deployment_handler(self, container_deployment: bool): def package_code( self, directory: str, - language_name: str, + language: Language, language_version: str, architecture: str, benchmark: str, is_cached: bool, - container_deployment: bool, - ) -> Tuple[str, int, str]: + ) -> Tuple[str, int]: """ - Package code for Cloudflare Workers deployment using Wrangler. + Package code for native Cloudflare Workers deployment using Wrangler. - Uses Wrangler CLI to bundle dependencies and prepare for deployment. - Delegates to either CloudflareWorkersDeployment or CloudflareContainersDeployment - based on the deployment type. + Called by benchmark.build() via the non-container path. Container + builds are driven by _CloudflareContainerAdapter.build_base_image() + through the container_client property instead. Args: directory: Path to the code directory - language_name: Programming language name + language: Programming language enum language_version: Programming language version architecture: Target architecture (not used for Workers) benchmark: Benchmark name is_cached: Whether the code is cached - container_deployment: Whether to deploy as container Returns: - Tuple of (package_path, package_size, container_uri) + Tuple of (package_path, package_size) """ - handler = self._get_deployment_handler(container_deployment) - - # Container deployment flow - build Docker image - if container_deployment: - self.logging.info(f"Building container image for {benchmark}") - return handler.package_code( - directory, language_name, language_version, architecture, benchmark - ) - - # Native worker deployment flow - return handler.package_code( - directory, language_name, language_version, benchmark, is_cached + # Native worker deployment flow — always the cloudflare variant. + # workers.py returns a 3-tuple (path, size, ""); drop the unused 3rd element. + pkg_path, pkg_size, _ = self._workers_deployment.package_code( + directory, language.value, language_version, benchmark, is_cached, + language_variant="cloudflare", ) + return (pkg_path, pkg_size) def _get_auth_headers(self) -> Dict[str, str]: """Get authentication headers for Cloudflare API requests.""" @@ -263,10 +326,11 @@ def _generate_wrangler_toml( Returns: Path to the generated wrangler.toml file """ + language_variant = code_package.language_variant if code_package else "cloudflare" handler = self._get_deployment_handler(container_deployment) return handler.generate_wrangler_toml( worker_name, package_dir, language, account_id, - benchmark_name, code_package, container_uri + benchmark_name, code_package, container_uri, language_variant, ) def create_function( @@ -290,7 +354,17 @@ def create_function( Returns: CloudflareWorker instance """ + # For container builds benchmark.build() goes through container_client.build_base_image(), + # which does NOT set code_package._code_location. Fall back to the directory that + # _CloudflareContainerAdapter stored during its last build_base_image() call. package = code_package.code_location + if package is None and container_deployment: + package = self._container_adapter.last_directory + if package is None: + raise RuntimeError( + f"Code location is not set for {code_package.benchmark}. " + "The build step may not have completed successfully." + ) benchmark = code_package.benchmark language = code_package.language_name language_runtime = code_package.language_version @@ -408,13 +482,15 @@ def _create_or_update_worker( # Deploy using Wrangler in container self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") + language_variant = code_package.language_variant if code_package else "cloudflare" + try: - # For container deployments, always use wrangler (not pywrangler) - # For native deployments, use wrangler for nodejs, pywrangler for python - if container_deployment or language == "nodejs": - output = cli.wrangler_deploy(container_package_path, env=env) - else: # python native + # pywrangler is used exclusively for the Pyodide (python cloudflare) variant. + # All other cases — nodejs, containers, or non-cloudflare python — use wrangler. + if not container_deployment and language == "python" and language_variant == "cloudflare": output = cli.pywrangler_deploy(container_package_path, env=env) + else: + output = cli.wrangler_deploy(container_package_path, env=env) self.logging.info(f"Worker {worker_name} deployed successfully") self.logging.debug(f"Wrangler deploy output: {output}") @@ -537,6 +613,8 @@ def update_function( """ worker = cast(CloudflareWorker, function) package = code_package.code_location + if package is None and container_deployment: + package = self._container_adapter.last_directory language = code_package.language_name benchmark = code_package.benchmark @@ -598,10 +676,13 @@ def default_function_name(self, code_package: Benchmark, resources=None) -> str: Default function name """ # Cloudflare Worker names must be lowercase and can contain hyphens - return ( + name = ( f"{code_package.benchmark}-{code_package.language_name}-" f"{code_package.language_version.replace('.', '')}" ).lower() + if code_package.language_variant != "default": + name = f"{name}-{code_package.language_variant}" + return name @staticmethod def format_function_name(name: str, container_deployment: bool = False) -> str: diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 5b1fd9fd4..5aa84936f 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -67,6 +67,7 @@ def generate_wrangler_toml( benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_uri: str = "", + language_variant: str = "default", ) -> str: """ Generate a wrangler.toml configuration file for container workers. @@ -104,22 +105,20 @@ def generate_wrangler_toml( self.logging.warning("Using standard-4 instance type for high resource benchmark") config['containers'][0]['instance_type'] = "standard-4" - # Add nosql table bindings if benchmark uses them + # Add nosql KV namespace bindings if benchmark uses them if code_package and code_package.uses_nosql: # Get registered nosql tables for this benchmark nosql_storage = self.system_resources.get_nosql_storage() - if nosql_storage.retrieve_cache(benchmark_name): - nosql_tables = nosql_storage._tables.get(benchmark_name, {}) - - # Add durable object bindings for each nosql table - for table_name in nosql_tables.keys(): - config['durable_objects']['bindings'].append({ - 'name': table_name, - 'class_name': 'KVApiObject' - }) - - # Update migrations to include KVApiObject - config['migrations'][0]['new_sqlite_classes'].append('KVApiObject') + benchmark_for_nosql = benchmark_name or code_package.benchmark + if nosql_storage.retrieve_cache(benchmark_for_nosql): + nosql_tables = nosql_storage.get_tables(benchmark_for_nosql) + if nosql_tables: + config['kv_namespaces'] = config.get('kv_namespaces', []) + for table_name, namespace_id in nosql_tables.items(): + config['kv_namespaces'].append({ + 'binding': table_name, + 'id': namespace_id, + }) # Add environment variables if benchmark_name or (code_package and code_package.uses_nosql): @@ -127,7 +126,7 @@ def generate_wrangler_toml( if benchmark_name: config['vars']['BENCHMARK_NAME'] = benchmark_name if code_package and code_package.uses_nosql: - config['vars']['NOSQL_STORAGE_DATABASE'] = "durable_objects" + config['vars']['NOSQL_STORAGE_DATABASE'] = "kvstore" # Add R2 bucket binding try: diff --git a/sebs/cloudflare/durable_objects.py b/sebs/cloudflare/durable_objects.py deleted file mode 100644 index 258886cf3..000000000 --- a/sebs/cloudflare/durable_objects.py +++ /dev/null @@ -1,229 +0,0 @@ -import json -from collections import defaultdict -from typing import Dict, Optional, Tuple - -from sebs.cloudflare.config import CloudflareCredentials -from sebs.faas.nosql import NoSQLStorage -from sebs.faas.config import Resources -from sebs.cache import Cache - - -class DurableObjects(NoSQLStorage): - """ - Cloudflare Durable Objects implementation for NoSQL storage. - - Note: Durable Objects are not managed via API like DynamoDB or CosmosDB. - Instead, they are defined in the Worker code and wrangler.toml, and accessed - via bindings in the Worker environment. This implementation provides a minimal - interface to satisfy SeBS requirements by tracking table names without actual - API-based table creation. - """ - - @staticmethod - def typename() -> str: - return "Cloudflare.DurableObjects" - - @staticmethod - def deployment_name() -> str: - return "cloudflare" - - def __init__( - self, - region: str, - cache_client: Cache, - resources: Resources, - credentials: CloudflareCredentials, - ): - super().__init__(region, cache_client, resources) - self._credentials = credentials - # Tables are just logical names - Durable Objects are accessed via Worker bindings - self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) - - def _get_auth_headers(self) -> dict[str, str]: - """Get authentication headers for Cloudflare API requests.""" - if self._credentials.api_token: - return { - "Authorization": f"Bearer {self._credentials.api_token}", - "Content-Type": "application/json", - } - elif self._credentials.email and self._credentials.api_key: - return { - "X-Auth-Email": self._credentials.email, - "X-Auth-Key": self._credentials.api_key, - "Content-Type": "application/json", - } - else: - raise RuntimeError("Invalid Cloudflare credentials configuration") - - def get_tables(self, benchmark: str) -> Dict[str, str]: - """ - Get all tables for a benchmark. - - :param benchmark: benchmark name - :return: dictionary mapping table names to their IDs - """ - return self._tables[benchmark] - - def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: - """ - Get the full table name for a benchmark table. - - :param benchmark: benchmark name - :param table: table name - :return: full table name or None if not found - """ - if benchmark not in self._tables: - return None - - if table not in self._tables[benchmark]: - return None - - return self._tables[benchmark][table] - - def retrieve_cache(self, benchmark: str) -> bool: - """ - Retrieve cached table information. - - :param benchmark: benchmark name - :return: True if cache was found and loaded - """ - if benchmark in self._tables: - return True - - cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) - if cached_storage is not None: - self._tables[benchmark] = cached_storage["tables"] - self.logging.info(f"Retrieved cached Durable Objects tables for {benchmark}") - return True - - return False - - def update_cache(self, benchmark: str): - """ - Update cache with current table information. - - :param benchmark: benchmark name - """ - self.cache_client.update_nosql( - self.deployment_name(), - benchmark, - { - "tables": self._tables[benchmark], - }, - ) - self.logging.info(f"Updated cache for Durable Objects tables for {benchmark}") - - def create_table( - self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None - ) -> str: - """ - Register a table name for a benchmark. - - Note: Durable Objects don't have traditional table creation via API. - They are defined in the Worker code and wrangler.toml, and accessed via - bindings. This method just tracks the logical table name for the wrapper - to use when accessing the Durable Object binding. - - :param benchmark: benchmark name - :param name: table name - :param primary_key: primary key field name - :param secondary_key: optional secondary key field name - :return: table name (same as input name - used directly as binding name) - """ - # For Cloudflare, table names are used directly as the binding names - # in the wrapper code, so we just use the simple name - self._tables[benchmark][name] = name - - self.logging.info( - f"Registered Durable Object table '{name}' for benchmark {benchmark}" - ) - - return name - - def write_to_table( - self, - benchmark: str, - table: str, - data: dict, - primary_key: Tuple[str, str], - secondary_key: Optional[Tuple[str, str]] = None, - ): - """ - Write data to a table (Durable Object). - - Note: Cloudflare Durable Objects can only be written to from within the Worker, - not via external API calls. Data seeding for benchmarks is not supported. - Benchmarks that require pre-populated data (like test/small sizes of crud-api) - will return empty results. Use 'large' size which creates its own data. - - :param benchmark: benchmark name - :param table: table name - :param data: data to write - :param primary_key: primary key (field_name, value) - :param secondary_key: optional secondary key (field_name, value) - """ - table_name = self._get_table_name(benchmark, table) - - if not table_name: - raise ValueError(f"Table {table} not found for benchmark {benchmark}") - - # Silently skip data seeding for Cloudflare Durable Objects - # This is a platform limitation - pass - - def clear_table(self, name: str) -> str: - """ - Clear all data from a table. - - Note: Durable Object data is managed within the Worker. - - :param name: table name - :return: table name - """ - self.logging.warning(f"Durable Objects data is managed within the Worker") - return name - - def remove_table(self, name: str) -> str: - """ - Remove a table from tracking. - - :param name: table name - :return: table name - """ - # Remove from internal tracking - two-step approach to avoid mutation during iteration - benchmark_to_modify = None - table_key_to_delete = None - - # Step 1: Find the benchmark and table_key without deleting - for benchmark, tables in list(self._tables.items()): - if name in tables.values(): - # Find the table key - for table_key, table_name in list(tables.items()): - if table_name == name: - benchmark_to_modify = benchmark - table_key_to_delete = table_key - break - break - - # Step 2: Perform deletion after iteration - if benchmark_to_modify is not None and table_key_to_delete is not None: - del self._tables[benchmark_to_modify][table_key_to_delete] - - self.logging.info(f"Removed Durable Objects table {name} from tracking") - return name - - def envs(self) -> dict: - """ - Get environment variables for accessing Durable Objects. - - Durable Objects are accessed via bindings in the Worker environment, - which are configured in wrangler.toml. We set a marker environment - variable so the wrapper knows Durable Objects are available. - - :return: dictionary of environment variables - """ - # Set a marker that Durable Objects are enabled - # The actual bindings (DURABLE_STORE, etc.) are configured in wrangler.toml - return { - "NOSQL_STORAGE_DATABASE": "durable_objects" - } diff --git a/sebs/cloudflare/kvstore.py b/sebs/cloudflare/kvstore.py new file mode 100644 index 000000000..edc9ab6dc --- /dev/null +++ b/sebs/cloudflare/kvstore.py @@ -0,0 +1,359 @@ +import hashlib +import json +import re +from collections import defaultdict +from typing import Dict, List, Optional, Tuple +from urllib.parse import quote + +import requests + +from sebs.cache import Cache +from sebs.cloudflare.config import CloudflareCredentials +from sebs.faas.config import Resources +from sebs.faas.nosql import NoSQLStorage + + +class KVStore(NoSQLStorage): + """ + Cloudflare KV-backed NoSQL storage for SeBS. + + This implementation maps every benchmark table to one KV namespace. + Data is stored as JSON values under composite keys: + # + """ + + NAMESPACE_ID_PATTERN = re.compile(r"^[a-fA-F0-9]{32}$") + + @staticmethod + def typename() -> str: + return "Cloudflare.KVStore" + + @staticmethod + def deployment_name() -> str: + return "cloudflare" + + def __init__( + self, + region: str, + cache_client: Cache, + resources: Resources, + credentials: CloudflareCredentials, + ): + super().__init__(region, cache_client, resources) + self._credentials = credentials + # benchmark -> logical table name -> KV namespace id + self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) + + def _account_id(self) -> str: + account_id = self._credentials.account_id + if not account_id: + raise RuntimeError("Cloudflare account ID is required for KV operations") + return account_id + + def _kv_api_base(self) -> str: + return f"https://api.cloudflare.com/client/v4/accounts/{self._account_id()}/storage/kv/namespaces" + + def _get_auth_headers(self, content_type: str = "application/json") -> dict[str, str]: + """Get authentication headers for Cloudflare API requests.""" + if self._credentials.api_token: + return { + "Authorization": f"Bearer {self._credentials.api_token}", + "Content-Type": content_type, + } + elif self._credentials.email and self._credentials.api_key: + return { + "X-Auth-Email": self._credentials.email, + "X-Auth-Key": self._credentials.api_key, + "Content-Type": content_type, + } + else: + raise RuntimeError("Invalid Cloudflare credentials configuration") + + @classmethod + def _is_namespace_id(cls, value: str) -> bool: + return bool(cls.NAMESPACE_ID_PATTERN.fullmatch(value)) + + def _resource_id(self) -> str: + if self._cloud_resources.has_resources_id: + return self._cloud_resources.resources_id + return "default" + + @staticmethod + def _sanitize_component(value: str) -> str: + sanitized = re.sub(r"[^A-Za-z0-9_-]", "-", value) + return sanitized.strip("-") or "default" + + def _namespace_title(self, benchmark: str, table: str) -> str: + title = ( + f"sebs-nosql-{self._sanitize_component(self._resource_id())}-" + f"{self._sanitize_component(benchmark)}-{self._sanitize_component(table)}" + ) + # Cloudflare KV namespace title has length constraints. Keep a deterministic suffix if truncated. + max_len = 100 + if len(title) > max_len: + digest = hashlib.sha1(title.encode("utf-8")).hexdigest()[:12] + title = f"{title[: max_len - 13]}-{digest}" + return title + + def _list_namespaces(self) -> List[dict]: + namespaces: List[dict] = [] + page = 1 + per_page = 100 + + while True: + response = requests.get( + self._kv_api_base(), + params={"page": page, "per_page": per_page}, + headers=self._get_auth_headers(), + ) + response.raise_for_status() + payload = response.json() + + if not payload.get("success"): + raise RuntimeError(f"Failed to list KV namespaces: {payload.get('errors')}") + + page_items = payload.get("result", []) + namespaces.extend(page_items) + + page_info = payload.get("result_info", {}) or {} + total_pages = int(page_info.get("total_pages", 1)) + if page >= total_pages: + break + page += 1 + + return namespaces + + def _find_namespace_id_by_title(self, title: str) -> Optional[str]: + for namespace in self._list_namespaces(): + if namespace.get("title") == title: + return namespace.get("id") + return None + + def _delete_namespace(self, namespace_id: str) -> None: + response = requests.delete( + f"{self._kv_api_base()}/{namespace_id}", + headers=self._get_auth_headers(), + ) + if response.status_code == 404: + return + response.raise_for_status() + + if response.content: + payload = response.json() + if not payload.get("success"): + raise RuntimeError(f"Failed to delete KV namespace {namespace_id}: {payload.get('errors')}") + + @staticmethod + def _compose_key( + primary_key: Tuple[str, str], secondary_key: Optional[Tuple[str, str]] = None + ) -> str: + if secondary_key is None: + return str(primary_key[1]) + return f"{primary_key[1]}#{secondary_key[1]}" + + @staticmethod + def _index_key(primary_value: str) -> str: + return f"__sebs_idx__{primary_value}" + + def _read_index(self, namespace_id: str, primary_value: str) -> List[str]: + response = requests.get( + f"{self._kv_api_base()}/{namespace_id}/values/{quote(self._index_key(primary_value), safe='')}", + headers=self._get_auth_headers(), + ) + if response.status_code == 404: + return [] + response.raise_for_status() + + raw = response.text + if not raw: + return [] + + try: + parsed = json.loads(raw) + except Exception: + return [] + + if not isinstance(parsed, list): + return [] + + return [str(v) for v in parsed] + + def _write_index(self, namespace_id: str, primary_value: str, values: List[str]) -> None: + response = requests.put( + f"{self._kv_api_base()}/{namespace_id}/values/{quote(self._index_key(primary_value), safe='')}", + data=json.dumps(values, separators=(",", ":")).encode("utf-8"), + headers=self._get_auth_headers(content_type="text/plain;charset=UTF-8"), + ) + response.raise_for_status() + + def _get_tables(self) -> Dict[str, List[str]]: + tables = self.cache_client.get_nosql_configs(self.deployment_name()) + return {benchmark: list(v.values()) for benchmark, v in tables.items()} + + def get_tables(self, benchmark: str) -> Dict[str, str]: + return self._tables[benchmark] + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + if benchmark not in self._tables: + return None + if table not in self._tables[benchmark]: + return None + return self._tables[benchmark][table] + + def retrieve_cache(self, benchmark: str) -> bool: + if benchmark in self._tables: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is None: + return False + + cached_tables = cached_storage.get("tables", {}) + if not isinstance(cached_tables, dict): + return False + + # Ignore legacy Durable Objects cache entries (table -> table name). + if cached_tables and not all( + isinstance(v, str) and self._is_namespace_id(v) for v in cached_tables.values() + ): + self.logging.warning( + f"Ignoring legacy/non-KV cache for benchmark {benchmark}; creating KV namespaces." + ) + return False + + self._tables[benchmark] = cached_tables + self.logging.info(f"Retrieved cached KV namespace mappings for {benchmark}") + return True + + def update_cache(self, benchmark: str): + self.cache_client.update_nosql( + self.deployment_name(), + benchmark, + {"tables": self._tables[benchmark]}, + ) + self.logging.info(f"Updated cache for KV namespace mappings for {benchmark}") + + def create_table( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ) -> str: + # Unused in KV namespace allocation, kept for interface compatibility + _ = primary_key, secondary_key + + existing = self._get_table_name(benchmark, name) + if existing: + return existing + + namespace_title = self._namespace_title(benchmark, name) + + existing_namespace_id = self._find_namespace_id_by_title(namespace_title) + if existing_namespace_id: + self._tables[benchmark][name] = existing_namespace_id + self.logging.info( + f"Reusing existing KV namespace '{namespace_title}' ({existing_namespace_id})" + ) + return existing_namespace_id + + response = requests.post( + self._kv_api_base(), + json={"title": namespace_title}, + headers=self._get_auth_headers(), + ) + + # A concurrent run may have created it after our lookup. + if response.status_code >= 400: + existing_namespace_id = self._find_namespace_id_by_title(namespace_title) + if existing_namespace_id: + self._tables[benchmark][name] = existing_namespace_id + return existing_namespace_id + response.raise_for_status() + + payload = response.json() + if not payload.get("success"): + raise RuntimeError( + f"Failed to create KV namespace {namespace_title}: {payload.get('errors')}" + ) + + namespace_id = payload.get("result", {}).get("id") + if not namespace_id: + raise RuntimeError(f"Cloudflare KV API did not return namespace id for {namespace_title}") + + self._tables[benchmark][name] = namespace_id + self.logging.info( + f"Created KV namespace '{namespace_title}' ({namespace_id}) for benchmark {benchmark}" + ) + return namespace_id + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + namespace_id = self._get_table_name(benchmark, table) + if not namespace_id: + raise ValueError(f"Table {table} not found for benchmark {benchmark}") + + record = dict(data) + for key in (primary_key, secondary_key): + if key is not None: + record[key[0]] = key[1] + + composite_key = self._compose_key(primary_key, secondary_key) + value = json.dumps(record, separators=(",", ":"), default=str) + + response = requests.put( + f"{self._kv_api_base()}/{namespace_id}/values/{quote(composite_key, safe='')}", + data=value.encode("utf-8"), + headers=self._get_auth_headers(content_type="text/plain;charset=UTF-8"), + ) + response.raise_for_status() + + if secondary_key is not None: + primary_value = str(primary_key[1]) + secondary_value = str(secondary_key[1]) + index_values = self._read_index(namespace_id, primary_value) + if secondary_value not in index_values: + index_values.append(secondary_value) + self._write_index(namespace_id, primary_value, index_values) + + def clear_table(self, name: str) -> str: + self.logging.warning( + "Cloudflare KV clear_table is not implemented. Use remove_table() + create_table() instead." + ) + return name + + def remove_table(self, name: str) -> str: + benchmark_to_modify: Optional[str] = None + logical_name_to_delete: Optional[str] = None + namespace_id_to_delete: Optional[str] = None + + for benchmark, tables in list(self._tables.items()): + for logical_name, namespace_id in list(tables.items()): + if name == logical_name or name == namespace_id: + benchmark_to_modify = benchmark + logical_name_to_delete = logical_name + namespace_id_to_delete = namespace_id + break + if namespace_id_to_delete: + break + + # Also allow direct removal by namespace id when not present in local mapping. + if namespace_id_to_delete is None and self._is_namespace_id(name): + namespace_id_to_delete = name + + if namespace_id_to_delete is None: + self.logging.warning(f"KV table '{name}' not found in local mapping.") + return name + + self._delete_namespace(namespace_id_to_delete) + + if benchmark_to_modify is not None and logical_name_to_delete is not None: + del self._tables[benchmark_to_modify][logical_name_to_delete] + + self.logging.info(f"Removed KV namespace {namespace_id_to_delete}") + return name + + def envs(self) -> dict: + return {"NOSQL_STORAGE_DATABASE": "kvstore"} diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index 1b3d9dbc7..1a76475bc 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -5,7 +5,7 @@ from sebs.cache import Cache from sebs.cloudflare.config import CloudflareConfig from sebs.cloudflare.r2 import R2 -from sebs.cloudflare.durable_objects import DurableObjects +from sebs.cloudflare.kvstore import KVStore from sebs.faas.resources import SystemResources from sebs.faas.storage import PersistentStorage from sebs.faas.nosql import NoSQLStorage @@ -79,15 +79,14 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStor def get_nosql_storage(self) -> NoSQLStorage: """ - Get Cloudflare Durable Objects storage instance. + Get Cloudflare KV storage instance. - Durable Objects provide stateful storage for Workers. - Note: This is a minimal implementation to satisfy SeBS requirements. + KV namespaces provide key-value storage for Workers. Returns: - DurableObjects storage instance + KVStore storage instance """ - return DurableObjects( + return KVStore( region=self._config.region, cache_client=self._cache_client, resources=self._config.resources, diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index f78f0aad1..8f5d90d17 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -62,6 +62,7 @@ def generate_wrangler_toml( benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_uri: str = "", + language_variant: str = "cloudflare", ) -> str: """ Generate a wrangler.toml configuration file for native workers. @@ -86,14 +87,21 @@ def generate_wrangler_toml( ) with open(template_path, 'rb') as f: config = tomllib.load(f) + + # Native workers no longer require Durable Object bindings for NoSQL. + config.pop('durable_objects', None) + config.pop('migrations', None) # Update basic configuration config['name'] = worker_name - config['main'] = "dist/handler.js" if language == "nodejs" else "handler.py" config['account_id'] = account_id - - # Add language-specific configuration + + # Add language- and variant-specific configuration. + # For Node.js workers, we always bundle through build.js into dist/, + # regardless of language variant (default/cloudflare), because the + # wrangler entrypoint points to dist/handler.js. if language == "nodejs": + config['main'] = "dist/handler.js" config['compatibility_flags'] = ["nodejs_compat"] config['no_bundle'] = True config['build'] = {'command': 'node build.js'} @@ -110,7 +118,24 @@ def generate_wrangler_toml( } ] elif language == "python": + config['main'] = "handler.py" config['compatibility_flags'] = ["python_workers"] + else: + config['main'] = "dist/handler.js" if language == "nodejs" else "handler.py" + + # Add NoSQL KV namespace bindings if benchmark uses them + if code_package and code_package.uses_nosql: + benchmark_for_nosql = benchmark_name or code_package.benchmark + nosql_storage = self.system_resources.get_nosql_storage() + if nosql_storage.retrieve_cache(benchmark_for_nosql): + nosql_tables = nosql_storage.get_tables(benchmark_for_nosql) + if nosql_tables: + config['kv_namespaces'] = [] + for table_name, namespace_id in nosql_tables.items(): + config['kv_namespaces'].append({ + 'binding': table_name, + 'id': namespace_id, + }) # Add environment variables if benchmark_name or (code_package and code_package.uses_nosql): @@ -118,7 +143,7 @@ def generate_wrangler_toml( if benchmark_name: config['vars']['BENCHMARK_NAME'] = benchmark_name if code_package and code_package.uses_nosql: - config['vars']['NOSQL_STORAGE_DATABASE'] = "durable_objects" + config['vars']['NOSQL_STORAGE_DATABASE'] = "kvstore" # Add R2 bucket binding try: @@ -158,6 +183,7 @@ def package_code( language_version: str, benchmark: str, is_cached: bool, + language_variant: str = "cloudflare", ) -> Tuple[str, int, str]: """ Package code for native Cloudflare Workers deployment. @@ -196,7 +222,7 @@ def package_code( # Install esbuild as a dev dependency (needed by build.js) self.logging.info("Installing esbuild for custom build script...") - cli.execute(f"cd {container_path} && npm install --save-dev esbuild") + cli.execute(f"cd {container_path} && npm install --force esbuild") self.logging.info("esbuild installed successfully") # Download node_modules back to host @@ -225,7 +251,7 @@ def package_code( try: cli.upload_package(directory, container_path) - cli.execute(f"cd {container_path} && npm install --save-dev esbuild") + cli.execute(f"cd {container_path} && npm install --force esbuild") # Download node_modules back to host bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") @@ -249,18 +275,11 @@ def package_code( shutil.move(src, dest) self.logging.info(f"move {src} to {dest}") - # move function_cloudflare.py into function.py - function_cloudflare_file = os.path.join(directory, "function_cloudflare.py") - if os.path.exists(function_cloudflare_file): - src = function_cloudflare_file - dest = os.path.join(directory, "function.py") - shutil.move(src, dest) - self.logging.info(f"move {src} to {dest}") - - if os.path.exists(requirements_file): - with open(requirements_file, 'r') as reqf: - reqtext = reqf.read() - supported_pkg = \ + if language_variant == "cloudflare": + if os.path.exists(requirements_file): + with open(requirements_file, 'r') as reqf: + reqtext = reqf.read() + supported_pkg = \ ['affine', 'aiohappyeyeballs', 'aiohttp', 'aiosignal', 'altair', 'annotated-types',\ 'anyio', 'apsw', 'argon2-cffi', 'argon2-cffi-bindings', 'asciitree', 'astropy', 'astropy_iers_data',\ 'asttokens', 'async-timeout', 'atomicwrites', 'attrs', 'audioop-lts', 'autograd', 'awkward-cpp', 'b2d',\ @@ -291,15 +310,15 @@ def package_code( 'tomli-w', 'toolz', 'tqdm', 'traitlets', 'traits', 'tree-sitter', 'tree-sitter-go', 'tree-sitter-java', 'tree-sitter-python',\ 'tskit', 'typing-extensions', 'tzdata', 'ujson', 'uncertainties', 'unyt', 'urllib3', 'vega-datasets', 'vrplib', 'wcwidth',\ 'webencodings', 'wordcloud', 'wrapt', 'xarray', 'xgboost', 'xlrd', 'xxhash', 'xyzservices', 'yarl', 'yt', 'zengl', 'zfpy', 'zstandard'] - needed_pkg = [] - for pkg in supported_pkg: - if pkg.lower() in reqtext.lower(): - needed_pkg.append(pkg) - - project_file = os.path.join(directory, "pyproject.toml") - depstr = str(needed_pkg).replace("\'", "\"") - with open(project_file, 'w') as pf: - pf.write(f""" + needed_pkg = [] + for pkg in supported_pkg: + if pkg.lower() in reqtext.lower(): + needed_pkg.append(pkg) + + project_file = os.path.join(directory, "pyproject.toml") + depstr = str(needed_pkg).replace("\'", "\"") + with open(project_file, 'w') as pf: + pf.write(f""" [project] name = "{benchmark.replace(".", "-")}-python-{language_version.replace(".", "")}" version = "0.1.0" @@ -312,18 +331,18 @@ def package_code( "workers-py", "workers-runtime-sdk" ] - """) - # move into function dir - funcdir = os.path.join(directory, "function") - if not os.path.exists(funcdir): - os.makedirs(funcdir) - - dont_move = ["handler.py", "function", "python_modules", "pyproject.toml"] - for thing in os.listdir(directory): - if thing not in dont_move: - src = os.path.join(directory, thing) - dest = os.path.join(directory, "function", thing) - shutil.move(src, dest) + """) + # Pyodide Workers require all function files in a function/ subdir + funcdir = os.path.join(directory, "function") + if not os.path.exists(funcdir): + os.makedirs(funcdir) + + dont_move = ["handler.py", "function", "python_modules", "pyproject.toml"] + for thing in os.listdir(directory): + if thing not in dont_move: + src = os.path.join(directory, thing) + dest = os.path.join(directory, "function", thing) + shutil.move(src, dest) # Create package structure CONFIG_FILES = { diff --git a/sebs/faas/function.py b/sebs/faas/function.py index 651c1f65c..fb93bd39b 100644 --- a/sebs/faas/function.py +++ b/sebs/faas/function.py @@ -545,6 +545,7 @@ class Python(Enum): DEFAULT = "default" PYPY = "pypy" + CLOUDFLARE = "cloudflare" class NodeJS(Enum): """Node.js runtime variants.""" @@ -552,6 +553,7 @@ class NodeJS(Enum): DEFAULT = "default" BUN = "bun" LLRT = "llrt" + CLOUDFLARE = "cloudflare" @classmethod def for_language(cls, language: Language, val: str) -> Enum: diff --git a/sebs/utils.py b/sebs/utils.py index 538194ae8..6204d0d9b 100644 --- a/sebs/utils.py +++ b/sebs/utils.py @@ -528,7 +528,7 @@ def has_platform(name: str) -> bool: import google.cloud.devtools # noqa: F401 return True - elif name in ("local", "openwhisk"): + elif name in ("local", "openwhisk", "cloudflare"): # these don't have specific dependencies return True else: diff --git a/templates/wrangler-worker.toml b/templates/wrangler-worker.toml index b11821281..fd36127a9 100644 --- a/templates/wrangler-worker.toml +++ b/templates/wrangler-worker.toml @@ -5,12 +5,3 @@ name = "PLACEHOLDER_WORKER_NAME" main = "PLACEHOLDER_MAIN_FILE" compatibility_date = "2025-11-18" account_id = "PLACEHOLDER_ACCOUNT_ID" - -# Durable Object binding for NoSQL storage -[[durable_objects.bindings]] -name = "DURABLE_STORE" -class_name = "KVApiObject" - -[[migrations]] -tag = "v3" -new_classes = ["KVApiObject"] From fa7b2f58c7e17eeaf8617c91cd6526b4e3cd9a9d Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 26 Mar 2026 12:34:32 +0100 Subject: [PATCH 071/230] refactor(nosql_kv): simplify index check by removing redundant conditions --- benchmarks/wrappers/cloudflare/python/nosql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/nosql.py b/benchmarks/wrappers/cloudflare/python/nosql.py index b5374cbd7..7a91a94f1 100644 --- a/benchmarks/wrappers/cloudflare/python/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/nosql.py @@ -152,7 +152,7 @@ def insert( idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) idx = [] - if idx_raw is not None and len(idx_raw) > 0: + if idx_raw: idx = json.loads(idx_raw) if secondary_key[1] not in idx: idx.append(secondary_key[1]) @@ -202,7 +202,7 @@ def query( ) -> List[dict]: idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) idx = [] - if idx_raw is not None and len(idx_raw) > 0: + if idx_raw: idx = json.loads(idx_raw) res = [] @@ -222,7 +222,7 @@ def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: T idx_raw = run_sync(self.get_table(table_name).get(self.index_key(primary_key))) idx = [] - if idx_raw is not None and len(idx_raw) > 0: + if idx_raw: idx = json.loads(idx_raw) if secondary_key[1] in idx: idx = [v for v in idx if v != secondary_key[1]] From 1311f20f7146f186250bf065ffa770c1ed698823 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 26 Mar 2026 15:07:24 +0100 Subject: [PATCH 072/230] feat(cloudflare): Enhance HTTP invocation with User-Agent header and error handling for Cloudflare http invocation protection --- sebs/cloudflare/triggers.py | 75 +++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index cecd0338f..16c980c6e 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -1,5 +1,8 @@ from typing import Optional import concurrent.futures +import json +from datetime import datetime +from io import BytesIO from sebs.faas.function import Trigger, ExecutionResult @@ -32,16 +35,72 @@ def url(self) -> str: def url(self, url: str): self._url = url - def sync_invoke(self, payload: dict) -> ExecutionResult: + def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> ExecutionResult: """ - Synchronously invoke a Cloudflare Worker via HTTP. - - Args: - payload: The payload to send to the worker - - Returns: - ExecutionResult with performance metrics extracted from the response + Invoke a Cloudflare Worker via HTTP POST. + + Overrides the base implementation to add a browser-like User-Agent header. + Cloudflare's bot-protection returns HTTP 1010 for requests that look like + automated tools (empty or libcurl User-Agent), so we must set one explicitly. """ + import pycurl + + c = pycurl.Curl() + c.setopt(pycurl.HTTPHEADER, [ + "Content-Type: application/json", + # Cloudflare bot-protection (error 1010) blocks requests with no/tool UA. + "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; +https://github.com/spcl/serverless-benchmarks)", + ]) + c.setopt(pycurl.POST, 1) + c.setopt(pycurl.URL, url) + if not verify_ssl: + c.setopt(pycurl.SSL_VERIFYHOST, 0) + c.setopt(pycurl.SSL_VERIFYPEER, 0) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + + c.setopt(pycurl.POSTFIELDS, json.dumps(payload)) + begin = datetime.now() + c.perform() + end = datetime.now() + status_code = c.getinfo(pycurl.RESPONSE_CODE) + conn_time = c.getinfo(pycurl.PRETRANSFER_TIME) + receive_time = c.getinfo(pycurl.STARTTRANSFER_TIME) + c.close() + + try: + output = json.loads(data.getvalue()) + if "body" in output: + if isinstance(output["body"], dict): + output = output["body"] + else: + output = json.loads(output["body"]) + + if status_code != 200: + self.logging.error(f"Invocation on URL {url} failed!") + self.logging.error(f"Output: {output}") + raise RuntimeError(f"Failed invocation of function! Output: {output}") + + self.logging.debug("Invoke of function was successful") + result = ExecutionResult.from_times(begin, end) + result.times.http_startup = conn_time + result.times.http_first_byte_return = receive_time + if "request_id" not in output: + raise RuntimeError(f"Cannot process allocation with output: {output}") + result.request_id = output["request_id"] + result.parse_benchmark_output(output) + return result + except json.decoder.JSONDecodeError: + self.logging.error(f"Invocation on URL {url} failed!") + raw = data.getvalue() + if raw: + self.logging.error(f"Output: {raw.decode()}") + else: + self.logging.error("No output provided!") + raise RuntimeError(f"Failed invocation of function! Output: {raw.decode()}") + + def sync_invoke(self, payload: dict) -> ExecutionResult: + """Synchronously invoke a Cloudflare Worker via HTTP.""" self.logging.debug(f"Invoke function {self.url}") result = self._http_invoke(payload, self.url) From 02cb35ad1333e983efc4c85c4101d0132d419bb6 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 26 Mar 2026 16:22:54 +0100 Subject: [PATCH 073/230] feat(cloudflare): Add support for benchmark validation based on language and deployment type --- sebs/cloudflare/cloudflare.py | 45 +++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 407ff0346..baeb92b29 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -87,6 +87,18 @@ class Cloudflare(System): low-latency serverless execution globally. """ + # Benchmarks supported per (language, container_deployment) combination. + # Keys are (language_name, container_deployment). + # A value of None means all benchmarks are supported. + # Benchmark IDs are matched against the numeric prefix of the benchmark name + # (e.g. "110" matches "110.dynamic-html"). + SUPPORTED_BENCHMARKS: Dict[Tuple[str, bool], Optional[List[str]]] = { + ("python", False): ["110", "120", "130", "210", "311", "501", "502", "503"], + ("nodejs", False): ["110", "120", "130", "311"], + ("python", True): None, # all benchmarks supported + ("nodejs", True): ["110", "120", "130", "210", "311"], + } + _config: CloudflareConfig @staticmethod @@ -105,6 +117,39 @@ def function_type() -> "Type[Function]": def config(self) -> CloudflareConfig: return self._config + def is_benchmark_supported(self, benchmark_name: str, language: str, container_deployment: bool) -> bool: + """Return True if the benchmark is supported for the given language/deployment type. + + Args: + benchmark_name: Full benchmark name, e.g. "110.dynamic-html" + language: Language name, e.g. "python" or "nodejs" + container_deployment: Whether this is a container deployment + + Returns: + True if supported, False otherwise + """ + allowed = self.SUPPORTED_BENCHMARKS.get((language, container_deployment)) + if allowed is None: + # None means all benchmarks are supported + return True + # Match by numeric prefix (the part before the first dot) + prefix = benchmark_name.split(".")[0] + return prefix in allowed + + def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) -> Function: + """Override to validate benchmark support before building/deploying.""" + language = code_package.language_name + container_deployment = code_package.container_deployment + benchmark_name = code_package.benchmark + if not self.is_benchmark_supported(benchmark_name, language, container_deployment): + deployment_type = "container" if container_deployment else "worker" + raise RuntimeError( + f"Benchmark '{benchmark_name}' is not supported for " + f"{language} {deployment_type} deployments on Cloudflare. " + f"Supported benchmarks: {self.SUPPORTED_BENCHMARKS.get((language, container_deployment))}" + ) + return super().get_function(code_package, func_name) + def __init__( self, sebs_config: SeBSConfig, From 60aa6318a0f05aac356a52bedc78dfbf1f582dc9 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 26 Mar 2026 18:16:58 +0100 Subject: [PATCH 074/230] fix(cloudflare): Update deployment logic for Python workers and ensure package directory creation --- sebs/cloudflare/cloudflare.py | 7 ++++--- sebs/cloudflare/workers.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index baeb92b29..d8952e41a 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -530,9 +530,10 @@ def _create_or_update_worker( language_variant = code_package.language_variant if code_package else "cloudflare" try: - # pywrangler is used exclusively for the Pyodide (python cloudflare) variant. - # All other cases — nodejs, containers, or non-cloudflare python — use wrangler. - if not container_deployment and language == "python" and language_variant == "cloudflare": + # pywrangler is used for all native Python workers (packages must be + # synced via pyproject.toml before wrangler uploads the bundle). + # All other cases — nodejs, containers — use wrangler directly. + if not container_deployment and language == "python": output = cli.pywrangler_deploy(container_package_path, env=env) else: output = cli.wrangler_deploy(container_package_path, env=env) diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 8f5d90d17..667d95aaa 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -164,6 +164,7 @@ def generate_wrangler_toml( # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") + os.makedirs(package_dir, exist_ok=True) try: # Try tomli_w (writes binary) with open(toml_path, 'wb') as f: @@ -275,7 +276,7 @@ def package_code( shutil.move(src, dest) self.logging.info(f"move {src} to {dest}") - if language_variant == "cloudflare": + if language_variant in ("cloudflare", "default"): if os.path.exists(requirements_file): with open(requirements_file, 'r') as reqf: reqtext = reqf.read() From cf9d333916cf92c114509488beb9444e8e6285ae Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 29 Mar 2026 18:13:28 +0200 Subject: [PATCH 075/230] refactor(cloudflare): abstract nodejs worker build into Dockerfile.worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the two-phase Node.js build pipeline (esbuild bundling + __require→import post-processing) out of the SeBS orchestration code and into a dedicated Dockerfile, mirroring how container deployments already work. Changes: - Extract __require-patching logic from build.js into a standalone postprocess.js - Add dockerfiles/cloudflare/nodejs/Dockerfile.worker that runs npm install, build.js, and postprocess.js as explicit RUN layers - Replace the CLI-container npm-install/download loop in workers.py with _build_worker_and_extract_dist(), which builds Dockerfile.worker via docker-py and extracts only dist/ back to the host — identical pattern to containers.py - Remove wrangler.toml build command (build is now pre-done before upload) - Remove explicit cli.execute() build calls from cloudflare.py - Register postprocess.js in configs/systems.json deployment files so it is copied into the package directory Both worker and container deployments now follow the same shape: package_code (Dockerfile) → upload directory → wrangler deploy The only difference is which Dockerfile is selected. --- .../wrappers/cloudflare/nodejs/build.js | 49 ------ .../wrappers/cloudflare/nodejs/postprocess.js | 86 ++++++++++ configs/systems.json | 1 + .../cloudflare/nodejs/Dockerfile.worker | 16 ++ sebs/cloudflare/workers.py | 151 ++++++++++-------- 5 files changed, 186 insertions(+), 117 deletions(-) create mode 100644 benchmarks/wrappers/cloudflare/nodejs/postprocess.js create mode 100644 dockerfiles/cloudflare/nodejs/Dockerfile.worker diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index 834ec5c16..fd6cc5e08 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -104,55 +104,6 @@ async function customBuild() { mainFields: ['module', 'main'], treeShaking: true, }); - - // POST-PROCESS: Replace dynamic requires with static imports - console.log('Post-processing to fix node: module imports...'); - - for (const jsFile of jsFiles) { - const outPath = join(outDir, relative(srcDir, jsFile)); - - if (fs.existsSync(outPath)) { - let content = fs.readFileSync(outPath, 'utf-8'); - - // Find all node: modules being dynamically required - const nodeModules = new Set(); - const requireRegex = /__require\d*\("(node:[^"]+)"\)/g; - let match; - while ((match = requireRegex.exec(content)) !== null) { - nodeModules.add(match[1]); - } - - if (nodeModules.size > 0) { - // Generate static imports at the top - let imports = ''; - const mapping = {}; - let i = 0; - for (const mod of nodeModules) { - const varName = `__node_${mod.replace('node:', '').replace(/[^a-z0-9]/gi, '_')}_${i++}`; - imports += `import * as ${varName} from '${mod}';\n`; - mapping[mod] = varName; - } - - // Add cache object - imports += '\nconst __node_cache = {\n'; - for (const [mod, varName] of Object.entries(mapping)) { - imports += ` '${mod}': ${varName},\n`; - } - imports += '};\n\n'; - - // Replace all __require calls with cache lookups - content = content.replace(/__require(\d*)\("(node:[^"]+)"\)/g, (match, num, mod) => { - return `__node_cache['${mod}']`; - }); - - // Prepend imports to the file - content = imports + content; - - fs.writeFileSync(outPath, content, 'utf-8'); - console.log(`✓ Fixed ${nodeModules.size} node: imports in ${relative(srcDir, jsFile)}`); - } - } - } } // Copy non-JS files (templates, etc.) diff --git a/benchmarks/wrappers/cloudflare/nodejs/postprocess.js b/benchmarks/wrappers/cloudflare/nodejs/postprocess.js new file mode 100644 index 000000000..2c06ce414 --- /dev/null +++ b/benchmarks/wrappers/cloudflare/nodejs/postprocess.js @@ -0,0 +1,86 @@ +/** + * Post-processing step: replace esbuild's dynamic __require("node:…") helper + * calls in the bundled dist/ output with static ESM imports. + * + * esbuild bundles dependencies that themselves call require() at runtime, + * turning them into __require("node:fs") style calls. Cloudflare Workers + * run in an ESM-only environment, so these must be hoisted to top-level + * import statements that wrangler / the runtime can resolve. + * + * Only the top-level requires in *source* files can be handled by esbuild's + * own external/format options, which is why this step is needed separately. + * + * Usage: node postprocess.js (run from the worker package directory) + */ + +'use strict'; + +const fs = require('fs'); +const { join, relative } = require('path'); + +function getAllJsFiles(dir, fileList = []) { + if (!fs.existsSync(dir)) return fileList; + const files = fs.readdirSync(dir, { withFileTypes: true }); + for (const file of files) { + const filePath = join(dir, file.name); + if (file.isDirectory()) { + getAllJsFiles(filePath, fileList); + } else if (file.name.endsWith('.js')) { + fileList.push(filePath); + } + } + return fileList; +} + +const distDir = './dist'; +const jsFiles = getAllJsFiles(distDir); + +let totalFixed = 0; + +for (const filePath of jsFiles) { + let content = fs.readFileSync(filePath, 'utf-8'); + + // Collect all unique node: modules required via esbuild's __require helper. + const nodeModules = new Set(); + const requireRegex = /__require\d*\("(node:[^"]+)"\)/g; + let match; + while ((match = requireRegex.exec(content)) !== null) { + nodeModules.add(match[1]); + } + + if (nodeModules.size === 0) continue; + + // Build static import declarations and a lookup cache object. + let imports = ''; + const mapping = {}; + let i = 0; + for (const mod of nodeModules) { + const varName = `__node_${mod.replace('node:', '').replace(/[^a-z0-9]/gi, '_')}_${i++}`; + imports += `import * as ${varName} from '${mod}';\n`; + mapping[mod] = varName; + } + + imports += '\nconst __node_cache = {\n'; + for (const [mod, varName] of Object.entries(mapping)) { + imports += ` '${mod}': ${varName},\n`; + } + imports += '};\n\n'; + + // Replace every __require("node:…") call with a cache lookup. + content = content.replace(/__require(\d*)\("(node:[^"]+)"\)/g, (_match, _num, mod) => { + return `__node_cache['${mod}']`; + }); + + // Prepend the import block. + content = imports + content; + + fs.writeFileSync(filePath, content, 'utf-8'); + console.log(`✓ Fixed ${nodeModules.size} node: import(s) in ${relative(distDir, filePath)}`); + totalFixed++; +} + +if (totalFixed === 0) { + console.log('No __require node: calls found — nothing to patch.'); +} else { + console.log(`✓ Post-processing complete (${totalFixed} file(s) patched).`); +} diff --git a/configs/systems.json b/configs/systems.json index b5bda5336..56196ba6d 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -489,6 +489,7 @@ "storage.js", "nosql.js", "build.js", + "postprocess.js", "request-polyfill.js" ], "packages": { diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile.worker b/dockerfiles/cloudflare/nodejs/Dockerfile.worker new file mode 100644 index 000000000..6fb2b7149 --- /dev/null +++ b/dockerfiles/cloudflare/nodejs/Dockerfile.worker @@ -0,0 +1,16 @@ +FROM node:20-slim + +WORKDIR /worker + +# Copy source files (node_modules and dist are excluded via .dockerignore) +COPY . . + +# Install production dependencies + esbuild (needed by build.js) +RUN npm install --production && npm install --force esbuild + +# Bundle all source files with esbuild into dist/ +RUN node build.js + +# Patch esbuild's dynamic __require("node:…") calls to static ESM imports +# so that the Cloudflare Workers ESM runtime can resolve built-in modules. +RUN node postprocess.js diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 667d95aaa..83d12965d 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -104,7 +104,6 @@ def generate_wrangler_toml( config['main'] = "dist/handler.js" config['compatibility_flags'] = ["nodejs_compat"] config['no_bundle'] = True - config['build'] = {'command': 'node build.js'} config['rules'] = [ { 'type': 'ESModule', @@ -199,74 +198,13 @@ def package_code( Returns: Tuple of (package_path, package_size, container_uri) """ - # Install dependencies + # Install dependencies and bundle if language_name == "nodejs": - package_file = os.path.join(directory, "package.json") - node_modules = os.path.join(directory, "node_modules") - - # Only install if package.json exists and node_modules doesn't - if os.path.exists(package_file) and not os.path.exists(node_modules): - self.logging.info(f"Installing Node.js dependencies in {directory}") - # Use CLI container for npm install - no Node.js/npm needed on host - cli = self._get_cli() - container_path = f"/tmp/npm_install/{os.path.basename(directory)}" - - try: - # Upload package directory to container - cli.upload_package(directory, container_path) - - # Install production dependencies - self.logging.info("Installing npm dependencies in container...") - output = cli.npm_install(container_path) - self.logging.info("npm install completed successfully") - self.logging.debug(f"npm output: {output}") - - # Install esbuild as a dev dependency (needed by build.js) - self.logging.info("Installing esbuild for custom build script...") - cli.execute(f"cd {container_path} && npm install --force esbuild") - self.logging.info("esbuild installed successfully") - - # Download node_modules back to host - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info(f"Downloaded node_modules to {directory}") - - except Exception as e: - self.logging.error(f"npm install in container failed: {e}") - raise RuntimeError(f"Failed to install Node.js dependencies: {e}") - elif os.path.exists(node_modules): - self.logging.info(f"Node.js dependencies already installed in {directory}") - - # Ensure esbuild is available even for cached installations - esbuild_path = os.path.join(node_modules, "esbuild") - if not os.path.exists(esbuild_path): - self.logging.info("Installing esbuild for custom build script...") - cli = self._get_cli() - container_path = f"/tmp/npm_install/{os.path.basename(directory)}" - - try: - cli.upload_package(directory, container_path) - cli.execute(f"cd {container_path} && npm install --force esbuild") - - # Download node_modules back to host - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info("esbuild installed successfully") - except Exception as e: - self.logging.error(f"Failed to install esbuild: {e}") - raise RuntimeError(f"Failed to install esbuild: {e}") + # Build via Dockerfile.worker (npm install + esbuild + __require patching), + # then extract the produced dist/ back into the package directory. + # This mirrors how container deployments use their Dockerfile — the only + # difference is which Dockerfile is selected. + self._build_worker_and_extract_dist(directory, is_cached) elif language_name == "python": requirements_file = os.path.join(directory, "requirements.txt") @@ -383,6 +321,83 @@ def package_code( return (directory, total_size, "") + def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> None: + """Build the Node.js worker bundle via Dockerfile.worker and extract dist/. + + Runs npm install, esbuild (build.js), and the __require→import post- + processing step (postprocess.js) inside a throwaway Docker image built + from Dockerfile.worker. Only the resulting dist/ directory is extracted + back to *directory*; intermediate artifacts (node_modules, build image) + stay inside Docker. + + If *is_cached* is True and dist/ already exists the build is skipped. + """ + import docker as docker_module + + dist_dir = os.path.join(directory, "dist") + if is_cached and os.path.exists(dist_dir): + self.logging.info("Cached dist/ found — skipping worker bundle build.") + return + + dockerfile_src = os.path.join( + os.path.dirname(__file__), "..", "..", + "dockerfiles", "cloudflare", "nodejs", "Dockerfile.worker" + ) + dockerfile_dest = os.path.join(directory, "Dockerfile.worker") + dockerignore_dest = os.path.join(directory, ".dockerignore") + + # Keep the build context lean: exclude generated / heavy artifacts. + dockerignore_content = "node_modules\ndist\nDockerfile.worker\n.dockerignore\n" + shutil.copy2(dockerfile_src, dockerfile_dest) + with open(dockerignore_dest, "w") as f: + f.write(dockerignore_content) + + # Use base directory name + pid for a unique, collision-free tag. + image_tag = f"sebs-worker-build-{os.path.basename(directory)}-{os.getpid()}:latest" + + try: + self.logging.info(f"Building worker bundle via Dockerfile.worker in {directory}") + _, build_logs = self.docker_client.images.build( + path=directory, + dockerfile="Dockerfile.worker", + tag=image_tag, + rm=True, + ) + for log in build_logs: + if "stream" in log: + self.logging.debug(log["stream"].strip()) + elif "error" in log: + raise RuntimeError(f"Docker build error: {log['error']}") + + # Extract dist/ from the built image. + self.logging.info("Extracting built dist/ from worker build image...") + container = self.docker_client.containers.create(image_tag) + try: + bits, _ = container.get_archive("/worker/dist") + file_obj = io.BytesIO() + for chunk in bits: + file_obj.write(chunk) + file_obj.seek(0) + if os.path.exists(dist_dir): + shutil.rmtree(dist_dir) + with tarfile.open(fileobj=file_obj) as tar: + tar.extractall(directory) + self.logging.info(f"dist/ extracted to {directory}") + finally: + container.remove() + + except docker_module.errors.BuildError as e: + raise RuntimeError(f"Worker bundle build failed: {e}") + finally: + # Remove the temporary files we injected into the build context. + for tmp in (dockerfile_dest, dockerignore_dest): + if os.path.exists(tmp): + os.remove(tmp) + try: + self.docker_client.images.remove(image_tag, force=True) + except Exception: + pass + def shutdown(self): """Shutdown CLI container if initialized.""" if self._cli is not None: From cf5a547c9bf803142facb947d8d89a905bdb6f1a Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 15:54:29 +0200 Subject: [PATCH 076/230] fix(cloudflare): Implement health check pings to keep container warm during provisioning --- sebs/cloudflare/cloudflare.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index d8952e41a..7b6dd5eb7 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -550,10 +550,28 @@ def _create_or_update_worker( worker_name, worker_url ) - # The container binding needs time to propagate before first invocation + # Keep the container warm for a minimum provisioning window. + # A flat sleep lets the Durable Object hibernate, which causes the + # container runtime to reject the next start() call. Instead we + # ping /health every few seconds so the DO stays alive. if container_deployment: - self.logging.info("Waiting 60 seconds for container to be fully provisioned (can sometimes take a bit longer)...") - time.sleep(60) + warm_seconds = 60 + ping_interval = 5 + account_id = env.get('CLOUDFLARE_ACCOUNT_ID') + worker_url = self._build_workers_dev_url(worker_name, account_id) + health_url = f"{worker_url}/health" + self.logging.info( + f"Keeping container warm for {warm_seconds}s " + f"(pinging {health_url} every {ping_interval}s)..." + ) + deadline = time.time() + warm_seconds + while time.time() < deadline: + try: + requests.get(health_url, timeout=10) + except Exception: + pass + remaining = deadline - time.time() + time.sleep(min(ping_interval, max(0, remaining))) return {"success": True, "output": output} From 91bb9a166f878340095b097c18f6f70ded728d89 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 18:05:54 +0200 Subject: [PATCH 077/230] feat(cloudflare): Enhance language configuration with variant support for Cloudflare deployments --- .../100.webapps/110.dynamic-html/config.json | 16 +- .../100.webapps/120.uploader/config.json | 16 +- .../100.webapps/130.crud-api/config.json | 16 +- .../210.thumbnailer/config.json | 18 +- .../220.video-processing/config.json | 10 +- .../300.utilities/311.compression/config.json | 17 +- .../411.image-recognition/config.json | 11 +- .../501.graph-pagerank/config.json | 11 +- .../500.scientific/502.graph-mst/config.json | 10 +- .../500.scientific/503.graph-bfs/config.json | 11 +- .../504.dna-visualisation/config.json | 10 +- docs/build.md | 59 +++++- sebs/benchmark.py | 193 +++++++++-------- sebs/regression.py | 199 +++++++++++++++++- 14 files changed, 495 insertions(+), 102 deletions(-) diff --git a/benchmarks/100.webapps/110.dynamic-html/config.json b/benchmarks/100.webapps/110.dynamic-html/config.json index 2f8e4f6dc..dc5160394 100644 --- a/benchmarks/100.webapps/110.dynamic-html/config.json +++ b/benchmarks/100.webapps/110.dynamic-html/config.json @@ -2,8 +2,20 @@ "timeout": 10, "memory": 128, "languages": [ - "python", - "nodejs", + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, "java" ], "modules": [] diff --git a/benchmarks/100.webapps/120.uploader/config.json b/benchmarks/100.webapps/120.uploader/config.json index b8bc9f0f9..0bece4e5b 100644 --- a/benchmarks/100.webapps/120.uploader/config.json +++ b/benchmarks/100.webapps/120.uploader/config.json @@ -2,8 +2,20 @@ "timeout": 30, "memory": 128, "languages": [ - { "language": "python", "variants": ["default", "cloudflare"] }, - { "language": "nodejs", "variants": ["default", "cloudflare"] } + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": {"workers": "cloudflare", "containers": "default"} + } + }, + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": {"workers": "cloudflare", "containers": "default"} + } + } ], "modules": ["storage"] } diff --git a/benchmarks/100.webapps/130.crud-api/config.json b/benchmarks/100.webapps/130.crud-api/config.json index 25c6cb05e..66532ad84 100644 --- a/benchmarks/100.webapps/130.crud-api/config.json +++ b/benchmarks/100.webapps/130.crud-api/config.json @@ -2,8 +2,20 @@ "timeout": 30, "memory": 128, "languages": [ - "python", - "nodejs" + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": "default" + } + } ], "modules": [ "nosql" diff --git a/benchmarks/200.multimedia/210.thumbnailer/config.json b/benchmarks/200.multimedia/210.thumbnailer/config.json index 7ba71f4dd..6bb025a9b 100644 --- a/benchmarks/200.multimedia/210.thumbnailer/config.json +++ b/benchmarks/200.multimedia/210.thumbnailer/config.json @@ -1,7 +1,23 @@ { "timeout": 60, "memory": 256, - "languages": ["python", "nodejs", "cpp"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + "cpp" + ], "modules": ["storage"], "cpp_dependencies": ["sdk", "opencv", "libjpeg-turbo", "boost"] } diff --git a/benchmarks/200.multimedia/220.video-processing/config.json b/benchmarks/200.multimedia/220.video-processing/config.json index 94ede7925..d9596b9ac 100644 --- a/benchmarks/200.multimedia/220.video-processing/config.json +++ b/benchmarks/200.multimedia/220.video-processing/config.json @@ -1,6 +1,14 @@ { "timeout": 60, "memory": 512, - "languages": ["python"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + } + ], "modules": ["storage"] } diff --git a/benchmarks/300.utilities/311.compression/config.json b/benchmarks/300.utilities/311.compression/config.json index 3f0f9238b..0b20dbd82 100644 --- a/benchmarks/300.utilities/311.compression/config.json +++ b/benchmarks/300.utilities/311.compression/config.json @@ -2,8 +2,21 @@ "timeout": 60, "memory": 256, "languages": [ - "python", - { "language": "nodejs", "variants": ["default", "cloudflare"] } + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": {"workers": "cloudflare", "containers": "default"} + } + } ], "modules": ["storage"] } + diff --git a/benchmarks/400.inference/411.image-recognition/config.json b/benchmarks/400.inference/411.image-recognition/config.json index a0c9c607e..a5c9cbb95 100644 --- a/benchmarks/400.inference/411.image-recognition/config.json +++ b/benchmarks/400.inference/411.image-recognition/config.json @@ -1,7 +1,16 @@ { "timeout": 60, "memory": 512, - "languages": ["python", "cpp"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + "cpp" + ], "modules": ["storage"], "cpp_dependencies": ["sdk", "torch", "opencv"] } diff --git a/benchmarks/500.scientific/501.graph-pagerank/config.json b/benchmarks/500.scientific/501.graph-pagerank/config.json index 90e8c7f86..2fbbec8c8 100644 --- a/benchmarks/500.scientific/501.graph-pagerank/config.json +++ b/benchmarks/500.scientific/501.graph-pagerank/config.json @@ -1,7 +1,16 @@ { "timeout": 120, "memory": 512, - "languages": ["python", "cpp"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + "cpp" + ], "modules": [], "cpp_dependencies": ["igraph"] } diff --git a/benchmarks/500.scientific/502.graph-mst/config.json b/benchmarks/500.scientific/502.graph-mst/config.json index e80fb4351..9749feb38 100644 --- a/benchmarks/500.scientific/502.graph-mst/config.json +++ b/benchmarks/500.scientific/502.graph-mst/config.json @@ -1,6 +1,14 @@ { "timeout": 120, "memory": 512, - "languages": ["python"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + } + ], "modules": [] } diff --git a/benchmarks/500.scientific/503.graph-bfs/config.json b/benchmarks/500.scientific/503.graph-bfs/config.json index 90e8c7f86..2fbbec8c8 100644 --- a/benchmarks/500.scientific/503.graph-bfs/config.json +++ b/benchmarks/500.scientific/503.graph-bfs/config.json @@ -1,7 +1,16 @@ { "timeout": 120, "memory": 512, - "languages": ["python", "cpp"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + }, + "cpp" + ], "modules": [], "cpp_dependencies": ["igraph"] } diff --git a/benchmarks/500.scientific/504.dna-visualisation/config.json b/benchmarks/500.scientific/504.dna-visualisation/config.json index ff297ac5b..158e2ff59 100644 --- a/benchmarks/500.scientific/504.dna-visualisation/config.json +++ b/benchmarks/500.scientific/504.dna-visualisation/config.json @@ -1,6 +1,14 @@ { "timeout": 60, "memory": 2048, - "languages": ["python"], + "languages": [ + { + "language": "python", + "variants": { + "default": "default", + "cloudflare": "default" + } + } + ], "modules": ["storage"] } diff --git a/docs/build.md b/docs/build.md index c6c7f4bab..671a45904 100644 --- a/docs/build.md +++ b/docs/build.md @@ -92,23 +92,70 @@ additive on top of it. ### 1. Declaring variants in a benchmark (`config.json`) A benchmark opts into variant support by using the extended language object syntax in its -`config.json`. The legacy string form (`"python"`) implies only the `"default"` variant. +`config.json`. The legacy string form (`"python"`) implies only the `"default"` variant +and should be kept for languages that have no variant-specific code or configuration. ```json { "timeout": 10, "memory": 128, "languages": [ - { "language": "nodejs", "variants": ["default", "bun", "llrt"] }, - { "language": "python", "variants": ["default", "pypy"] } + "java", + { + "language": "nodejs", + "variants": { + "default": "default", + "bun": "bun", + "llrt": "llrt" + } + }, + { + "language": "python", + "variants": { + "default": "default", + "pypy": "pypy" + } + } ], "modules": [] } ``` +The `variants` field is a **dict** mapping each variant name to the source overlay directory +to apply for that variant (see [section 2](#2-variant-source-code-inside-a-benchmark) below). +The special sentinel value `"default"` means *use the base language directory without any +overlay* — no files are copied from a sub-directory. + SeBS validates this at startup: if you request a variant that is not listed here, the run is rejected with an error. +#### Deployment-mode-split variants + +Some variants behave differently depending on whether the function is deployed as a **code +package** (workers) or as a **container image**. For those cases the overlay directory can be +specified per deployment mode using a nested dict: + +```json +{ + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": {"workers": "cloudflare", "containers": "default"} + } +} +``` + +The inner dict must use the keys `"workers"` and/or `"containers"`. A missing key means the +variant is not supported in that deployment mode and SeBS will raise an error if it is +requested. A value of `"default"` means no overlay is applied for that mode (the base +language files are used unchanged). + +This is useful when a variant requires platform-specific source changes for one deployment +mode but can reuse the standard implementation for the other. For example, the `cloudflare` +variant of benchmarks that target Cloudflare Workers uses a Pyodide-aware implementation for +the `workers` mode, but falls back to the standard CPython implementation (`"default"`) for +the `containers` mode. + --- ### 2. Variant source code inside a benchmark @@ -120,7 +167,11 @@ language directory of the benchmark: benchmarks//// ``` -Two strategies are supported: +The overlay directory name comes from the value in the `variants` dict (or the inner +`workers`/`containers` value for deployment-mode-split variants). When that value is +`"default"`, no sub-directory is consulted and the base language files are used as-is. + +Two strategies are supported for non-`"default"` overlay directories: #### Patch variant (small targeted changes) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 3a94ee074..e2ae1f798 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -39,65 +39,90 @@ class LanguageSpec: """ Represents a language with its supported variants for a benchmark. - Parses the config language settings, supports both the legacy format - (e.g. "python") and the new dict format: + Parses the config.json ``languages`` entries. Supports three formats: - {"language": "nodejs", "variants": ["default", "bun", "llrt"]} + * Legacy string: ``"python"`` + → treated as ``{"default": "default"}`` - The legacy format is treated as having just the "default" variant. - """ + * New dict with simple (non-deployment-split) variants:: - def __init__(self, language: "Language", variants: List[str]): - """Initialize a language specification. + {"language": "nodejs", "variants": {"default": "default", "bun": "bun"}} - Args: - language: The programming language - variants: List of supported runtime variants for this language - """ + The value for each key is either the literal sentinel ``"default"`` + (meaning: use the base language directory, no overlay) or a subdirectory + name to use as an overlay (e.g. ``"cloudflare"`` → ``nodejs/cloudflare/``). + + * New dict with deployment-mode-split variants:: + + { + "language": "nodejs", + "variants": { + "default": "default", + "cloudflare": {"workers": "cloudflare", "containers": "default"} + } + } + + When the value is itself a dict, the keys are deployment modes + (``"workers"`` / ``"containers"``) and the values follow the same + sentinel / subdirectory convention. A missing mode key means the + benchmark is not supported in that deployment mode. + """ + + def __init__(self, language: "Language", variants: Dict[str, Any]): self._language = language self._variants = variants @property def language(self) -> "Language": - """Get the programming language. - - Returns: - Language: The programming language - """ return self._language @property - def variants(self) -> List[str]: - """Get the list of supported runtime variants. - - Returns: - List[str]: List of variant names (e.g., ["default", "pypy"]) - """ + def variants(self) -> Dict[str, Any]: + """Variant map: variant name → directory name or deployment-mode dict.""" return self._variants - @staticmethod - def deserialize(val) -> LanguageSpec: - """Deserialize a language specification from config. + def resolve_dir(self, variant: str, container_deployment: bool) -> str: + """Return the source subdirectory name for *variant* + deployment mode. - Args: - val: Either a string (legacy format) or dict with language and variants + Returns ``"default"`` (sentinel) when the base language directory should + be used without any overlay. Returns a subdirectory name (e.g. + ``"cloudflare"``) when an overlay should be applied from that subdir. - Returns: - LanguageSpec: Deserialized language specification + Raises ``RuntimeError`` when the variant or deployment mode is not + supported. """ + entry = self._variants.get(variant) + if entry is None: + raise RuntimeError( + f"Variant '{variant}' not declared for language {self._language.value}" + ) + if isinstance(entry, dict): + mode = "containers" if container_deployment else "workers" + dir_name = entry.get(mode) + if dir_name is None: + raise RuntimeError( + f"Variant '{variant}' does not support deployment mode '{mode}' " + f"for language {self._language.value}" + ) + return dir_name + return entry # str: "default" or a subdir name + + @staticmethod + def deserialize(val) -> "LanguageSpec": if isinstance(val, str): - return LanguageSpec(Language.deserialize(val), ["default"]) - return LanguageSpec( - Language.deserialize(val["language"]), - val.get("variants", ["default"]), - ) + # Legacy: "python" → only the default variant + return LanguageSpec(Language.deserialize(val), {"default": "default"}) + variants = val.get("variants") + if variants is None: + variants = {"default": "default"} + elif isinstance(variants, list): + # Old list format: ["default", "cloudflare"] + # Each name maps to itself ("default" stays as the sentinel). + variants = {v: v for v in variants} + # else: already the new dict format + return LanguageSpec(Language.deserialize(val["language"]), variants) def serialize(self) -> dict: - """Serialize the language specification to a dictionary. - - Returns: - dict: Dictionary with language and variants keys - """ return { "language": self._language.value, "variants": self._variants, @@ -217,13 +242,20 @@ def supported_variants(self, language: Language) -> List[str]: or [] if the language has no implementation in this benchmark.""" for spec in self._language_specs: if spec.language == language: - return spec.variants + return list(spec.variants.keys()) return [] def supports(self, language: Language, variant: str) -> bool: """Return True when language + variant combination is declared in config.json.""" return variant in self.supported_variants(language) + def get_language_spec(self, language: Language) -> "LanguageSpec": + """Return the LanguageSpec for *language*, raising if not found.""" + for spec in self._language_specs: + if spec.language == language: + return spec + raise RuntimeError(f"Language {language.value} not declared in benchmark config") + @staticmethod def deserialize(json_object: dict) -> BenchmarkConfig: """ @@ -811,52 +843,49 @@ def copy_code(self, output_dir: str) -> None: shutil.copy2(nodejs_package_json, os.path.join(output_dir, "package.json")) if self._language_variant != "default": - variant_dir = os.path.join(path, self._language_variant) - if not os.path.isdir(variant_dir): - raise RuntimeError( - "Variant directory not found for benchmark {} language {} " - "variant {}: {}".format( - self.benchmark, self.language_name, self._language_variant, variant_dir - ) - ) + lang_spec = self.benchmark_config.get_language_spec(self.language) + overlay_dir_name = lang_spec.resolve_dir( + self._language_variant, self._container_deployment + ) - patch_file = os.path.join(variant_dir, "patch.diff") - if os.path.exists(patch_file): - # Patch-based variant: a unified diff (patch.diff) is applied on top of the - # default implementation. Use this when the variant only needs small - # targeted changes to the base code (e.g. swapping async I/O for sync I/O - # in a runtime that lacks full async support). - # Apply unified diff on top of the already-copied base files - import patch_ng - - pset = patch_ng.fromfile(patch_file) - if not pset or not pset.apply(strip=1, root=output_dir): + if overlay_dir_name != "default": + variant_dir = os.path.join(path, overlay_dir_name) + if not os.path.isdir(variant_dir): raise RuntimeError( - "Failed to apply patch {} for variant {}".format( - patch_file, self._language_variant + "Variant directory not found for benchmark {} language {} " + "variant {}: {}".format( + self.benchmark, self.language_name, self._language_variant, variant_dir + ) + ) + + patch_file = os.path.join(variant_dir, "patch.diff") + if os.path.exists(patch_file): + import patch_ng + + pset = patch_ng.fromfile(patch_file) + if not pset or not pset.apply(strip=1, root=output_dir): + raise RuntimeError( + "Failed to apply patch {} for variant {}".format( + patch_file, self._language_variant + ) + ) + self.logging.info( + "Applied patch for variant {} ({})".format(self._language_variant, patch_file) + ) + else: + for file_type in FILES[self.language]: + for f in glob.glob(os.path.join(variant_dir, file_type)): + shutil.copy2(f, output_dir) + nodejs_variant_pkg = os.path.join( + variant_dir, f"package.json.{self.language_version}" + ) + if os.path.exists(nodejs_variant_pkg): + shutil.copy2(nodejs_variant_pkg, os.path.join(output_dir, "package.json")) + self.logging.info( + "Applied file overlay for variant {} (dir: {})".format( + self._language_variant, overlay_dir_name ) ) - self.logging.info( - "Applied patch for variant {} ({})".format(self._language_variant, patch_file) - ) - else: - # Overlay-based variant: the variant directory contains a complete - # replacement set of source files that fully override the default - # implementation. All files from the variant directory are copied - # on top of the already-placed base files. Use this when the variant - # is substantially different from the default (e.g. a full rewrite). - for file_type in FILES[self.language]: - for f in glob.glob(os.path.join(variant_dir, file_type)): - shutil.copy2(f, output_dir) - # version-specific package.json override for Node.js - nodejs_variant_pkg = os.path.join( - variant_dir, f"package.json.{self.language_version}" - ) - if os.path.exists(nodejs_variant_pkg): - shutil.copy2(nodejs_variant_pkg, os.path.join(output_dir, "package.json")) - self.logging.info( - "Applied file overlay for variant {}".format(self._language_variant) - ) def add_benchmark_data(self, output_dir: str) -> None: """Add benchmark-specific data and assets to output directory. diff --git a/sebs/regression.py b/sebs/regression.py index 53336a2a7..c0809fdd1 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -25,7 +25,7 @@ import testtools import threading from time import sleep -from typing import cast, Dict, Optional, Set, TYPE_CHECKING +from typing import cast, Dict, Optional, Set, Tuple, TYPE_CHECKING from sebs.faas.function import Trigger from sebs.utils import ColoredWrapper @@ -77,6 +77,35 @@ architectures_openwhisk = ["x64"] deployments_openwhisk = ["container"] +# Cloudflare-specific configurations +architectures_cloudflare = ["x64"] + +# Cloudflare workers benchmarks per language +benchmarks_cloudflare_python_workers = [ + "110.dynamic-html", + "120.uploader", + "130.crud-api", + "210.thumbnailer", + "311.compression", + "501.graph-pagerank", + "502.graph-mst", + "503.graph-bfs", +] +benchmarks_cloudflare_python_containers = benchmarks_python # all benchmarks supported +benchmarks_cloudflare_nodejs_workers = [ + "110.dynamic-html", + "120.uploader", + "130.crud-api", + "311.compression", +] +benchmarks_cloudflare_nodejs_containers = [ + "110.dynamic-html", + "120.uploader", + "130.crud-api", + "210.thumbnailer", + "311.compression", +] + # User-defined config passed during initialization, set in regression_suite() cloud_config: Optional[dict] = None @@ -1052,6 +1081,130 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): return deployment_client +class CloudflareTestSequencePythonWorkers( + unittest.TestCase, + metaclass=TestSequenceMeta, + benchmarks=benchmarks_cloudflare_python_workers, + architectures=architectures_cloudflare, + deployments=["workers"], + deployment_name="cloudflare", + triggers=[Trigger.TriggerType.HTTP], +): + """Test suite for Python benchmarks on Cloudflare Workers.""" + + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "cloudflare" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = False + config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with CloudflareTestSequencePythonWorkers.lock: + deployment_client.initialize(resource_prefix="regr") + return deployment_client + + +class CloudflareTestSequencePythonContainers( + unittest.TestCase, + metaclass=TestSequenceMeta, + benchmarks=benchmarks_cloudflare_python_containers, + architectures=architectures_cloudflare, + deployments=["container"], + deployment_name="cloudflare", + triggers=[Trigger.TriggerType.HTTP], +): + """Test suite for Python benchmarks on Cloudflare Containers.""" + + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "cloudflare" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = True + config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with CloudflareTestSequencePythonContainers.lock: + deployment_client.initialize(resource_prefix="regr") + return deployment_client + + +class CloudflareTestSequenceNodejsWorkers( + unittest.TestCase, + metaclass=TestSequenceMeta, + benchmarks=benchmarks_cloudflare_nodejs_workers, + architectures=architectures_cloudflare, + deployments=["workers"], + deployment_name="cloudflare", + triggers=[Trigger.TriggerType.HTTP], +): + """Test suite for Node.js benchmarks on Cloudflare Workers.""" + + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "cloudflare" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = False + config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with CloudflareTestSequenceNodejsWorkers.lock: + deployment_client.initialize(resource_prefix="regr") + return deployment_client + + +class CloudflareTestSequenceNodejsContainers( + unittest.TestCase, + metaclass=TestSequenceMeta, + benchmarks=benchmarks_cloudflare_nodejs_containers, + architectures=architectures_cloudflare, + deployments=["container"], + deployment_name="cloudflare", + triggers=[Trigger.TriggerType.HTTP], +): + """Test suite for Node.js benchmarks on Cloudflare Containers.""" + + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "cloudflare" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = True + config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with CloudflareTestSequenceNodejsContainers.lock: + deployment_client.initialize(resource_prefix="regr") + return deployment_client + + # Stream result handler for concurrent test execution # Based on https://stackoverflow.com/questions/22484805/ # a-simple-working-example-for-testtools-concurrentstreamtestsuite @@ -1163,6 +1316,22 @@ def filter_out_benchmarks( if (deployment_name == "gcp" and language == "python" and language_version in ["3.8", "3.9", "3.10", "3.11", "3.12"]): return "411.image-recognition" not in benchmark + + # Cloudflare: only certain benchmarks are supported per language/deployment-type. + # Mirrors Cloudflare.SUPPORTED_BENCHMARKS in sebs/cloudflare/cloudflare.py. + # None means all benchmarks are supported for that combination. + if deployment_name == "cloudflare": + _CF_SUPPORTED: Dict[Tuple[str, bool], Optional[Set[str]]] = { + ("python", False): {"110", "120", "130", "210", "311", "501", "502", "503"}, + ("nodejs", False): {"110", "120", "130", "311"}, + ("python", True): None, # all supported + ("nodejs", True): {"110", "120", "130", "210", "311"}, + } + is_container = deployment_type == "container" + allowed = _CF_SUPPORTED.get((language, is_container)) + if allowed is not None: + benchmark_id = benchmark.split(".")[0] + return benchmark_id in allowed # fmt: on # All other benchmarks are supported @@ -1261,6 +1430,34 @@ def regression_suite( unittest.defaultTestLoader.loadTestsFromTestCase(OpenWhiskTestSequenceJava) ) + # Add Cloudflare tests if requested + if "cloudflare" in providers: + assert ( + "cloudflare" in cloud_config["deployment"] + ), "Cloudflare provider requested but not in deployment config" + if language == "python": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequencePythonWorkers + ) + ) + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequencePythonContainers + ) + ) + elif language == "nodejs": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequenceNodejsWorkers + ) + ) + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequenceNodejsContainers + ) + ) + # Prepare the list of tests to run tests = [] # mypy is confused here about the type From 8ea37c560138f146ee080b10ffc3e84357701eb7 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 18:52:05 +0200 Subject: [PATCH 078/230] feat(cloudflare): Implement multipart upload support for R2 storage in Python and Node.js workers --- .../cloudflare/nodejs/container/worker.js | 62 ++++++++-- .../cloudflare/python/container/storage.py | 115 ++++++++++++++---- sebs/benchmark.py | 15 ++- sebs/cloudflare/cloudflare.py | 2 +- sebs/cloudflare/containers.py | 10 +- sebs/cloudflare/triggers.py | 43 ++++++- 6 files changed, 199 insertions(+), 48 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index ba5eabcae..a22764d88 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -320,18 +320,57 @@ async function handleR2Request(request, env) { } } - // All other R2 operations require both bucket and key - if (!bucket || !key) { + // Multipart upload routes only need 'key' (bucket is implicit in the R2 binding) + if (url.pathname === '/r2/multipart-init') { + // Initiate a multipart upload; returns { key, uploadId } + const contentType = url.searchParams.get('contentType') || 'application/octet-stream'; + console.log(`[worker.js /r2/multipart-init] key=${key}, contentType=${contentType}`); + const multipart = await env.R2.createMultipartUpload(key, { + httpMetadata: { contentType } + }); + console.log(`[worker.js /r2/multipart-init] uploadId=${multipart.uploadId}`); + return new Response(JSON.stringify({ + key: multipart.key, + uploadId: multipart.uploadId + }), { headers: { 'Content-Type': 'application/json' } }); + + } else if (url.pathname === '/r2/multipart-part') { + // Upload one part; returns { partNumber, etag } + const uploadId = url.searchParams.get('uploadId'); + const partNumber = parseInt(url.searchParams.get('partNumber'), 10); + console.log(`[worker.js /r2/multipart-part] key=${key}, uploadId=${uploadId}, partNumber=${partNumber}`); + const multipart = env.R2.resumeMultipartUpload(key, uploadId); + const part = await multipart.uploadPart(partNumber, request.body); + console.log(`[worker.js /r2/multipart-part] uploaded part ${part.partNumber}, etag=${part.etag}`); + return new Response(JSON.stringify({ + partNumber: part.partNumber, + etag: part.etag + }), { headers: { 'Content-Type': 'application/json' } }); + + } else if (url.pathname === '/r2/multipart-complete') { + // Complete a multipart upload; body is JSON { parts: [{ partNumber, etag }] } + const uploadId = url.searchParams.get('uploadId'); + console.log(`[worker.js /r2/multipart-complete] key=${key}, uploadId=${uploadId}`); + const { parts } = await request.json(); + const multipart = env.R2.resumeMultipartUpload(key, uploadId); + const obj = await multipart.complete(parts); + console.log(`[worker.js /r2/multipart-complete] completed, size=${obj ? obj.size : '?'}`); + return new Response(JSON.stringify({ key: key }), { + headers: { 'Content-Type': 'application/json' } + }); + } + + // Download and upload require a key (bucket is implicit in the R2 binding) + if (!key) { return new Response(JSON.stringify({ - error: 'Missing bucket or key parameter' + error: 'Missing key parameter' }), { status: 400, headers: { 'Content-Type': 'application/json' } }); } - + if (url.pathname === '/r2/download') { - // Download from R2 const object = await env.R2.get(key); if (!object) { @@ -352,16 +391,19 @@ async function handleR2Request(request, env) { }); } else if (url.pathname === '/r2/upload') { - // Upload to R2 + // Upload to R2 — stream request.body directly to avoid buffering large payloads in Worker memory console.log(`[worker.js /r2/upload] bucket=${bucket}, key=${key}`); console.log(`[worker.js /r2/upload] env.R2 exists:`, !!env.R2); - const data = await request.arrayBuffer(); - console.log(`[worker.js /r2/upload] Received ${data.byteLength} bytes`); + const contentLength = request.headers.get('Content-Length'); + console.log(`[worker.js /r2/upload] Content-Length: ${contentLength}`); // Use the key as-is (container already generates unique keys if needed) try { - const putResult = await env.R2.put(key, data); - console.log(`[worker.js /r2/upload] R2.put() returned:`, putResult); + const putResult = await env.R2.put(key, request.body, { + httpMetadata: { contentType: request.headers.get('Content-Type') || 'application/octet-stream' } + }); + const size = putResult ? putResult.size : '(unknown)'; + console.log(`[worker.js /r2/upload] R2.put() succeeded, size=${size}`); console.log(`[worker.js /r2/upload] Successfully uploaded to R2 with key=${key}`); } catch (error) { console.error(`[worker.js /r2/upload] R2.put() error:`, error); diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index 53ab90d54..19c21f337 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -3,11 +3,23 @@ Uses HTTP proxy to access R2 storage through the Worker's R2 binding """ import io +import mimetypes import os import json import urllib.request import urllib.parse +def _guess_content_type(name: str) -> str: + """Infer MIME type from a file name, falling back to application/octet-stream.""" + ct, _ = mimetypes.guess_type(name) + return ct or 'application/octet-stream' + +# Cloudflare Workers enforce a 100 MB request body limit at the edge. +# Use multipart upload for payloads larger than this threshold so that +# each individual request stays well below that limit. +_MULTIPART_THRESHOLD = 10 * 1024 * 1024 # 10 MB +_PART_SIZE = 10 * 1024 * 1024 # 10 MB per part (R2 min is 5 MB) + class storage: """R2 storage client for containers using HTTP proxy to Worker""" instance = None @@ -44,6 +56,76 @@ def unique_name(name): name_part, extension = os.path.splitext(name) return f'{name_part}.{str(uuid.uuid4()).split("-")[0]}{extension}' + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _post_json(self, url: str, body: bytes = b'', content_type: str = 'application/octet-stream'): + """POST *body* to *url* and return the parsed JSON response.""" + req = urllib.request.Request(url, data=body, method='POST') + req.add_header('Content-Type', content_type) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode('utf-8')) + + def _upload_bytes(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + """Upload *data* to the exact R2 *key* via the worker proxy. + + Uses a single PUT for small payloads and R2 multipart upload for + payloads that exceed _MULTIPART_THRESHOLD (to stay under Cloudflare's + 100 MB per-request edge limit). + + Returns the R2 key. + """ + if len(data) <= _MULTIPART_THRESHOLD: + return self._single_upload(key, data, content_type) + return self._multipart_upload(key, data, content_type) + + def _single_upload(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + params = urllib.parse.urlencode({'key': key}) + url = f"{storage.worker_url}/r2/upload?{params}" + result = self._post_json(url, data, content_type) + return result['key'] + + def _multipart_upload(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + """Split *data* into ≤_PART_SIZE chunks and use R2 multipart upload.""" + # 1. Initiate + params = urllib.parse.urlencode({'key': key, 'contentType': content_type}) + init_url = f"{storage.worker_url}/r2/multipart-init?{params}" + init = self._post_json(init_url) + upload_id = init['uploadId'] + upload_key = init['key'] + print(f"[storage] multipart upload initiated: key={upload_key}, uploadId={upload_id}, " + f"total={len(data):,} bytes, parts={-(-len(data)//_PART_SIZE)}") + + # 2. Upload parts + completed_parts = [] + for part_num, offset in enumerate(range(0, len(data), _PART_SIZE), start=1): + chunk = data[offset:offset + _PART_SIZE] + params = urllib.parse.urlencode({ + 'key': upload_key, + 'uploadId': upload_id, + 'partNumber': part_num, + }) + part_url = f"{storage.worker_url}/r2/multipart-part?{params}" + part = self._post_json(part_url, chunk) + completed_parts.append({'partNumber': part['partNumber'], 'etag': part['etag']}) + print(f"[storage] uploaded part {part_num}, etag={part['etag']}") + + # 3. Complete + params = urllib.parse.urlencode({'key': upload_key, 'uploadId': upload_id}) + complete_url = f"{storage.worker_url}/r2/multipart-complete?{params}" + result = self._post_json( + complete_url, + json.dumps({'parts': completed_parts}).encode('utf-8'), + content_type='application/json', + ) + print(f"[storage] multipart upload complete: key={result['key']}") + return result['key'] + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def upload_stream(self, bucket: str, key: str, data): """Upload data to R2 via worker proxy""" if not self.r2_enabled: @@ -61,17 +143,8 @@ def upload_stream(self, bucket: str, key: str, data): if isinstance(data, str): data = data.encode('utf-8') - # Upload via worker proxy - params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) - url = f"{storage.worker_url}/r2/upload?{params}" - - req = urllib.request.Request(url, data=data, method='POST') - req.add_header('Content-Type', 'application/octet-stream') - try: - with urllib.request.urlopen(req) as response: - result = json.loads(response.read().decode('utf-8')) - return result['key'] + return self._upload_bytes(key, data, _guess_content_type(key)) except Exception as e: print(f"R2 upload error: {e}") raise RuntimeError(f"Failed to upload to R2: {e}") @@ -104,12 +177,14 @@ def upload(self, bucket, key, filepath): """Upload file from disk with unique key generation""" # Generate unique key to avoid conflicts unique_key = self.unique_name(key) - + content_type = _guess_content_type(filepath) with open(filepath, 'rb') as f: data = f.read() - # Upload with the unique key - self._upload_with_key(bucket, unique_key, data) - return unique_key + try: + self._upload_bytes(unique_key, data, content_type) + except Exception as e: + raise RuntimeError(f"Failed to upload to R2: {e}") + return unique_key def _upload_with_key(self, bucket: str, key: str, data): """Upload data to R2 via worker proxy with exact key (internal method)""" @@ -128,17 +203,9 @@ def _upload_with_key(self, bucket: str, key: str, data): if isinstance(data, str): data = data.encode('utf-8') - # Upload via worker proxy with exact key - params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) - url = f"{storage.worker_url}/r2/upload?{params}" - - req = urllib.request.Request(url, data=data, method='POST') - req.add_header('Content-Type', 'application/octet-stream') - try: - with urllib.request.urlopen(req) as response: - result = json.loads(response.read().decode('utf-8')) - print(f"[storage._upload_with_key] Upload successful, key={result['key']}") + result_key = self._upload_bytes(key, data, _guess_content_type(key)) + print(f"[storage._upload_with_key] Upload successful, key={result_key}") except Exception as e: print(f"R2 upload error: {e}") raise RuntimeError(f"Failed to upload to R2: {e}") diff --git a/sebs/benchmark.py b/sebs/benchmark.py index e2ae1f798..33019df33 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -585,7 +585,8 @@ def hash(self) -> str: """ path = os.path.join(self.benchmark_path, self.language_name) self._hash_value = Benchmark.hash_directory( - path, self._deployment_name, self.language, self._language_variant + path, self._deployment_name, self.language, self._language_variant, + container_deployment=self._container_deployment, ) return self._hash_value @@ -690,7 +691,8 @@ def __init__( @staticmethod def hash_directory( - directory: str, deployment: str, language: Language, variant: str = "default" + directory: str, deployment: str, language: Language, variant: str = "default", + container_deployment: bool = False, ): """ Compute MD5 hash of an entire directory. @@ -757,6 +759,15 @@ def hash_directory( else: with open(f, "rb") as opened_file: hash_sum.update(opened_file.read()) + # For Cloudflare Python containers, also hash the nodejs/container worker.js + # since containers.py always copies it into the build directory regardless of language. + if deployment == "cloudflare" and language == Language.PYTHON and container_deployment: + nodejs_worker = get_resource_path( + "benchmarks", "wrappers", "cloudflare", "nodejs", "container", "worker.js" + ) + if os.path.isfile(str(nodejs_worker)): + with open(str(nodejs_worker), "rb") as f: + hash_sum.update(f.read()) return hash_sum.hexdigest() def serialize(self) -> dict: diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 7b6dd5eb7..9baa8799b 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -543,7 +543,7 @@ def _create_or_update_worker( # The container binding needs time to propagate before first invocation if container_deployment: - self.logging.info("Waiting for container Durable Object to initialize...") + self.logging.info("Waiting for container worker to initialize...") account_id = env.get('CLOUDFLARE_ACCOUNT_ID') worker_url = self._build_workers_dev_url(worker_name, account_id) self._containers_deployment.wait_for_durable_object_ready( diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 5aa84936f..2d42c80da 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -470,7 +470,7 @@ def wait_for_durable_object_ready( max_wait_seconds: int = 400 ) -> bool: """ - Wait for container Durable Object to be fully provisioned and ready. + Wait for container worker to be fully provisioned and ready. Args: worker_name: Name of the worker @@ -483,7 +483,7 @@ def wait_for_durable_object_ready( wait_interval = 10 start_time = time.time() - self.logging.info("Checking container Durable Object readiness via health endpoint...") + self.logging.info("Checking container worker readiness via health endpoint...") consecutive_failures = 0 max_consecutive_failures = 5 @@ -498,13 +498,13 @@ def wait_for_durable_object_ready( # 200 = ready if response.status_code == 200: - self.logging.info("Container Durable Object is ready!") + self.logging.info("Container worker is ready!") return True # 503 = not ready yet elif response.status_code == 503: elapsed = int(time.time() - start_time) self.logging.info( - f"Container Durable Object not ready yet (503 Service Unavailable)... " + f"Container worker not ready yet (503 Service Unavailable)... " f"({elapsed}s elapsed, will retry)" ) # Other errors @@ -521,7 +521,7 @@ def wait_for_durable_object_ready( time.sleep(wait_interval) self.logging.warning( - f"Container Durable Object may not be fully ready after {max_wait_seconds}s. " + f"Container worker may not be fully ready after {max_wait_seconds}s. " "First invocation may still experience initialization delay." ) return False diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 16c980c6e..f07185472 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -1,12 +1,18 @@ from typing import Optional import concurrent.futures import json +import time from datetime import datetime from io import BytesIO from sebs.faas.function import Trigger, ExecutionResult +class ContainerProvisioningError(RuntimeError): + """Raised when Cloudflare reports the container is still provisioning.""" + pass + + class HTTPTrigger(Trigger): """ HTTP trigger for Cloudflare Workers. @@ -91,21 +97,46 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec result.parse_benchmark_output(output) return result except json.decoder.JSONDecodeError: - self.logging.error(f"Invocation on URL {url} failed!") raw = data.getvalue() - if raw: - self.logging.error(f"Output: {raw.decode()}") + raw_text = raw.decode() if raw else "" + provisioning_phrases = ( + "no Container instance available", + "provisioning the Container", + "currently provisioning", + ) + if any(p.lower() in raw_text.lower() for p in provisioning_phrases): + self.logging.info(f"Container still provisioning (URL {url}): {raw_text[:120]}") + raise ContainerProvisioningError( + f"Container not yet available: {raw_text[:200]}" + ) + self.logging.error(f"Invocation on URL {url} failed!") + if raw_text: + self.logging.error(f"Output: {raw_text}") else: self.logging.error("No output provided!") - raise RuntimeError(f"Failed invocation of function! Output: {raw.decode()}") + raise RuntimeError(f"Failed invocation of function! Output: {raw_text}") def sync_invoke(self, payload: dict) -> ExecutionResult: """Synchronously invoke a Cloudflare Worker via HTTP.""" self.logging.debug(f"Invoke function {self.url}") - result = self._http_invoke(payload, self.url) + max_provisioning_retries = 6 + provisioning_retry_wait = 30 # seconds between retries + for attempt in range(max_provisioning_retries + 1): + try: + result = self._http_invoke(payload, self.url) + break + except ContainerProvisioningError: + if attempt < max_provisioning_retries: + self.logging.info( + f"Container still provisioning, waiting {provisioning_retry_wait}s before retry " + f"(attempt {attempt + 1}/{max_provisioning_retries})..." + ) + time.sleep(provisioning_retry_wait) + else: + raise # Extract measurement data from the response if available - if result.output and 'result' in result.output: + if result.output and 'result' in result.output: # type: ignore[union-attr] result_data = result.output['result'] if isinstance(result_data, dict) and 'measurement' in result_data: measurement = result_data['measurement'] From a60e5d4174bc2da64f59a370367be581c87b76ab Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 20:49:30 +0200 Subject: [PATCH 079/230] feat(cloudflare): Enable parallel downloads in download_directory method for improved performance --- .../cloudflare/python/container/storage.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index 19c21f337..5ec17d859 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -220,8 +220,10 @@ def download(self, bucket, key, filepath): def download_directory(self, bucket, prefix, local_path): """ Download all files with a given prefix to a local directory. - Lists objects via /r2/list endpoint and downloads each one. + Lists objects via /r2/list endpoint and downloads each one in parallel. """ + import concurrent.futures + if not storage.worker_url: raise RuntimeError("Worker URL not set - cannot access R2") @@ -241,26 +243,25 @@ def download_directory(self, bucket, prefix, local_path): objects = result.get('objects', []) print(f"Found {len(objects)} objects with prefix '{prefix}'") - - # Download each object - for obj in objects: + + def _download_one(obj): obj_key = obj['key'] - # Create local file path by removing the prefix relative_path = obj_key if prefix and obj_key.startswith(prefix): relative_path = obj_key[len(prefix):].lstrip('/') - local_file_path = os.path.join(local_path, relative_path) - - # Create directory structure if needed local_dir = os.path.dirname(local_file_path) if local_dir: os.makedirs(local_dir, exist_ok=True) - - # Download the file print(f"Downloading {obj_key} to {local_file_path}") self.download(bucket, obj_key, local_file_path) - + + # Download all objects in parallel (up to 16 concurrent) + with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: + futures = [executor.submit(_download_one, obj) for obj in objects] + for fut in concurrent.futures.as_completed(futures): + fut.result() # re-raise any exception + return local_path except Exception as e: From 18070f02cfa5cbb195a8aa9535b14c660c977b37 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 21:04:26 +0200 Subject: [PATCH 080/230] fix(regression): Update benchmark filtering to use test_benchmark for accuracy --- sebs/regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sebs/regression.py b/sebs/regression.py index c0809fdd1..2f06646ed 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -1469,8 +1469,9 @@ def regression_suite( # Remove unsupported benchmarks test_architecture = getattr(test, test_name).test_architecture # type: ignore test_deployment_type = getattr(test, test_name).test_deployment_type # type: ignore + test_benchmark = getattr(test, test_name).test_benchmark # type: ignore if not filter_out_benchmarks( - test_name, + test_benchmark, test.deployment_name, # type: ignore language, language_version, From 044b9eff991b0b520b2e185f8438cc1f8e806291 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 13 Apr 2026 21:11:41 +0200 Subject: [PATCH 081/230] fix(cloudflare): Enhance error handling for container provisioning in HTTPTrigger --- sebs/cloudflare/triggers.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index f07185472..ace4122ec 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -82,6 +82,17 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec else: output = json.loads(output["body"]) + if status_code == 502: + self.logging.info(f"Container returned 502 (still starting?), will retry...") + raise ContainerProvisioningError(f"502 gateway error from container worker") + + # Check for Cloudflare error code 1042 (worker not ready) in JSON response + if isinstance(output, dict) and "error code" in str(output): + error_str = str(output) + if "1042" in error_str: + self.logging.info(f"Worker returned error 1042 (not ready yet), will retry...") + raise ContainerProvisioningError(f"Error 1042 from worker: {error_str}") + if status_code != 200: self.logging.error(f"Invocation on URL {url} failed!") self.logging.error(f"Output: {output}") @@ -104,7 +115,7 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec "provisioning the Container", "currently provisioning", ) - if any(p.lower() in raw_text.lower() for p in provisioning_phrases): + if status_code == 502 or any(p.lower() in raw_text.lower() for p in provisioning_phrases): self.logging.info(f"Container still provisioning (URL {url}): {raw_text[:120]}") raise ContainerProvisioningError( f"Container not yet available: {raw_text[:200]}" From 7ac2b8c6629a7b9ad368518585460e3ef5f83bb3 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 16 Apr 2026 14:28:20 +0200 Subject: [PATCH 082/230] fix(regression): Update benchmark filtering logic to correctly extract benchmark IDs --- sebs/regression.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sebs/regression.py b/sebs/regression.py index 2f06646ed..5f63936c2 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -1330,7 +1330,9 @@ def filter_out_benchmarks( is_container = deployment_type == "container" allowed = _CF_SUPPORTED.get((language, is_container)) if allowed is not None: - benchmark_id = benchmark.split(".")[0] + # benchmark is the test method name, e.g. "test_cloudflare_120.uploader_x64_workers" + # Extract the numeric benchmark prefix (e.g. "120") from before the first "." + benchmark_id = benchmark.split(".")[-2].split("_")[-1] if "." in benchmark else benchmark.split("_")[-1] return benchmark_id in allowed # fmt: on @@ -1469,9 +1471,8 @@ def regression_suite( # Remove unsupported benchmarks test_architecture = getattr(test, test_name).test_architecture # type: ignore test_deployment_type = getattr(test, test_name).test_deployment_type # type: ignore - test_benchmark = getattr(test, test_name).test_benchmark # type: ignore if not filter_out_benchmarks( - test_benchmark, + test_name, test.deployment_name, # type: ignore language, language_version, From 78b29790d4aa2c124ef3d52b1cddc33224958184 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 16 Apr 2026 14:28:29 +0200 Subject: [PATCH 083/230] fix(config): Correct cloudflare worker variant to use default configuration --- benchmarks/300.utilities/311.compression/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/300.utilities/311.compression/config.json b/benchmarks/300.utilities/311.compression/config.json index 0b20dbd82..d69311ae3 100644 --- a/benchmarks/300.utilities/311.compression/config.json +++ b/benchmarks/300.utilities/311.compression/config.json @@ -13,7 +13,7 @@ "language": "nodejs", "variants": { "default": "default", - "cloudflare": {"workers": "cloudflare", "containers": "default"} + "cloudflare": {"workers": "default", "containers": "default"} } } ], From 6d7e4d03c4b4880cb4d612e78d7144d93c6b43eb Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 11:15:15 +0200 Subject: [PATCH 084/230] fix(cloudflare): Remove hardcoded language-variant from Cloudflare test configurations and enhance deployment type handling --- sebs/regression.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/sebs/regression.py b/sebs/regression.py index 5f63936c2..16607d0de 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -1099,7 +1099,6 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = False - config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1130,7 +1129,6 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = True - config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1161,7 +1159,6 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = False - config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1192,7 +1189,6 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = True - config_copy["experiments"]["runtime"]["language-variant"] = "cloudflare" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1346,6 +1342,7 @@ def regression_suite( providers: Set[str], deployment_config: dict, benchmark_name: Optional[str] = None, + deployment_type: Optional[str] = None, ): """Create and run a regression test suite for specified cloud providers. @@ -1438,27 +1435,31 @@ def regression_suite( "cloudflare" in cloud_config["deployment"] ), "Cloudflare provider requested but not in deployment config" if language == "python": - suite.addTest( - unittest.defaultTestLoader.loadTestsFromTestCase( - CloudflareTestSequencePythonWorkers + if deployment_type != "container": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequencePythonWorkers + ) ) - ) - suite.addTest( - unittest.defaultTestLoader.loadTestsFromTestCase( - CloudflareTestSequencePythonContainers + if deployment_type != "workers": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequencePythonContainers + ) ) - ) elif language == "nodejs": - suite.addTest( - unittest.defaultTestLoader.loadTestsFromTestCase( - CloudflareTestSequenceNodejsWorkers + if deployment_type != "container": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequenceNodejsWorkers + ) ) - ) - suite.addTest( - unittest.defaultTestLoader.loadTestsFromTestCase( - CloudflareTestSequenceNodejsContainers + if deployment_type != "workers": + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase( + CloudflareTestSequenceNodejsContainers + ) ) - ) # Prepare the list of tests to run tests = [] @@ -1486,7 +1487,7 @@ def regression_suite( if not benchmark_name or (benchmark_name and benchmark_name in test_name): # Set up test instance with client and config test.client = sebs_client # type: ignore - test.experiment_config = experiment_config.copy() # type: ignore + test.experiment_config = copy.deepcopy(experiment_config) # type: ignore tests.append(test) else: print(f"Skip test {test_name}") From 2cc8f938657cf1f909cb3fb2d551e58d032f734c Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 11:27:02 +0200 Subject: [PATCH 085/230] fix(cloudflare): Improve handling of Cloudflare error code 1042 for worker readiness and CPU time limit --- sebs/cloudflare/triggers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index ace4122ec..aa5f17faf 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -86,12 +86,12 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec self.logging.info(f"Container returned 502 (still starting?), will retry...") raise ContainerProvisioningError(f"502 gateway error from container worker") - # Check for Cloudflare error code 1042 (worker not ready) in JSON response - if isinstance(output, dict) and "error code" in str(output): - error_str = str(output) - if "1042" in error_str: - self.logging.info(f"Worker returned error 1042 (not ready yet), will retry...") - raise ContainerProvisioningError(f"Error 1042 from worker: {error_str}") + # Check for Cloudflare error code 1042 (CPU time limit / worker not ready) + # Output may be a plain string like "error code: 1042" rather than a dict. + output_str = str(output) + if "1042" in output_str and "error code" in output_str: + self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") + raise ContainerProvisioningError(f"Error 1042 from worker: {output_str}") if status_code != 200: self.logging.error(f"Invocation on URL {url} failed!") @@ -115,6 +115,9 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec "provisioning the Container", "currently provisioning", ) + if "1042" in raw_text and "error code" in raw_text: + self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") + raise ContainerProvisioningError(f"Error 1042 from worker: {raw_text[:200]}") if status_code == 502 or any(p.lower() in raw_text.lower() for p in provisioning_phrases): self.logging.info(f"Container still provisioning (URL {url}): {raw_text[:120]}") raise ContainerProvisioningError( From 3f8e69ce5bcf9cbdb8259c616b0289c57f4211db Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 12:48:17 +0200 Subject: [PATCH 086/230] fix(cloudflare): Enhance content type inference and improve upload key handling in storage modules --- .../cloudflare/nodejs/container/storage.js | 167 +++++++++++------- .../cloudflare/nodejs/container/worker.js | 39 +++- .../cloudflare/python/container/storage.py | 4 +- 3 files changed, 143 insertions(+), 67 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js index f05d2fb14..e9f630187 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js @@ -2,6 +2,14 @@ const fs = require('fs'); const path = require('path'); const uuid = require('uuid'); +const MULTIPART_THRESHOLD = 10 * 1024 * 1024; +const PART_SIZE = 10 * 1024 * 1024; + +function isRetryableSingleUploadError(error) { + const message = error?.message || ''; + return /HTTP 4(?:08|13|29)|request body|payload|too large|content length|body size|stream/i.test(message); +} + /** * Storage module for Cloudflare Node.js Containers * Uses HTTP proxy to access R2 storage through the Worker's R2 binding @@ -12,9 +20,6 @@ class storage { this.r2_enabled = true; } - static worker_url = null; // Set by handler from X-Worker-URL header - - static worker_url = null; // Set by handler from X-Worker-URL header static unique_name(name) { @@ -41,47 +46,109 @@ class storage { return storage.instance; } - async upload_stream(bucket, key, data) { - if (!this.r2_enabled) { - console.log('Warning: R2 not configured, skipping upload'); - return key; + _toBuffer(data) { + if (Buffer.isBuffer(data)) { + return data; + } + if (typeof data === 'string') { + return Buffer.from(data, 'utf-8'); } + if (data instanceof ArrayBuffer) { + return Buffer.from(data); + } + return Buffer.from(String(data), 'utf-8'); + } - if (!storage.worker_url) { - throw new Error('Worker URL not set - cannot access R2'); + async _postJson(url, body = Buffer.alloc(0), contentType = null) { + const options = { + method: 'POST', + body, + }; + + if (contentType) { + options.headers = { 'Content-Type': contentType }; } - const unique_key = storage.unique_name(key); + const response = await fetch(url, options); - // Convert data to Buffer if needed - let buffer; - if (Buffer.isBuffer(data)) { - buffer = data; - } else if (typeof data === 'string') { - buffer = Buffer.from(data, 'utf-8'); - } else if (data instanceof ArrayBuffer) { - buffer = Buffer.from(data); - } else { - buffer = Buffer.from(String(data), 'utf-8'); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); } - // Upload via worker proxy - const params = new URLSearchParams({ bucket, key: unique_key }); + return response.json(); + } + + async _single_upload(key, buffer) { + const params = new URLSearchParams({ key }); const url = `${storage.worker_url}/r2/upload?${params}`; + const result = await this._postJson(url, buffer); + return result.key; + } - try { - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/octet-stream' }, - body: buffer, + async _multipart_upload(key, buffer) { + const initParams = new URLSearchParams({ key }); + const initUrl = `${storage.worker_url}/r2/multipart-init?${initParams}`; + const init = await this._postJson(initUrl); + const uploadId = init.uploadId; + const uploadKey = init.key; + const completedParts = []; + + for (let offset = 0, partNumber = 1; offset < buffer.length; offset += PART_SIZE, partNumber += 1) { + const chunk = buffer.subarray(offset, offset + PART_SIZE); + const partParams = new URLSearchParams({ + key: uploadKey, + uploadId, + partNumber: String(partNumber), }); + const partUrl = `${storage.worker_url}/r2/multipart-part?${partParams}`; + const part = await this._postJson(partUrl, chunk, 'application/octet-stream'); + completedParts.push({ partNumber: part.partNumber, etag: part.etag }); + } - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${await response.text()}`); + const completeParams = new URLSearchParams({ key: uploadKey, uploadId }); + const completeUrl = `${storage.worker_url}/r2/multipart-complete?${completeParams}`; + const result = await this._postJson( + completeUrl, + Buffer.from(JSON.stringify({ parts: completedParts }), 'utf-8'), + 'application/json' + ); + return result.key; + } + + async _upload_bytes(key, buffer) { + if (buffer.length > MULTIPART_THRESHOLD) { + return this._multipart_upload(key, buffer); + } + + try { + return await this._single_upload(key, buffer); + } catch (error) { + if (!isRetryableSingleUploadError(error)) { + throw error; } - const result = await response.json(); - return result.key; + console.warn( + `[storage] single upload failed for ${key}; retrying with multipart upload: ${error.message}` + ); + return this._multipart_upload(key, buffer); + } + } + + async upload_stream(bucket, key, data) { + if (!this.r2_enabled) { + console.log('Warning: R2 not configured, skipping upload'); + return key; + } + + if (!storage.worker_url) { + throw new Error('Worker URL not set - cannot access R2'); + } + + const unique_key = storage.unique_name(key); + const buffer = this._toBuffer(data); + + try { + return await this._upload_bytes(unique_key, buffer); } catch (error) { console.error('R2 upload error:', error); throw new Error(`Failed to upload to R2: ${error.message}`); @@ -152,41 +219,13 @@ class storage { console.log(`[storage._upload_stream_with_key] Worker URL: ${storage.worker_url}`); - // Convert data to Buffer if needed - let buffer; - if (Buffer.isBuffer(data)) { - buffer = data; - } else if (typeof data === 'string') { - buffer = Buffer.from(data, 'utf-8'); - } else if (data instanceof ArrayBuffer) { - buffer = Buffer.from(data); - } else { - buffer = Buffer.from(String(data), 'utf-8'); - } - - // Upload via worker proxy - const params = new URLSearchParams({ bucket, key }); - const url = `${storage.worker_url}/r2/upload?${params}`; - console.log(`[storage._upload_stream_with_key] Uploading to URL: ${url}, buffer size: ${buffer.length}`); + const buffer = this._toBuffer(data); + console.log(`[storage._upload_stream_with_key] Uploading key=${key}, buffer size: ${buffer.length}`); try { - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/octet-stream' }, - body: buffer, - }); - - console.log(`[storage._upload_stream_with_key] Response status: ${response.status}`); - - if (!response.ok) { - const errorText = await response.text(); - console.error(`[storage._upload_stream_with_key] Upload failed: ${response.status} - ${errorText}`); - throw new Error(`HTTP ${response.status}: ${errorText}`); - } - - const result = await response.json(); - console.log(`[storage._upload_stream_with_key] Upload successful, returned key: ${result.key}`); - return result.key; + const resultKey = await this._upload_bytes(key, buffer); + console.log(`[storage._upload_stream_with_key] Upload successful, returned key: ${resultKey}`); + return resultKey; } catch (error) { console.error('R2 upload error:', error); throw new Error(`Failed to upload to R2: ${error.message}`); diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index a22764d88..97d937fe6 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -110,6 +110,40 @@ export default { } }; +const MIME_TYPES = { + '.bin': 'application/octet-stream', + '.csv': 'text/csv', + '.gif': 'image/gif', + '.htm': 'text/html', + '.html': 'text/html', + '.jpeg': 'image/jpeg', + '.jpg': 'image/jpeg', + '.json': 'application/json', + '.mov': 'video/quicktime', + '.mp3': 'audio/mpeg', + '.mp4': 'video/mp4', + '.pdf': 'application/pdf', + '.png': 'image/png', + '.svg': 'image/svg+xml', + '.txt': 'text/plain', + '.wav': 'audio/wav', + '.webm': 'video/webm', + '.xml': 'application/xml', + '.zip': 'application/zip', +}; + +function inferContentTypeFromKey(key) { + if (!key) { + return 'application/octet-stream'; + } + const dot = key.lastIndexOf('.'); + if (dot < 0) { + return 'application/octet-stream'; + } + const extension = key.slice(dot).toLowerCase(); + return MIME_TYPES[extension] || 'application/octet-stream'; +} + /** * Handle NoSQL (KV namespace) requests proxied from the container * Routes: @@ -323,7 +357,7 @@ async function handleR2Request(request, env) { // Multipart upload routes only need 'key' (bucket is implicit in the R2 binding) if (url.pathname === '/r2/multipart-init') { // Initiate a multipart upload; returns { key, uploadId } - const contentType = url.searchParams.get('contentType') || 'application/octet-stream'; + const contentType = url.searchParams.get('contentType') || inferContentTypeFromKey(key); console.log(`[worker.js /r2/multipart-init] key=${key}, contentType=${contentType}`); const multipart = await env.R2.createMultipartUpload(key, { httpMetadata: { contentType } @@ -399,8 +433,9 @@ async function handleR2Request(request, env) { // Use the key as-is (container already generates unique keys if needed) try { + const contentType = request.headers.get('Content-Type') || inferContentTypeFromKey(key); const putResult = await env.R2.put(key, request.body, { - httpMetadata: { contentType: request.headers.get('Content-Type') || 'application/octet-stream' } + httpMetadata: { contentType } }); const size = putResult ? putResult.size : '(unknown)'; console.log(`[worker.js /r2/upload] R2.put() succeeded, size=${size}`); diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index 5ec17d859..bb8066d16 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -142,9 +142,11 @@ def upload_stream(self, bucket: str, key: str, data): # Convert to bytes if needed if isinstance(data, str): data = data.encode('utf-8') + + unique_key = self.unique_name(key) try: - return self._upload_bytes(key, data, _guess_content_type(key)) + return self._upload_bytes(unique_key, data, _guess_content_type(unique_key)) except Exception as e: print(f"R2 upload error: {e}") raise RuntimeError(f"Failed to upload to R2: {e}") From c8ca384dec2009c292df47a668018b4553349176 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 13:34:04 +0200 Subject: [PATCH 087/230] fix(cloudflare): Update storage documentation to clarify container upload behavior and KVStore integration --- docs/storage.md | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/storage.md b/docs/storage.md index 1a4ee4573..d09105989 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -149,18 +149,36 @@ R2 configuration is handled automatically by SeBS when deploying to Cloudflare W **Limitations:** - Geographic location hints (locationHint) are not currently supported. R2 buckets are created with Cloudflare's automatic location selection, which places data near where it's most frequently accessed. -### Durable Objects for NoSQL +### Container Upload Behavior (R2 Proxy) -Cloudflare Durable Objects provide stateful storage for NoSQL operations required by benchmarks like the CRUD API (130.crud-api). +For Cloudflare container deployments, benchmark code does not talk to R2 directly. Instead, container wrappers call the Worker proxy endpoints (`/r2/upload`, `/r2/multipart-init`, `/r2/multipart-part`, `/r2/multipart-complete`). + +**Upload strategy:** +- Small payloads use a single upload request. +- Large payloads use multipart upload (10 MB threshold, 10 MB part size in current wrappers). +- Node.js container wrapper retries with multipart when single-upload fails with size/body-limit style errors. + +**Object keys and uniqueness:** +- Container wrappers generate unique output keys (suffix based on UUID fragment) before upload. +- This avoids collisions and keeps run-specific output objects distinct in regression and repeated invocations. + +**Content-Type behavior:** +- Stored object metadata should reflect the real file type (for example `image/jpeg`, `image/png`) when inferable. +- In the Node.js container path, the Worker proxy infers content type from the object key extension when the caller omits it. +- In the Python container path, the wrapper infers content type and passes it to the Worker proxy. +- Multipart part transport may still use `application/octet-stream`; this is expected for chunk transport and does not imply final object metadata must be octet-stream. + +### KVStore for NoSQL + +Cloudflare KV namespaces are used for NoSQL operations required by benchmarks such as CRUD API (130.crud-api). **Key Features:** -- Strongly consistent storage -- Low-latency access from Workers -- Built-in coordination primitives -- Global replication +- Native Workers integration through KV bindings +- Simple key-value interface compatible with SeBS NoSQL wrapper operations +- Global edge distribution for read-heavy access patterns **Usage:** -SeBS configures Durable Objects bindings automatically when deploying container-based Workers that require NoSQL storage. The benchmark wrappers handle the interaction with Durable Objects through the standard SeBS storage interface. +SeBS configures KV namespace bindings automatically for Cloudflare deployments that require NoSQL storage. Benchmark wrappers access KV through the standard SeBS NoSQL interface (insert/update/get/query/delete). ## Lifecycle Management From b71e2c8eefe940a8ac2ab77d9225635b87983f4d Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 15:06:06 +0200 Subject: [PATCH 088/230] fix(cli): Add deployment type option to regression command for better test targeting --- sebs/cli.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sebs/cli.py b/sebs/cli.py index 6de1d4dca..4ab62e52f 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -469,7 +469,13 @@ def package( default=os.path.join(os.path.curdir, "regression-output"), help="Output directory for results.", ) -def regression(benchmark_input_size, benchmark_name, storage_configuration, **kwargs): +@click.option( + "--deployment-type", + default=None, + type=click.Choice(["workers", "container"]), + help="Limit regression to a specific deployment type (workers or container).", +) +def regression(benchmark_input_size, benchmark_name, storage_configuration, deployment_type, **kwargs): """Run regression test suite across benchmarks.""" # for regression, deployment client is initialized locally # disable default initialization @@ -484,6 +490,7 @@ def regression(benchmark_input_size, benchmark_name, storage_configuration, **kw set((config["deployment"]["name"],)), config, benchmark_name, + deployment_type, ) From 53a1cd28cdf1c8fb156a8ece3724879f268817b7 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 17 Apr 2026 15:06:16 +0200 Subject: [PATCH 089/230] fix(cloudflare): Pin workers-py to version 1.8.0 to avoid broken import issues --- dockerfiles/cloudflare/Dockerfile.manage | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dockerfiles/cloudflare/Dockerfile.manage b/dockerfiles/cloudflare/Dockerfile.manage index ac18ac336..88b122ba3 100644 --- a/dockerfiles/cloudflare/Dockerfile.manage +++ b/dockerfiles/cloudflare/Dockerfile.manage @@ -20,9 +20,10 @@ RUN apt-get clean && apt-get update \ RUN npm install -g wrangler # Install uv (fast Python package installer) and pywrangler -# uv install script puts the binary in ~/.local/bin by default (not ~/.cargo/bin) +# Pin workers-py to 1.8.0: 1.9.x introduced a broken import (rich.logging.Console) +# which does not exist in any version of rich. Remove the pin once upstream fixes it. RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ - /root/.local/bin/uv tool install workers-py + /root/.local/bin/uv tool install 'workers-py==1.8.0' # Add paths to environment ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:${PATH}" From 32debbf23224dbd8e8913945dbbd556c9eee6dfd Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 11:16:17 +0200 Subject: [PATCH 090/230] fix(cloudflare): Correct typo in typename and implement download method for R2 with error handling --- sebs/cloudflare/r2.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 45a1167c6..7b9e21272 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -11,7 +11,7 @@ class R2(PersistentStorage): @staticmethod def typename() -> str: - return "Cloudlfare.R2" + return "Cloudflare.R2" @staticmethod def deployment_name() -> str: @@ -168,10 +168,21 @@ def download(self, bucket_name: str, key: str, filepath: str) -> None: :param key: storage source filepath :param filepath: local destination filepath """ - # R2 requires S3-compatible access for object operations - # For now, this is not fully implemented - self.logging.warning(f"download not fully implemented for R2 bucket {bucket_name}") - pass + s3_client = self._get_s3_client() + if s3_client is None: + self.logging.warning(f"Cannot download {key} from R2 - S3 client not available") + return + + try: + dirname = os.path.dirname(filepath) + if dirname: + os.makedirs(dirname, exist_ok=True) + s3_client.download_file(bucket_name, key, filepath) + self.logging.debug( + f"Downloaded {key} from R2 bucket {bucket_name} to {filepath}" + ) + except Exception as e: + self.logging.warning(f"Failed to download {key} from R2: {e}") def upload(self, bucket_name: str, filepath: str, key: str): """ From e22bb62e3311fd744404f3536974db4093be64e7 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 11:19:58 +0200 Subject: [PATCH 091/230] fix(docs): Update comments to clarify Cloudflare Workers differences for Node.js and Python implementations --- .../100.webapps/120.uploader/nodejs/cloudflare/function.js | 4 ++++ .../100.webapps/120.uploader/python/cloudflare/function.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js index 5efc8103f..fe27944fd 100644 --- a/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js +++ b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js @@ -1,4 +1,8 @@ // Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. +// Cloudflare Workers differ from the default Node.js version: Workers require +// ES module syntax (no CommonJS `require`) and do not ship the `request` npm +// package, so we use the platform-native `fetch` API and buffer the response +// into /tmp instead of piping a stream. import * as fs from 'node:fs'; import * as path from 'node:path'; import { storage } from './storage'; diff --git a/benchmarks/100.webapps/120.uploader/python/cloudflare/function.py b/benchmarks/100.webapps/120.uploader/python/cloudflare/function.py index 98372cf0f..e4028b14f 100644 --- a/benchmarks/100.webapps/120.uploader/python/cloudflare/function.py +++ b/benchmarks/100.webapps/120.uploader/python/cloudflare/function.py @@ -1,3 +1,8 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. +# Cloudflare Workers differ from the default Python version: the Workers +# Python runtime is Pyodide-based and does not support `urllib.request`, so +# we download via Pyodide's async `pyfetch` and wrap it with `run_sync` to +# keep the synchronous handler signature. import datetime import os From acf2e33f005a13d3339f1d7071185200e8814d01 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 13:27:38 +0200 Subject: [PATCH 092/230] fix(cloudflare): Improve request handling and error responses in container server --- .../cloudflare/nodejs/container/handler.js | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 9b8b25e19..7980a3f62 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -59,19 +59,21 @@ try { const PORT = process.env.PORT || 8080; const server = http.createServer(async (req, res) => { - // Handle favicon requests - if (req.url.includes('favicon')) { - res.writeHead(200); - res.end('None'); - return; - } - try { // Get unique request ID from Cloudflare (CF-Ray header) const crypto = require('crypto'); const reqId = req.headers['cf-ray'] || crypto.randomUUID(); - // Extract Worker URL from header for R2 and NoSQL proxy + // Extract Worker URL from header for R2 and NoSQL proxy. + // + // Containers run in a separate runtime from Workers and cannot access R2 or + // KV bindings directly — those bindings only exist in the Worker's `env`. + // To let the benchmark code reach storage, worker.js injects its own public + // origin into the X-Worker-URL header before forwarding the request here. + // The container-side storage/nosql modules use this URL to call back into + // the Worker over HTTP (e.g. POST ${workerUrl}/r2/upload), and worker.js + // intercepts those paths (/r2/*, /nosql/*) and performs the binding call + // on the container's behalf. const workerUrl = req.headers['x-worker-url']; if (workerUrl) { if (storage && storage.storage && storage.storage.set_worker_url) { @@ -100,6 +102,9 @@ const server = http.createServer(async (req, res) => { event = JSON.parse(body); } catch (e) { console.error('Failed to parse JSON body:', e); + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Invalid JSON body', message: e.message })); + return; } } From e4b2abfd424fd5d28d4c7eeaded64a58e22dfe49 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 13:48:43 +0200 Subject: [PATCH 093/230] fix(cloudflare): Enhance debugging by utilizing Node.js debuglog for response logging --- .../cloudflare/nodejs/container/handler.js | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 7980a3f62..820037e0d 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -2,6 +2,7 @@ // This handler is used when deploying as a container worker const http = require('http'); +const debug = require('util').debuglog('sebs'); // Monkey-patch the 'request' library to always include a User-Agent header // This is needed because Wikimedia (and other sites) require a User-Agent @@ -122,21 +123,9 @@ const server = http.createServer(async (req, res) => { event['request-id'] = reqId; event['income-timestamp'] = incomeTimestamp; - // For debugging: check /tmp directory before and after benchmark - const fs = require('fs'); - // Call the benchmark function const ret = await benchmarkHandler(event); - - // Check what was downloaded - const tmpFiles = fs.readdirSync('/tmp'); - for (const file of tmpFiles) { - const filePath = `/tmp/${file}`; - const stats = fs.statSync(filePath); - if (stats.size < 500) { - const content = fs.readFileSync(filePath, 'utf8'); - } - } + // Calculate elapsed time const end = Date.now() / 1000; @@ -156,7 +145,8 @@ const server = http.createServer(async (req, res) => { const memory_mb = memUsage.heapUsed / 1024 / 1024; log_data.measurement.memory_used_mb = memory_mb; - console.log('Sending response with log_data:', log_data); + // Gated behind Node.js' built-in debuglog — enable with NODE_DEBUG=sebs + debug('Sending response with log_data: %o', log_data); // Send response matching Python handler format exactly if (event.html) { From 28d90d764076330566211b2dd234fc78d1b199fd Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 13:50:29 +0200 Subject: [PATCH 094/230] fix(nosql): Update documentation and streamline query method by removing debug logs --- .../cloudflare/nodejs/container/nosql.js | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js index 3469bf6b9..b704b0157 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js @@ -1,6 +1,16 @@ /** - * NoSQL module for Cloudflare Node.js Containers - * Uses HTTP proxy to access Durable Objects through the Worker's binding + * NoSQL module for Cloudflare Node.js Containers. + * + * On Cloudflare, NoSQL storage is mapped to KVStore. KVStore + * bindings only exist inside the Worker runtime, so a container cannot talk + * to them directly. Instead, the container forwards each operation over HTTP + * to the parent Worker (see worker.js), which holds the KVStore + * binding and performs the actual read/write. + * + * Because of this, the HTTP endpoint depends on the Worker's URL, which is + * not known ahead of time. The handler receives it via the X-Worker-URL + * header on the incoming request and installs it here through + * set_worker_url() before any NoSQL call is made. */ class nosql { @@ -90,12 +100,7 @@ class nosql { secondary_key_name: secondaryKeyName, }; const result = await this._make_request('query', params); - console.error(`[nosql.query] result:`, JSON.stringify(result)); - console.error(`[nosql.query] result.items:`, result.items); - console.error(`[nosql.query] Array.isArray(result.items):`, Array.isArray(result.items)); - const items = result.items || []; - console.error(`[nosql.query] returning items:`, items); - return items; + return result.items || []; } async delete(tableName, primaryKey, secondaryKey) { From 7517eaac7bf8689c1ee83784e208991653d240cb Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 14:24:47 +0200 Subject: [PATCH 095/230] refactor(docker): Moved Dockerfiles for Node.js and Python functions --- .../cloudflare/nodejs/{Dockerfile.worker => Dockerfile.build} | 0 dockerfiles/cloudflare/nodejs/{Dockerfile => Dockerfile.function} | 0 dockerfiles/cloudflare/python/{Dockerfile => Dockerfile.function} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename dockerfiles/cloudflare/nodejs/{Dockerfile.worker => Dockerfile.build} (100%) rename dockerfiles/cloudflare/nodejs/{Dockerfile => Dockerfile.function} (100%) rename dockerfiles/cloudflare/python/{Dockerfile => Dockerfile.function} (100%) diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile.worker b/dockerfiles/cloudflare/nodejs/Dockerfile.build similarity index 100% rename from dockerfiles/cloudflare/nodejs/Dockerfile.worker rename to dockerfiles/cloudflare/nodejs/Dockerfile.build diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile b/dockerfiles/cloudflare/nodejs/Dockerfile.function similarity index 100% rename from dockerfiles/cloudflare/nodejs/Dockerfile rename to dockerfiles/cloudflare/nodejs/Dockerfile.function diff --git a/dockerfiles/cloudflare/python/Dockerfile b/dockerfiles/cloudflare/python/Dockerfile.function similarity index 100% rename from dockerfiles/cloudflare/python/Dockerfile rename to dockerfiles/cloudflare/python/Dockerfile.function From 6eae47cea5f18f50dc5a3ad4df4cf37ebacdedcc Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 14:42:36 +0200 Subject: [PATCH 096/230] feat(docker): Add Dockerfile.build for Python worker validation and update references in deployment logic --- .../cloudflare/python/Dockerfile.build | 32 +++++++ sebs/cloudflare/containers.py | 4 +- sebs/cloudflare/workers.py | 92 +++++++++++++++++-- 3 files changed, 116 insertions(+), 12 deletions(-) create mode 100644 dockerfiles/cloudflare/python/Dockerfile.build diff --git a/dockerfiles/cloudflare/python/Dockerfile.build b/dockerfiles/cloudflare/python/Dockerfile.build new file mode 100644 index 000000000..33c5ff04f --- /dev/null +++ b/dockerfiles/cloudflare/python/Dockerfile.build @@ -0,0 +1,32 @@ +FROM python:3.11-slim + +# curl + ca-certificates are required by the uv installer; git is sometimes +# pulled in by workers-py when it resolves VCS-declared deps. +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl ca-certificates git \ + && rm -rf /var/lib/apt/lists/* + +# Install uv (fast Python package manager) and the workers-py toolchain, +# which provides pywrangler for Pyodide-based Cloudflare Worker deploys. +# Pinned to 1.8.0 to match Dockerfile.manage — 1.9.x introduced a broken +# import (rich.logging.Console). Remove the pin once upstream fixes it. +RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ + /root/.local/bin/uv tool install 'workers-py==1.8.0' + +ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:${PATH}" + +WORKDIR /worker + +# Build context is populated by sebs; .venv / python_modules / Dockerfile.build +# itself are excluded via the .dockerignore injected alongside this file. +COPY . . + +# Validate that the generated pyproject.toml parses and that the workers-py +# toolchain is installed and callable. Pyodide package resolution itself +# happens at deploy time inside pywrangler (Cloudflare re-resolves Pyodide +# packages server-side), so there is no vendored dist/ to extract — this +# image exists purely for early failure detection on malformed packages. +RUN if [ -f pyproject.toml ]; then \ + python -c "import tomllib; tomllib.load(open('pyproject.toml','rb'))" && \ + pywrangler --version; \ + fi diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 2d42c80da..25b90b7fc 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -196,7 +196,7 @@ def package_code( ) # Copy container wrapper files to the package directory - # Copy Dockerfile from dockerfiles/cloudflare/{language}/ + # Copy Dockerfile.function from dockerfiles/cloudflare/{language}/ dockerfile_src = os.path.join( os.path.dirname(__file__), "..", @@ -204,7 +204,7 @@ def package_code( "dockerfiles", "cloudflare", language_name, - "Dockerfile" + "Dockerfile.function" ) dockerfile_dest = os.path.join(directory, "Dockerfile") if os.path.exists(dockerfile_src): diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 83d12965d..16f0369c3 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -200,10 +200,10 @@ def package_code( """ # Install dependencies and bundle if language_name == "nodejs": - # Build via Dockerfile.worker (npm install + esbuild + __require patching), + # Build via Dockerfile.build (npm install + esbuild + __require patching), # then extract the produced dist/ back into the package directory. - # This mirrors how container deployments use their Dockerfile — the only - # difference is which Dockerfile is selected. + # This mirrors how container deployments use their Dockerfile.function — the + # only difference is which Dockerfile is selected. self._build_worker_and_extract_dist(directory, is_cached) elif language_name == "python": @@ -283,6 +283,11 @@ def package_code( dest = os.path.join(directory, "function", thing) shutil.move(src, dest) + # Early validation: build Dockerfile.build to confirm the + # generated pyproject.toml parses and the workers-py toolchain + # is wired up. Deploy still runs pywrangler from Dockerfile.manage. + self._build_python_worker(directory, is_cached) + # Create package structure CONFIG_FILES = { "nodejs": ["handler.js", "package.json", "node_modules"], @@ -322,11 +327,11 @@ def package_code( return (directory, total_size, "") def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> None: - """Build the Node.js worker bundle via Dockerfile.worker and extract dist/. + """Build the Node.js worker bundle via Dockerfile.build and extract dist/. Runs npm install, esbuild (build.js), and the __require→import post- processing step (postprocess.js) inside a throwaway Docker image built - from Dockerfile.worker. Only the resulting dist/ directory is extracted + from Dockerfile.build. Only the resulting dist/ directory is extracted back to *directory*; intermediate artifacts (node_modules, build image) stay inside Docker. @@ -341,13 +346,13 @@ def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> Non dockerfile_src = os.path.join( os.path.dirname(__file__), "..", "..", - "dockerfiles", "cloudflare", "nodejs", "Dockerfile.worker" + "dockerfiles", "cloudflare", "nodejs", "Dockerfile.build" ) - dockerfile_dest = os.path.join(directory, "Dockerfile.worker") + dockerfile_dest = os.path.join(directory, "Dockerfile.build") dockerignore_dest = os.path.join(directory, ".dockerignore") # Keep the build context lean: exclude generated / heavy artifacts. - dockerignore_content = "node_modules\ndist\nDockerfile.worker\n.dockerignore\n" + dockerignore_content = "node_modules\ndist\nDockerfile.build\n.dockerignore\n" shutil.copy2(dockerfile_src, dockerfile_dest) with open(dockerignore_dest, "w") as f: f.write(dockerignore_content) @@ -356,10 +361,10 @@ def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> Non image_tag = f"sebs-worker-build-{os.path.basename(directory)}-{os.getpid()}:latest" try: - self.logging.info(f"Building worker bundle via Dockerfile.worker in {directory}") + self.logging.info(f"Building worker bundle via Dockerfile.build in {directory}") _, build_logs = self.docker_client.images.build( path=directory, - dockerfile="Dockerfile.worker", + dockerfile="Dockerfile.build", tag=image_tag, rm=True, ) @@ -398,6 +403,73 @@ def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> Non except Exception: pass + def _build_python_worker(self, directory: str, is_cached: bool) -> None: + """Validate a Python worker package via Dockerfile.build. + + Mirrors _build_worker_and_extract_dist for structural symmetry with + the Node.js flow and with Dockerfile.build layouts in other clouds. + Unlike Node.js (which needs esbuild + __require→import patching), + Pyodide Worker deploys don't require a vendored bundle — Cloudflare + resolves Pyodide packages server-side at deploy time via pywrangler. + So this image only validates that the generated pyproject.toml parses + and that workers-py is callable; nothing is extracted. + + A marker file is used for caching: once validation succeeds it is + skipped on subsequent builds of the same directory. + """ + import docker as docker_module + + marker = os.path.join(directory, ".build-validated") + if is_cached and os.path.exists(marker): + self.logging.info("Cached Python build marker — skipping validation.") + return + + dockerfile_src = os.path.join( + os.path.dirname(__file__), "..", "..", + "dockerfiles", "cloudflare", "python", "Dockerfile.build" + ) + dockerfile_dest = os.path.join(directory, "Dockerfile.build") + dockerignore_dest = os.path.join(directory, ".dockerignore") + + dockerignore_content = ( + "python_modules\n.venv\nDockerfile.build\n.dockerignore\n" + ) + shutil.copy2(dockerfile_src, dockerfile_dest) + with open(dockerignore_dest, "w") as f: + f.write(dockerignore_content) + + image_tag = f"sebs-python-build-{os.path.basename(directory)}-{os.getpid()}:latest" + + try: + self.logging.info( + f"Validating Python worker via Dockerfile.build in {directory}" + ) + _, build_logs = self.docker_client.images.build( + path=directory, + dockerfile="Dockerfile.build", + tag=image_tag, + rm=True, + ) + for log in build_logs: + if "stream" in log: + self.logging.debug(log["stream"].strip()) + elif "error" in log: + raise RuntimeError(f"Docker build error: {log['error']}") + + with open(marker, "w") as f: + f.write("ok") + + except docker_module.errors.BuildError as e: + raise RuntimeError(f"Python worker validation failed: {e}") + finally: + for tmp in (dockerfile_dest, dockerignore_dest): + if os.path.exists(tmp): + os.remove(tmp) + try: + self.docker_client.images.remove(image_tag, force=True) + except Exception: + pass + def shutdown(self): """Shutdown CLI container if initialized.""" if self._cli is not None: From 6a3a8db2dd613bb902681d8649f97182f70a4178 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 14:56:19 +0200 Subject: [PATCH 097/230] docs(storage): Clarify proxy usage for Cloudflare R2 and explain storage wrapper limitations --- docs/storage.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/storage.md b/docs/storage.md index d09105989..fc544063b 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -153,6 +153,9 @@ R2 configuration is handled automatically by SeBS when deploying to Cloudflare W For Cloudflare container deployments, benchmark code does not talk to R2 directly. Instead, container wrappers call the Worker proxy endpoints (`/r2/upload`, `/r2/multipart-init`, `/r2/multipart-part`, `/r2/multipart-complete`). +**Why a proxy (and not a direct storage wrapper like other platforms)?** +On other platforms (AWS/GCP/Azure), the storage wrapper can be an SDK call because the function runtime and object store share a credential/SDK surface. Cloudflare R2 is different: the supported access path for Workers is the R2 **binding** (`env.R2_BUCKET`), which is a runtime object injected only inside the Worker runtime. A Cloudflare container runs in a separate runtime and has no access to that binding, so a container-side "storage wrapper" has nowhere to call. The only direct alternative is R2's S3-compatible HTTPS API, which would require provisioning R2 access keys and shipping them into each container — a second credential model that diverges from how the native Worker benchmarks talk to R2. Routing container storage calls through the parent Worker keeps a single code path and single credential model for both deployment types; the container-side `storage.js` wrapper still exists and still exposes the SeBS storage interface, it just implements those operations by forwarding to the Worker that holds the binding. + **Upload strategy:** - Small payloads use a single upload request. - Large payloads use multipart upload (10 MB threshold, 10 MB part size in current wrappers). From 95fdaba89cde32f4dddc12a35a431175c3676595 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 14:56:31 +0200 Subject: [PATCH 098/230] fix(storage): Enhance debugging by replacing console logs with debuglog for R2 operations --- .../cloudflare/nodejs/container/storage.js | 69 +++++++++++++------ 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js index e9f630187..a858db54f 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js @@ -1,7 +1,12 @@ const fs = require('fs'); const path = require('path'); const uuid = require('uuid'); +const debug = require('util').debuglog('sebs'); +// Cloudflare Workers enforce a 100 MB request body limit at the edge. +// Use multipart upload for payloads larger than this threshold so that +// each individual request stays well below that limit. R2 requires parts +// of at least 5 MB. const MULTIPART_THRESHOLD = 10 * 1024 * 1024; const PART_SIZE = 10 * 1024 * 1024; @@ -11,8 +16,26 @@ function isRetryableSingleUploadError(error) { } /** - * Storage module for Cloudflare Node.js Containers - * Uses HTTP proxy to access R2 storage through the Worker's R2 binding + * Storage module for Cloudflare Node.js Containers. + * + * On Cloudflare, object storage (R2) is normally accessed through a Worker + * binding (`env.R2_BUCKET`). That binding only exists inside the Worker + * runtime, so a container cannot talk to R2 directly the way a Lambda or + * Cloud Function talks to S3/GCS with a regular SDK. Instead, the container + * forwards each storage operation over HTTP to the parent Worker (see + * worker.js), which holds the R2 binding and performs the actual + * get/put/list/multipart calls. + * + * R2 does expose an S3-compatible HTTPS API that a container could call + * without a Worker proxy, but that path requires provisioning and injecting + * R2 access keys into the container and diverges from how the Worker-based + * benchmarks access R2. Routing through the Worker keeps a single code path + * and credential model for both deployment types. + * + * Because of this, the HTTP endpoint depends on the Worker's URL, which is + * not known ahead of time. The handler receives it via the X-Worker-URL + * header on the incoming request and installs it here through + * set_worker_url() before any storage call is made. */ class storage { @@ -127,8 +150,10 @@ class storage { throw error; } - console.warn( - `[storage] single upload failed for ${key}; retrying with multipart upload: ${error.message}` + debug( + '[storage] single upload failed for %s; retrying with multipart upload: %s', + key, + error.message ); return this._multipart_upload(key, buffer); } @@ -136,7 +161,7 @@ class storage { async upload_stream(bucket, key, data) { if (!this.r2_enabled) { - console.log('Warning: R2 not configured, skipping upload'); + debug('R2 not configured, skipping upload'); return key; } @@ -150,7 +175,7 @@ class storage { try { return await this._upload_bytes(unique_key, buffer); } catch (error) { - console.error('R2 upload error:', error); + debug('R2 upload error: %o', error); throw new Error(`Failed to upload to R2: ${error.message}`); } } @@ -182,7 +207,7 @@ class storage { const arrayBuffer = await response.arrayBuffer(); return Buffer.from(arrayBuffer); } catch (error) { - console.error('R2 download error:', error); + debug('R2 download error: %o', error); throw new Error(`Failed to download from R2: ${error.message}`); } } @@ -199,35 +224,38 @@ class storage { return [unique_key, uploadPromise]; } - console.error(`!!! [storage.upload] File not found: ${filepath}`); + debug('[storage.upload] File not found: %s', filepath); throw new Error(`upload(): file not found: ${filepath}`); } async _upload_stream_with_key(bucket, key, data) { - // Internal method that uploads with exact key (no unique naming) - console.log(`[storage._upload_stream_with_key] Starting upload: bucket=${bucket}, key=${key}, data_size=${data.length}`); - + debug( + '[storage._upload_stream_with_key] Starting upload: bucket=%s, key=%s, data_size=%d', + bucket, + key, + data.length + ); + if (!this.r2_enabled) { - console.log('Warning: R2 not configured, skipping upload'); + debug('R2 not configured, skipping upload'); return key; } if (!storage.worker_url) { - console.error('[storage._upload_stream_with_key] Worker URL not set!'); throw new Error('Worker URL not set - cannot access R2'); } - - console.log(`[storage._upload_stream_with_key] Worker URL: ${storage.worker_url}`); + + debug('[storage._upload_stream_with_key] Worker URL: %s', storage.worker_url); const buffer = this._toBuffer(data); - console.log(`[storage._upload_stream_with_key] Uploading key=${key}, buffer size: ${buffer.length}`); + debug('[storage._upload_stream_with_key] Uploading key=%s, buffer size: %d', key, buffer.length); try { const resultKey = await this._upload_bytes(key, buffer); - console.log(`[storage._upload_stream_with_key] Upload successful, returned key: ${resultKey}`); + debug('[storage._upload_stream_with_key] Upload successful, returned key: %s', resultKey); return resultKey; } catch (error) { - console.error('R2 upload error:', error); + debug('R2 upload error: %o', error); throw new Error(`Failed to upload to R2: ${error.message}`); } } @@ -246,9 +274,8 @@ class storage { } async download_directory(bucket, prefix, out_path) { - // List all objects with the prefix and download each one if (!this.r2_enabled) { - console.log('Warning: R2 not configured, skipping download_directory'); + debug('R2 not configured, skipping download_directory'); return; } @@ -281,7 +308,7 @@ class storage { await this.download(bucket, file_name, path.join(out_path, file_name)); } } catch (error) { - console.error('R2 download_directory error:', error); + debug('R2 download_directory error: %o', error); throw new Error(`Failed to download directory from R2: ${error.message}`); } } From 6b9434e4ec51391e19f1462dafbde9e04a7590bf Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 15:43:11 +0200 Subject: [PATCH 099/230] docs(cloudflare): Add deployment architecture and detailed flow for script-based and container-based Workers --- docs/platforms.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docs/platforms.md b/docs/platforms.md index 2b7ac8948..8e4b50f33 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -228,6 +228,51 @@ Cloudflare Workers support multiple languages through different deployment metho SeBS uses a containerized CLI approach for Cloudflare deployments, eliminating the need to install Node.js, npm, wrangler, pywrangler, or uv on your host system. The CLI container (`sebs/manage.cloudflare`) is automatically built on first use and contains all necessary tools. This ensures consistent behavior across platforms and simplifies setup—only Docker is required. +### Deployment Architecture + +SeBS supports two deployment paths for Cloudflare: **script-based Workers** (native Workers runtime) and **container-based Workers** (Cloudflare's managed container runtime, fronted by a Durable-Object-backed Worker). Both paths share the same credentials, R2/KV resources, and HTTP trigger; they differ only in how code is packaged and which Cloudflare runtime executes it. The deployment type is controlled by the benchmark's `container_deployment` flag. + +#### Python modules (`sebs/cloudflare/`) + +| File | Responsibility | +|------|----------------| +| `cloudflare.py` | `Cloudflare(System)` facade. Verifies credentials, enforces `SUPPORTED_BENCHMARKS`, resolves the `workers.dev` URL, and dispatches `package_code`/`create_function`/`update_function` to the correct handler via `_get_deployment_handler(container_deployment)`. | +| `workers.py` | `CloudflareWorkersDeployment` — native script packaging. Node.js is bundled with esbuild via `nodejs/Dockerfile.build`; Python generates a `pyproject.toml` and is validated via `python/Dockerfile.build` (Pyodide resolution happens server-side at deploy time). | +| `containers.py` | `CloudflareContainersDeployment` — container packaging. Copies the per-language `Dockerfile.function` into the code directory, injects the `worker.js` orchestrator (Node-only, required by `@cloudflare/containers`), merges `package.json`, runs `npm install`, and builds a local image as a cache anchor. | +| `cli.py` | `CloudflareCLI` — runs the `manage.cloudflare` Docker container with the Docker socket mounted and exposes `wrangler_deploy`, `pywrangler_deploy`, `npm_install`, `docker_build`, `upload_package`. Used by both deployment handlers; `cloudflare.py` never calls `wrangler` directly. | +| `config.py` | `CloudflareCredentials` / `CloudflareConfig` — API token, account ID, R2 keys. | +| `resources.py` | `CloudflareSystemResources` — factories for R2 and KV/Durable Objects. | +| `function.py` | `CloudflareWorker(Function)` — cached function metadata. | +| `triggers.py` | `HTTPTrigger` — invokes the deployed Worker at `https://{name}.{account}.workers.dev`. | +| `r2.py`, `kvstore.py` | Object and NoSQL storage clients. | + +Wrangler templates live at the repo root under `templates/wrangler-worker.toml` and `templates/wrangler-container.toml`. + +#### Dockerfiles (`dockerfiles/cloudflare/`) + +| File | Purpose | +|------|---------| +| `Dockerfile.manage` | Builds the `manage.cloudflare` CLI image (Node + global `wrangler` + `pywrangler` via `uv` + Docker CLI). Driven by `cli.py`. | +| `nodejs/Dockerfile.build` | Ephemeral build image for **script-based** Node.js workers. Produces the bundled `dist/` that `workers.py` extracts back to the host package. | +| `python/Dockerfile.build` | Ephemeral validation image for **script-based** Python workers — confirms `pywrangler` accepts the generated `pyproject.toml`. | +| `nodejs/Dockerfile.function` | Runtime image for **container-based** Node.js functions. Parameterized via `ARG BASE_IMAGE` from `config/systems.json`. Copied into the package by `containers.py` and rebuilt by `wrangler deploy`. | +| `python/Dockerfile.function` | Runtime image for **container-based** Python functions. Same parameterization. | + +#### Script-based flow (`container_deployment=false`) + +1. `benchmark.build()` → `Cloudflare.package_code` → `CloudflareWorkersDeployment.package_code` (builds via `Dockerfile.build`). +2. `Cloudflare.create_function` → `_create_or_update_worker` renders `templates/wrangler-worker.toml` into the package. +3. `CloudflareCLI.wrangler_deploy` (Node) or `pywrangler_deploy` (Python) deploys via the `manage.cloudflare` container. +4. `HTTPTrigger` is attached using the `workers.dev` URL. + +#### Container-based flow (`container_deployment=true`) + +1. `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile` (patching `BASE_IMAGE`), adds `worker.js`, merges `package.json`, runs `npm install` in the CLI container, and builds a local Docker image. +2. `Cloudflare.create_function` → `_create_or_update_worker` renders `templates/wrangler-container.toml`. +3. `CloudflareCLI.wrangler_deploy` invokes wrangler, which rebuilds the image from `Dockerfile` and pushes it to Cloudflare's managed registry, creating a Durable-Object-backed container worker. +4. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the DO warm before the first measured invocation. +5. `HTTPTrigger` is attached using the `workers.dev` URL. + ### Trigger Support - **HTTP Trigger**: ✅ Fully supported - Workers are automatically accessible at `https://{name}.{account}.workers.dev` From 8966909f13389b9d5e097934a5f6d12e01e2e1d4 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 15:45:19 +0200 Subject: [PATCH 100/230] refactor(handler): Move all imports to the top. --- .../wrappers/cloudflare/nodejs/container/handler.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 820037e0d..948722f3c 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -2,17 +2,18 @@ // This handler is used when deploying as a container worker const http = require('http'); +const crypto = require('crypto'); +const Module = require('module'); const debug = require('util').debuglog('sebs'); // Monkey-patch the 'request' library to always include a User-Agent header // This is needed because Wikimedia (and other sites) require a User-Agent try { - const Module = require('module'); const originalRequire = Module.prototype.require; - + Module.prototype.require = function(id) { const module = originalRequire.apply(this, arguments); - + if (id === 'request') { // Wrap the request function to inject default headers const originalRequest = module; @@ -34,7 +35,7 @@ try { }); return wrappedRequest; } - + return module; }; } catch (e) { @@ -62,7 +63,6 @@ const PORT = process.env.PORT || 8080; const server = http.createServer(async (req, res) => { try { // Get unique request ID from Cloudflare (CF-Ray header) - const crypto = require('crypto'); const reqId = req.headers['cf-ray'] || crypto.randomUUID(); // Extract Worker URL from header for R2 and NoSQL proxy. From 3665df803a6ccad2ffedb62df2b491f1183cbfdc Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 20 Apr 2026 16:10:27 +0200 Subject: [PATCH 101/230] refactor(storage): Remove content type inference from upload functions and update documentation --- .../cloudflare/nodejs/container/worker.js | 46 ++----------------- .../cloudflare/python/container/storage.py | 29 +++++------- docs/storage.md | 6 --- 3 files changed, 14 insertions(+), 67 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index 97d937fe6..80b9be82a 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -110,40 +110,6 @@ export default { } }; -const MIME_TYPES = { - '.bin': 'application/octet-stream', - '.csv': 'text/csv', - '.gif': 'image/gif', - '.htm': 'text/html', - '.html': 'text/html', - '.jpeg': 'image/jpeg', - '.jpg': 'image/jpeg', - '.json': 'application/json', - '.mov': 'video/quicktime', - '.mp3': 'audio/mpeg', - '.mp4': 'video/mp4', - '.pdf': 'application/pdf', - '.png': 'image/png', - '.svg': 'image/svg+xml', - '.txt': 'text/plain', - '.wav': 'audio/wav', - '.webm': 'video/webm', - '.xml': 'application/xml', - '.zip': 'application/zip', -}; - -function inferContentTypeFromKey(key) { - if (!key) { - return 'application/octet-stream'; - } - const dot = key.lastIndexOf('.'); - if (dot < 0) { - return 'application/octet-stream'; - } - const extension = key.slice(dot).toLowerCase(); - return MIME_TYPES[extension] || 'application/octet-stream'; -} - /** * Handle NoSQL (KV namespace) requests proxied from the container * Routes: @@ -357,11 +323,8 @@ async function handleR2Request(request, env) { // Multipart upload routes only need 'key' (bucket is implicit in the R2 binding) if (url.pathname === '/r2/multipart-init') { // Initiate a multipart upload; returns { key, uploadId } - const contentType = url.searchParams.get('contentType') || inferContentTypeFromKey(key); - console.log(`[worker.js /r2/multipart-init] key=${key}, contentType=${contentType}`); - const multipart = await env.R2.createMultipartUpload(key, { - httpMetadata: { contentType } - }); + console.log(`[worker.js /r2/multipart-init] key=${key}`); + const multipart = await env.R2.createMultipartUpload(key); console.log(`[worker.js /r2/multipart-init] uploadId=${multipart.uploadId}`); return new Response(JSON.stringify({ key: multipart.key, @@ -433,10 +396,7 @@ async function handleR2Request(request, env) { // Use the key as-is (container already generates unique keys if needed) try { - const contentType = request.headers.get('Content-Type') || inferContentTypeFromKey(key); - const putResult = await env.R2.put(key, request.body, { - httpMetadata: { contentType } - }); + const putResult = await env.R2.put(key, request.body); const size = putResult ? putResult.size : '(unknown)'; console.log(`[worker.js /r2/upload] R2.put() succeeded, size=${size}`); console.log(`[worker.js /r2/upload] Successfully uploaded to R2 with key=${key}`); diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index bb8066d16..d386d4cc9 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -3,17 +3,11 @@ Uses HTTP proxy to access R2 storage through the Worker's R2 binding """ import io -import mimetypes import os import json import urllib.request import urllib.parse -def _guess_content_type(name: str) -> str: - """Infer MIME type from a file name, falling back to application/octet-stream.""" - ct, _ = mimetypes.guess_type(name) - return ct or 'application/octet-stream' - # Cloudflare Workers enforce a 100 MB request body limit at the edge. # Use multipart upload for payloads larger than this threshold so that # each individual request stays well below that limit. @@ -67,7 +61,7 @@ def _post_json(self, url: str, body: bytes = b'', content_type: str = 'applicati with urllib.request.urlopen(req) as resp: return json.loads(resp.read().decode('utf-8')) - def _upload_bytes(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + def _upload_bytes(self, key: str, data: bytes) -> str: """Upload *data* to the exact R2 *key* via the worker proxy. Uses a single PUT for small payloads and R2 multipart upload for @@ -77,19 +71,19 @@ def _upload_bytes(self, key: str, data: bytes, content_type: str = 'application/ Returns the R2 key. """ if len(data) <= _MULTIPART_THRESHOLD: - return self._single_upload(key, data, content_type) - return self._multipart_upload(key, data, content_type) + return self._single_upload(key, data) + return self._multipart_upload(key, data) - def _single_upload(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + def _single_upload(self, key: str, data: bytes) -> str: params = urllib.parse.urlencode({'key': key}) url = f"{storage.worker_url}/r2/upload?{params}" - result = self._post_json(url, data, content_type) + result = self._post_json(url, data) return result['key'] - def _multipart_upload(self, key: str, data: bytes, content_type: str = 'application/octet-stream') -> str: + def _multipart_upload(self, key: str, data: bytes) -> str: """Split *data* into ≤_PART_SIZE chunks and use R2 multipart upload.""" # 1. Initiate - params = urllib.parse.urlencode({'key': key, 'contentType': content_type}) + params = urllib.parse.urlencode({'key': key}) init_url = f"{storage.worker_url}/r2/multipart-init?{params}" init = self._post_json(init_url) upload_id = init['uploadId'] @@ -146,11 +140,11 @@ def upload_stream(self, bucket: str, key: str, data): unique_key = self.unique_name(key) try: - return self._upload_bytes(unique_key, data, _guess_content_type(unique_key)) + return self._upload_bytes(unique_key, data) except Exception as e: print(f"R2 upload error: {e}") raise RuntimeError(f"Failed to upload to R2: {e}") - + def download_stream(self, bucket: str, key: str) -> bytes: """Download data from R2 via worker proxy""" if not self.r2_enabled: @@ -179,11 +173,10 @@ def upload(self, bucket, key, filepath): """Upload file from disk with unique key generation""" # Generate unique key to avoid conflicts unique_key = self.unique_name(key) - content_type = _guess_content_type(filepath) with open(filepath, 'rb') as f: data = f.read() try: - self._upload_bytes(unique_key, data, content_type) + self._upload_bytes(unique_key, data) except Exception as e: raise RuntimeError(f"Failed to upload to R2: {e}") return unique_key @@ -206,7 +199,7 @@ def _upload_with_key(self, bucket: str, key: str, data): data = data.encode('utf-8') try: - result_key = self._upload_bytes(key, data, _guess_content_type(key)) + result_key = self._upload_bytes(key, data) print(f"[storage._upload_with_key] Upload successful, key={result_key}") except Exception as e: print(f"R2 upload error: {e}") diff --git a/docs/storage.md b/docs/storage.md index fc544063b..c1a4f7c92 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -165,12 +165,6 @@ On other platforms (AWS/GCP/Azure), the storage wrapper can be an SDK call becau - Container wrappers generate unique output keys (suffix based on UUID fragment) before upload. - This avoids collisions and keeps run-specific output objects distinct in regression and repeated invocations. -**Content-Type behavior:** -- Stored object metadata should reflect the real file type (for example `image/jpeg`, `image/png`) when inferable. -- In the Node.js container path, the Worker proxy infers content type from the object key extension when the caller omits it. -- In the Python container path, the wrapper infers content type and passes it to the Worker proxy. -- Multipart part transport may still use `application/octet-stream`; this is expected for chunk transport and does not imply final object metadata must be octet-stream. - ### KVStore for NoSQL Cloudflare KV namespaces are used for NoSQL operations required by benchmarks such as CRUD API (130.crud-api). From 2dafdf977a18cb1d2cc84bab78cbc35bbf0ad518 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 11:27:33 +0200 Subject: [PATCH 102/230] docs(build): Enhance documentation for build process and clarify Workers compatibility --- .../wrappers/cloudflare/nodejs/build.js | 103 ++++++++++++++---- 1 file changed, 81 insertions(+), 22 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index fd6cc5e08..c130d781a 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -1,14 +1,42 @@ +/** + * build.js — Convert a Node.js benchmark into a Cloudflare Workers-compatible bundle. + * + * Why this exists: + * Cloudflare Workers do not ship a Node.js runtime or a filesystem at deploy + * time: there is no `node_modules` directory, no `require()` resolution, and + * only a curated subset of Node built-ins is available (and only when opted + * in via the `node:` prefix and the `nodejs_compat` compatibility flag). + * Our SeBS benchmarks, however, are authored as regular Node.js code. This + * script bridges that gap by bundling the benchmark + its dependencies into + * a single ESM module that the Workers runtime can load. + * + * High-level pipeline: + * 1. Discover source files under the wrapper directory (skipping tests, + * node_modules, dotfiles, and the previous build output). + * 2. Run esbuild on every JS/TS entry point with a Workers-friendly config + * (ESM output, neutral platform, ES2020 target, tree-shaking). + * 3. Apply the `nodeBuiltinsPlugin` to rewrite imports so that: + * - Node built-ins always use the `node:` prefix required by Workers. + * - `cloudflare:*` imports stay external (resolved by the runtime). + * - The legacy `request` npm module is swapped for a fetch-based + * polyfill, since it cannot run under Workers. + * 4. Copy any non-code assets (templates, SQL, etc.) into `dist/` unchanged. + */ + const { build } = require('esbuild'); const fs = require('fs'); const { join, extname, dirname, relative } = require('path'); +// Recursively collect every file that should be part of the Workers bundle. +// Excludes test directories, node_modules, build artifacts, and this script +// itself so that only benchmark sources and the wrapper code get processed. function getAllFiles(dir, fileList = []) { const files = fs.readdirSync(dir, { withFileTypes: true }); for (const file of files) { const filePath = join(dir, file.name); if (file.isDirectory()) { - if (file.name !== 'node_modules' && - file.name !== 'test' && + if (file.name !== 'node_modules' && + file.name !== 'test' && file.name !== 'tests' && file.name !== '__tests__' && file.name !== 'dist' && @@ -16,7 +44,7 @@ function getAllFiles(dir, fileList = []) { getAllFiles(filePath, fileList); } } else { - if (!file.name.includes('.test.') && + if (!file.name.includes('.test.') && !file.name.includes('.spec.') && file.name !== 'build.js' && file.name !== 'wrangler.toml') { @@ -35,26 +63,36 @@ function copyFile(src, dest) { fs.copyFileSync(src, dest); } +// esbuild plugin that rewrites module imports so the output works on the +// Cloudflare Workers runtime. Workers only accept Node built-ins via the +// `node:` prefix (with the `nodejs_compat` flag enabled on the Worker), do +// not support arbitrary npm packages that rely on Node's networking stack, +// and resolve their own `cloudflare:*` imports at runtime. const nodeBuiltinsPlugin = { name: 'node-builtins-external', setup(build) { const { resolve } = require('path'); - - // Keep node: prefixed modules external + + // Imports already using the `node:` or `cloudflare:` prefix are provided + // by the Workers runtime itself — leave them external so esbuild does not + // try to bundle them (which would fail, since they are not on disk). build.onResolve({ filter: /^(node:|cloudflare:)/ }, (args) => { return { path: args.path, external: true }; }); - - // Map bare node built-in names to node: versions and keep external + + // Benchmarks commonly `require('fs')`, `require('path')`, etc. Workers + // reject those bare specifiers; rewrite them to the `node:`-prefixed + // form and mark them external so the runtime resolves them. build.onResolve({ filter: /^(fs|querystring|path|crypto|stream|buffer|util|events|http|https|net|tls|zlib|os|child_process|tty|assert|url)$/ }, (args) => { return { path: 'node:' + args.path, external: true }; }); - - // Polyfill 'request' module with fetch-based implementation + + // The `request` npm module depends on Node's http/https clients and is + // incompatible with Workers. Redirect every `require('request')` to our + // fetch-based shim so benchmark code can keep the same call sites. build.onResolve({ filter: /^request$/ }, (args) => { - // Get the directory where build.js is located (wrapper directory) const wrapperDir = __dirname; - return { + return { path: resolve(wrapperDir, 'request-polyfill.js') }; }); @@ -65,26 +103,45 @@ const nodeBuiltinsPlugin = { async function customBuild() { const srcDir = './'; const outDir = './dist'; - + + // Start from a clean output directory so stale artifacts from a previous + // build cannot leak into the Worker upload. if (fs.existsSync(outDir)) { fs.rmSync(outDir, { recursive: true }); } fs.mkdirSync(outDir, { recursive: true }); - + try { const files = getAllFiles(srcDir); - - const jsFiles = files.filter(f => + + // Split discovered files: code goes through esbuild, everything else + // (JSON fixtures, templates, SQL, binary assets, ...) is copied verbatim. + const jsFiles = files.filter(f => ['.js', '.ts', '.jsx', '.tsx'].includes(extname(f)) ); - - const otherFiles = files.filter(f => + + const otherFiles = files.filter(f => !['.js', '.ts', '.jsx', '.tsx'].includes(extname(f)) ); - + console.log('Building JS files:', jsFiles); - + if (jsFiles.length > 0) { + // esbuild options chosen for Workers compatibility: + // - format: 'esm' Workers modules must be ES modules. + // - platform: 'neutral' Avoid Node- or browser-specific resolution; + // the plugin above handles Node built-ins + // explicitly. + // - target: 'es2020' Matches the V8 version used by Workers. + // - bundle + treeShaking Flattens dependencies into one module and + // drops dead code to stay under Workers' + // script size limit. + // - define.__dirname Node's `__dirname` does not exist in + // Workers; stub it with a harmless constant + // so benchmark code that references it still + // compiles. + // - define.global Workers expose `globalThis` rather than + // `global`; alias the two for compatibility. await build({ entryPoints: jsFiles, bundle: true, @@ -105,15 +162,17 @@ async function customBuild() { treeShaking: true, }); } - - // Copy non-JS files (templates, etc.) + + // Non-code assets (e.g. HTML/CSS templates, JSON payloads) need to ship + // alongside the bundle at their original relative paths so the worker + // can read them via the runtime's asset APIs. for (const file of otherFiles) { const relativePath = relative(srcDir, file); const destPath = join(outDir, relativePath); copyFile(file, destPath); console.log(`Copied: ${relativePath}`); } - + console.log('✓ Build completed successfully'); } catch (error) { console.error('Build failed:', error); From eb21ce54981c02b52f6eabaf2efd6c488f6f94b1 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:00:36 +0200 Subject: [PATCH 103/230] feat(handler): Implement advanceWorkersClock function to manage timing in Cloudflare Workers --- .../wrappers/cloudflare/nodejs/handler.js | 323 ++++++++---------- 1 file changed, 150 insertions(+), 173 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index df0cee97b..bcf7876be 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -1,5 +1,21 @@ import { DurableObject } from "cloudflare:workers"; +// Cloudflare Workers freezes Date.now() / performance.now() between I/O +// operations as a timing-sidechannel mitigation, so wall-clock time does +// not advance inside pure-compute sections. To record a meaningful +// compute_time, we issue a throwaway self-fetch that triggers I/O and +// unfreezes the clock before we sample it. +// Docs: https://developers.cloudflare.com/workers/reference/security-model/#step-1-disallow-timers-and-multi-threading +async function advanceWorkersClock(request) { + try { + const url = new URL(request.url); + url.pathname = '/favicon'; + await fetch(url.toString(), { method: 'HEAD' }); + } catch (e) { + // Ignore — we only care about the side effect of performing I/O. + } +} + // Durable Object class for KV API compatibility export class KVApiObject extends DurableObject { constructor(state, env) { @@ -40,213 +56,174 @@ export default { return new Response('None'); } - // Get unique request ID from Cloudflare (CF-Ray header) - const req_id = request.headers.get('CF-Ray') || crypto.randomUUID(); - - // Start timing measurements - const start = performance.now(); - const begin = Date.now() / 1000; + // Get unique request ID from Cloudflare (CF-Ray header) + const req_id = request.headers.get('CF-Ray') || crypto.randomUUID(); + // Start timing measurements + const start = performance.now(); + const begin = Date.now() / 1000; - // Parse JSON body first (similar to Azure handler which uses req.body) - const req_text = await request.text(); - let event = {}; - if (req_text && req_text.length > 0) { - try { - event = JSON.parse(req_text); - } catch (e) { - // If body isn't JSON, keep event empty - event = {}; - } - } - // Parse query string into event (URL parameters override/merge with body) - // This makes it compatible with both input formats - const urlParts = request.url.split('?'); - if (urlParts.length > 1) { - const query = urlParts[1]; - const pairs = query.split('&'); - for (const p of pairs) { - const [k, v] = p.split('='); + // Parse JSON body first (similar to Azure handler which uses req.body) + const req_text = await request.text(); + let event = {}; + if (req_text && req_text.length > 0) { try { - if (v === undefined) { - event[k] = null; - } else if (!Number.isNaN(Number(v)) && Number.isFinite(Number(v))) { - // mirror Python attempt to convert to int - const n = Number(v); - event[k] = Number.isInteger(n) ? parseInt(v, 10) : n; - } else { - event[k] = decodeURIComponent(v); - } + event = JSON.parse(req_text); } catch (e) { - event[k] = v; + // If body isn't JSON, keep event empty + event = {}; } } - } - - // Set timestamps - const income_timestamp = Math.floor(Date.now() / 1000); - event['request-id'] = req_id; - event['income-timestamp'] = income_timestamp; - // Load the benchmark function module and initialize storage if available - // With nodejs_compat enabled, we can use require() for CommonJS modules - let funcModule; - try { - // Fallback to dynamic import for ES modules - funcModule = await import('./function.js'); - } catch (e2) { - throw new Error('Failed to import benchmark function module: ' + e2.message); - } - - // Initialize storage - try function module first, then fall back to wrapper storage - try { - if (funcModule && funcModule.storage && typeof funcModule.storage.init_instance === 'function') { - funcModule.storage.init_instance({ env, request }); - } else { - // Function doesn't export storage, so initialize wrapper storage directly - try { - const storageModule = await import('./storage.js'); - if (storageModule && storageModule.storage && typeof storageModule.storage.init_instance === 'function') { - storageModule.storage.init_instance({ env, request }); + // Parse query string into event (URL parameters override/merge with body) + // This makes it compatible with both input formats + const urlParts = request.url.split('?'); + if (urlParts.length > 1) { + const query = urlParts[1]; + const pairs = query.split('&'); + for (const p of pairs) { + const [k, v] = p.split('='); + try { + if (v === undefined) { + event[k] = null; + } else if (!Number.isNaN(Number(v)) && Number.isFinite(Number(v))) { + // mirror Python attempt to convert to int + const n = Number(v); + event[k] = Number.isInteger(n) ? parseInt(v, 10) : n; + } else { + event[k] = decodeURIComponent(v); + } + } catch (e) { + event[k] = v; } - } catch (storageErr) { - // Ignore errors from storage initialization } } - } catch (e) { - // don't fail the request if storage init isn't available - } - // Initialize nosql if environment variable is set - if (env.NOSQL_STORAGE_DATABASE) { + // Set timestamps + const income_timestamp = Math.floor(Date.now() / 1000); + event['request-id'] = req_id; + event['income-timestamp'] = income_timestamp; + + // Load the benchmark function module and initialize storage if available + // With nodejs_compat enabled, we can use require() for CommonJS modules + let funcModule; + try { + // Fallback to dynamic import for ES modules + funcModule = await import('./function.js'); + } catch (e2) { + throw new Error('Failed to import benchmark function module: ' + e2.message); + } + + try { - const nosqlModule = await import('./nosql.js'); - if (nosqlModule && nosqlModule.nosql && typeof nosqlModule.nosql.init_instance === 'function') { - nosqlModule.nosql.init_instance({ env, request }); + const storageModule = await import('./storage.js'); + if (storageModule && storageModule.storage && typeof storageModule.storage.init_instance === 'function') { + storageModule.storage.init_instance({ env, request }); + } else { + console.warn('storage module imported but storage.init_instance is missing; skipping storage setup'); } } catch (e) { - // nosql module might not exist for all benchmarks - console.log('Could not initialize nosql:', e.message); + // storage module may not be bundled for benchmarks that don't need it } - } - // Execute the benchmark handler - let ret; - try { - // Wrap the handler execution to handle sync-style async code - // The benchmark code calls async nosql methods but doesn't await them - // We need to serialize the execution - if (funcModule && typeof funcModule.handler === 'function') { - // Create a promise-aware execution context - const handler = funcModule.handler; - - // Execute handler - it will return { result: [Promise, Promise, ...] } - ret = await Promise.resolve(handler(event)); - - // Deeply resolve all promises in the result - if (ret && ret.result && Array.isArray(ret.result)) { - ret.result = await Promise.all(ret.result.map(async item => await Promise.resolve(item))); - } - } else if (funcModule && funcModule.default && typeof funcModule.default.handler === 'function') { - const handler = funcModule.default.handler; - ret = await Promise.resolve(handler(event)); - - if (ret && ret.result && Array.isArray(ret.result)) { - ret.result = await Promise.all(ret.result.map(async item => await Promise.resolve(item))); + if (env.NOSQL_STORAGE_DATABASE) { + try { + const nosqlModule = await import('./nosql.js'); + if (nosqlModule && nosqlModule.nosql && typeof nosqlModule.nosql.init_instance === 'function') { + nosqlModule.nosql.init_instance({ env, request }); + } else { + console.warn('nosql module imported but nosql.init_instance is missing; skipping nosql setup'); + } + } catch (e) { + // nosql module might not exist for all benchmarks + console.log('Could not initialize nosql:', e.message); } - } else { - throw new Error('benchmark handler function not found'); } - } catch (err) { - // Trigger a fetch request to update the timer before measuring - // Time measurements only update after a fetch request or R2 operation + + // Execute the benchmark handler. Benchmarks expose `handler` either as a + // named export (`exports.handler` / `export const handler`) or nested + // under a default export (`export default { handler }`). + let ret; try { - // Fetch the worker's own URL with favicon to minimize overhead - const finalUrl = new URL(request.url); - finalUrl.pathname = '/favicon'; - await fetch(finalUrl.toString(), { method: 'HEAD' }); - } catch (e) { - // Ignore fetch errors + const handler = + (funcModule && typeof funcModule.handler === 'function' && funcModule.handler) || + (funcModule && funcModule.default && typeof funcModule.default.handler === 'function' && funcModule.default.handler); + if (!handler) { + throw new Error('benchmark handler function not found'); + } + ret = await handler(event); + } catch (err) { + await advanceWorkersClock(request); + // Calculate timing even for errors + const end = Date.now() / 1000; + const elapsed = performance.now() - start; + const micro = elapsed * 1000; // Convert milliseconds to microseconds + + // Mirror Python behavior: return structured error payload + const errorPayload = JSON.stringify({ + begin: begin, + end: end, + compute_time: micro, + results_time: 0, + result: { output: null }, + is_cold: false, + is_cold_worker: false, + container_id: '0', + environ_container_id: 'no_id', + request_id: '0', + error: String(err && err.message ? err.message : err), + stack: err && err.stack ? err.stack : undefined, + event: event, + env: env, + }); + return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); } - // Calculate timing even for errors + + await advanceWorkersClock(request); + + // Now read the updated timer const end = Date.now() / 1000; const elapsed = performance.now() - start; const micro = elapsed * 1000; // Convert milliseconds to microseconds - - // Mirror Python behavior: return structured error payload - const errorPayload = JSON.stringify({ + + // Build log_data similar to Python handler + const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; + if (ret && ret.measurement !== undefined) { + log_data.measurement = ret.measurement; + } else { + log_data.measurement = {}; + } + + // Add memory usage to measurement + const memUsage = process.memoryUsage(); + const memory_mb = memUsage.heapUsed / 1024 / 1024; + log_data.measurement.memory_used_mb = memory_mb; + + if (event.logs !== undefined) { + log_data.time = 0; + } + + if (event.html) { + return new Response(String(ret && ret.result !== undefined ? ret.result : ''), { + headers: { 'Content-Type': 'text/html; charset=utf-8' }, + }); + } + + const responseBody = JSON.stringify({ begin: begin, end: end, compute_time: micro, results_time: 0, - result: { output: null }, + result: log_data, is_cold: false, is_cold_worker: false, container_id: '0', environ_container_id: 'no_id', - request_id: '0', - error: String(err && err.message ? err.message : err), - stack: err && err.stack ? err.stack : undefined, - event: event, - env: env, + request_id: req_id, }); - return new Response(errorPayload, { status: 500, headers: { 'Content-Type': 'application/json' } }); - } - - // Trigger a fetch request to update the timer before measuring - // Time measurements only update after a fetch request or R2 operation - try { - // Fetch the worker's own URL with favicon to minimize overhead - const finalUrl = new URL(request.url); - finalUrl.pathname = '/favicon'; - await fetch(finalUrl.toString(), { method: 'HEAD' }); - } catch (e) { - // Ignore fetch errors - } - - // Now read the updated timer - const end = Date.now() / 1000; - const elapsed = performance.now() - start; - const micro = elapsed * 1000; // Convert milliseconds to microseconds - - // Build log_data similar to Python handler - const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; - if (ret && ret.measurement !== undefined) { - log_data.measurement = ret.measurement; - } else { - log_data.measurement = {}; - } - - // Add memory usage to measurement - const memUsage = process.memoryUsage(); - const memory_mb = memUsage.heapUsed / 1024 / 1024; - log_data.measurement.memory_used_mb = memory_mb; - - if (event.logs !== undefined) { - log_data.time = 0; - } - - if (event.html) { - return new Response(String(ret && ret.result !== undefined ? ret.result : ''), { - headers: { 'Content-Type': 'text/html; charset=utf-8' }, - }); - } - - const responseBody = JSON.stringify({ - begin: begin, - end: end, - compute_time: micro, - results_time: 0, - result: log_data, - is_cold: false, - is_cold_worker: false, - container_id: '0', - environ_container_id: 'no_id', - request_id: req_id, - }); - return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); + return new Response(responseBody, { headers: { 'Content-Type': 'application/json' } }); } catch (topLevelError) { // Catch any uncaught errors (module loading, syntax errors, etc.) // Try to include timing if available From 0be87068a6b865e21063b8448381d144e4ebbc50 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:05:13 +0200 Subject: [PATCH 104/230] docs(nosql): Add clarification on resource access in Cloudflare Workers --- benchmarks/wrappers/cloudflare/nodejs/nosql.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/wrappers/cloudflare/nodejs/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/nosql.js index b12dfa8b1..4fe3c80b5 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/nosql.js +++ b/benchmarks/wrappers/cloudflare/nodejs/nosql.js @@ -31,6 +31,12 @@ class nosql { throw new Error(`nosql env not initialized for table ${tableName}`); } + // Unlike AWS/Azure/GCP where you instantiate a client SDK and address + // resources by name, Cloudflare Workers expose every bound resource + // (KV namespace, R2 bucket, D1 database, queue, etc.) as a property on + // the `env` object passed into the fetch handler. The property name is + // the binding name declared in wrangler.toml, so looking up a KV + // namespace by its table name is simply `env[tableName]`. const table = env[tableName]; if (!table || typeof table.get !== 'function' || typeof table.put !== 'function') { const envKeys = Object.keys(env || {}); From 5908cefec52f65f6e5c648c2ad6d1e477f837955 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:07:52 +0200 Subject: [PATCH 105/230] docs(nosql): Update module description to clarify HTTP POST operations and server-side implementation --- benchmarks/wrappers/cloudflare/python/container/nosql.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/nosql.py b/benchmarks/wrappers/cloudflare/python/container/nosql.py index 936a49901..5a414d21a 100644 --- a/benchmarks/wrappers/cloudflare/python/container/nosql.py +++ b/benchmarks/wrappers/cloudflare/python/container/nosql.py @@ -1,6 +1,11 @@ """ -NoSQL module for Cloudflare Python Containers -Uses HTTP proxy to access Durable Objects through the Worker's binding +NoSQL module for Cloudflare Python Containers. + +Issues HTTP POSTs to {worker_url}/nosql/. The server side is +implemented in benchmarks/wrappers/cloudflare/nodejs/container/worker.js +(handleNoSQLRequest), which is copied into every container project at deploy +time by sebs/cloudflare/containers.py because @cloudflare/containers is +Node.js-only and wraps Python containers as well. """ import json import urllib.request From 2040294c658d52e97231f34ff9ebb225ee28f45b Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:11:14 +0200 Subject: [PATCH 106/230] refactor(storage): Simplify list URL request by removing unnecessary User-Agent header --- benchmarks/wrappers/cloudflare/python/container/storage.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index d386d4cc9..d639754e8 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -230,10 +230,7 @@ def download_directory(self, bucket, prefix, local_path): list_url = f"{storage.worker_url}/r2/list?{params}" try: - req = urllib.request.Request(list_url) - req.add_header('User-Agent', 'SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2') - - with urllib.request.urlopen(req) as response: + with urllib.request.urlopen(list_url) as response: result = json.loads(response.read().decode('utf-8')) objects = result.get('objects', []) From 6d8a1c61656ed847323984d564712178b439c316 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:18:26 +0200 Subject: [PATCH 107/230] refactor(handler): Remove unused functions and clean up code in handler.py --- .../wrappers/cloudflare/python/handler.py | 50 ------------------- 1 file changed, 50 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 6674e34d5..a29ea2a2e 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -63,9 +63,6 @@ async def fetch2(self, request, env): except IndexError: event[param[0]] = None - - - ## note: time fixed in worker income_timestamp = datetime.datetime.now().timestamp() @@ -145,50 +142,3 @@ async def fetch2(self, request, env): 'environ_container_id': "no_id", 'request_id': req_id })) - - -### ---------- old ------- - -def import_from_path(module_name, file_path): - spec = importlib.util.spec_from_file_location(module_name, file_path) - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) - return module - - -working_dir = os.path.dirname(__file__) - -class MakeAsync(ast.NodeTransformer): - def visit_FunctionDef(self, node): - if node.name != "handler": - return node - return ast.AsyncFunctionDef( - name=node.name, - args=node.args, - body=node.body, - decorator_list=node.decorator_list, - returns=node.returns, - type_params=node.type_params) - -class AddAwait(ast.NodeTransformer): - to_find = ["upload_stream", "download_stream", "upload", "download", "download_directory"] - - def visit_Call(self, node): - if isinstance(node.func, ast.Attribute) and node.func.attr in self.to_find: - #print(ast.dump(node.func, indent=2)) - return ast.Await(value=node) - - return node - -def make_benchmark_func(): - with open(working_dir +"/function/function.py") as f: - module = ast.parse(f.read()) - module = ast.fix_missing_locations(MakeAsync().visit(module)) - module = ast.fix_missing_locations(AddAwait().visit(module)) - new_source = ast.unparse(module) - ##print("new_source:") - ##print(new_source) - ##print() - with open("/tmp/function.py", "w") as wf: - wf.write(new_source) From 6a5ef6b419be20eb7066f88195fefc7831bebc8c Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:20:20 +0200 Subject: [PATCH 108/230] docs(worker): Update comments to clarify the purpose and usage of worker.js --- benchmarks/wrappers/cloudflare/nodejs/container/worker.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index 80b9be82a..a46146d22 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -1,3 +1,9 @@ +// Shared container orchestrator for both Node.js and Python container benchmarks. +// @cloudflare/containers is Node.js-only, so this single worker.js fronts the +// Durable-Object-backed container regardless of the in-container handler +// language. See sebs/cloudflare/containers.py, which copies this file from +// benchmarks/wrappers/cloudflare/nodejs/container/ into every container build +// directory (Python builds included). import { Container, getContainer } from "@cloudflare/containers"; // Container wrapper class From 9ed8f9c255badbb92b8634606b21800e15540a4f Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:21:29 +0200 Subject: [PATCH 109/230] docs(benchmark): Enhance comments to clarify handling of worker.js and variant types --- sebs/benchmark.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 33019df33..299734303 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -759,8 +759,11 @@ def hash_directory( else: with open(f, "rb") as opened_file: hash_sum.update(opened_file.read()) - # For Cloudflare Python containers, also hash the nodejs/container worker.js - # since containers.py always copies it into the build directory regardless of language. + # For Cloudflare Python containers, also hash the nodejs/container worker.js. + # worker.js is shared between Node.js and Python container builds + # (@cloudflare/containers is Node.js-only), so containers.py copies it from + # nodejs/container/ into every container build directory regardless of language. + # Python's wrapper glob would otherwise miss it and stale builds wouldn't invalidate. if deployment == "cloudflare" and language == Language.PYTHON and container_deployment: nodejs_worker = get_resource_path( "benchmarks", "wrappers", "cloudflare", "nodejs", "container", "worker.js" @@ -869,6 +872,14 @@ def copy_code(self, output_dir: str) -> None: ) ) + # Variants come in two flavors and this is where we split between them: + # 1. Patch-based (patch.diff present): apply a unified diff on top of the + # already-copied base files. Use when the variant only needs small, + # targeted edits to the default implementation (e.g. swapping async I/O + # for sync I/O in a runtime that lacks full async support). + # 2. Copy-based (no patch.diff): overlay the variant directory's files on + # top of the base files, replacing any that collide. Use when the + # variant diverges enough that a patch would be unwieldy. patch_file = os.path.join(variant_dir, "patch.diff") if os.path.exists(patch_file): import patch_ng From 26ea6011c6ab4b016eb0d856c41cb686970f617b Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:23:05 +0200 Subject: [PATCH 110/230] refactor(cli): Update deployment type options in regression command --- sebs/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sebs/cli.py b/sebs/cli.py index 4ab62e52f..5cfbe44d1 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -472,8 +472,8 @@ def package( @click.option( "--deployment-type", default=None, - type=click.Choice(["workers", "container"]), - help="Limit regression to a specific deployment type (workers or container).", + type=click.Choice(["functions", "containers"]), + help="Limit regression to a specific deployment type (functions or containers).", ) def regression(benchmark_input_size, benchmark_name, storage_configuration, deployment_type, **kwargs): """Run regression test suite across benchmarks.""" @@ -491,6 +491,7 @@ def regression(benchmark_input_size, benchmark_name, storage_configuration, depl config, benchmark_name, deployment_type, + benchmark_input_size, ) From f14bce96f4a4c4c4dc284f6fdce96642e527403d Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:26:08 +0200 Subject: [PATCH 111/230] feat(dependencies): Add patch-ng and tomli dependencies for compatibility --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 76f832751..6670f0751 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,9 @@ dependencies = [ "pycurl>=7.43", "click>=7.1.2", "rich", + "patch-ng", + "tomli ; python_version < '3.11'", + "tomli_w", # Storage & Local "minio==5.0.10", From 5a1fdcc5ff4cec6235ffaab92d5482018fc8adec Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:34:15 +0200 Subject: [PATCH 112/230] feat(cloudflare): changes in workers.py including moving the templates into the cloudflare directory, defining the pyodide packages in a seperate file and better handling the requirements in python alltogether --- pyproject.toml | 4 + sebs/cloudflare/pyodide_packages.py | 73 +++++++++++ .../templates}/wrangler-container.toml | 0 .../templates}/wrangler-worker.toml | 0 sebs/cloudflare/workers.py | 115 ++++++++---------- 5 files changed, 129 insertions(+), 63 deletions(-) create mode 100644 sebs/cloudflare/pyodide_packages.py rename {templates => sebs/cloudflare/templates}/wrangler-container.toml (100%) rename {templates => sebs/cloudflare/templates}/wrangler-worker.toml (100%) diff --git a/pyproject.toml b/pyproject.toml index 6670f0751..806f1d425 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,6 +104,7 @@ packages = [ "sebs.aws", "sebs.azure", "sebs.gcp", + "sebs.cloudflare", "sebs.local", "sebs.openwhisk", "sebs.faas", @@ -147,6 +148,9 @@ sebs = ["py.typed"] "sebs.dockerfiles" = [ "**/*", ] +"sebs.cloudflare" = [ + "templates/*.toml", +] "sebs.tools" = [ "**/*.py", ] diff --git a/sebs/cloudflare/pyodide_packages.py b/sebs/cloudflare/pyodide_packages.py new file mode 100644 index 000000000..acae0bc41 --- /dev/null +++ b/sebs/cloudflare/pyodide_packages.py @@ -0,0 +1,73 @@ +""" +Pyodide packages supported by Cloudflare Python Workers. + +See https://developers.cloudflare.com/workers/languages/python/packages/ for the +authoritative list. Names use the canonical PyPI distribution casing so the +generated pyproject.toml mirrors what pyodide publishes. +""" + +from typing import FrozenSet, Optional + + +SUPPORTED_PYODIDE_PACKAGES: FrozenSet[str] = frozenset({ + "affine", "aiohappyeyeballs", "aiohttp", "aiosignal", "altair", + "annotated-types", "anyio", "apsw", "argon2-cffi", "argon2-cffi-bindings", + "asciitree", "astropy", "astropy_iers_data", "asttokens", "async-timeout", + "atomicwrites", "attrs", "audioop-lts", "autograd", "awkward-cpp", "b2d", + "bcrypt", "beautifulsoup4", "bilby.cython", "biopython", "bitarray", + "bitstring", "bleach", "blosc2", "bokeh", "boost-histogram", "brotli", + "cachetools", "casadi", "cbor-diag", "certifi", "cffi", "cffi_example", + "cftime", "charset-normalizer", "clarabel", "click", "cligj", "clingo", + "cloudpickle", "cmyt", "cobs", "colorspacious", "contourpy", "coolprop", + "coverage", "cramjam", "crc32c", "cryptography", "css-inline", "cssselect", + "cvxpy-base", "cycler", "cysignals", "cytoolz", "decorator", "demes", + "deprecation", "diskcache", "distlib", "distro", "docutils", "donfig", + "ewah_bool_utils", "exceptiongroup", "executing", "fastapi", "fastcan", + "fastparquet", "fiona", "fonttools", "freesasa", "frozenlist", "fsspec", + "future", "galpy", "gmpy2", "gsw", "h11", "h3", "h5py", "highspy", + "html5lib", "httpcore", "httpx", "idna", "igraph", "imageio", "imgui-bundle", + "iminuit", "iniconfig", "inspice", "ipython", "jedi", "Jinja2", "jiter", + "joblib", "jsonpatch", "jsonpointer", "jsonschema", "jsonschema_specifications", + "kiwisolver", "lakers-python", "lazy_loader", "lazy-object-proxy", "libcst", + "lightgbm", "logbook", "lxml", "lz4", "MarkupSafe", "matplotlib", + "matplotlib-inline", "memory-allocator", "micropip", "mmh3", "more-itertools", + "mpmath", "msgpack", "msgspec", "msprime", "multidict", "munch", "mypy", + "narwhals", "ndindex", "netcdf4", "networkx", "newick", "nh3", "nlopt", + "nltk", "numcodecs", "numpy", "openai", "opencv-python", "optlang", "orjson", + "packaging", "pandas", "parso", "patsy", "pcodec", "peewee", "pi-heif", + "Pillow", "pillow-heif", "pkgconfig", "platformdirs", "pluggy", "ply", + "pplpy", "primecountpy", "prompt_toolkit", "propcache", "protobuf", + "pure-eval", "py", "pyclipper", "pycparser", "pycryptodome", "pydantic", + "pydantic_core", "pyerfa", "pygame-ce", "Pygments", "pyheif", "pyiceberg", + "pyinstrument", "pylimer-tools", "PyMuPDF", "pynacl", "pyodide-http", + "pyodide-unix-timezones", "pyparsing", "pyrsistent", "pysam", "pyshp", + "pytaglib", "pytest", "pytest-asyncio", "pytest-benchmark", "pytest_httpx", + "python-calamine", "python-dateutil", "python-flint", "python-magic", + "python-sat", "python-solvespace", "pytz", "pywavelets", "pyxel", "pyxirr", + "pyyaml", "rasterio", "rateslib", "rebound", "reboundx", "referencing", + "regex", "requests", "retrying", "rich", "river", "RobotRaconteur", + "rpds-py", "ruamel.yaml", "rustworkx", "scikit-image", "scikit-learn", + "scipy", "screed", "setuptools", "shapely", "simplejson", "sisl", "six", + "smart-open", "sniffio", "sortedcontainers", "soundfile", "soupsieve", + "sourmash", "soxr", "sparseqr", "sqlalchemy", "stack-data", "starlette", + "statsmodels", "strictyaml", "svgwrite", "swiglpk", "sympy", "tblib", + "termcolor", "texttable", "texture2ddecoder", "threadpoolctl", "tiktoken", + "tomli", "tomli-w", "toolz", "tqdm", "traitlets", "traits", "tree-sitter", + "tree-sitter-go", "tree-sitter-java", "tree-sitter-python", "tskit", + "typing-extensions", "tzdata", "ujson", "uncertainties", "unyt", "urllib3", + "vega-datasets", "vrplib", "wcwidth", "webencodings", "wordcloud", "wrapt", + "xarray", "xgboost", "xlrd", "xxhash", "xyzservices", "yarl", "yt", "zengl", + "zfpy", "zstandard", +}) + + +_CANONICAL_BY_LOWER = {name.lower(): name for name in SUPPORTED_PYODIDE_PACKAGES} + + +def get_canonical_pyodide_name(name: str) -> Optional[str]: + """Return the canonical Pyodide package name for ``name`` (O(1) lookup). + + Matching is case-insensitive. Returns ``None`` if the package is not + supported by the Cloudflare Python Workers runtime. + """ + return _CANONICAL_BY_LOWER.get(name.lower()) diff --git a/templates/wrangler-container.toml b/sebs/cloudflare/templates/wrangler-container.toml similarity index 100% rename from templates/wrangler-container.toml rename to sebs/cloudflare/templates/wrangler-container.toml diff --git a/templates/wrangler-worker.toml b/sebs/cloudflare/templates/wrangler-worker.toml similarity index 100% rename from templates/wrangler-worker.toml rename to sebs/cloudflare/templates/wrangler-worker.toml diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 16f0369c3..278d3ae27 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -6,6 +6,7 @@ """ import os +import re import shutil import json import io @@ -23,6 +24,8 @@ from sebs.benchmark import Benchmark from sebs.cloudflare.cli import CloudflareCLI +from sebs.cloudflare.pyodide_packages import get_canonical_pyodide_name +from sebs.utils import get_resource_path class CloudflareWorkersDeployment: @@ -80,18 +83,13 @@ def generate_wrangler_toml( """ # Load template template_path = os.path.join( - os.path.dirname(__file__), - "../..", - "templates", + os.path.dirname(__file__), + "templates", "wrangler-worker.toml" ) with open(template_path, 'rb') as f: config = tomllib.load(f) - # Native workers no longer require Durable Object bindings for NoSQL. - config.pop('durable_objects', None) - config.pop('migrations', None) - # Update basic configuration config['name'] = worker_name config['account_id'] = account_id @@ -218,59 +216,52 @@ def package_code( if os.path.exists(requirements_file): with open(requirements_file, 'r') as reqf: reqtext = reqf.read() - supported_pkg = \ -['affine', 'aiohappyeyeballs', 'aiohttp', 'aiosignal', 'altair', 'annotated-types',\ -'anyio', 'apsw', 'argon2-cffi', 'argon2-cffi-bindings', 'asciitree', 'astropy', 'astropy_iers_data',\ -'asttokens', 'async-timeout', 'atomicwrites', 'attrs', 'audioop-lts', 'autograd', 'awkward-cpp', 'b2d',\ -'bcrypt', 'beautifulsoup4', 'bilby.cython', 'biopython', 'bitarray', 'bitstring', 'bleach', 'blosc2', 'bokeh',\ -'boost-histogram', 'brotli', 'cachetools', 'casadi', 'cbor-diag', 'certifi', 'cffi', 'cffi_example', 'cftime',\ -'charset-normalizer', 'clarabel', 'click', 'cligj', 'clingo', 'cloudpickle', 'cmyt', 'cobs', 'colorspacious',\ -'contourpy', 'coolprop', 'coverage', 'cramjam', 'crc32c', 'cryptography', 'css-inline', 'cssselect', 'cvxpy-base', 'cycler',\ -'cysignals', 'cytoolz', 'decorator', 'demes', 'deprecation', 'diskcache', 'distlib', 'distro', 'docutils', 'donfig',\ -'ewah_bool_utils', 'exceptiongroup', 'executing', 'fastapi', 'fastcan', 'fastparquet', 'fiona', 'fonttools', 'freesasa',\ -'frozenlist', 'fsspec', 'future', 'galpy', 'gmpy2', 'gsw', 'h11', 'h3', 'h5py', 'highspy', 'html5lib', 'httpcore',\ -'httpx', 'idna', 'igraph', 'imageio', 'imgui-bundle', 'iminuit', 'iniconfig', 'inspice', 'ipython', 'jedi', 'Jinja2',\ -'jiter', 'joblib', 'jsonpatch', 'jsonpointer', 'jsonschema', 'jsonschema_specifications', 'kiwisolver',\ -'lakers-python', 'lazy_loader', 'lazy-object-proxy', 'libcst', 'lightgbm', 'logbook', 'lxml', 'lz4', 'MarkupSafe',\ -'matplotlib', 'matplotlib-inline', 'memory-allocator', 'micropip', 'mmh3', 'more-itertools', 'mpmath',\ -'msgpack', 'msgspec', 'msprime', 'multidict', 'munch', 'mypy', 'narwhals', 'ndindex', 'netcdf4', 'networkx',\ -'newick', 'nh3', 'nlopt', 'nltk', 'numcodecs', 'numpy', 'openai', 'opencv-python', 'optlang', 'orjson',\ -'packaging', 'pandas', 'parso', 'patsy', 'pcodec', 'peewee', 'pi-heif', 'Pillow', 'pillow-heif', 'pkgconfig',\ -'platformdirs', 'pluggy', 'ply', 'pplpy', 'primecountpy', 'prompt_toolkit', 'propcache', 'protobuf', 'pure-eval',\ -'py', 'pyclipper', 'pycparser', 'pycryptodome', 'pydantic', 'pydantic_core', 'pyerfa', 'pygame-ce', 'Pygments',\ -'pyheif', 'pyiceberg', 'pyinstrument', 'pylimer-tools', 'PyMuPDF', 'pynacl', 'pyodide-http', 'pyodide-unix-timezones',\ -'pyparsing', 'pyrsistent', 'pysam', 'pyshp', 'pytaglib', 'pytest', 'pytest-asyncio', 'pytest-benchmark', 'pytest_httpx',\ -'python-calamine', 'python-dateutil', 'python-flint', 'python-magic', 'python-sat', 'python-solvespace', 'pytz', 'pywavelets',\ -'pyxel', 'pyxirr', 'pyyaml', 'rasterio', 'rateslib', 'rebound', 'reboundx', 'referencing', 'regex', 'requests',\ -'retrying', 'rich', 'river', 'RobotRaconteur', 'rpds-py', 'ruamel.yaml', 'rustworkx', 'scikit-image', 'scikit-learn',\ -'scipy', 'screed', 'setuptools', 'shapely', 'simplejson', 'sisl', 'six', 'smart-open', 'sniffio', 'sortedcontainers',\ -'soundfile', 'soupsieve', 'sourmash', 'soxr', 'sparseqr', 'sqlalchemy', 'stack-data', 'starlette', 'statsmodels', 'strictyaml',\ -'svgwrite', 'swiglpk', 'sympy', 'tblib', 'termcolor', 'texttable', 'texture2ddecoder', 'threadpoolctl', 'tiktoken', 'tomli',\ -'tomli-w', 'toolz', 'tqdm', 'traitlets', 'traits', 'tree-sitter', 'tree-sitter-go', 'tree-sitter-java', 'tree-sitter-python',\ -'tskit', 'typing-extensions', 'tzdata', 'ujson', 'uncertainties', 'unyt', 'urllib3', 'vega-datasets', 'vrplib', 'wcwidth',\ -'webencodings', 'wordcloud', 'wrapt', 'xarray', 'xgboost', 'xlrd', 'xxhash', 'xyzservices', 'yarl', 'yt', 'zengl', 'zfpy', 'zstandard'] needed_pkg = [] - for pkg in supported_pkg: - if pkg.lower() in reqtext.lower(): - needed_pkg.append(pkg) + unsupported = [] + seen = set() + for raw_line in reqtext.splitlines(): + line = raw_line.split("#", 1)[0].strip() + if not line: + continue + name = re.split(r"[<>=!~;\s\[]", line, maxsplit=1)[0].strip() + if not name: + continue + canonical = get_canonical_pyodide_name(name) + if canonical is None: + unsupported.append(name) + continue + if canonical not in seen: + needed_pkg.append(canonical) + seen.add(canonical) + if unsupported: + raise RuntimeError( + "The following packages from requirements.txt are not " + "supported by the Cloudflare Python Workers (Pyodide) " + f"runtime: {', '.join(unsupported)}. See " + "https://developers.cloudflare.com/workers/languages/python/packages/ " + "for the list of supported packages." + ) project_file = os.path.join(directory, "pyproject.toml") - depstr = str(needed_pkg).replace("\'", "\"") - with open(project_file, 'w') as pf: - pf.write(f""" -[project] -name = "{benchmark.replace(".", "-")}-python-{language_version.replace(".", "")}" -version = "0.1.0" -description = "dummy description" -requires-python = ">={language_version}" -dependencies = {depstr} - -[dependency-groups] -dev = [ - "workers-py", - "workers-runtime-sdk" -] - """) + pyproject_config = { + "project": { + "name": f"{benchmark.replace('.', '-')}-python-" + f"{language_version.replace('.', '')}", + "version": "0.1.0", + "description": "dummy description", + "requires-python": f">={language_version}", + "dependencies": needed_pkg, + }, + "dependency-groups": { + "dev": ["workers-py", "workers-runtime-sdk"], + }, + } + try: + with open(project_file, 'wb') as pf: + tomli_w.dump(pyproject_config, pf) + except TypeError: + with open(project_file, 'w') as pf: + pf.write(tomli_w.dumps(pyproject_config)) # Pyodide Workers require all function files in a function/ subdir funcdir = os.path.join(directory, "function") if not os.path.exists(funcdir): @@ -344,9 +335,8 @@ def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> Non self.logging.info("Cached dist/ found — skipping worker bundle build.") return - dockerfile_src = os.path.join( - os.path.dirname(__file__), "..", "..", - "dockerfiles", "cloudflare", "nodejs", "Dockerfile.build" + dockerfile_src = str( + get_resource_path("dockerfiles", "cloudflare", "nodejs", "Dockerfile.build") ) dockerfile_dest = os.path.join(directory, "Dockerfile.build") dockerignore_dest = os.path.join(directory, ".dockerignore") @@ -424,9 +414,8 @@ def _build_python_worker(self, directory: str, is_cached: bool) -> None: self.logging.info("Cached Python build marker — skipping validation.") return - dockerfile_src = os.path.join( - os.path.dirname(__file__), "..", "..", - "dockerfiles", "cloudflare", "python", "Dockerfile.build" + dockerfile_src = str( + get_resource_path("dockerfiles", "cloudflare", "python", "Dockerfile.build") ) dockerfile_dest = os.path.join(directory, "Dockerfile.build") dockerignore_dest = os.path.join(directory, ".dockerignore") From bcd3dae77d1a82c626ec56525e2a0cbf64f0dbfa Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:36:03 +0200 Subject: [PATCH 113/230] refactor(regression): streamline Cloudflare benchmark configurations and update input size handling --- sebs/regression.py | 62 +++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/sebs/regression.py b/sebs/regression.py index 16607d0de..9ef28fc0d 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -80,34 +80,10 @@ # Cloudflare-specific configurations architectures_cloudflare = ["x64"] -# Cloudflare workers benchmarks per language -benchmarks_cloudflare_python_workers = [ - "110.dynamic-html", - "120.uploader", - "130.crud-api", - "210.thumbnailer", - "311.compression", - "501.graph-pagerank", - "502.graph-mst", - "503.graph-bfs", -] -benchmarks_cloudflare_python_containers = benchmarks_python # all benchmarks supported -benchmarks_cloudflare_nodejs_workers = [ - "110.dynamic-html", - "120.uploader", - "130.crud-api", - "311.compression", -] -benchmarks_cloudflare_nodejs_containers = [ - "110.dynamic-html", - "120.uploader", - "130.crud-api", - "210.thumbnailer", - "311.compression", -] - # User-defined config passed during initialization, set in regression_suite() cloud_config: Optional[dict] = None +# Input size for benchmark test data ("test" | "small" | "large"), set in regression_suite() +benchmark_input_size: str = "test" class TestSequenceMeta(type): @@ -233,7 +209,7 @@ def test(self): # Prepare input data for the benchmark input_config = benchmark.prepare_input( deployment_client.system_resources, - size="test", + size=benchmark_input_size, replace_existing=experiment_config.update_storage, ) @@ -1084,7 +1060,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): class CloudflareTestSequencePythonWorkers( unittest.TestCase, metaclass=TestSequenceMeta, - benchmarks=benchmarks_cloudflare_python_workers, + benchmarks=benchmarks_python, architectures=architectures_cloudflare, deployments=["workers"], deployment_name="cloudflare", @@ -1114,7 +1090,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): class CloudflareTestSequencePythonContainers( unittest.TestCase, metaclass=TestSequenceMeta, - benchmarks=benchmarks_cloudflare_python_containers, + benchmarks=benchmarks_python, architectures=architectures_cloudflare, deployments=["container"], deployment_name="cloudflare", @@ -1144,7 +1120,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): class CloudflareTestSequenceNodejsWorkers( unittest.TestCase, metaclass=TestSequenceMeta, - benchmarks=benchmarks_cloudflare_nodejs_workers, + benchmarks=benchmarks_nodejs, architectures=architectures_cloudflare, deployments=["workers"], deployment_name="cloudflare", @@ -1174,7 +1150,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): class CloudflareTestSequenceNodejsContainers( unittest.TestCase, metaclass=TestSequenceMeta, - benchmarks=benchmarks_cloudflare_nodejs_containers, + benchmarks=benchmarks_nodejs, architectures=architectures_cloudflare, deployments=["container"], deployment_name="cloudflare", @@ -1314,17 +1290,11 @@ def filter_out_benchmarks( return "411.image-recognition" not in benchmark # Cloudflare: only certain benchmarks are supported per language/deployment-type. - # Mirrors Cloudflare.SUPPORTED_BENCHMARKS in sebs/cloudflare/cloudflare.py. # None means all benchmarks are supported for that combination. if deployment_name == "cloudflare": - _CF_SUPPORTED: Dict[Tuple[str, bool], Optional[Set[str]]] = { - ("python", False): {"110", "120", "130", "210", "311", "501", "502", "503"}, - ("nodejs", False): {"110", "120", "130", "311"}, - ("python", True): None, # all supported - ("nodejs", True): {"110", "120", "130", "210", "311"}, - } + from sebs.cloudflare.cloudflare import Cloudflare is_container = deployment_type == "container" - allowed = _CF_SUPPORTED.get((language, is_container)) + allowed = Cloudflare.SUPPORTED_BENCHMARKS.get((language, is_container)) if allowed is not None: # benchmark is the test method name, e.g. "test_cloudflare_120.uploader_x64_workers" # Extract the numeric benchmark prefix (e.g. "120") from before the first "." @@ -1343,6 +1313,7 @@ def regression_suite( deployment_config: dict, benchmark_name: Optional[str] = None, deployment_type: Optional[str] = None, + input_size: str = "test", ): """Create and run a regression test suite for specified cloud providers. @@ -1366,9 +1337,10 @@ def regression_suite( # Create the test suite suite = unittest.TestSuite() - # Make cloud_config available to test classes - global cloud_config + # Make cloud_config and input size available to test classes + global cloud_config, benchmark_input_size cloud_config = deployment_config + benchmark_input_size = input_size # Extract runtime configuration language = experiment_config["runtime"]["language"] @@ -1435,26 +1407,26 @@ def regression_suite( "cloudflare" in cloud_config["deployment"] ), "Cloudflare provider requested but not in deployment config" if language == "python": - if deployment_type != "container": + if deployment_type != "containers": suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase( CloudflareTestSequencePythonWorkers ) ) - if deployment_type != "workers": + if deployment_type != "functions": suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase( CloudflareTestSequencePythonContainers ) ) elif language == "nodejs": - if deployment_type != "container": + if deployment_type != "containers": suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase( CloudflareTestSequenceNodejsWorkers ) ) - if deployment_type != "workers": + if deployment_type != "functions": suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase( CloudflareTestSequenceNodejsContainers From 4f7a279168cb0400e28ca958771defdd65dffc9e Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:36:19 +0200 Subject: [PATCH 114/230] refactor(nosql): simplify constructor by removing unnecessary comment --- benchmarks/wrappers/cloudflare/nodejs/container/nosql.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js index b704b0157..f529e682b 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/nosql.js @@ -14,10 +14,8 @@ */ class nosql { - constructor() { - // Container accesses Durable Objects through worker.js proxy - } - + constructor() {} + static worker_url = null; // Set by handler from X-Worker-URL header static init_instance(entry) { From 8acec87dd9768dfcaba10ed805406f3721aa76cc Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:36:36 +0200 Subject: [PATCH 115/230] fix(docs): update Wrangler template paths to reflect new directory structure --- docs/platforms.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/platforms.md b/docs/platforms.md index 8e4b50f33..39c8cb5c5 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -246,7 +246,7 @@ SeBS supports two deployment paths for Cloudflare: **script-based Workers** (nat | `triggers.py` | `HTTPTrigger` — invokes the deployed Worker at `https://{name}.{account}.workers.dev`. | | `r2.py`, `kvstore.py` | Object and NoSQL storage clients. | -Wrangler templates live at the repo root under `templates/wrangler-worker.toml` and `templates/wrangler-container.toml`. +Wrangler templates live alongside the deployment code at `sebs/cloudflare/templates/wrangler-worker.toml` and `sebs/cloudflare/templates/wrangler-container.toml` so they ship with the pip-packaged `sebs`. #### Dockerfiles (`dockerfiles/cloudflare/`) @@ -261,14 +261,14 @@ Wrangler templates live at the repo root under `templates/wrangler-worker.toml` #### Script-based flow (`container_deployment=false`) 1. `benchmark.build()` → `Cloudflare.package_code` → `CloudflareWorkersDeployment.package_code` (builds via `Dockerfile.build`). -2. `Cloudflare.create_function` → `_create_or_update_worker` renders `templates/wrangler-worker.toml` into the package. +2. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-worker.toml` into the package. 3. `CloudflareCLI.wrangler_deploy` (Node) or `pywrangler_deploy` (Python) deploys via the `manage.cloudflare` container. 4. `HTTPTrigger` is attached using the `workers.dev` URL. #### Container-based flow (`container_deployment=true`) 1. `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile` (patching `BASE_IMAGE`), adds `worker.js`, merges `package.json`, runs `npm install` in the CLI container, and builds a local Docker image. -2. `Cloudflare.create_function` → `_create_or_update_worker` renders `templates/wrangler-container.toml`. +2. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-container.toml`. 3. `CloudflareCLI.wrangler_deploy` invokes wrangler, which rebuilds the image from `Dockerfile` and pushes it to Cloudflare's managed registry, creating a Durable-Object-backed container worker. 4. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the DO warm before the first measured invocation. 5. `HTTPTrigger` is attached using the `workers.dev` URL. From 812fe7223e26f2dc86e470425f3fb2feac467ce6 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:36:52 +0200 Subject: [PATCH 116/230] refactor(cli): streamline Dockerfile path resolution using get_resource_path --- sebs/cloudflare/cli.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 426db5cac..c4d4117ff 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -6,7 +6,7 @@ import docker from sebs.config import SeBSConfig -from sebs.utils import LoggingBase +from sebs.utils import LoggingBase, get_resource_path class CloudflareCLI(LoggingBase): @@ -39,23 +39,22 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): logging.info(f"Pull failed: {pull_error}. Building image locally...") # Find the Dockerfile path - dockerfile_path = os.path.join( - os.path.dirname(__file__), - "..", - "..", - "dockerfiles", - "cloudflare", - "Dockerfile.manage" + dockerfile_path = str( + get_resource_path( + "dockerfiles", "cloudflare", "Dockerfile.manage" + ) ) - + if not os.path.exists(dockerfile_path): raise RuntimeError( f"Dockerfile not found at {dockerfile_path}. " "Cannot build Cloudflare CLI container." ) - - # Build the image - build_path = os.path.join(os.path.dirname(__file__), "..", "..") + + # Build context must contain dockerfiles/entrypoint.sh (COPY'd by + # Dockerfile.manage). In git mode this is the repo root; in package- + # install mode it is the sebs/ package dir — both hold via get_resource_path(). + build_path = str(get_resource_path()) logging.info(f"Building {full_image_name} from {dockerfile_path}...") try: From d8f08db9c814876f34e36d5cbf1bc17acff8e20e Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 13:37:18 +0200 Subject: [PATCH 117/230] refactor(containers): simplify Dockerfile path resolution using get_resource_path + template path directory update --- sebs/cloudflare/containers.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 25b90b7fc..47ab87c05 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -28,6 +28,7 @@ from sebs.benchmark import Benchmark from sebs.cloudflare.cli import CloudflareCLI +from sebs.utils import get_resource_path class CloudflareContainersDeployment: @@ -86,9 +87,8 @@ def generate_wrangler_toml( """ # Load template template_path = os.path.join( - os.path.dirname(__file__), - "../..", - "templates", + os.path.dirname(__file__), + "templates", "wrangler-container.toml" ) with open(template_path, 'rb') as f: @@ -197,14 +197,10 @@ def package_code( # Copy container wrapper files to the package directory # Copy Dockerfile.function from dockerfiles/cloudflare/{language}/ - dockerfile_src = os.path.join( - os.path.dirname(__file__), - "..", - "..", - "dockerfiles", - "cloudflare", - language_name, - "Dockerfile.function" + dockerfile_src = str( + get_resource_path( + "dockerfiles", "cloudflare", language_name, "Dockerfile.function" + ) ) dockerfile_dest = os.path.join(directory, "Dockerfile") if os.path.exists(dockerfile_src): From 97bb6741adb45a3336d5221bba1c36164cf9eb40 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 15:41:33 +0200 Subject: [PATCH 118/230] docs(cloudflare): enhance authentication section with detailed API token and legacy key usage instructions --- docs/platforms.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/platforms.md b/docs/platforms.md index 39c8cb5c5..03ee2737d 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -183,16 +183,31 @@ Cloudflare offers a free tier for Workers with generous limits for development a ### Credentials -You can authenticate with Cloudflare using an API token (recommended) or email + API key. Additionally, you need your account ID which can be found in the Cloudflare dashboard. +SeBS supports both authentication methods Cloudflare offers. Both are +functionally equivalent for SeBS: every API call, R2 upload, KV +operation, and `wrangler` invocation works with either. Pick based on +your Cloudflare account, not on SeBS features: + +- **API Token (recommended)**: A scoped credential you mint in the + Cloudflare dashboard. It can be limited to the permissions SeBS needs + and revoked independently, so this is the safest default for most + users. +- **Email + Global API Key (legacy)**: Your account email plus the + Global API Key from the Cloudflare dashboard. SeBS still supports this + path for older setups and accounts that cannot use scoped tokens, but + it grants broad account access and should be handled more carefully. + +Regardless of which method you choose, you also need your account ID +from the Cloudflare dashboard. You can pass credentials using environment variables: ```bash -# Option 1: Using API Token (recommended) +# Option 1: API Token (recommended) export CLOUDFLARE_API_TOKEN="your-api-token" export CLOUDFLARE_ACCOUNT_ID="your-account-id" -# Option 2: Using Email + API Key +# Option 2: Email + Global API Key (legacy) export CLOUDFLARE_EMAIL="your-email@example.com" export CLOUDFLARE_API_KEY="your-global-api-key" export CLOUDFLARE_ACCOUNT_ID="your-account-id" From e69c8366f6a336cd85f375c0a58a0103c5d70c31 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 16:00:07 +0200 Subject: [PATCH 119/230] refactor(containers): update Cloudflare CLI initialization and improve R2 bucket binding error handling --- sebs/cloudflare/containers.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 47ab87c05..a39fcbf5a 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -53,7 +53,7 @@ def __init__(self, logging, system_config, docker_client, system_resources): def _get_cli(self) -> CloudflareCLI: """Get or initialize the Cloudflare CLI container.""" if self._cli is None: - self._cli = CloudflareCLI(self.system_config, self.docker_client) + self._cli = CloudflareCLI.get_instance(self.system_config, self.docker_client) # Verify wrangler is available version = self._cli.check_wrangler_version() self.logging.info(f"Cloudflare CLI container ready: {version}") @@ -129,21 +129,19 @@ def generate_wrangler_toml( config['vars']['NOSQL_STORAGE_DATABASE'] = "kvstore" # Add R2 bucket binding - try: - from sebs.faas.config import Resources - storage = self.system_resources.get_storage() - bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) - if bucket_name: - config['r2_buckets'] = [{ - 'binding': 'R2', - 'bucket_name': bucket_name - }] - self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") - except Exception as e: - self.logging.warning( - f"R2 bucket binding not configured: {e}. " - f"Benchmarks requiring file access will not work properly." + from sebs.faas.config import Resources + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + if not bucket_name: + raise RuntimeError( + "R2 bucket binding not configured: benchmarks bucket name is empty. " + "Benchmarks requiring file access will not work properly." ) + config['r2_buckets'] = [{ + 'binding': 'R2', + 'bucket_name': bucket_name + }] + self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") @@ -185,9 +183,7 @@ def package_code( self.logging.info(f"Packaging container for {language_name} {language_version}") # Get wrapper directory for container files - wrapper_base = os.path.join( - os.path.dirname(__file__), "..", "..", "benchmarks", "wrappers", "cloudflare" - ) + wrapper_base = str(get_resource_path("benchmarks", "wrappers", "cloudflare")) wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") if not os.path.exists(wrapper_container_dir): From 42eef0ebec1820f6bc7bba08f6f79e3fb7e8dbe0 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 16:05:41 +0200 Subject: [PATCH 120/230] docs(kvstore): enhance KVStore class documentation with detailed namespace and key mapping explanations --- sebs/cloudflare/kvstore.py | 52 +++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/sebs/cloudflare/kvstore.py b/sebs/cloudflare/kvstore.py index edc9ab6dc..11c71c7e3 100644 --- a/sebs/cloudflare/kvstore.py +++ b/sebs/cloudflare/kvstore.py @@ -17,9 +17,55 @@ class KVStore(NoSQLStorage): """ Cloudflare KV-backed NoSQL storage for SeBS. - This implementation maps every benchmark table to one KV namespace. - Data is stored as JSON values under composite keys: - # + Cloudflare KV is a flat key-value store: there are no tables, schemas, or + secondary indexes. The SeBS NoSQL abstraction (modelled after DynamoDB / + Cosmos DB / Datastore) is therefore layered on top of KV as follows. + + Table -> namespace mapping + -------------------------- + Each (benchmark, logical table) pair is mapped to exactly one KV namespace + -- the coarsest isolation unit KV offers. Namespaces are titled + + sebs-nosql--- + + with each component sanitized to ``[A-Za-z0-9_-]`` and a SHA1 suffix + appended when the title would exceed Cloudflare's 100-character limit + (see ``_namespace_title``). A one-namespace-per-table layout is used + instead of packing multiple tables into a shared namespace because: + + * Workers bind namespaces by id, so one binding per table is the natural + way to expose the logical table to the benchmark code. + * ``cleanup_tables`` / ``remove_table`` can drop a whole table by deleting + its namespace -- KV has no bulk-delete-by-prefix primitive. + * Key collisions between benchmarks or logical tables are impossible. + + Key mapping + ----------- + Items are stored as JSON values under composite keys: + + # (when a secondary key exists) + (otherwise) + + The primary and secondary key fields are also written back into the JSON + value so that clients reading an item do not have to re-parse the key. + + Secondary-key indices + --------------------- + KV exposes a ``list`` API, but from inside a Worker it is paginated, + eventually consistent, and scales with the total namespace size -- not + with the number of items under a given primary key. DynamoDB-style query + patterns ("give me every item with primary key = X") would therefore be + prohibitively expensive if implemented via ``list``. + + To support those queries with point reads only, ``write_to_table`` + additionally maintains a per-primary-key index entry: + + __sebs_idx__ -> JSON array of secondary-key values + + A query then becomes one ``GET`` for the index followed by one ``GET`` per + secondary value. The index is only written when a secondary key is + supplied; tables without a secondary key do not need it. The matching + read path lives in ``benchmarks/wrappers/cloudflare/*/nosql.*``. """ NAMESPACE_ID_PATTERN = re.compile(r"^[a-fA-F0-9]{32}$") From 4a7f03668984db8ee0565ceb01ad2869b1a43c72 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 16:07:26 +0200 Subject: [PATCH 121/230] refactor(r2): improve error handling and logging in list_bucket method --- sebs/cloudflare/r2.py | 49 ++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 7b9e21272..a57b686a0 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -237,13 +237,6 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): except Exception as e: self.logging.warning(f"Failed to upload bytes to R2: {e}") - """ - Retrieves list of files in a bucket. - - :param bucket_name: - :return: list of files in a given bucket - """ - def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: """ Retrieves list of files in a bucket using S3-compatible API. @@ -289,41 +282,35 @@ def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: try: response = requests.get(list_buckets_uri, headers=self._get_auth_headers()) - - # Log detailed error information + if response.status_code == 403: try: error_data = response.json() - self.logging.error( - f"403 Forbidden accessing R2 buckets. " - f"Response: {error_data}. " - f"Your API token may need 'R2 Read and Write' permissions." - ) - except: - self.logging.error( - f"403 Forbidden accessing R2 buckets. " - f"Your API token may need 'R2 Read and Write' permissions." - ) - return [] - + detail = f"Response: {error_data}. " + except ValueError: + detail = "" + raise RuntimeError( + f"403 Forbidden accessing R2 buckets. {detail}" + "Your API token may need 'R2 Read and Write' permissions." + ) + response.raise_for_status() - + data = response.json() - + if not data.get("success"): - self.logging.error(f"Failed to list R2 buckets: {data.get('errors')}") - return [] - - # Extract bucket names from response + raise RuntimeError( + f"Failed to list R2 buckets: {data.get('errors')}" + ) + buckets = data.get("result", {}).get("buckets", []) bucket_names = [bucket["name"] for bucket in buckets] - + self.logging.info(f"Found {len(bucket_names)} R2 buckets") return bucket_names - + except requests.exceptions.RequestException as e: - self.logging.error(f"Error listing R2 buckets: {e}") - return [] + raise RuntimeError(f"Error listing R2 buckets: {e}") from e def exists_bucket(self, bucket_name: str) -> bool: """ From cd1def7b836eade35bcda4131981a6d78082424d Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 16:39:05 +0200 Subject: [PATCH 122/230] refactor(containers): update Cloudflare CLI initialization to use constructor directly --- sebs/cloudflare/containers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index a39fcbf5a..7dff7215a 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -53,7 +53,7 @@ def __init__(self, logging, system_config, docker_client, system_resources): def _get_cli(self) -> CloudflareCLI: """Get or initialize the Cloudflare CLI container.""" if self._cli is None: - self._cli = CloudflareCLI.get_instance(self.system_config, self.docker_client) + self._cli = CloudflareCLI(self.system_config, self.docker_client) # Verify wrangler is available version = self._cli.check_wrangler_version() self.logging.info(f"Cloudflare CLI container ready: {version}") From 9b9894af780c6692c5f669ba5b9b16972282ac1d Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 22 Apr 2026 17:23:48 +0200 Subject: [PATCH 123/230] feat(cloudflare): enhance variant selection logic for Cloudflare deployments --- sebs/benchmark.py | 36 +++++++++++++++++++++++++++++++++-- sebs/cloudflare/cloudflare.py | 16 +++++++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 299734303..fcca285a7 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -502,6 +502,38 @@ def language_variant(self) -> str: """ return self._language_variant + def select_variant(self, variant: str) -> None: + """Switch the active language variant and refresh the cache state. + + Should be called before build() whenever the deployment platform + needs to override the variant that was set from the experiment config. + Re-queries the cache with the updated variant key and re-applies + the update_code flag if it was set. + + Args: + variant: New variant name (e.g. "cloudflare"). + + Raises: + RuntimeError: If the variant is not declared for this benchmark. + """ + if not self.benchmark_config.supports(self.language, variant): + raise RuntimeError( + f"Variant '{variant}' is not declared for benchmark " + f"{self.benchmark} language {self.language_name}" + ) + self._language_variant = variant + self._output_dir = os.path.join( + self._output_dir_base, + self._language.value, + self._language_variant, + self._language_version, + self._architecture, + "container" if self._container_deployment else "package", + ) + self.query_cache() + if self._experiment_config.update_code: + self._is_cached_valid = False + @property def cache_language_key(self) -> str: """ @@ -661,9 +693,9 @@ def __init__( self._docker_client = docker_client self._system_config = system_config self._code_location: Optional[str] = None + self._output_dir_base = os.path.join(output_dir, f"{benchmark}_code") self._output_dir = os.path.join( - output_dir, - f"{benchmark}_code", + self._output_dir_base, self._language.value, self._language_variant, self._language_version, diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 9baa8799b..57dac8680 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -137,7 +137,7 @@ def is_benchmark_supported(self, benchmark_name: str, language: str, container_d return prefix in allowed def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) -> Function: - """Override to validate benchmark support before building/deploying.""" + """Override to validate benchmark support and auto-select cloudflare variant.""" language = code_package.language_name container_deployment = code_package.container_deployment benchmark_name = code_package.benchmark @@ -148,6 +148,20 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) f"{language} {deployment_type} deployments on Cloudflare. " f"Supported benchmarks: {self.SUPPORTED_BENCHMARKS.get((language, container_deployment))}" ) + + # For workers deployments, auto-promote the variant from "default" to + # "cloudflare" when the benchmark's config.json declares a "cloudflare" + # variant. Benchmark.__init__ sets the variant from the experiment config + # (CLI --language-variant flag), which defaults to "default". Promoting + # here ensures copy_code() applies the cloudflare/ source overlay and the + # cache key reflects the correct variant. + if ( + not container_deployment + and code_package.language_variant == "default" + and code_package.benchmark_config.supports(code_package.language, self.name()) + ): + code_package.select_variant(self.name()) + return super().get_function(code_package, func_name) def __init__( From 6d94293da25545068975673f0f8e2e663e816cc9 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 12:42:53 +0200 Subject: [PATCH 124/230] refactor(cloudflare): restructure Dockerfile.build for Node.js and Python, add installer scripts --- configs/systems.json | 4 +- .../cloudflare/nodejs/Dockerfile.build | 23 +-- .../cloudflare/python/Dockerfile.build | 24 +-- dockerfiles/cloudflare_nodejs_installer.sh | 10 + dockerfiles/cloudflare_python_installer.sh | 11 ++ docs/platforms.md | 4 +- sebs/cloudflare/workers.py | 171 ++---------------- 7 files changed, 55 insertions(+), 192 deletions(-) create mode 100644 dockerfiles/cloudflare_nodejs_installer.sh create mode 100644 dockerfiles/cloudflare_python_installer.sh diff --git a/configs/systems.json b/configs/systems.json index 56196ba6d..0d648c198 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -447,7 +447,7 @@ "3.12": "python:3.12-slim" } }, - "images": [], + "images": ["build"], "supported_variants": ["default", "cloudflare"], "deployment": { "files": [ @@ -481,7 +481,7 @@ "20": "node:20-slim" } }, - "images": [], + "images": ["build"], "supported_variants": ["default", "cloudflare"], "deployment": { "files": [ diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile.build b/dockerfiles/cloudflare/nodejs/Dockerfile.build index 6fb2b7149..652f4eb6f 100644 --- a/dockerfiles/cloudflare/nodejs/Dockerfile.build +++ b/dockerfiles/cloudflare/nodejs/Dockerfile.build @@ -1,16 +1,13 @@ -FROM node:20-slim +ARG BASE_IMAGE=node:20-slim +FROM ${BASE_IMAGE} -WORKDIR /worker +# Install esbuild globally once — benchmark source arrives via bind-mount. +RUN npm install -g esbuild -# Copy source files (node_modules and dist are excluded via .dockerignore) -COPY . . +RUN mkdir -p /sebs/ +COPY dockerfiles/cloudflare_nodejs_installer.sh /sebs/installer.sh +COPY dockerfiles/entrypoint.sh /sebs/entrypoint.sh +RUN chmod +x /sebs/installer.sh /sebs/entrypoint.sh -# Install production dependencies + esbuild (needed by build.js) -RUN npm install --production && npm install --force esbuild - -# Bundle all source files with esbuild into dist/ -RUN node build.js - -# Patch esbuild's dynamic __require("node:…") calls to static ESM imports -# so that the Cloudflare Workers ESM runtime can resolve built-in modules. -RUN node postprocess.js +CMD ["/bin/bash", "/sebs/installer.sh"] +ENTRYPOINT ["/sebs/entrypoint.sh"] diff --git a/dockerfiles/cloudflare/python/Dockerfile.build b/dockerfiles/cloudflare/python/Dockerfile.build index 33c5ff04f..5d591883e 100644 --- a/dockerfiles/cloudflare/python/Dockerfile.build +++ b/dockerfiles/cloudflare/python/Dockerfile.build @@ -1,4 +1,5 @@ -FROM python:3.11-slim +ARG BASE_IMAGE=python:3.11-slim +FROM ${BASE_IMAGE} # curl + ca-certificates are required by the uv installer; git is sometimes # pulled in by workers-py when it resolves VCS-declared deps. @@ -15,18 +16,11 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:${PATH}" -WORKDIR /worker +# Benchmark source arrives via bind-mount at /mnt/function — no COPY . . here. +RUN mkdir -p /sebs/ +COPY dockerfiles/cloudflare_python_installer.sh /sebs/installer.sh +COPY dockerfiles/entrypoint.sh /sebs/entrypoint.sh +RUN chmod +x /sebs/installer.sh /sebs/entrypoint.sh -# Build context is populated by sebs; .venv / python_modules / Dockerfile.build -# itself are excluded via the .dockerignore injected alongside this file. -COPY . . - -# Validate that the generated pyproject.toml parses and that the workers-py -# toolchain is installed and callable. Pyodide package resolution itself -# happens at deploy time inside pywrangler (Cloudflare re-resolves Pyodide -# packages server-side), so there is no vendored dist/ to extract — this -# image exists purely for early failure detection on malformed packages. -RUN if [ -f pyproject.toml ]; then \ - python -c "import tomllib; tomllib.load(open('pyproject.toml','rb'))" && \ - pywrangler --version; \ - fi +CMD ["/bin/bash", "/sebs/installer.sh"] +ENTRYPOINT ["/sebs/entrypoint.sh"] diff --git a/dockerfiles/cloudflare_nodejs_installer.sh b/dockerfiles/cloudflare_nodejs_installer.sh new file mode 100644 index 000000000..dfe4482b5 --- /dev/null +++ b/dockerfiles/cloudflare_nodejs_installer.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +cd /mnt/function + +npm install --production +npm install --force esbuild + +node build.js +node postprocess.js diff --git a/dockerfiles/cloudflare_python_installer.sh b/dockerfiles/cloudflare_python_installer.sh new file mode 100644 index 000000000..884afff0b --- /dev/null +++ b/dockerfiles/cloudflare_python_installer.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +cd /mnt/function + +if [ -f pyproject.toml ]; then + python -c "import tomllib; tomllib.load(open('pyproject.toml','rb'))" + pywrangler --version +fi + +touch .build-validated diff --git a/docs/platforms.md b/docs/platforms.md index 03ee2737d..7d79ed0d5 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -254,7 +254,7 @@ SeBS supports two deployment paths for Cloudflare: **script-based Workers** (nat | `cloudflare.py` | `Cloudflare(System)` facade. Verifies credentials, enforces `SUPPORTED_BENCHMARKS`, resolves the `workers.dev` URL, and dispatches `package_code`/`create_function`/`update_function` to the correct handler via `_get_deployment_handler(container_deployment)`. | | `workers.py` | `CloudflareWorkersDeployment` — native script packaging. Node.js is bundled with esbuild via `nodejs/Dockerfile.build`; Python generates a `pyproject.toml` and is validated via `python/Dockerfile.build` (Pyodide resolution happens server-side at deploy time). | | `containers.py` | `CloudflareContainersDeployment` — container packaging. Copies the per-language `Dockerfile.function` into the code directory, injects the `worker.js` orchestrator (Node-only, required by `@cloudflare/containers`), merges `package.json`, runs `npm install`, and builds a local image as a cache anchor. | -| `cli.py` | `CloudflareCLI` — runs the `manage.cloudflare` Docker container with the Docker socket mounted and exposes `wrangler_deploy`, `pywrangler_deploy`, `npm_install`, `docker_build`, `upload_package`. Used by both deployment handlers; `cloudflare.py` never calls `wrangler` directly. | +| `cli.py` | `CloudflareCLI` — runs the `manage.cloudflare` Docker container with the Docker socket mounted and exposes `wrangler_deploy`, `pywrangler_deploy`, `docker_build`, `upload_package`. Used by both deployment handlers; `cloudflare.py` never calls `wrangler` directly. | | `config.py` | `CloudflareCredentials` / `CloudflareConfig` — API token, account ID, R2 keys. | | `resources.py` | `CloudflareSystemResources` — factories for R2 and KV/Durable Objects. | | `function.py` | `CloudflareWorker(Function)` — cached function metadata. | @@ -282,7 +282,7 @@ Wrangler templates live alongside the deployment code at `sebs/cloudflare/templa #### Container-based flow (`container_deployment=true`) -1. `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile` (patching `BASE_IMAGE`), adds `worker.js`, merges `package.json`, runs `npm install` in the CLI container, and builds a local Docker image. +1. `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile`, adds `worker.js`, merges `package.json`, and runs `npm install` in the CLI container. The correct `BASE_IMAGE` is passed via Docker build args (resolved from `systems.json`) rather than patching the Dockerfile. 2. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-container.toml`. 3. `CloudflareCLI.wrangler_deploy` invokes wrangler, which rebuilds the image from `Dockerfile` and pushes it to Cloudflare's managed registry, creating a Durable-Object-backed container worker. 4. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the DO warm before the first measured invocation. diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 278d3ae27..dbb2b386d 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -9,8 +9,7 @@ import re import shutil import json -import io -import tarfile +from importlib.resources import files try: import tomllib # Python 3.11+ except ImportError: @@ -25,7 +24,6 @@ from sebs.benchmark import Benchmark from sebs.cloudflare.cli import CloudflareCLI from sebs.cloudflare.pyodide_packages import get_canonical_pyodide_name -from sebs.utils import get_resource_path class CloudflareWorkersDeployment: @@ -82,11 +80,7 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = os.path.join( - os.path.dirname(__file__), - "templates", - "wrangler-worker.toml" - ) + template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-worker.toml") with open(template_path, 'rb') as f: config = tomllib.load(f) @@ -196,13 +190,14 @@ def package_code( Returns: Tuple of (package_path, package_size, container_uri) """ - # Install dependencies and bundle + # Install dependencies and bundle. + # Dependency installation (npm install / pip install) is handled by + # Benchmark.install_dependencies() via the canonical SeBS build-image + # pipeline (bind-mount + /sebs/installer.sh). package_code only needs + # to do the language-specific file preparation that happens before or + # after that step. if language_name == "nodejs": - # Build via Dockerfile.build (npm install + esbuild + __require patching), - # then extract the produced dist/ back into the package directory. - # This mirrors how container deployments use their Dockerfile.function — the - # only difference is which Dockerfile is selected. - self._build_worker_and_extract_dist(directory, is_cached) + pass # install_dependencies handles npm install + esbuild bundle elif language_name == "python": requirements_file = os.path.join(directory, "requirements.txt") @@ -274,10 +269,8 @@ def package_code( dest = os.path.join(directory, "function", thing) shutil.move(src, dest) - # Early validation: build Dockerfile.build to confirm the - # generated pyproject.toml parses and the workers-py toolchain - # is wired up. Deploy still runs pywrangler from Dockerfile.manage. - self._build_python_worker(directory, is_cached) + # Validation (pyproject.toml parse + pywrangler check) is + # performed by install_dependencies via cloudflare_python_installer.sh. # Create package structure CONFIG_FILES = { @@ -317,148 +310,6 @@ def package_code( return (directory, total_size, "") - def _build_worker_and_extract_dist(self, directory: str, is_cached: bool) -> None: - """Build the Node.js worker bundle via Dockerfile.build and extract dist/. - - Runs npm install, esbuild (build.js), and the __require→import post- - processing step (postprocess.js) inside a throwaway Docker image built - from Dockerfile.build. Only the resulting dist/ directory is extracted - back to *directory*; intermediate artifacts (node_modules, build image) - stay inside Docker. - - If *is_cached* is True and dist/ already exists the build is skipped. - """ - import docker as docker_module - - dist_dir = os.path.join(directory, "dist") - if is_cached and os.path.exists(dist_dir): - self.logging.info("Cached dist/ found — skipping worker bundle build.") - return - - dockerfile_src = str( - get_resource_path("dockerfiles", "cloudflare", "nodejs", "Dockerfile.build") - ) - dockerfile_dest = os.path.join(directory, "Dockerfile.build") - dockerignore_dest = os.path.join(directory, ".dockerignore") - - # Keep the build context lean: exclude generated / heavy artifacts. - dockerignore_content = "node_modules\ndist\nDockerfile.build\n.dockerignore\n" - shutil.copy2(dockerfile_src, dockerfile_dest) - with open(dockerignore_dest, "w") as f: - f.write(dockerignore_content) - - # Use base directory name + pid for a unique, collision-free tag. - image_tag = f"sebs-worker-build-{os.path.basename(directory)}-{os.getpid()}:latest" - - try: - self.logging.info(f"Building worker bundle via Dockerfile.build in {directory}") - _, build_logs = self.docker_client.images.build( - path=directory, - dockerfile="Dockerfile.build", - tag=image_tag, - rm=True, - ) - for log in build_logs: - if "stream" in log: - self.logging.debug(log["stream"].strip()) - elif "error" in log: - raise RuntimeError(f"Docker build error: {log['error']}") - - # Extract dist/ from the built image. - self.logging.info("Extracting built dist/ from worker build image...") - container = self.docker_client.containers.create(image_tag) - try: - bits, _ = container.get_archive("/worker/dist") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - if os.path.exists(dist_dir): - shutil.rmtree(dist_dir) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - self.logging.info(f"dist/ extracted to {directory}") - finally: - container.remove() - - except docker_module.errors.BuildError as e: - raise RuntimeError(f"Worker bundle build failed: {e}") - finally: - # Remove the temporary files we injected into the build context. - for tmp in (dockerfile_dest, dockerignore_dest): - if os.path.exists(tmp): - os.remove(tmp) - try: - self.docker_client.images.remove(image_tag, force=True) - except Exception: - pass - - def _build_python_worker(self, directory: str, is_cached: bool) -> None: - """Validate a Python worker package via Dockerfile.build. - - Mirrors _build_worker_and_extract_dist for structural symmetry with - the Node.js flow and with Dockerfile.build layouts in other clouds. - Unlike Node.js (which needs esbuild + __require→import patching), - Pyodide Worker deploys don't require a vendored bundle — Cloudflare - resolves Pyodide packages server-side at deploy time via pywrangler. - So this image only validates that the generated pyproject.toml parses - and that workers-py is callable; nothing is extracted. - - A marker file is used for caching: once validation succeeds it is - skipped on subsequent builds of the same directory. - """ - import docker as docker_module - - marker = os.path.join(directory, ".build-validated") - if is_cached and os.path.exists(marker): - self.logging.info("Cached Python build marker — skipping validation.") - return - - dockerfile_src = str( - get_resource_path("dockerfiles", "cloudflare", "python", "Dockerfile.build") - ) - dockerfile_dest = os.path.join(directory, "Dockerfile.build") - dockerignore_dest = os.path.join(directory, ".dockerignore") - - dockerignore_content = ( - "python_modules\n.venv\nDockerfile.build\n.dockerignore\n" - ) - shutil.copy2(dockerfile_src, dockerfile_dest) - with open(dockerignore_dest, "w") as f: - f.write(dockerignore_content) - - image_tag = f"sebs-python-build-{os.path.basename(directory)}-{os.getpid()}:latest" - - try: - self.logging.info( - f"Validating Python worker via Dockerfile.build in {directory}" - ) - _, build_logs = self.docker_client.images.build( - path=directory, - dockerfile="Dockerfile.build", - tag=image_tag, - rm=True, - ) - for log in build_logs: - if "stream" in log: - self.logging.debug(log["stream"].strip()) - elif "error" in log: - raise RuntimeError(f"Docker build error: {log['error']}") - - with open(marker, "w") as f: - f.write("ok") - - except docker_module.errors.BuildError as e: - raise RuntimeError(f"Python worker validation failed: {e}") - finally: - for tmp in (dockerfile_dest, dockerignore_dest): - if os.path.exists(tmp): - os.remove(tmp) - try: - self.docker_client.images.remove(image_tag, force=True) - except Exception: - pass - def shutdown(self): """Shutdown CLI container if initialized.""" if self._cli is not None: From 31624ec1cf3797b88b88be4ef9f90051f7d6b319 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 12:43:08 +0200 Subject: [PATCH 125/230] refactor(cloudflare): update base images for Python and Node.js to use slim variants --- configs/systems.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/systems.json b/configs/systems.json index 0d648c198..7cbde7ef3 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -431,11 +431,11 @@ "python": { "base_images": { "x64": { - "3.8": "ubuntu:22.04", - "3.9": "ubuntu:22.04", - "3.10": "ubuntu:22.04", - "3.11": "ubuntu:22.04", - "3.12": "ubuntu:22.04" + "3.8": "python:3.8-slim", + "3.9": "python:3.9-slim", + "3.10": "python:3.10-slim", + "3.11": "python:3.11-slim", + "3.12": "python:3.12-slim" } }, "container_images": { @@ -471,8 +471,8 @@ "nodejs": { "base_images": { "x64": { - "18": "ubuntu:22.04", - "20": "ubuntu:22.04" + "18": "node:18-slim", + "20": "node:20-slim" } }, "container_images": { From 0253844962ec96efea3f5dac0133233e9fa3dae3 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 12:43:26 +0200 Subject: [PATCH 126/230] refactor(cloudflare): enhance Dockerfile.build for Node.js and Python with gosu installation and user privileges --- dockerfiles/cloudflare/nodejs/Dockerfile.build | 9 +++++++++ dockerfiles/cloudflare/python/Dockerfile.build | 10 +++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/dockerfiles/cloudflare/nodejs/Dockerfile.build b/dockerfiles/cloudflare/nodejs/Dockerfile.build index 652f4eb6f..204434e76 100644 --- a/dockerfiles/cloudflare/nodejs/Dockerfile.build +++ b/dockerfiles/cloudflare/nodejs/Dockerfile.build @@ -1,6 +1,14 @@ ARG BASE_IMAGE=node:20-slim FROM ${BASE_IMAGE} +# useradd, groupmod, gosu (needed by entrypoint.sh to drop privileges) +RUN apt-get update && apt-get install -y --no-install-recommends \ + passwd curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* +ENV GOSU_VERSION=1.14 +RUN curl -o /usr/local/bin/gosu -SL "https://github.com/tianon/gosu/releases/download/${GOSU_VERSION}/gosu-$(dpkg --print-architecture)" \ + && chmod +x /usr/local/bin/gosu + # Install esbuild globally once — benchmark source arrives via bind-mount. RUN npm install -g esbuild @@ -9,5 +17,6 @@ COPY dockerfiles/cloudflare_nodejs_installer.sh /sebs/installer.sh COPY dockerfiles/entrypoint.sh /sebs/entrypoint.sh RUN chmod +x /sebs/installer.sh /sebs/entrypoint.sh +ENV PATH=/usr/sbin:$PATH CMD ["/bin/bash", "/sebs/installer.sh"] ENTRYPOINT ["/sebs/entrypoint.sh"] diff --git a/dockerfiles/cloudflare/python/Dockerfile.build b/dockerfiles/cloudflare/python/Dockerfile.build index 5d591883e..283b088e3 100644 --- a/dockerfiles/cloudflare/python/Dockerfile.build +++ b/dockerfiles/cloudflare/python/Dockerfile.build @@ -1,11 +1,15 @@ ARG BASE_IMAGE=python:3.11-slim FROM ${BASE_IMAGE} -# curl + ca-certificates are required by the uv installer; git is sometimes +# useradd, groupmod, gosu (needed by entrypoint.sh to drop privileges) +# curl + ca-certificates are also required by the uv installer; git is sometimes # pulled in by workers-py when it resolves VCS-declared deps. RUN apt-get update && apt-get install -y --no-install-recommends \ - curl ca-certificates git \ + passwd curl ca-certificates git \ && rm -rf /var/lib/apt/lists/* +ENV GOSU_VERSION=1.14 +RUN curl -o /usr/local/bin/gosu -SL "https://github.com/tianon/gosu/releases/download/${GOSU_VERSION}/gosu-$(dpkg --print-architecture)" \ + && chmod +x /usr/local/bin/gosu # Install uv (fast Python package manager) and the workers-py toolchain, # which provides pywrangler for Pyodide-based Cloudflare Worker deploys. @@ -14,7 +18,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ /root/.local/bin/uv tool install 'workers-py==1.8.0' -ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:${PATH}" +ENV PATH="/root/.local/bin:/root/.local/share/uv/tools/workers-py/bin:/usr/sbin:${PATH}" # Benchmark source arrives via bind-mount at /mnt/function — no COPY . . here. RUN mkdir -p /sebs/ From b84c330e0b33cee96ca91df6ccb605f20b7c82f3 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 17:14:24 +0200 Subject: [PATCH 127/230] refactor(cloudflare): simplify Docker image handling and improve singleton pattern in CloudflareCLI --- sebs/cloudflare/cli.py | 100 ++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 67 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index c4d4117ff..8e4b966c0 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -2,84 +2,59 @@ import logging import os import tarfile +from typing import Optional import docker from sebs.config import SeBSConfig -from sebs.utils import LoggingBase, get_resource_path +from sebs.utils import LoggingBase class CloudflareCLI(LoggingBase): """ Manages a Docker container with Cloudflare Wrangler and related tools pre-installed. - + This approach isolates Cloudflare CLI tools (wrangler, pywrangler) from the host system, avoiding global npm/uv installations and ensuring consistent behavior across platforms. """ + _instance: Optional["CloudflareCLI"] = None + + @staticmethod + def get_instance( + system_config: SeBSConfig, docker_client: docker.client + ) -> "CloudflareCLI": + """Return the shared CloudflareCLI instance, creating it on first use. + + Container and native workers deployments share one underlying CLI + container so that combined runs don't spawn duplicates. + """ + if CloudflareCLI._instance is None: + CloudflareCLI._instance = CloudflareCLI(system_config, docker_client) + return CloudflareCLI._instance + def __init__(self, system_config: SeBSConfig, docker_client: docker.client): super().__init__() + self._stopped = False repo_name = system_config.docker_repository() image_name = "manage.cloudflare" - full_image_name = repo_name + ":" + image_name - - # Try to get the image, pull if not found, build if pull fails try: - docker_client.images.get(full_image_name) - logging.info(f"Using existing Docker image: {full_image_name}") + docker_client.images.get(repo_name + ":" + image_name) except docker.errors.ImageNotFound: - # Try to pull the image first try: - logging.info(f"Pulling Docker image {full_image_name}...") - docker_client.images.pull(repo_name, image_name) - logging.info(f"Successfully pulled {full_image_name}") - except docker.errors.APIError as pull_error: - # If pull fails, try to build the image locally - logging.info(f"Pull failed: {pull_error}. Building image locally...") - - # Find the Dockerfile path - dockerfile_path = str( - get_resource_path( - "dockerfiles", "cloudflare", "Dockerfile.manage" + logging.info( + "Docker pull of image {repo}:{image}".format( + repo=repo_name, image=image_name ) ) + docker_client.images.pull(repo_name, image_name) + except docker.errors.APIError: + raise RuntimeError("Docker pull of image {} failed!".format(image_name)) - if not os.path.exists(dockerfile_path): - raise RuntimeError( - f"Dockerfile not found at {dockerfile_path}. " - "Cannot build Cloudflare CLI container." - ) - - # Build context must contain dockerfiles/entrypoint.sh (COPY'd by - # Dockerfile.manage). In git mode this is the repo root; in package- - # install mode it is the sebs/ package dir — both hold via get_resource_path(). - build_path = str(get_resource_path()) - logging.info(f"Building {full_image_name} from {dockerfile_path}...") - - try: - image, build_logs = docker_client.images.build( - path=build_path, - dockerfile=dockerfile_path, - tag=full_image_name, - rm=True, - pull=True - ) - - # Log build output - for log in build_logs: - if 'stream' in log: - logging.debug(log['stream'].strip()) - - logging.info(f"Successfully built {full_image_name}") - except docker.errors.BuildError as build_error: - raise RuntimeError( - f"Failed to build Docker image {full_image_name}: {build_error}" - ) - # Start the container in detached mode self.docker_instance = docker_client.containers.run( - image=full_image_name, + image=repo_name + ":" + image_name, command="/bin/bash", environment={ "CONTAINER_UID": str(os.getuid()), @@ -213,20 +188,6 @@ def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: out = self.execute(cmd, env=env) return out.decode("utf-8") - def npm_install(self, package_dir: str) -> str: - """ - Run npm install in a directory. - - Args: - package_dir: Path to package directory in container - - Returns: - npm output - """ - cmd = "cd {} && npm install".format(package_dir) - out = self.execute(cmd) - return out.decode("utf-8") - def docker_build(self, package_dir: str, image_tag: str) -> str: """ Build a Docker image for container deployment. @@ -243,6 +204,11 @@ def docker_build(self, package_dir: str, image_tag: str) -> str: return out.decode("utf-8") def shutdown(self): - """Shutdown Docker instance.""" + """Shutdown Docker instance. Idempotent — safe to call multiple times.""" + if self._stopped: + return + self._stopped = True self.logging.info("Stopping Cloudflare CLI Docker instance") self.docker_instance.stop() + if CloudflareCLI._instance is self: + CloudflareCLI._instance = None From b7ee302629562a6f4510f56a5a65e09354cde4c4 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 17:24:02 +0200 Subject: [PATCH 128/230] refactor(cloudflare): update Docker socket mount comment for clarity and remove unused docker_build method --- sebs/cloudflare/cli.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 8e4b966c0..c30e6e53f 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -62,7 +62,8 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): "CONTAINER_USER": "docker_user", }, volumes={ - # Mount Docker socket for wrangler container deployments + # Mount Docker socket so wrangler can build and push images to + # Cloudflare's registry during `wrangler deploy` for container workers. "/var/run/docker.sock": {"bind": "/var/run/docker.sock", "mode": "rw"} }, remove=True, @@ -188,21 +189,6 @@ def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: out = self.execute(cmd, env=env) return out.decode("utf-8") - def docker_build(self, package_dir: str, image_tag: str) -> str: - """ - Build a Docker image for container deployment. - - Args: - package_dir: Path to package directory in container - image_tag: Tag for the Docker image - - Returns: - Docker build output - """ - cmd = "cd {} && docker build --no-cache -t {} .".format(package_dir, image_tag) - out = self.execute(cmd) - return out.decode("utf-8") - def shutdown(self): """Shutdown Docker instance. Idempotent — safe to call multiple times.""" if self._stopped: From 8bc3595c700fbf4ccafbf39b14352ef4133d3cfe Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 23 Apr 2026 17:27:51 +0200 Subject: [PATCH 129/230] refactor(cloudflare): enhance container image handling and add worker readiness check --- sebs/cloudflare/cloudflare.py | 51 ++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 57dac8680..8c23bbbf5 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -71,7 +71,22 @@ def push_to_registry( language_version: str, architecture: str, ) -> str: - """Return the local Docker image tag (Cloudflare containers use wrangler, not a registry).""" + """ + Return a local cache label for the container image. + + Cloudflare container workers do not use a conventional image registry. + Instead, `wrangler deploy` reads `./Dockerfile` directly from the + package directory, builds the image, and pushes it to Cloudflare's + managed registry — all in one step. SeBS therefore never needs to + push an image to an external registry before deployment; this method + exists only to satisfy the `ContainerSystemInterface` contract and to + provide a stable cache key that `Benchmark` uses to detect whether a + previously-built image is still valid. + + The returned string is a local image tag of the form + ``--:latest``. It is + NOT a pushable URI and is not passed to any registry client. + """ image_name = ( f"{benchmark.replace('.', '-')}-{language_name}-" f"{language_version.replace('.', '')}" @@ -555,14 +570,19 @@ def _create_or_update_worker( self.logging.info(f"Worker {worker_name} deployed successfully") self.logging.debug(f"Wrangler deploy output: {output}") - # The container binding needs time to propagate before first invocation + # Wait for the worker to become reachable before returning. + # Container workers expose /health; native workers are probed + # with a lightweight GET to confirm edge propagation. + account_id_val = env.get('CLOUDFLARE_ACCOUNT_ID') + worker_url = self._build_workers_dev_url(worker_name, account_id_val) + if container_deployment: self.logging.info("Waiting for container worker to initialize...") - account_id = env.get('CLOUDFLARE_ACCOUNT_ID') - worker_url = self._build_workers_dev_url(worker_name, account_id) self._containers_deployment.wait_for_durable_object_ready( worker_name, worker_url ) + else: + self._wait_for_worker_ready(worker_name, worker_url) # Keep the container warm for a minimum provisioning window. # A flat sleep lets the Durable Object hibernate, which causes the @@ -594,6 +614,29 @@ def _create_or_update_worker( self.logging.error(error_msg) raise RuntimeError(error_msg) + def _wait_for_worker_ready( + self, worker_name: str, worker_url: str, + max_wait_seconds: int = 60, poll_interval: int = 5 + ) -> None: + """Poll a native worker until it responds, confirming edge propagation.""" + self.logging.info( + f"Waiting up to {max_wait_seconds}s for worker {worker_name} to become reachable..." + ) + start = time.time() + while time.time() - start < max_wait_seconds: + try: + resp = requests.get(worker_url, timeout=10) + if resp.status_code not in (502, 503, 522, 524): + self.logging.info(f"Worker {worker_name} is reachable (HTTP {resp.status_code}).") + return + except requests.exceptions.RequestException: + pass + time.sleep(poll_interval) + self.logging.warning( + f"Worker {worker_name} not confirmed reachable after {max_wait_seconds}s; " + "proceeding anyway — invocation retries will handle residual propagation delay." + ) + def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: """Fetch the workers.dev subdomain for the given account. From 4574d8ccc73a1d434043018513cb9b7c346d2bf1 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 13:59:14 +0200 Subject: [PATCH 130/230] refactor(containers): get singleton cli instance, get template files using files() resource helper routine, pass BASE_IMAGE as a build arg, override the wrapper files staged by add_deployment_files() those are worker specific, copy benchmark & wrapper into function/ subdirectory such that we can execute it as a package, add @cloudflare/containers to the already existing package.json or create one (we always need that for worker.js orchestration), do not manually override the torch version in the requirements.txt --- .../cloudflare/python/container/handler.py | 6 +- sebs/cloudflare/cloudflare.py | 3 +- sebs/cloudflare/containers.py | 247 +++++++----------- 3 files changed, 94 insertions(+), 162 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 810c26ee3..1bef40216 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -51,17 +51,17 @@ def patched_urlopen(url, data=None, timeout=None, **kwargs): print("Monkey-patched urllib.request.urlopen to add User-Agent header") # Import the benchmark handler function -from function import handler as benchmark_handler +from function.function import handler as benchmark_handler # Import storage and nosql if available try: - import storage + from function import storage except ImportError: storage = None print("Storage module not available") try: - import nosql + from function import nosql except ImportError: nosql = None print("NoSQL module not available") diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 8c23bbbf5..181c46444 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -171,8 +171,7 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) # here ensures copy_code() applies the cloudflare/ source overlay and the # cache key reflects the correct variant. if ( - not container_deployment - and code_package.language_variant == "default" + code_package.language_variant == "default" and code_package.benchmark_config.supports(code_package.language, self.name()) ): code_package.select_variant(self.name()) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 7dff7215a..9ced6e60d 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -9,9 +9,10 @@ import shutil import json import io -import re + import time import tarfile +from importlib.resources import files try: import tomllib # Python 3.11+ except ImportError: @@ -48,12 +49,13 @@ def __init__(self, logging, system_config, docker_client, system_resources): self.system_config = system_config self.docker_client = docker_client self.system_resources = system_resources + self._base_image: Optional[str] = None self._cli: Optional[CloudflareCLI] = None def _get_cli(self) -> CloudflareCLI: """Get or initialize the Cloudflare CLI container.""" if self._cli is None: - self._cli = CloudflareCLI(self.system_config, self.docker_client) + self._cli = CloudflareCLI.get_instance(self.system_config, self.docker_client) # Verify wrangler is available version = self._cli.check_wrangler_version() self.logging.info(f"Cloudflare CLI container ready: {version}") @@ -86,18 +88,18 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = os.path.join( - os.path.dirname(__file__), - "templates", - "wrangler-container.toml" - ) + template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-container.toml") with open(template_path, 'rb') as f: config = tomllib.load(f) - + # Update basic configuration config['name'] = worker_name config['account_id'] = account_id - + + # Pass BASE_IMAGE as a build arg so wrangler uses the correct base image + if self._base_image: + config['containers'][0]['build_args'] = {"BASE_IMAGE": self._base_image} + # Update container configuration with instance type if needed if benchmark_name and ("411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or @@ -181,17 +183,38 @@ def package_code( Tuple of (package_path, package_size, container_uri) """ self.logging.info(f"Packaging container for {language_name} {language_version}") - + # Get wrapper directory for container files wrapper_base = str(get_resource_path("benchmarks", "wrappers", "cloudflare")) wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") - + if not os.path.exists(wrapper_container_dir): raise RuntimeError( f"Container wrapper directory not found: {wrapper_container_dir}" ) - - # Copy container wrapper files to the package directory + + # Overwrite the wrapper files staged by add_deployment_files() with the + # container-specific versions before doing anything else. + if language_name == "python": + for f in ["handler.py", "storage.py", "nosql.py"]: + src = os.path.join(wrapper_container_dir, f) + if os.path.exists(src): + shutil.copy2(src, os.path.join(directory, f)) + + # For Python: move benchmark code into function/ so that relative imports + # work natively, matching the workers and AWS layout. + # handler.py and requirements.txt* stay at the top level. + if language_name == "python": + func_dir = os.path.join(directory, "function") + os.makedirs(func_dir, exist_ok=True) + open(os.path.join(func_dir, "__init__.py"), "w").close() + dont_move = {"function", "handler.py"} + for item in os.listdir(directory): + if item in dont_move or item.startswith("requirements"): + continue + shutil.move(os.path.join(directory, item), os.path.join(func_dir, item)) + self.logging.info(f"Moved {item} into function/ package") + # Copy Dockerfile.function from dockerfiles/cloudflare/{language}/ dockerfile_src = str( get_resource_path( @@ -200,10 +223,6 @@ def package_code( ) dockerfile_dest = os.path.join(directory, "Dockerfile") if os.path.exists(dockerfile_src): - # Read Dockerfile and update BASE_IMAGE based on language version - with open(dockerfile_src, 'r') as f: - dockerfile_content = f.read() - # Get base image from systems.json for container deployments container_images = self.system_config.benchmark_container_images( "cloudflare", language_name, architecture @@ -213,114 +232,54 @@ def package_code( raise RuntimeError( f"No container base image found in systems.json for {language_name} {language_version} on {architecture}" ) - - # Replace BASE_IMAGE default value in ARG line - dockerfile_content = re.sub( - r'ARG BASE_IMAGE=.*', - f'ARG BASE_IMAGE={base_image}', - dockerfile_content - ) - - # Write modified Dockerfile - with open(dockerfile_dest, 'w') as f: - f.write(dockerfile_content) - + self._base_image = base_image + + shutil.copy2(dockerfile_src, dockerfile_dest) self.logging.info(f"Copied Dockerfile from {dockerfile_src}") - else: - raise RuntimeError(f"Dockerfile not found at {dockerfile_src}") - - # Copy handler and utility files from wrapper/container - # Note: ALL containers use worker.js for orchestration (@cloudflare/containers is Node.js only) - # The handler inside the container can be Python or Node.js - container_files = ["handler.py" if language_name == "python" else "handler.js"] - - # For worker.js orchestration file, always use the nodejs version + + # For nodejs, copy the container handler (no function/ subdir for nodejs). + if language_name == "nodejs": + handler_file = "handler.js" + shutil.copy2( + os.path.join(wrapper_container_dir, handler_file), + os.path.join(directory, handler_file), + ) + self.logging.info(f"Copied container {handler_file}") + nodejs_wrapper_dir = os.path.join(wrapper_base, "nodejs", "container") worker_js_src = os.path.join(nodejs_wrapper_dir, "worker.js") - worker_js_dest = os.path.join(directory, "worker.js") if os.path.exists(worker_js_src): - shutil.copy2(worker_js_src, worker_js_dest) + shutil.copy2(worker_js_src, os.path.join(directory, "worker.js")) self.logging.info(f"Copied worker.js orchestration file from nodejs/container") - - # Copy storage and nosql utilities from language-specific wrapper - if language_name == "nodejs": - container_files.extend(["storage.js", "nosql.js"]) - else: - container_files.extend(["storage.py", "nosql.py"]) - - for file in container_files: - src = os.path.join(wrapper_container_dir, file) - dest = os.path.join(directory, file) - if os.path.exists(src): - shutil.copy2(src, dest) - self.logging.info(f"Copied container file: {file}") - - # Check if benchmark has init.sh and copy it (needed for some benchmarks like video-processing) - # Look in both the benchmark root and the language-specific directory + + # Copy init.sh if the benchmark needs it (e.g. video-processing downloads ffmpeg) from sebs.utils import find_benchmark benchmark_path = find_benchmark(benchmark, "benchmarks") if benchmark_path: - paths = [ - benchmark_path, - os.path.join(benchmark_path, language_name), - ] - for path in paths: + for path in [benchmark_path, os.path.join(benchmark_path, language_name)]: init_sh = os.path.join(path, "init.sh") if os.path.exists(init_sh): shutil.copy2(init_sh, os.path.join(directory, "init.sh")) self.logging.info(f"Copied init.sh from {path}") break - - # For Python containers, fix relative imports in benchmark code - # Containers use flat structure, so "from . import storage" must become "import storage" - if language_name == "python": - for item in os.listdir(directory): - if item.endswith('.py') and item not in ['handler.py', 'storage.py', 'nosql.py', 'worker.py']: - file_path = os.path.join(directory, item) - with open(file_path, 'r') as f: - content = f.read() - # Fix relative imports - content = re.sub(r'from \. import ', 'import ', content) - with open(file_path, 'w') as f: - f.write(content) - - # For Node.js containers, transform benchmark code to be async-compatible - # The container wrapper uses async HTTP calls, but benchmarks expect sync - elif language_name == "nodejs": - for item in os.listdir(directory): - if item.endswith('.js') and item not in ['handler.js', 'storage.js', 'nosql.js', 'worker.js', 'build.js', 'request-polyfill.js']: - file_path = os.path.join(directory, item) - # Could add transformations here if needed - pass - - # Prepare package.json for container orchestration - # ALL containers need @cloudflare/containers for worker.js orchestration - worker_package_json = { - "name": f"{benchmark}-worker", - "version": "1.0.0", - "dependencies": { - "@cloudflare/containers": "*" - } - } - + + # ALL containers need @cloudflare/containers for worker.js orchestration. + # For nodejs benchmarks, preserve the existing package.json and add the + # dependency. For Python, create a minimal package.json with just the dep. + package_json_path = os.path.join(directory, "package.json") if language_name == "nodejs": - # Read the benchmark's package.json if it exists and merge dependencies - benchmark_package_file = os.path.join(directory, "package.json") - if os.path.exists(benchmark_package_file): - with open(benchmark_package_file, 'r') as f: - benchmark_package = json.load(f) - # Merge dependencies - if "dependencies" in benchmark_package: - worker_package_json["dependencies"].update(benchmark_package["dependencies"]) - - # Write the combined package.json - with open(benchmark_package_file, 'w') as f: - json.dump(worker_package_json, f, indent=2) - else: # Python containers also need package.json for worker.js orchestration - # Create package.json just for @cloudflare/containers (Python code in container) - package_json_path = os.path.join(directory, "package.json") - with open(package_json_path, 'w') as f: - json.dump(worker_package_json, f, indent=2) + if not os.path.exists(package_json_path): + raise RuntimeError( + f"package.json not found at {package_json_path} " + f"for nodejs benchmark '{benchmark}'" + ) + with open(package_json_path, 'r') as f: + package_json = json.load(f) + else: + package_json = {} + package_json.setdefault("dependencies", {})["@cloudflare/containers"] = "*" + with open(package_json_path, 'w') as f: + json.dump(package_json, f, indent=2) # Install Node.js dependencies for wrangler deployment # Note: These are needed for wrangler to bundle worker.js, not for the container @@ -352,58 +311,31 @@ def package_code( self.logging.error(f"npm install failed: {e}") raise RuntimeError(f"Failed to install Node.js dependencies: {e}") - # For Python containers, also handle Python requirements + # For Python containers, promote the versioned requirements.txt to requirements.txt if language_name == "python": - # Python requirements will be installed in the Dockerfile - # Rename version-specific requirements.txt to requirements.txt requirements_file = os.path.join(directory, "requirements.txt") versioned_requirements = os.path.join(directory, f"requirements.txt.{language_version}") - if os.path.exists(versioned_requirements): shutil.copy2(versioned_requirements, requirements_file) self.logging.info(f"Copied requirements.txt.{language_version} to requirements.txt") - - # Fix torch wheel URLs for container compatibility - # Replace direct wheel URLs with proper torch installation - with open(requirements_file, 'r') as f: - content = f.read() - - # Replace torch wheel URLs with proper installation commands - modified = False - if 'download.pytorch.org/whl' in content: - # Replace direct wheel URL with pip-installable torch - content = re.sub( - r'https://download\.pytorch\.org/whl/[^\s]+\.whl', - 'torch', - content - ) - modified = True - - if modified: - with open(requirements_file, 'w') as f: - f.write(content) - self.logging.info("Fixed torch URLs in requirements.txt for container compatibility") - elif not os.path.exists(requirements_file): - # Create empty requirements.txt if none exists - with open(requirements_file, 'w') as f: - f.write("") + open(requirements_file, "w").close() self.logging.info("Created empty requirements.txt") - # Build Docker image locally for cache compatibility - # wrangler will re-build/push during deployment from the Dockerfile - image_tag = self._build_container_image_local(directory, benchmark, language_name, language_version) - + # Deterministic image tag used as the container_uri label. wrangler reads + # the Dockerfile directly during deploy, so no local image is required. + image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" + image_tag = f"{image_name}:latest" + # Calculate package size (approximate, as it's a source directory) total_size = 0 for dirpath, dirnames, filenames in os.walk(directory): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) - - self.logging.info(f"Container package prepared with local image: {image_tag}") - - # Return local image tag (wrangler will rebuild from Dockerfile during deploy) + + self.logging.info(f"Container package prepared (image tag: {image_tag})") + return (directory, total_size, image_tag) def _build_container_image_local( @@ -412,25 +344,27 @@ def _build_container_image_local( benchmark: str, language_name: str, language_version: str, + base_image: str = "", ) -> str: """ Build a Docker image locally for cache purposes. wrangler will rebuild from Dockerfile during deployment. - + Returns the local image tag. """ # Generate image tag image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" image_tag = f"{image_name}:latest" - + self.logging.info(f"Building local container image: {image_tag}") - + + buildargs = {"BASE_IMAGE": base_image} if base_image else {} + try: - # Build the Docker image using docker-py - # nocache=True ensures handler changes are picked up _, build_logs = self.docker_client.images.build( path=directory, tag=image_tag, + buildargs=buildargs, nocache=True, rm=True ) @@ -512,11 +446,10 @@ def wait_for_durable_object_ready( time.sleep(wait_interval) - self.logging.warning( - f"Container worker may not be fully ready after {max_wait_seconds}s. " - "First invocation may still experience initialization delay." + raise RuntimeError( + f"Container worker {worker_name} did not become ready after {max_wait_seconds}s. " + "Deployment cannot proceed without a healthy container." ) - return False def shutdown(self): """Shutdown CLI container if initialized.""" From 220574cc8b552c4b8f0a6752961972506465b391 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 14:14:32 +0200 Subject: [PATCH 131/230] refactor(config): update cloudflare variant structure and add requirements files for Python dependencies --- benchmarks/400.inference/411.image-recognition/config.json | 2 +- .../python/cloudflare/requirements.txt.3.11 | 6 ++++++ .../python/cloudflare/requirements.txt.3.12 | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.11 create mode 100644 benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.12 diff --git a/benchmarks/400.inference/411.image-recognition/config.json b/benchmarks/400.inference/411.image-recognition/config.json index a5c9cbb95..6db214c0b 100644 --- a/benchmarks/400.inference/411.image-recognition/config.json +++ b/benchmarks/400.inference/411.image-recognition/config.json @@ -6,7 +6,7 @@ "language": "python", "variants": { "default": "default", - "cloudflare": "default" + "cloudflare": {"workers": "default", "containers": "cloudflare"} } }, "cpp" diff --git a/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.11 b/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.11 new file mode 100644 index 000000000..c3e648b75 --- /dev/null +++ b/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.11 @@ -0,0 +1,6 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. +pillow==10.3.0 +torch==2.0.0 +torchvision==0.15.1 +# prevent installing numpy 2.0 +numpy==1.24.0 diff --git a/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.12 b/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.12 new file mode 100644 index 000000000..c3e648b75 --- /dev/null +++ b/benchmarks/400.inference/411.image-recognition/python/cloudflare/requirements.txt.3.12 @@ -0,0 +1,6 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. +pillow==10.3.0 +torch==2.0.0 +torchvision==0.15.1 +# prevent installing numpy 2.0 +numpy==1.24.0 From 4a3fc618dec5a816d409d801015b23cc0d79a5a6 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 14:50:46 +0200 Subject: [PATCH 132/230] refactor(cloudflare): enhance container image deployment by building locally, pushing to cloudflare registry, then specifying the location of the registry url in the toml. also moved the npm install for the @cloudflare/containers package to the dockerfile.manage. --- dockerfiles/cloudflare/Dockerfile.manage | 4 +- sebs/cloudflare/cloudflare.py | 12 ++++- sebs/cloudflare/containers.py | 64 ++++++++---------------- 3 files changed, 34 insertions(+), 46 deletions(-) diff --git a/dockerfiles/cloudflare/Dockerfile.manage b/dockerfiles/cloudflare/Dockerfile.manage index 88b122ba3..46ffc5ab1 100644 --- a/dockerfiles/cloudflare/Dockerfile.manage +++ b/dockerfiles/cloudflare/Dockerfile.manage @@ -16,8 +16,8 @@ RUN apt-get clean && apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install wrangler globally -RUN npm install -g wrangler +# Install wrangler and @cloudflare/containers globally +RUN npm install -g wrangler @cloudflare/containers # Install uv (fast Python package installer) and pywrangler # Pin workers-py to 1.8.0: 1.9.x introduced a broken import (rich.logging.Console) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 181c46444..58ee9b3d0 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -547,6 +547,16 @@ def _create_or_update_worker( handler = self._get_deployment_handler(container_deployment) cli = handler._get_cli() + # Push the locally-built container image to Cloudflare's registry so that + # wrangler deploy can reference it directly instead of rebuilding from the + # Dockerfile. The registry URI replaces the local tag for wrangler.toml. + if container_deployment and container_uri: + self.logging.info(f"Pushing container image {container_uri} to Cloudflare registry...") + container_uri = cli.containers_push(container_uri, env=env) + self.logging.info(f"Image pushed to: {container_uri}") + # Regenerate wrangler.toml now that we have the registry URI + self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) + # Upload package directory to container container_package_path = f"/tmp/workers/{worker_name}" self.logging.info(f"Uploading package to container: {container_package_path}") @@ -577,7 +587,7 @@ def _create_or_update_worker( if container_deployment: self.logging.info("Waiting for container worker to initialize...") - self._containers_deployment.wait_for_durable_object_ready( + self._containers_deployment.wait_for_container_worker_ready( worker_name, worker_url ) else: diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 9ced6e60d..07279892c 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -8,10 +8,8 @@ import os import shutil import json -import io import time -import tarfile from importlib.resources import files try: import tomllib # Python 3.11+ @@ -96,9 +94,14 @@ def generate_wrangler_toml( config['name'] = worker_name config['account_id'] = account_id - # Pass BASE_IMAGE as a build arg so wrangler uses the correct base image - if self._base_image: - config['containers'][0]['build_args'] = {"BASE_IMAGE": self._base_image} + if container_uri and container_uri.startswith("registry.cloudflare.com"): + # Pre-built image already pushed to Cloudflare registry — point wrangler + # at it directly so it skips the Docker build step entirely. + config['containers'][0]['image'] = container_uri + else: + # Fallback: let wrangler build from the local Dockerfile. + if self._base_image: + config['containers'][0]['build_args'] = {"BASE_IMAGE": self._base_image} # Update container configuration with instance type if needed if benchmark_name and ("411.image-recognition" in benchmark_name or @@ -280,37 +283,7 @@ def package_code( package_json.setdefault("dependencies", {})["@cloudflare/containers"] = "*" with open(package_json_path, 'w') as f: json.dump(package_json, f, indent=2) - - # Install Node.js dependencies for wrangler deployment - # Note: These are needed for wrangler to bundle worker.js, not for the container - # The container also installs them during Docker build - self.logging.info(f"Installing Node.js dependencies for wrangler deployment in {directory}") - cli = self._get_cli() - container_path = f"/tmp/container_npm/{os.path.basename(directory)}" - - try: - # Upload package directory to CLI container - cli.upload_package(directory, container_path) - - # Install production dependencies - output = cli.execute(f"cd {container_path} && npm install --production") - self.logging.info("npm install completed successfully") - self.logging.debug(f"npm output: {output.decode('utf-8')}") - - # Download node_modules back to host for wrangler - bits, stat = cli.docker_instance.get_archive(f"{container_path}/node_modules") - file_obj = io.BytesIO() - for chunk in bits: - file_obj.write(chunk) - file_obj.seek(0) - with tarfile.open(fileobj=file_obj) as tar: - tar.extractall(directory) - - self.logging.info(f"Downloaded node_modules to {directory} for wrangler deployment") - except Exception as e: - self.logging.error(f"npm install failed: {e}") - raise RuntimeError(f"Failed to install Node.js dependencies: {e}") - + # For Python containers, promote the versioned requirements.txt to requirements.txt if language_name == "python": requirements_file = os.path.join(directory, "requirements.txt") @@ -322,10 +295,12 @@ def package_code( open(requirements_file, "w").close() self.logging.info("Created empty requirements.txt") - # Deterministic image tag used as the container_uri label. wrangler reads - # the Dockerfile directly during deploy, so no local image is required. - image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" - image_tag = f"{image_name}:latest" + # Build the image locally. cache.py requires docker_client.images.get() to + # succeed for container deployments, and the local image is what we push to + # Cloudflare's registry during deploy (wrangler containers push). + image_tag = self._build_container_image_local( + directory, benchmark, language_name, language_version, self._base_image or "" + ) # Calculate package size (approximate, as it's a source directory) total_size = 0 @@ -347,8 +322,11 @@ def _build_container_image_local( base_image: str = "", ) -> str: """ - Build a Docker image locally for cache purposes. - wrangler will rebuild from Dockerfile during deployment. + Build the container image locally. + + The local image is pushed to Cloudflare's registry via + `wrangler containers push` during deployment, so wrangler deploy can + reference it directly without rebuilding from the Dockerfile. Returns the local image tag. """ @@ -389,7 +367,7 @@ def _build_container_image_local( self.logging.error(error_msg) raise RuntimeError(error_msg) - def wait_for_durable_object_ready( + def wait_for_container_worker_ready( self, worker_name: str, worker_url: str, From 33441b21ffab17ba24678980979a3032b2ff611e Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 15:20:20 +0200 Subject: [PATCH 133/230] refactor(containers): streamline local container image build process using subprocess and remove unused docker client code. crossbuild to linux/amd64 as cloudflare registry needs this --- sebs/cloudflare/containers.py | 55 +++++++++++++---------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 07279892c..10eff812e 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -8,6 +8,7 @@ import os import shutil import json +import subprocess import time from importlib.resources import files @@ -22,7 +23,6 @@ import toml as tomli_w from typing import Optional, Tuple -import docker import requests from sebs.benchmark import Benchmark @@ -299,7 +299,7 @@ def package_code( # succeed for container deployments, and the local image is what we push to # Cloudflare's registry during deploy (wrangler containers push). image_tag = self._build_container_image_local( - directory, benchmark, language_name, language_version, self._base_image or "" + directory, benchmark, language_name, language_version ) # Calculate package size (approximate, as it's a source directory) @@ -319,7 +319,6 @@ def _build_container_image_local( benchmark: str, language_name: str, language_version: str, - base_image: str = "", ) -> str: """ Build the container image locally. @@ -334,38 +333,26 @@ def _build_container_image_local( image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" image_tag = f"{image_name}:latest" - self.logging.info(f"Building local container image: {image_tag}") - - buildargs = {"BASE_IMAGE": base_image} if base_image else {} + self.logging.info(f"Building container image {image_tag} for linux/amd64...") + + result = subprocess.run( + [ + "docker", "buildx", "build", + "--platform", "linux/amd64", + "--load", + "--no-cache", + "-t", image_tag, + directory, + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + self.logging.error(result.stderr) + raise RuntimeError(f"Docker build failed for {image_tag}:\n{result.stderr}") - try: - _, build_logs = self.docker_client.images.build( - path=directory, - tag=image_tag, - buildargs=buildargs, - nocache=True, - rm=True - ) - - # Log build output - for log in build_logs: - if 'stream' in log: - self.logging.debug(log['stream'].strip()) - elif 'error' in log: - self.logging.error(log['error']) - - self.logging.info(f"Local container image built: {image_tag}") - - return image_tag - - except docker.errors.BuildError as e: - error_msg = f"Docker build failed for {image_tag}: {e}" - self.logging.error(error_msg) - raise RuntimeError(error_msg) - except Exception as e: - error_msg = f"Unexpected error building Docker image {image_tag}: {e}" - self.logging.error(error_msg) - raise RuntimeError(error_msg) + self.logging.info(f"Container image built: {image_tag}") + return image_tag def wait_for_container_worker_ready( self, From ac51c3e04f6eb0767fa81d863f4bfca083ccffc3 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 15:38:07 +0200 Subject: [PATCH 134/230] refactor(credentials): enhance documentation for Cloudflare API credentials, detailing authentication methods and requirements --- sebs/cloudflare/config.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index b75c52ad8..bf1b34bd4 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -9,11 +9,24 @@ class CloudflareCredentials(Credentials): """ Cloudflare API credentials. - - Requires: - - API token or email + global API key - - Account ID - - Optional: R2 S3-compatible credentials for file uploads + + Two mutually exclusive authentication methods are supported; both are + functionally equivalent for every SeBS operation (API calls, R2, KV, + wrangler): + + - **API Token** (recommended): a scoped, revocable token created in the + Cloudflare dashboard. Env: ``CLOUDFLARE_API_TOKEN``. + - **Email + Global API Key** (legacy): the account email plus the + Global API Key. Grants broad account access; use only when scoped + tokens are not available. Env: ``CLOUDFLARE_EMAIL`` + + ``CLOUDFLARE_API_KEY``. + + Both methods additionally require ``CLOUDFLARE_ACCOUNT_ID``. + Optional R2 S3-compatible credentials (``CLOUDFLARE_R2_ACCESS_KEY_ID``, + ``CLOUDFLARE_R2_SECRET_ACCESS_KEY``) are needed for file uploads. + + See ``docs/platforms.md`` (Cloudflare Workers → Credentials) for full + setup instructions. """ def __init__(self, api_token: Optional[str] = None, email: Optional[str] = None, From 2ce61f3ef9ad68f669980adac5cfaf5716c94357 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 15:38:50 +0200 Subject: [PATCH 135/230] refactor(r2): improve error handling in list_bucket method for S3 client availability --- sebs/cloudflare/r2.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index a57b686a0..1a03ab3d7 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -247,25 +247,25 @@ def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: """ s3_client = self._get_s3_client() if s3_client is None: - self.logging.warning(f"Cannot list R2 bucket {bucket_name} - S3 client not available") - return [] - + raise RuntimeError( + f"Cannot list R2 bucket {bucket_name} - S3 client not available. " + "Ensure CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY are set." + ) + try: - # List objects with optional prefix paginator = s3_client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) - + files = [] for page in page_iterator: if 'Contents' in page: for obj in page['Contents']: files.append(obj['Key']) - + return files - + except Exception as e: - self.logging.warning(f"Failed to list R2 bucket {bucket_name}: {str(e)}") - return [] + raise RuntimeError(f"Failed to list R2 bucket {bucket_name}: {str(e)}") from e def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: """ From 184695c6a59903fd7da23d7b48ec3576e0a6b488 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 15:39:14 +0200 Subject: [PATCH 136/230] refactor(workers): update Cloudflare CLI initialization to use get_instance method --- sebs/cloudflare/workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index dbb2b386d..1a373de26 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -48,7 +48,7 @@ def __init__(self, logging, system_config, docker_client, system_resources): def _get_cli(self) -> CloudflareCLI: """Get or initialize the Cloudflare CLI container.""" if self._cli is None: - self._cli = CloudflareCLI(self.system_config, self.docker_client) + self._cli = CloudflareCLI.get_instance(self.system_config, self.docker_client) # Verify wrangler is available version = self._cli.check_wrangler_version() self.logging.info(f"Cloudflare CLI container ready: {version}") From 38cd1c038b7339e116272cc049a88cd67deb08fe Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 24 Apr 2026 15:39:39 +0200 Subject: [PATCH 137/230] refactor(platforms): update build image descriptions and enhance build process documentation for Cloudflare Workers --- docs/platforms.md | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/platforms.md b/docs/platforms.md index 7d79ed0d5..4d8023eab 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -268,17 +268,18 @@ Wrangler templates live alongside the deployment code at `sebs/cloudflare/templa | File | Purpose | |------|---------| | `Dockerfile.manage` | Builds the `manage.cloudflare` CLI image (Node + global `wrangler` + `pywrangler` via `uv` + Docker CLI). Driven by `cli.py`. | -| `nodejs/Dockerfile.build` | Ephemeral build image for **script-based** Node.js workers. Produces the bundled `dist/` that `workers.py` extracts back to the host package. | -| `python/Dockerfile.build` | Ephemeral validation image for **script-based** Python workers — confirms `pywrangler` accepts the generated `pyproject.toml`. | +| `nodejs/Dockerfile.build` | Build image for **script-based** Node.js workers. Pulled once per session; benchmark source is bind-mounted to `/mnt/function` at build time and `cloudflare_nodejs_installer.sh` runs `npm install`, `esbuild`, and the benchmark's `build.js`/`postprocess.js` inside it. | +| `python/Dockerfile.build` | Build image for **script-based** Python workers. Pulled once per session; benchmark source is bind-mounted to `/mnt/function` at build time and `cloudflare_python_installer.sh` validates that `pywrangler` accepts the generated `pyproject.toml`. | | `nodejs/Dockerfile.function` | Runtime image for **container-based** Node.js functions. Parameterized via `ARG BASE_IMAGE` from `config/systems.json`. Copied into the package by `containers.py` and rebuilt by `wrangler deploy`. | | `python/Dockerfile.function` | Runtime image for **container-based** Python functions. Same parameterization. | #### Script-based flow (`container_deployment=false`) -1. `benchmark.build()` → `Cloudflare.package_code` → `CloudflareWorkersDeployment.package_code` (builds via `Dockerfile.build`). -2. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-worker.toml` into the package. -3. `CloudflareCLI.wrangler_deploy` (Node) or `pywrangler_deploy` (Python) deploys via the `manage.cloudflare` container. -4. `HTTPTrigger` is attached using the `workers.dev` URL. +1. `benchmark.build()` → `CloudflareWorkersDeployment.package_code` copies source files into the package directory. +2. `Benchmark.install_dependencies()` pulls the matching `spcleth/serverless-benchmarks:build.cloudflare..` build image (see [Build Images](#build-images) below), bind-mounts the package directory to `/mnt/function`, and runs `/sebs/installer.sh` (`cloudflare_nodejs_installer.sh` or `cloudflare_python_installer.sh`) inside the container. +3. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-worker.toml` into the package. +4. `CloudflareCLI.wrangler_deploy` (Node) or `pywrangler_deploy` (Python) deploys via the `manage.cloudflare` container. +5. `HTTPTrigger` is attached using the `workers.dev` URL. #### Container-based flow (`container_deployment=true`) @@ -288,6 +289,22 @@ Wrangler templates live alongside the deployment code at `sebs/cloudflare/templa 4. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the DO warm before the first measured invocation. 5. `HTTPTrigger` is attached using the `workers.dev` URL. +### Build Images + +Script-based Worker builds use pre-built build images that are pulled once and reused across all benchmarks via bind-mounts — this is the same pattern SeBS uses for other platforms (see [build.md](build.md)). The images are tagged `spcleth/serverless-benchmarks:build.cloudflare..` (e.g. `build.cloudflare.nodejs.18`, `build.cloudflare.python.3.12`) and are available on Docker Hub. + +To build and push updated images yourself (e.g. after modifying a `Dockerfile.build` or an installer script): + +```bash +# Build all Cloudflare toolchain images locally +sebs docker build --deployment cloudflare + +# Push them to Docker Hub (requires push access to the repository) +sebs docker push --deployment cloudflare +``` + +To use a different Docker Hub repository, change `['general']['docker_repository']` in `configs/systems.json`. + ### Trigger Support - **HTTP Trigger**: ✅ Fully supported - Workers are automatically accessible at `https://{name}.{account}.workers.dev` From 360b2a2034f2cde32fb5a6043679bcf45bff201a Mon Sep 17 00:00:00 2001 From: laurin Date: Sat, 25 Apr 2026 13:09:13 +0200 Subject: [PATCH 138/230] refactor(cli): enhance Docker image handling and add containers_push method for Cloudflare registry --- sebs/cloudflare/cli.py | 69 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index c30e6e53f..d8d37a77e 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -7,7 +7,7 @@ import docker from sebs.config import SeBSConfig -from sebs.utils import LoggingBase +from sebs.utils import LoggingBase, get_resource_path class CloudflareCLI(LoggingBase): @@ -39,18 +39,41 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): repo_name = system_config.docker_repository() image_name = "manage.cloudflare" + full_image_name = repo_name + ":" + image_name try: - docker_client.images.get(repo_name + ":" + image_name) + docker_client.images.get(full_image_name) except docker.errors.ImageNotFound: try: - logging.info( - "Docker pull of image {repo}:{image}".format( - repo=repo_name, image=image_name - ) - ) + logging.info(f"Pulling Docker image {full_image_name}...") docker_client.images.pull(repo_name, image_name) - except docker.errors.APIError: - raise RuntimeError("Docker pull of image {} failed!".format(image_name)) + except docker.errors.APIError as pull_error: + logging.info(f"Pull failed: {pull_error}. Building image locally...") + dockerfile_path = str( + get_resource_path("dockerfiles", "cloudflare", "Dockerfile.manage") + ) + if not os.path.exists(dockerfile_path): + raise RuntimeError( + f"Dockerfile not found at {dockerfile_path}. " + "Cannot build Cloudflare CLI container." + ) + build_path = str(get_resource_path()) + logging.info(f"Building {full_image_name} from {dockerfile_path}...") + try: + _, build_logs = docker_client.images.build( + path=build_path, + dockerfile=dockerfile_path, + tag=full_image_name, + rm=True, + pull=True, + ) + for log in build_logs: + if "stream" in log: + logging.debug(log["stream"].strip()) + logging.info(f"Successfully built {full_image_name}") + except docker.errors.BuildError as build_error: + raise RuntimeError( + f"Failed to build Docker image {full_image_name}: {build_error}" + ) # Start the container in detached mode self.docker_instance = docker_client.containers.run( @@ -159,6 +182,34 @@ def check_pywrangler_version(self) -> str: out = self.execute("pywrangler --version") return out.decode("utf-8").strip() + def containers_push(self, tag: str, env: dict = None) -> str: + """ + Push a locally-built image to Cloudflare's container registry. + + The image must already exist locally (built by docker_client.images.build). + The manage container shares the host Docker socket, so it can see and push + local images directly. + + Args: + tag: Local image tag (e.g. my-bench-python-312:latest) + env: Environment variables (must include CLOUDFLARE_API_TOKEN and + CLOUDFLARE_ACCOUNT_ID) + + Returns: + Registry URI (registry.cloudflare.com//:) + """ + out = self.execute(f"wrangler containers push {tag}", env=env) + output = out.decode("utf-8") + for line in output.splitlines(): + if "registry.cloudflare.com" in line: + parts = line.split() + for part in parts: + if part.startswith("registry.cloudflare.com"): + return part.strip() + raise RuntimeError( + f"Could not parse registry URI from wrangler containers push output:\n{output}" + ) + def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: """ Deploy a worker using wrangler. From 51f1c3b27f477d147d5c8bf23cf98277a8fa0295 Mon Sep 17 00:00:00 2001 From: laurin Date: Sat, 25 Apr 2026 13:09:25 +0200 Subject: [PATCH 139/230] refactor(cloudflare): adjust wrangler.toml generation order to utilize container registry URI --- sebs/cloudflare/cloudflare.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 58ee9b3d0..f880bec87 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -529,12 +529,9 @@ def _create_or_update_worker( Returns: Worker deployment result """ - # Generate wrangler.toml for this worker - self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) - # Set up environment for Wrangler CLI in container env = {} - + if self.config.credentials.api_token: env['CLOUDFLARE_API_TOKEN'] = self.config.credentials.api_token elif self.config.credentials.email and self.config.credentials.api_key: @@ -549,13 +546,15 @@ def _create_or_update_worker( # Push the locally-built container image to Cloudflare's registry so that # wrangler deploy can reference it directly instead of rebuilding from the - # Dockerfile. The registry URI replaces the local tag for wrangler.toml. + # Dockerfile. Must happen before generating wrangler.toml so the registry + # URI is written in from the start. if container_deployment and container_uri: self.logging.info(f"Pushing container image {container_uri} to Cloudflare registry...") container_uri = cli.containers_push(container_uri, env=env) self.logging.info(f"Image pushed to: {container_uri}") - # Regenerate wrangler.toml now that we have the registry URI - self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) + + # Generate wrangler.toml for this worker (uses registry URI if available) + self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) # Upload package directory to container container_package_path = f"/tmp/workers/{worker_name}" From 76d5d1736fec361bb9cf5db86aecfa05970a980c Mon Sep 17 00:00:00 2001 From: laurin Date: Sat, 25 Apr 2026 13:12:06 +0200 Subject: [PATCH 140/230] refactor(cloudflare): simplify worker URL generation and update logging level for configuration updates --- sebs/cloudflare/cloudflare.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index f880bec87..cbf66ca80 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -695,15 +695,7 @@ def _build_workers_dev_url(self, worker_name: str, account_id: Optional[str]) -> """ if account_id: sub = self._get_workers_dev_subdomain(account_id) - if sub: - return f"https://{worker_name}.{sub}.workers.dev" - else: - # fallback: some code historically used account_id in the host - self.logging.warning( - "Using account ID in workers.dev URL as a fallback. " - "Enable the workers.dev subdomain in Cloudflare for proper URLs." - ) - return f"https://{worker_name}.{account_id}.workers.dev" + return f"https://{worker_name}.{sub}.workers.dev" # Last fallback: plain workers.dev (may not resolve without a subdomain) self.logging.warning( "No account ID available; using https://{name}.workers.dev which may not be reachable." @@ -788,7 +780,7 @@ def update_function_configuration( # For environment variables or KV namespaces, we would use the API here # For now, we'll just log that configuration update was requested - self.logging.info( + self.logging.warning( f"Configuration update requested for worker {worker.name}. " "Note: Cloudflare Workers have limited runtime configuration options." ) From 4af9a0fa4df3f0c6d6b3282213e3aaa93b9dad0e Mon Sep 17 00:00:00 2001 From: laurin Date: Sat, 25 Apr 2026 13:12:49 +0200 Subject: [PATCH 141/230] fix(cloudflare): raise error for missing workers.dev subdomain instead of logging warning --- sebs/cloudflare/cloudflare.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index cbf66ca80..92ff25706 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -672,11 +672,10 @@ def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: self._workers_dev_subdomain = sub return sub else: - self.logging.warning( + raise RuntimeError( "Could not find workers.dev subdomain in API response; " "please enable the workers.dev subdomain in your Cloudflare dashboard." ) - return None else: self.logging.warning( f"Failed to fetch workers.dev subdomain: {resp.status_code} - {resp.text}" From d6baba304b25c9806803854dea05c17b52140ba7 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 26 Apr 2026 15:06:41 +0200 Subject: [PATCH 142/230] fix(cli): ensure @cloudflare/container npm dependency is installed before deploying with wrangler --- sebs/cloudflare/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index d8d37a77e..762b6e7ae 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -221,7 +221,7 @@ def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: Returns: Deployment output """ - cmd = "cd {} && wrangler deploy".format(package_dir) + cmd = "cd {} && npm install && wrangler deploy".format(package_dir) out = self.execute(cmd, env=env) return out.decode("utf-8") From cec0f9d1c72e69b18134fbb0f7a5264fae96ddb3 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 26 Apr 2026 15:06:55 +0200 Subject: [PATCH 143/230] fix(containers): update image tag generation to use timestamp for versioning --- sebs/cloudflare/containers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 10eff812e..a16cc1e1d 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -331,7 +331,8 @@ def _build_container_image_local( """ # Generate image tag image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" - image_tag = f"{image_name}:latest" + version_tag = time.strftime("%Y%m%d-%H%M%S") + image_tag = f"{image_name}:{version_tag}" self.logging.info(f"Building container image {image_tag} for linux/amd64...") From 89270cf5200d5ee324c59d72468fd128fad8b0c7 Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 26 Apr 2026 15:19:01 +0200 Subject: [PATCH 144/230] refactor(cloudflare): enhance container deployment flow with local image build and registry push details --- docs/platforms.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/platforms.md b/docs/platforms.md index 4d8023eab..f80efdb07 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -283,11 +283,17 @@ Wrangler templates live alongside the deployment code at `sebs/cloudflare/templa #### Container-based flow (`container_deployment=true`) -1. `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile`, adds `worker.js`, merges `package.json`, and runs `npm install` in the CLI container. The correct `BASE_IMAGE` is passed via Docker build args (resolved from `systems.json`) rather than patching the Dockerfile. -2. `Cloudflare.create_function` → `_create_or_update_worker` renders `sebs/cloudflare/templates/wrangler-container.toml`. -3. `CloudflareCLI.wrangler_deploy` invokes wrangler, which rebuilds the image from `Dockerfile` and pushes it to Cloudflare's managed registry, creating a Durable-Object-backed container worker. -4. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the DO warm before the first measured invocation. -5. `HTTPTrigger` is attached using the `workers.dev` URL. +1. **Local image build** — `benchmark.build()` calls `container_client.build_base_image()` on the `_CloudflareContainerAdapter` in `cloudflare.py`, which delegates to `CloudflareContainersDeployment.package_code`. It copies `{language}/Dockerfile.function` as `Dockerfile`, adds `worker.js`, merges `package.json`, and builds a local Docker image tagged `:` (e.g. `my-benchmark-python-312:20260426-130338`). The correct `BASE_IMAGE` is passed via Docker build args (resolved from `systems.json`). A timestamp tag is used instead of `:latest` because Cloudflare's registry explicitly rejects `:latest` tags. + +2. **Registry push** — `Cloudflare.create_function` → `_create_or_update_worker` calls `CloudflareCLI.containers_push(:)`, which runs `wrangler containers push` inside the `manage.cloudflare` container. Wrangler uploads the locally-built image to Cloudflare's managed registry and returns the full registry URI: `registry.cloudflare.com//:`. + +3. **`wrangler.toml` generation** — `_generate_wrangler_toml` renders `sebs/cloudflare/templates/wrangler-container.toml`. The template defaults to `image = "./Dockerfile"` (a local build path). When a registry URI is available, `containers.py` replaces this field with the registry URI (`config['containers'][0]['image'] = container_uri`), so wrangler points directly at the pre-pushed image and skips rebuilding the Dockerfile entirely. + +4. **Deploy** — `CloudflareCLI.wrangler_deploy` runs `npm install && wrangler deploy` inside the `manage.cloudflare` container. `npm install` materializes `node_modules/@cloudflare/containers` (listed in `package.json`) so that wrangler's bundler can resolve the `worker.js` import. Wrangler then deploys the Worker script and creates the Durable-Object-backed container worker backed by the registry image. + +5. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the Durable Object alive during the container provisioning window before the first measured invocation. + +6. `HTTPTrigger` is attached using the `workers.dev` URL. ### Build Images From fad8b4078e23e81228fecb658cad7434996a724b Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 26 Apr 2026 20:43:50 +0200 Subject: [PATCH 145/230] refactor: formatted using black --- sebs/benchmark.py | 14 +- sebs/cli.py | 4 +- sebs/cloudflare/cli.py | 32 ++- sebs/cloudflare/cloudflare.py | 193 ++++++++++------ sebs/cloudflare/config.py | 56 +++-- sebs/cloudflare/containers.py | 124 ++++++----- sebs/cloudflare/function.py | 8 +- sebs/cloudflare/kvstore.py | 8 +- sebs/cloudflare/pyodide_packages.py | 331 +++++++++++++++++++++++----- sebs/cloudflare/r2.py | 130 +++++------ sebs/cloudflare/triggers.py | 78 +++---- sebs/cloudflare/workers.py | 81 ++++--- sebs/config.py | 8 +- sebs/faas/config.py | 5 +- 14 files changed, 681 insertions(+), 391 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 47bc1acad..c3f401784 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -618,7 +618,10 @@ def hash(self) -> str: """ path = os.path.join(self.benchmark_path, self.language_name) self._hash_value = Benchmark.hash_directory( - path, self._deployment_name, self.language, self._language_variant, + path, + self._deployment_name, + self.language, + self._language_variant, container_deployment=self._container_deployment, ) return self._hash_value @@ -727,7 +730,10 @@ def __init__( @staticmethod def hash_directory( - directory: str, deployment: str, language: Language, variant: str = "default", + directory: str, + deployment: str, + language: Language, + variant: str = "default", container_deployment: bool = False, ): """ @@ -928,7 +934,9 @@ def copy_code(self, output_dir: str) -> None: ) ) self.logging.info( - "Applied patch for variant {} ({})".format(self._language_variant, patch_file) + "Applied patch for variant {} ({})".format( + self._language_variant, patch_file + ) ) else: for file_type in FILES[self.language]: diff --git a/sebs/cli.py b/sebs/cli.py index 65751b2d4..999d3d031 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -189,7 +189,9 @@ def parse_common_params( # Only override container_deployment if explicitly set via CLI # If not in config, use CLI default (False) if container_deployment or "container_deployment" not in config_obj.get("experiments", {}): - update_nested_dict(config_obj, ["experiments", "container_deployment"], container_deployment) + update_nested_dict( + config_obj, ["experiments", "container_deployment"], container_deployment + ) # set the path the configuration was loaded from update_nested_dict(config_obj, ["deployment", "local", "path"], config) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 762b6e7ae..26d4cdd6a 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -21,9 +21,7 @@ class CloudflareCLI(LoggingBase): _instance: Optional["CloudflareCLI"] = None @staticmethod - def get_instance( - system_config: SeBSConfig, docker_client: docker.client - ) -> "CloudflareCLI": + def get_instance(system_config: SeBSConfig, docker_client: docker.client) -> "CloudflareCLI": """Return the shared CloudflareCLI instance, creating it on first use. Container and native workers deployments share one underlying CLI @@ -95,9 +93,9 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): detach=True, tty=True, ) - + self.logging.info(f"Started Cloudflare CLI container: {self.docker_instance.id}.") - + # Wait for container to be ready while True: try: @@ -115,20 +113,20 @@ def execute(self, cmd: str, env: dict = None): """ Execute the given command in Cloudflare CLI container. Throws an exception on failure (commands are expected to execute successfully). - + Args: cmd: Shell command to execute env: Optional environment variables dict - + Returns: Command output as bytes """ # Wrap command in sh -c to support shell features like cd, pipes, etc. shell_cmd = ["/bin/sh", "-c", cmd] exit_code, out = self.docker_instance.exec_run( - shell_cmd, + shell_cmd, user="root", # Run as root since entrypoint creates docker_user but we don't wait for it - environment=env + environment=env, ) if exit_code != 0: raise RuntimeError( @@ -141,7 +139,7 @@ def execute(self, cmd: str, env: dict = None): def upload_package(self, directory: str, dest: str): """ Upload a directory to the Docker container. - + This is not an efficient and memory-intensive implementation. So far, we didn't have very large functions that require many gigabytes. @@ -156,7 +154,7 @@ def upload_package(self, directory: str, dest: str): with tarfile.open(fileobj=handle, mode="w:gz") as tar: for f in os.listdir(directory): tar.add(os.path.join(directory, f), arcname=f) - + # Move to the beginning of memory before writing handle.seek(0) self.execute("mkdir -p {}".format(dest)) @@ -165,7 +163,7 @@ def upload_package(self, directory: str, dest: str): def check_wrangler_version(self) -> str: """ Check wrangler version. - + Returns: Version string """ @@ -175,7 +173,7 @@ def check_wrangler_version(self) -> str: def check_pywrangler_version(self) -> str: """ Check pywrangler version. - + Returns: Version string """ @@ -213,11 +211,11 @@ def containers_push(self, tag: str, env: dict = None) -> str: def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: """ Deploy a worker using wrangler. - + Args: package_dir: Path to package directory in container env: Environment variables for deployment - + Returns: Deployment output """ @@ -228,11 +226,11 @@ def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: """ Deploy a Python worker using pywrangler. - + Args: package_dir: Path to package directory in container env: Environment variables for deployment - + Returns: Deployment output """ diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 92ff25706..05c78935a 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -40,7 +40,7 @@ def __init__(self, containers_deployment: CloudflareContainersDeployment): def build_base_image( self, directory: str, - language, # sebs.sebs_types.Language enum + language, # sebs.sebs_types.Language enum language_version: str, architecture: str, benchmark: str, @@ -55,7 +55,7 @@ def build_base_image( """ dir_result, size_bytes, image_tag = self._containers.package_code( directory, - language.value, # Language enum → str + language.value, # Language enum → str language_version, architecture, benchmark, @@ -88,8 +88,7 @@ def push_to_registry( NOT a pushable URI and is not passed to any registry client. """ image_name = ( - f"{benchmark.replace('.', '-')}-{language_name}-" - f"{language_version.replace('.', '')}" + f"{benchmark.replace('.', '-')}-{language_name}-" f"{language_version.replace('.', '')}" ) return f"{image_name}:latest" @@ -110,8 +109,8 @@ class Cloudflare(System): SUPPORTED_BENCHMARKS: Dict[Tuple[str, bool], Optional[List[str]]] = { ("python", False): ["110", "120", "130", "210", "311", "501", "502", "503"], ("nodejs", False): ["110", "120", "130", "311"], - ("python", True): None, # all benchmarks supported - ("nodejs", True): ["110", "120", "130", "210", "311"], + ("python", True): None, # all benchmarks supported + ("nodejs", True): ["110", "120", "130", "210", "311"], } _config: CloudflareConfig @@ -132,7 +131,9 @@ def function_type() -> "Type[Function]": def config(self) -> CloudflareConfig: return self._config - def is_benchmark_supported(self, benchmark_name: str, language: str, container_deployment: bool) -> bool: + def is_benchmark_supported( + self, benchmark_name: str, language: str, container_deployment: bool + ) -> bool: """Return True if the benchmark is supported for the given language/deployment type. Args: @@ -170,9 +171,8 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) # (CLI --language-variant flag), which defaults to "default". Promoting # here ensures copy_code() applies the cloudflare/ source overlay and the # cache key reflects the correct variant. - if ( - code_package.language_variant == "default" - and code_package.benchmark_config.supports(code_package.language, self.name()) + if code_package.language_variant == "default" and code_package.benchmark_config.supports( + code_package.language, self.name() ): code_package.select_variant(self.name()) @@ -195,11 +195,11 @@ def __init__( self.logging_handlers = logger_handlers self._config = config self._api_base_url = "https://api.cloudflare.com/client/v4" - # cached workers.dev subdomain for the account + # cached workers.dev subdomain for the account # This is different from the account ID and is required to build # public worker URLs like ..workers.dev self._workers_dev_subdomain: Optional[str] = None - + # Initialize deployment handlers self._workers_deployment = CloudflareWorkersDeployment( self.logging, sebs_config, docker_client, self.system_resources @@ -275,7 +275,9 @@ def container_client(self) -> _CloudflareContainerAdapter: def _verify_credentials(self): """Verify that the Cloudflare API credentials are valid.""" # Check if credentials are set - if not self.config.credentials.api_token and not (self.config.credentials.email and self.config.credentials.api_key): + if not self.config.credentials.api_token and not ( + self.config.credentials.email and self.config.credentials.api_key + ): raise RuntimeError( "Cloudflare API credentials are not set. Please set CLOUDFLARE_API_TOKEN " "and CLOUDFLARE_ACCOUNT_ID environment variables." @@ -291,10 +293,16 @@ def _verify_credentials(self): # Log credential type being used (without exposing the actual token) if self.config.credentials.api_token: - token_preview = self.config.credentials.api_token[:8] + "..." if len(self.config.credentials.api_token) > 8 else "***" + token_preview = ( + self.config.credentials.api_token[:8] + "..." + if len(self.config.credentials.api_token) > 8 + else "***" + ) self.logging.info(f"Using API Token authentication (starts with: {token_preview})") else: - self.logging.info(f"Using Email + API Key authentication (email: {self.config.credentials.email})") + self.logging.info( + f"Using Email + API Key authentication (email: {self.config.credentials.email})" + ) response = requests.get(f"{self._api_base_url}/user/tokens/verify", headers=headers) @@ -305,13 +313,13 @@ def _verify_credentials(self): ) self.logging.info("Cloudflare credentials verified successfully") - + def _get_deployment_handler(self, container_deployment: bool): """Get the appropriate deployment handler based on deployment type. - + Args: container_deployment: Whether this is a container deployment - + Returns: CloudflareWorkersDeployment or CloudflareContainersDeployment """ @@ -320,7 +328,6 @@ def _get_deployment_handler(self, container_deployment: bool): else: return self._workers_deployment - def package_code( self, directory: str, @@ -351,7 +358,11 @@ def package_code( # Native worker deployment flow — always the cloudflare variant. # workers.py returns a 3-tuple (path, size, ""); drop the unused 3rd element. pkg_path, pkg_size, _ = self._workers_deployment.package_code( - directory, language.value, language_version, benchmark, is_cached, + directory, + language.value, + language_version, + benchmark, + is_cached, language_variant="cloudflare", ) return (pkg_path, pkg_size) @@ -402,8 +413,14 @@ def _generate_wrangler_toml( language_variant = code_package.language_variant if code_package else "cloudflare" handler = self._get_deployment_handler(container_deployment) return handler.generate_wrangler_toml( - worker_name, package_dir, language, account_id, - benchmark_name, code_package, container_uri, language_variant, + worker_name, + package_dir, + language, + account_id, + benchmark_name, + code_package, + container_uri, + language_variant, ) def create_function( @@ -469,7 +486,16 @@ def create_function( self.logging.info(f"Creating new worker {func_name}") # Create the worker with all package files - self._create_or_update_worker(func_name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) + self._create_or_update_worker( + func_name, + package, + account_id, + language, + benchmark, + code_package, + container_deployment, + container_uri, + ) worker = CloudflareWorker( func_name, @@ -512,7 +538,15 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: return None def _create_or_update_worker( - self, worker_name: str, package_dir: str, account_id: str, language: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, container_uri: str = "" + self, + worker_name: str, + package_dir: str, + account_id: str, + language: str, + benchmark_name: Optional[str] = None, + code_package: Optional[Benchmark] = None, + container_deployment: bool = False, + container_uri: str = "", ) -> dict: """Create or update a Cloudflare Worker using Wrangler CLI in container. @@ -533,12 +567,12 @@ def _create_or_update_worker( env = {} if self.config.credentials.api_token: - env['CLOUDFLARE_API_TOKEN'] = self.config.credentials.api_token + env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token elif self.config.credentials.email and self.config.credentials.api_key: - env['CLOUDFLARE_EMAIL'] = self.config.credentials.email - env['CLOUDFLARE_API_KEY'] = self.config.credentials.api_key + env["CLOUDFLARE_EMAIL"] = self.config.credentials.email + env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key - env['CLOUDFLARE_ACCOUNT_ID'] = account_id + env["CLOUDFLARE_ACCOUNT_ID"] = account_id # Get CLI container instance from appropriate deployment handler handler = self._get_deployment_handler(container_deployment) @@ -554,7 +588,16 @@ def _create_or_update_worker( self.logging.info(f"Image pushed to: {container_uri}") # Generate wrangler.toml for this worker (uses registry URI if available) - self._generate_wrangler_toml(worker_name, package_dir, language, account_id, benchmark_name, code_package, container_deployment, container_uri) + self._generate_wrangler_toml( + worker_name, + package_dir, + language, + account_id, + benchmark_name, + code_package, + container_deployment, + container_uri, + ) # Upload package directory to container container_package_path = f"/tmp/workers/{worker_name}" @@ -581,17 +624,15 @@ def _create_or_update_worker( # Wait for the worker to become reachable before returning. # Container workers expose /health; native workers are probed # with a lightweight GET to confirm edge propagation. - account_id_val = env.get('CLOUDFLARE_ACCOUNT_ID') + account_id_val = env.get("CLOUDFLARE_ACCOUNT_ID") worker_url = self._build_workers_dev_url(worker_name, account_id_val) if container_deployment: self.logging.info("Waiting for container worker to initialize...") - self._containers_deployment.wait_for_container_worker_ready( - worker_name, worker_url - ) + self._containers_deployment.wait_for_container_worker_ready(worker_name, worker_url) else: self._wait_for_worker_ready(worker_name, worker_url) - + # Keep the container warm for a minimum provisioning window. # A flat sleep lets the Durable Object hibernate, which causes the # container runtime to reject the next start() call. Instead we @@ -599,7 +640,7 @@ def _create_or_update_worker( if container_deployment: warm_seconds = 60 ping_interval = 5 - account_id = env.get('CLOUDFLARE_ACCOUNT_ID') + account_id = env.get("CLOUDFLARE_ACCOUNT_ID") worker_url = self._build_workers_dev_url(worker_name, account_id) health_url = f"{worker_url}/health" self.logging.info( @@ -623,8 +664,7 @@ def _create_or_update_worker( raise RuntimeError(error_msg) def _wait_for_worker_ready( - self, worker_name: str, worker_url: str, - max_wait_seconds: int = 60, poll_interval: int = 5 + self, worker_name: str, worker_url: str, max_wait_seconds: int = 60, poll_interval: int = 5 ) -> None: """Poll a native worker until it responds, confirming edge propagation.""" self.logging.info( @@ -635,7 +675,9 @@ def _wait_for_worker_ready( try: resp = requests.get(worker_url, timeout=10) if resp.status_code not in (502, 503, 522, 524): - self.logging.info(f"Worker {worker_name} is reachable (HTTP {resp.status_code}).") + self.logging.info( + f"Worker {worker_name} is reachable (HTTP {resp.status_code})." + ) return except requests.exceptions.RequestException: pass @@ -747,19 +789,28 @@ def update_function( # Containers don't support runtime memory configuration changes # Detect container deployment by checking if worker name starts with "container-" is_container = worker.name.startswith("container-") - + if is_container: - self.logging.info(f"Skipping redeployment for container worker {worker.name} - containers don't support runtime memory updates") + self.logging.info( + f"Skipping redeployment for container worker {worker.name} - containers don't support runtime memory updates" + ) else: - self._create_or_update_worker(worker.name, package, account_id, language, benchmark, code_package, container_deployment, container_uri) + self._create_or_update_worker( + worker.name, + package, + account_id, + language, + benchmark, + code_package, + container_deployment, + container_uri, + ) self.logging.info(f"Updated worker {worker.name}") # Update configuration if needed (no-op for containers since they don't support runtime memory changes) self.update_function_configuration(worker, code_package) - def update_function_configuration( - self, cached_function: Function, benchmark: Benchmark - ): + def update_function_configuration(self, cached_function: Function, benchmark: Benchmark): """ Update the configuration of a Cloudflare Worker. @@ -823,15 +874,15 @@ def format_function_name(name: str, container_deployment: bool = False) -> str: Formatted name """ # Convert to lowercase and replace invalid characters - formatted = name.lower().replace('_', '-').replace('.', '-') + formatted = name.lower().replace("_", "-").replace(".", "-") # Remove any characters that aren't alphanumeric or hyphen - formatted = ''.join(c for c in formatted if c.isalnum() or c == '-') + formatted = "".join(c for c in formatted if c.isalnum() or c == "-") # Remove leading/trailing hyphens - formatted = formatted.strip('-') + formatted = formatted.strip("-") # Ensure container worker names don't start with a digit (Cloudflare requirement) # Only add prefix for container workers to differentiate from native workers if container_deployment and formatted and formatted[0].isdigit(): - formatted = 'container-' + formatted + formatted = "container-" + formatted return formatted def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): @@ -850,7 +901,6 @@ def enforce_cold_start(self, functions: List[Function], code_package: Benchmark) "Cloudflare Workers do not support forced cold starts. " "Workers are automatically instantiated on-demand at edge locations." ) - def download_metrics( self, @@ -880,8 +930,7 @@ def download_metrics( return self.logging.info( - f"Extracting metrics from {len(requests)} invocations " - f"of worker {function_name}" + f"Extracting metrics from {len(requests)} invocations " f"of worker {function_name}" ) # Aggregate statistics from all requests @@ -924,31 +973,31 @@ def download_metrics( result.billing.gb_seconds = int(gb_seconds * 1_000_000) # micro GB-seconds # Calculate statistics - metrics['cloudflare'] = { - 'total_invocations': total_invocations, - 'cold_starts': cold_starts, - 'warm_starts': warm_starts, - 'data_source': 'response_measurements', - 'note': 'Per-invocation metrics extracted from benchmark response' + metrics["cloudflare"] = { + "total_invocations": total_invocations, + "cold_starts": cold_starts, + "warm_starts": warm_starts, + "data_source": "response_measurements", + "note": "Per-invocation metrics extracted from benchmark response", } if cpu_times: - metrics['cloudflare']['avg_cpu_time_us'] = sum(cpu_times) // len(cpu_times) - metrics['cloudflare']['min_cpu_time_us'] = min(cpu_times) - metrics['cloudflare']['max_cpu_time_us'] = max(cpu_times) - metrics['cloudflare']['cpu_time_measurements'] = len(cpu_times) + metrics["cloudflare"]["avg_cpu_time_us"] = sum(cpu_times) // len(cpu_times) + metrics["cloudflare"]["min_cpu_time_us"] = min(cpu_times) + metrics["cloudflare"]["max_cpu_time_us"] = max(cpu_times) + metrics["cloudflare"]["cpu_time_measurements"] = len(cpu_times) if wall_times: - metrics['cloudflare']['avg_wall_time_us'] = sum(wall_times) // len(wall_times) - metrics['cloudflare']['min_wall_time_us'] = min(wall_times) - metrics['cloudflare']['max_wall_time_us'] = max(wall_times) - metrics['cloudflare']['wall_time_measurements'] = len(wall_times) + metrics["cloudflare"]["avg_wall_time_us"] = sum(wall_times) // len(wall_times) + metrics["cloudflare"]["min_wall_time_us"] = min(wall_times) + metrics["cloudflare"]["max_wall_time_us"] = max(wall_times) + metrics["cloudflare"]["wall_time_measurements"] = len(wall_times) if memory_values: - metrics['cloudflare']['avg_memory_mb'] = sum(memory_values) / len(memory_values) - metrics['cloudflare']['min_memory_mb'] = min(memory_values) - metrics['cloudflare']['max_memory_mb'] = max(memory_values) - metrics['cloudflare']['memory_measurements'] = len(memory_values) + metrics["cloudflare"]["avg_memory_mb"] = sum(memory_values) / len(memory_values) + metrics["cloudflare"]["min_memory_mb"] = min(memory_values) + metrics["cloudflare"]["max_memory_mb"] = max(memory_values) + metrics["cloudflare"]["memory_measurements"] = len(memory_values) self.logging.info( f"Extracted metrics from {total_invocations} invocations: " @@ -963,9 +1012,7 @@ def download_metrics( avg_wall_ms = sum(wall_times) / len(wall_times) / 1000.0 self.logging.info(f"Average wall time: {avg_wall_ms:.2f} ms") - def create_trigger( - self, function: Function, trigger_type: Trigger.TriggerType - ) -> Trigger: + def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) -> Trigger: """ Create a trigger for a Cloudflare Worker. @@ -1002,7 +1049,7 @@ def shutdown(self) -> None: self.config.update_cache(self.cache_client) finally: self.cache_client.unlock() - + # Shutdown deployment handler CLI containers self._workers_deployment.shutdown() self._containers_deployment.shutdown() diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index bf1b34bd4..2219a8def 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -28,12 +28,18 @@ class CloudflareCredentials(Credentials): See ``docs/platforms.md`` (Cloudflare Workers → Credentials) for full setup instructions. """ - - def __init__(self, api_token: Optional[str] = None, email: Optional[str] = None, - api_key: Optional[str] = None, account_id: Optional[str] = None, - r2_access_key_id: Optional[str] = None, r2_secret_access_key: Optional[str] = None): + + def __init__( + self, + api_token: Optional[str] = None, + email: Optional[str] = None, + api_key: Optional[str] = None, + account_id: Optional[str] = None, + r2_access_key_id: Optional[str] = None, + r2_secret_access_key: Optional[str] = None, + ): super().__init__() - + self._api_token = api_token self._email = email self._api_key = api_key @@ -77,7 +83,7 @@ def initialize(dct: dict) -> "CloudflareCredentials": dct.get("api_key"), dct.get("account_id"), dct.get("r2_access_key_id"), - dct.get("r2_secret_access_key") + dct.get("r2_secret_access_key"), ) @staticmethod @@ -98,7 +104,7 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden api_token=os.environ["CLOUDFLARE_API_TOKEN"], account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID"), r2_access_key_id=os.environ.get("CLOUDFLARE_R2_ACCESS_KEY_ID"), - r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY") + r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY"), ) elif "CLOUDFLARE_EMAIL" in os.environ and "CLOUDFLARE_API_KEY" in os.environ: ret = CloudflareCredentials( @@ -106,7 +112,7 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden api_key=os.environ["CLOUDFLARE_API_KEY"], account_id=os.environ.get("CLOUDFLARE_ACCOUNT_ID"), r2_access_key_id=os.environ.get("CLOUDFLARE_R2_ACCESS_KEY_ID"), - r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY") + r2_secret_access_key=os.environ.get("CLOUDFLARE_R2_SECRET_ACCESS_KEY"), ) else: raise RuntimeError( @@ -124,14 +130,15 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden raise RuntimeError( f"Cloudflare login credentials do not match the account {account_id} in cache!" ) - + ret.logging_handlers = handlers return ret def update_cache(self, cache: Cache): if self._account_id: - cache.update_config(val=self._account_id, - keys=["cloudflare", "credentials", "account_id"]) + cache.update_config( + val=self._account_id, keys=["cloudflare", "credentials", "account_id"] + ) def serialize(self) -> dict: out = {} @@ -144,7 +151,7 @@ class CloudflareResources(Resources): """ Resources for Cloudflare Workers deployment. """ - + def __init__(self): super().__init__(name="cloudflare") self._namespace_id: Optional[str] = None @@ -165,10 +172,10 @@ def namespace_id(self, value: str): def initialize(res: Resources, dct: dict): ret = cast(CloudflareResources, res) super(CloudflareResources, CloudflareResources).initialize(ret, dct) - + if "namespace_id" in dct: ret._namespace_id = dct["namespace_id"] - + return ret def serialize(self) -> dict: @@ -181,8 +188,7 @@ def update_cache(self, cache: Cache): super().update_cache(cache) if self._namespace_id: cache.update_config( - val=self._namespace_id, - keys=["cloudflare", "resources", "namespace_id"] + val=self._namespace_id, keys=["cloudflare", "resources", "namespace_id"] ) @staticmethod @@ -200,7 +206,9 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resour if "resources" in config: CloudflareResources.initialize(ret, config["resources"]) ret.logging_handlers = handlers - ret.logging.info("No cached resources for Cloudflare found, using user configuration.") + ret.logging.info( + "No cached resources for Cloudflare found, using user configuration." + ) else: CloudflareResources.initialize(ret, {}) ret.logging_handlers = handlers @@ -213,7 +221,7 @@ class CloudflareConfig(Config): """ Configuration for Cloudflare Workers platform. """ - + def __init__(self, credentials: CloudflareCredentials, resources: CloudflareResources): super().__init__(name="cloudflare") self._credentials = credentials @@ -240,13 +248,15 @@ def initialize(cfg: Config, dct: dict): @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config: cached_config = cache.get_config("cloudflare") - credentials = cast(CloudflareCredentials, - CloudflareCredentials.deserialize(config, cache, handlers)) - resources = cast(CloudflareResources, - CloudflareResources.deserialize(config, cache, handlers)) + credentials = cast( + CloudflareCredentials, CloudflareCredentials.deserialize(config, cache, handlers) + ) + resources = cast( + CloudflareResources, CloudflareResources.deserialize(config, cache, handlers) + ) config_obj = CloudflareConfig(credentials, resources) config_obj.logging_handlers = handlers - + # Load cached values if cached_config: config_obj.logging.info("Using cached config for Cloudflare") diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index a16cc1e1d..b02447d9f 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -12,6 +12,7 @@ import time from importlib.resources import files + try: import tomllib # Python 3.11+ except ImportError: @@ -87,29 +88,31 @@ def generate_wrangler_toml( """ # Load template template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-container.toml") - with open(template_path, 'rb') as f: + with open(template_path, "rb") as f: config = tomllib.load(f) # Update basic configuration - config['name'] = worker_name - config['account_id'] = account_id + config["name"] = worker_name + config["account_id"] = account_id if container_uri and container_uri.startswith("registry.cloudflare.com"): # Pre-built image already pushed to Cloudflare registry — point wrangler # at it directly so it skips the Docker build step entirely. - config['containers'][0]['image'] = container_uri + config["containers"][0]["image"] = container_uri else: # Fallback: let wrangler build from the local Dockerfile. if self._base_image: - config['containers'][0]['build_args'] = {"BASE_IMAGE": self._base_image} + config["containers"][0]["build_args"] = {"BASE_IMAGE": self._base_image} # Update container configuration with instance type if needed - if benchmark_name and ("411.image-recognition" in benchmark_name or - "311.compression" in benchmark_name or - "504.dna-visualisation" in benchmark_name): + if benchmark_name and ( + "411.image-recognition" in benchmark_name + or "311.compression" in benchmark_name + or "504.dna-visualisation" in benchmark_name + ): self.logging.warning("Using standard-4 instance type for high resource benchmark") - config['containers'][0]['instance_type'] = "standard-4" - + config["containers"][0]["instance_type"] = "standard-4" + # Add nosql KV namespace bindings if benchmark uses them if code_package and code_package.uses_nosql: # Get registered nosql tables for this benchmark @@ -118,23 +121,26 @@ def generate_wrangler_toml( if nosql_storage.retrieve_cache(benchmark_for_nosql): nosql_tables = nosql_storage.get_tables(benchmark_for_nosql) if nosql_tables: - config['kv_namespaces'] = config.get('kv_namespaces', []) + config["kv_namespaces"] = config.get("kv_namespaces", []) for table_name, namespace_id in nosql_tables.items(): - config['kv_namespaces'].append({ - 'binding': table_name, - 'id': namespace_id, - }) - + config["kv_namespaces"].append( + { + "binding": table_name, + "id": namespace_id, + } + ) + # Add environment variables if benchmark_name or (code_package and code_package.uses_nosql): - config['vars'] = {} + config["vars"] = {} if benchmark_name: - config['vars']['BENCHMARK_NAME'] = benchmark_name + config["vars"]["BENCHMARK_NAME"] = benchmark_name if code_package and code_package.uses_nosql: - config['vars']['NOSQL_STORAGE_DATABASE'] = "kvstore" - + config["vars"]["NOSQL_STORAGE_DATABASE"] = "kvstore" + # Add R2 bucket binding from sebs.faas.config import Resources + storage = self.system_resources.get_storage() bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) if not bucket_name: @@ -142,23 +148,20 @@ def generate_wrangler_toml( "R2 bucket binding not configured: benchmarks bucket name is empty. " "Benchmarks requiring file access will not work properly." ) - config['r2_buckets'] = [{ - 'binding': 'R2', - 'bucket_name': bucket_name - }] + config["r2_buckets"] = [{"binding": "R2", "bucket_name": bucket_name}] self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") - + # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") try: # Try tomli_w (writes binary) - with open(toml_path, 'wb') as f: + with open(toml_path, "wb") as f: tomli_w.dump(config, f) except TypeError: # Fallback to toml library (writes text) - with open(toml_path, 'w') as f: + with open(toml_path, "w") as f: f.write(tomli_w.dumps(config)) - + self.logging.info(f"Generated wrangler.toml at {toml_path}") return toml_path @@ -172,7 +175,7 @@ def package_code( ) -> Tuple[str, int, str]: """ Package code for Cloudflare container worker deployment. - + Builds a Docker image and returns the image tag for deployment. Args: @@ -192,9 +195,7 @@ def package_code( wrapper_container_dir = os.path.join(wrapper_base, language_name, "container") if not os.path.exists(wrapper_container_dir): - raise RuntimeError( - f"Container wrapper directory not found: {wrapper_container_dir}" - ) + raise RuntimeError(f"Container wrapper directory not found: {wrapper_container_dir}") # Overwrite the wrapper files staged by add_deployment_files() with the # container-specific versions before doing anything else. @@ -220,9 +221,7 @@ def package_code( # Copy Dockerfile.function from dockerfiles/cloudflare/{language}/ dockerfile_src = str( - get_resource_path( - "dockerfiles", "cloudflare", language_name, "Dockerfile.function" - ) + get_resource_path("dockerfiles", "cloudflare", language_name, "Dockerfile.function") ) dockerfile_dest = os.path.join(directory, "Dockerfile") if os.path.exists(dockerfile_src): @@ -257,6 +256,7 @@ def package_code( # Copy init.sh if the benchmark needs it (e.g. video-processing downloads ffmpeg) from sebs.utils import find_benchmark + benchmark_path = find_benchmark(benchmark, "benchmarks") if benchmark_path: for path in [benchmark_path, os.path.join(benchmark_path, language_name)]: @@ -276,12 +276,12 @@ def package_code( f"package.json not found at {package_json_path} " f"for nodejs benchmark '{benchmark}'" ) - with open(package_json_path, 'r') as f: + with open(package_json_path, "r") as f: package_json = json.load(f) else: package_json = {} package_json.setdefault("dependencies", {})["@cloudflare/containers"] = "*" - with open(package_json_path, 'w') as f: + with open(package_json_path, "w") as f: json.dump(package_json, f, indent=2) # For Python containers, promote the versioned requirements.txt to requirements.txt @@ -294,7 +294,7 @@ def package_code( elif not os.path.exists(requirements_file): open(requirements_file, "w").close() self.logging.info("Created empty requirements.txt") - + # Build the image locally. cache.py requires docker_client.images.get() to # succeed for container deployments, and the local image is what we push to # Cloudflare's registry during deploy (wrangler containers push). @@ -312,7 +312,7 @@ def package_code( self.logging.info(f"Container package prepared (image tag: {image_tag})") return (directory, total_size, image_tag) - + def _build_container_image_local( self, directory: str, @@ -330,7 +330,9 @@ def _build_container_image_local( Returns the local image tag. """ # Generate image tag - image_name = f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" + image_name = ( + f"{benchmark.replace('.', '-')}-{language_name}-{language_version.replace('.', '')}" + ) version_tag = time.strftime("%Y%m%d-%H%M%S") image_tag = f"{image_name}:{version_tag}" @@ -338,11 +340,15 @@ def _build_container_image_local( result = subprocess.run( [ - "docker", "buildx", "build", - "--platform", "linux/amd64", + "docker", + "buildx", + "build", + "--platform", + "linux/amd64", "--load", "--no-cache", - "-t", image_tag, + "-t", + image_tag, directory, ], capture_output=True, @@ -356,10 +362,7 @@ def _build_container_image_local( return image_tag def wait_for_container_worker_ready( - self, - worker_name: str, - worker_url: str, - max_wait_seconds: int = 400 + self, worker_name: str, worker_url: str, max_wait_seconds: int = 400 ) -> bool: """ Wait for container worker to be fully provisioned and ready. @@ -374,20 +377,17 @@ def wait_for_container_worker_ready( """ wait_interval = 10 start_time = time.time() - + self.logging.info("Checking container worker readiness via health endpoint...") - + consecutive_failures = 0 max_consecutive_failures = 5 - + while time.time() - start_time < max_wait_seconds: try: # Use health check endpoint - response = requests.get( - f"{worker_url}/health", - timeout=60 - ) - + response = requests.get(f"{worker_url}/health", timeout=60) + # 200 = ready if response.status_code == 200: self.logging.info("Container worker is ready!") @@ -401,17 +401,21 @@ def wait_for_container_worker_ready( ) # Other errors else: - self.logging.warning(f"Unexpected status {response.status_code}: {response.text[:100]}") - + self.logging.warning( + f"Unexpected status {response.status_code}: {response.text[:100]}" + ) + except requests.exceptions.Timeout: elapsed = int(time.time() - start_time) - self.logging.info(f"Health check timeout (container may be starting)... ({elapsed}s elapsed)") + self.logging.info( + f"Health check timeout (container may be starting)... ({elapsed}s elapsed)" + ) except requests.exceptions.RequestException as e: elapsed = int(time.time() - start_time) self.logging.debug(f"Connection error ({elapsed}s): {str(e)[:100]}") - + time.sleep(wait_interval) - + raise RuntimeError( f"Container worker {worker_name} did not become ready after {max_wait_seconds}s. " "Deployment cannot proceed without a healthy container." diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py index cd422dc30..355d9536a 100644 --- a/sebs/cloudflare/function.py +++ b/sebs/cloudflare/function.py @@ -6,10 +6,10 @@ class CloudflareWorker(Function): """ Cloudflare Workers function implementation. - + A Cloudflare Worker is a serverless function that runs on Cloudflare's edge network. """ - + def __init__( self, name: str, @@ -52,10 +52,10 @@ def deserialize(cached_config: dict) -> "CloudflareWorker": cfg, cached_config.get("account_id"), ) - + for trigger in cached_config["triggers"]: trigger_type = HTTPTrigger if trigger["type"] == HTTPTrigger.typename() else None assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) ret.add_trigger(trigger_type.deserialize(trigger)) - + return ret diff --git a/sebs/cloudflare/kvstore.py b/sebs/cloudflare/kvstore.py index 11c71c7e3..4601087b9 100644 --- a/sebs/cloudflare/kvstore.py +++ b/sebs/cloudflare/kvstore.py @@ -187,7 +187,9 @@ def _delete_namespace(self, namespace_id: str) -> None: if response.content: payload = response.json() if not payload.get("success"): - raise RuntimeError(f"Failed to delete KV namespace {namespace_id}: {payload.get('errors')}") + raise RuntimeError( + f"Failed to delete KV namespace {namespace_id}: {payload.get('errors')}" + ) @staticmethod def _compose_key( @@ -321,7 +323,9 @@ def create_table( namespace_id = payload.get("result", {}).get("id") if not namespace_id: - raise RuntimeError(f"Cloudflare KV API did not return namespace id for {namespace_title}") + raise RuntimeError( + f"Cloudflare KV API did not return namespace id for {namespace_title}" + ) self._tables[benchmark][name] = namespace_id self.logging.info( diff --git a/sebs/cloudflare/pyodide_packages.py b/sebs/cloudflare/pyodide_packages.py index acae0bc41..d4e89320e 100644 --- a/sebs/cloudflare/pyodide_packages.py +++ b/sebs/cloudflare/pyodide_packages.py @@ -9,56 +9,287 @@ from typing import FrozenSet, Optional -SUPPORTED_PYODIDE_PACKAGES: FrozenSet[str] = frozenset({ - "affine", "aiohappyeyeballs", "aiohttp", "aiosignal", "altair", - "annotated-types", "anyio", "apsw", "argon2-cffi", "argon2-cffi-bindings", - "asciitree", "astropy", "astropy_iers_data", "asttokens", "async-timeout", - "atomicwrites", "attrs", "audioop-lts", "autograd", "awkward-cpp", "b2d", - "bcrypt", "beautifulsoup4", "bilby.cython", "biopython", "bitarray", - "bitstring", "bleach", "blosc2", "bokeh", "boost-histogram", "brotli", - "cachetools", "casadi", "cbor-diag", "certifi", "cffi", "cffi_example", - "cftime", "charset-normalizer", "clarabel", "click", "cligj", "clingo", - "cloudpickle", "cmyt", "cobs", "colorspacious", "contourpy", "coolprop", - "coverage", "cramjam", "crc32c", "cryptography", "css-inline", "cssselect", - "cvxpy-base", "cycler", "cysignals", "cytoolz", "decorator", "demes", - "deprecation", "diskcache", "distlib", "distro", "docutils", "donfig", - "ewah_bool_utils", "exceptiongroup", "executing", "fastapi", "fastcan", - "fastparquet", "fiona", "fonttools", "freesasa", "frozenlist", "fsspec", - "future", "galpy", "gmpy2", "gsw", "h11", "h3", "h5py", "highspy", - "html5lib", "httpcore", "httpx", "idna", "igraph", "imageio", "imgui-bundle", - "iminuit", "iniconfig", "inspice", "ipython", "jedi", "Jinja2", "jiter", - "joblib", "jsonpatch", "jsonpointer", "jsonschema", "jsonschema_specifications", - "kiwisolver", "lakers-python", "lazy_loader", "lazy-object-proxy", "libcst", - "lightgbm", "logbook", "lxml", "lz4", "MarkupSafe", "matplotlib", - "matplotlib-inline", "memory-allocator", "micropip", "mmh3", "more-itertools", - "mpmath", "msgpack", "msgspec", "msprime", "multidict", "munch", "mypy", - "narwhals", "ndindex", "netcdf4", "networkx", "newick", "nh3", "nlopt", - "nltk", "numcodecs", "numpy", "openai", "opencv-python", "optlang", "orjson", - "packaging", "pandas", "parso", "patsy", "pcodec", "peewee", "pi-heif", - "Pillow", "pillow-heif", "pkgconfig", "platformdirs", "pluggy", "ply", - "pplpy", "primecountpy", "prompt_toolkit", "propcache", "protobuf", - "pure-eval", "py", "pyclipper", "pycparser", "pycryptodome", "pydantic", - "pydantic_core", "pyerfa", "pygame-ce", "Pygments", "pyheif", "pyiceberg", - "pyinstrument", "pylimer-tools", "PyMuPDF", "pynacl", "pyodide-http", - "pyodide-unix-timezones", "pyparsing", "pyrsistent", "pysam", "pyshp", - "pytaglib", "pytest", "pytest-asyncio", "pytest-benchmark", "pytest_httpx", - "python-calamine", "python-dateutil", "python-flint", "python-magic", - "python-sat", "python-solvespace", "pytz", "pywavelets", "pyxel", "pyxirr", - "pyyaml", "rasterio", "rateslib", "rebound", "reboundx", "referencing", - "regex", "requests", "retrying", "rich", "river", "RobotRaconteur", - "rpds-py", "ruamel.yaml", "rustworkx", "scikit-image", "scikit-learn", - "scipy", "screed", "setuptools", "shapely", "simplejson", "sisl", "six", - "smart-open", "sniffio", "sortedcontainers", "soundfile", "soupsieve", - "sourmash", "soxr", "sparseqr", "sqlalchemy", "stack-data", "starlette", - "statsmodels", "strictyaml", "svgwrite", "swiglpk", "sympy", "tblib", - "termcolor", "texttable", "texture2ddecoder", "threadpoolctl", "tiktoken", - "tomli", "tomli-w", "toolz", "tqdm", "traitlets", "traits", "tree-sitter", - "tree-sitter-go", "tree-sitter-java", "tree-sitter-python", "tskit", - "typing-extensions", "tzdata", "ujson", "uncertainties", "unyt", "urllib3", - "vega-datasets", "vrplib", "wcwidth", "webencodings", "wordcloud", "wrapt", - "xarray", "xgboost", "xlrd", "xxhash", "xyzservices", "yarl", "yt", "zengl", - "zfpy", "zstandard", -}) +SUPPORTED_PYODIDE_PACKAGES: FrozenSet[str] = frozenset( + { + "affine", + "aiohappyeyeballs", + "aiohttp", + "aiosignal", + "altair", + "annotated-types", + "anyio", + "apsw", + "argon2-cffi", + "argon2-cffi-bindings", + "asciitree", + "astropy", + "astropy_iers_data", + "asttokens", + "async-timeout", + "atomicwrites", + "attrs", + "audioop-lts", + "autograd", + "awkward-cpp", + "b2d", + "bcrypt", + "beautifulsoup4", + "bilby.cython", + "biopython", + "bitarray", + "bitstring", + "bleach", + "blosc2", + "bokeh", + "boost-histogram", + "brotli", + "cachetools", + "casadi", + "cbor-diag", + "certifi", + "cffi", + "cffi_example", + "cftime", + "charset-normalizer", + "clarabel", + "click", + "cligj", + "clingo", + "cloudpickle", + "cmyt", + "cobs", + "colorspacious", + "contourpy", + "coolprop", + "coverage", + "cramjam", + "crc32c", + "cryptography", + "css-inline", + "cssselect", + "cvxpy-base", + "cycler", + "cysignals", + "cytoolz", + "decorator", + "demes", + "deprecation", + "diskcache", + "distlib", + "distro", + "docutils", + "donfig", + "ewah_bool_utils", + "exceptiongroup", + "executing", + "fastapi", + "fastcan", + "fastparquet", + "fiona", + "fonttools", + "freesasa", + "frozenlist", + "fsspec", + "future", + "galpy", + "gmpy2", + "gsw", + "h11", + "h3", + "h5py", + "highspy", + "html5lib", + "httpcore", + "httpx", + "idna", + "igraph", + "imageio", + "imgui-bundle", + "iminuit", + "iniconfig", + "inspice", + "ipython", + "jedi", + "Jinja2", + "jiter", + "joblib", + "jsonpatch", + "jsonpointer", + "jsonschema", + "jsonschema_specifications", + "kiwisolver", + "lakers-python", + "lazy_loader", + "lazy-object-proxy", + "libcst", + "lightgbm", + "logbook", + "lxml", + "lz4", + "MarkupSafe", + "matplotlib", + "matplotlib-inline", + "memory-allocator", + "micropip", + "mmh3", + "more-itertools", + "mpmath", + "msgpack", + "msgspec", + "msprime", + "multidict", + "munch", + "mypy", + "narwhals", + "ndindex", + "netcdf4", + "networkx", + "newick", + "nh3", + "nlopt", + "nltk", + "numcodecs", + "numpy", + "openai", + "opencv-python", + "optlang", + "orjson", + "packaging", + "pandas", + "parso", + "patsy", + "pcodec", + "peewee", + "pi-heif", + "Pillow", + "pillow-heif", + "pkgconfig", + "platformdirs", + "pluggy", + "ply", + "pplpy", + "primecountpy", + "prompt_toolkit", + "propcache", + "protobuf", + "pure-eval", + "py", + "pyclipper", + "pycparser", + "pycryptodome", + "pydantic", + "pydantic_core", + "pyerfa", + "pygame-ce", + "Pygments", + "pyheif", + "pyiceberg", + "pyinstrument", + "pylimer-tools", + "PyMuPDF", + "pynacl", + "pyodide-http", + "pyodide-unix-timezones", + "pyparsing", + "pyrsistent", + "pysam", + "pyshp", + "pytaglib", + "pytest", + "pytest-asyncio", + "pytest-benchmark", + "pytest_httpx", + "python-calamine", + "python-dateutil", + "python-flint", + "python-magic", + "python-sat", + "python-solvespace", + "pytz", + "pywavelets", + "pyxel", + "pyxirr", + "pyyaml", + "rasterio", + "rateslib", + "rebound", + "reboundx", + "referencing", + "regex", + "requests", + "retrying", + "rich", + "river", + "RobotRaconteur", + "rpds-py", + "ruamel.yaml", + "rustworkx", + "scikit-image", + "scikit-learn", + "scipy", + "screed", + "setuptools", + "shapely", + "simplejson", + "sisl", + "six", + "smart-open", + "sniffio", + "sortedcontainers", + "soundfile", + "soupsieve", + "sourmash", + "soxr", + "sparseqr", + "sqlalchemy", + "stack-data", + "starlette", + "statsmodels", + "strictyaml", + "svgwrite", + "swiglpk", + "sympy", + "tblib", + "termcolor", + "texttable", + "texture2ddecoder", + "threadpoolctl", + "tiktoken", + "tomli", + "tomli-w", + "toolz", + "tqdm", + "traitlets", + "traits", + "tree-sitter", + "tree-sitter-go", + "tree-sitter-java", + "tree-sitter-python", + "tskit", + "typing-extensions", + "tzdata", + "ujson", + "uncertainties", + "unyt", + "urllib3", + "vega-datasets", + "vrplib", + "wcwidth", + "webencodings", + "wordcloud", + "wrapt", + "xarray", + "xgboost", + "xlrd", + "xxhash", + "xyzservices", + "yarl", + "yt", + "zengl", + "zfpy", + "zstandard", + } +) _CANONICAL_BY_LOWER = {name.lower(): name for name in SUPPORTED_PYODIDE_PACKAGES} diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 1a03ab3d7..164e3d431 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -8,6 +8,8 @@ from sebs.cache import Cache from typing import List, Optional + + class R2(PersistentStorage): @staticmethod def typename() -> str: @@ -56,12 +58,12 @@ def _get_auth_headers(self) -> dict[str, str]: def _get_s3_client(self): """ Get or initialize the S3-compatible client for R2 operations. - + :return: boto3 S3 client or None if credentials not available """ if self._s3_client is not None: return self._s3_client - + # Check if we have S3-compatible credentials if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: self.logging.warning( @@ -69,28 +71,26 @@ def _get_s3_client(self): "Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." ) return None - + try: import boto3 from botocore.config import Config - + account_id = self._credentials.account_id - + self._s3_client = boto3.client( - 's3', - endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com', + "s3", + endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com", aws_access_key_id=self._credentials.r2_access_key_id, aws_secret_access_key=self._credentials.r2_secret_access_key, - config=Config(signature_version='s3v4'), - region_name='auto' + config=Config(signature_version="s3v4"), + region_name="auto", ) - + return self._s3_client - + except ImportError: - self.logging.warning( - "boto3 not available. Install with: pip install boto3" - ) + self.logging.warning("boto3 not available. Install with: pip install boto3") return None def correct_name(self, name: str) -> str: @@ -102,24 +102,20 @@ def _create_bucket( for bucket_name in buckets: if name in bucket_name: self.logging.info( - "Bucket {} for {} already exists, skipping.".format( - bucket_name, name - ) + "Bucket {} for {} already exists, skipping.".format(bucket_name, name) ) return bucket_name account_id = self._credentials.account_id - create_bucket_uri = ( - f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" - ) + create_bucket_uri = f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" # R2 API only accepts "name" parameter - locationHint is optional and must be one of: # "apac", "eeur", "enam", "weur", "wnam" # WARNING: locationHint is not currently supported by SeBS. Buckets are created # with Cloudflare's automatic location selection. params = {"name": name} - + self.logging.warning( f"Creating R2 bucket '{name}' without locationHint. " "Geographic location is determined automatically by Cloudflare." @@ -129,7 +125,7 @@ def _create_bucket( create_bucket_response = requests.post( create_bucket_uri, json=params, headers=self._get_auth_headers() ) - + # Log the response for debugging if create_bucket_response.status_code >= 400: try: @@ -143,9 +139,9 @@ def _create_bucket( f"R2 bucket creation failed. Status: {create_bucket_response.status_code}, " f"Response: {create_bucket_response.text}" ) - + create_bucket_response.raise_for_status() - + bucket_info_json = create_bucket_response.json() if not bucket_info_json.get("success"): @@ -155,7 +151,7 @@ def _create_bucket( bucket_name = bucket_info_json.get("result", {}).get("name", name) self.logging.info(f"Created R2 bucket {bucket_name}") return bucket_name - + except requests.exceptions.RequestException as e: self.logging.error(f"Error creating R2 bucket {name}: {e}") raise @@ -178,16 +174,14 @@ def download(self, bucket_name: str, key: str, filepath: str) -> None: if dirname: os.makedirs(dirname, exist_ok=True) s3_client.download_file(bucket_name, key, filepath) - self.logging.debug( - f"Downloaded {key} from R2 bucket {bucket_name} to {filepath}" - ) + self.logging.debug(f"Downloaded {key} from R2 bucket {bucket_name} to {filepath}") except Exception as e: self.logging.warning(f"Failed to download {key} from R2: {e}") def upload(self, bucket_name: str, filepath: str, key: str): """ Upload a file to R2 bucket using the S3-compatible API. - + Requires S3 credentials to be configured for the R2 bucket. :param bucket_name: R2 bucket name @@ -198,24 +192,20 @@ def upload(self, bucket_name: str, filepath: str, key: str): if s3_client is None: self.logging.warning(f"Cannot upload {filepath} to R2 - S3 client not available") return - + try: - with open(filepath, 'rb') as f: - s3_client.put_object( - Bucket=bucket_name, - Key=key, - Body=f - ) - + with open(filepath, "rb") as f: + s3_client.put_object(Bucket=bucket_name, Key=key, Body=f) + self.logging.debug(f"Uploaded {filepath} to R2 bucket {bucket_name} as {key}") - + except Exception as e: self.logging.warning(f"Failed to upload {filepath} to R2: {e}") - + def upload_bytes(self, bucket_name: str, key: str, data: bytes): """ Upload bytes directly to R2 bucket using the S3-compatible API. - + :param bucket_name: R2 bucket name :param key: R2 destination key/path :param data: bytes to upload @@ -224,23 +214,19 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): if s3_client is None: self.logging.warning(f"Cannot upload bytes to R2 - S3 client not available") return - + try: - s3_client.put_object( - Bucket=bucket_name, - Key=key, - Body=data - ) - + s3_client.put_object(Bucket=bucket_name, Key=key, Body=data) + self.logging.debug(f"Uploaded {len(data)} bytes to R2 bucket {bucket_name} as {key}") - + except Exception as e: self.logging.warning(f"Failed to upload bytes to R2: {e}") def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: """ Retrieves list of files in a bucket using S3-compatible API. - + :param bucket_name: :param prefix: optional prefix filter :return: list of files in a given bucket @@ -253,14 +239,14 @@ def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: ) try: - paginator = s3_client.get_paginator('list_objects_v2') + paginator = s3_client.get_paginator("list_objects_v2") page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) files = [] for page in page_iterator: - if 'Contents' in page: - for obj in page['Contents']: - files.append(obj['Key']) + if "Contents" in page: + for obj in page["Contents"]: + files.append(obj["Key"]) return files @@ -270,16 +256,14 @@ def list_bucket(self, bucket_name: str, prefix: str = "") -> List[str]: def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: """ List all R2 buckets in the account. - + :param bucket_name: optional filter (not used for R2) :return: list of bucket names """ account_id = self._credentials.account_id - - list_buckets_uri = ( - f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" - ) - + + list_buckets_uri = f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets" + try: response = requests.get(list_buckets_uri, headers=self._get_auth_headers()) @@ -299,9 +283,7 @@ def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: data = response.json() if not data.get("success"): - raise RuntimeError( - f"Failed to list R2 buckets: {data.get('errors')}" - ) + raise RuntimeError(f"Failed to list R2 buckets: {data.get('errors')}") buckets = data.get("result", {}).get("buckets", []) bucket_names = [bucket["name"] for bucket in buckets] @@ -315,7 +297,7 @@ def list_buckets(self, bucket_name: Optional[str] = None) -> List[str]: def exists_bucket(self, bucket_name: str) -> bool: """ Check if a bucket exists. - + :param bucket_name: :return: True if bucket exists """ @@ -325,7 +307,7 @@ def exists_bucket(self, bucket_name: str) -> bool: def clean_bucket(self, bucket_name: str): """ Remove all objects from a bucket. - + :param bucket_name: """ self.logging.warning(f"clean_bucket not fully implemented for R2 bucket {bucket_name}") @@ -334,33 +316,33 @@ def clean_bucket(self, bucket_name: str): def remove_bucket(self, bucket: str): """ Delete a bucket. - + :param bucket: """ account_id = self._credentials.account_id - + delete_bucket_uri = ( f"https://api.cloudflare.com/client/v4/accounts/{account_id}/r2/buckets/{bucket}" ) - + try: response = requests.delete(delete_bucket_uri, headers=self._get_auth_headers()) response.raise_for_status() - + data = response.json() - + if data.get("success"): self.logging.info(f"Successfully deleted R2 bucket {bucket}") else: self.logging.error(f"Failed to delete R2 bucket {bucket}: {data.get('errors')}") - + except requests.exceptions.RequestException as e: self.logging.error(f"Error deleting R2 bucket {bucket}: {e}") def uploader_func(self, bucket_idx: int, file: str, filepath: str) -> None: """ Upload a file to a bucket (used for parallel uploads). - + :param bucket_idx: index of the bucket/prefix to upload to :param file: destination file name/key :param filepath: source file path @@ -373,12 +355,14 @@ def uploader_func(self, bucket_idx: int, file: str, filepath: str) -> None: key = os.path.join(self.input_prefixes[bucket_idx], file) bucket_name = self.get_bucket(Resources.StorageBucketType.BENCHMARKS) - + # Check if file already exists (if not replacing existing files) if not self.replace_existing: for f in self.input_prefixes_files[bucket_idx]: if key == f: - self.logging.info(f"Skipping upload of {filepath} to {bucket_name} (already exists)") + self.logging.info( + f"Skipping upload of {filepath} to {bucket_name} (already exists)" + ) return # Upload the file diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index aa5f17faf..d0dd8f6fc 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -10,6 +10,7 @@ class ContainerProvisioningError(RuntimeError): """Raised when Cloudflare reports the container is still provisioning.""" + pass @@ -18,7 +19,7 @@ class HTTPTrigger(Trigger): HTTP trigger for Cloudflare Workers. Workers are automatically accessible via HTTPS endpoints. """ - + def __init__(self, worker_name: str, url: Optional[str] = None): super().__init__() self.worker_name = worker_name @@ -52,11 +53,14 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec import pycurl c = pycurl.Curl() - c.setopt(pycurl.HTTPHEADER, [ - "Content-Type: application/json", - # Cloudflare bot-protection (error 1010) blocks requests with no/tool UA. - "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; +https://github.com/spcl/serverless-benchmarks)", - ]) + c.setopt( + pycurl.HTTPHEADER, + [ + "Content-Type: application/json", + # Cloudflare bot-protection (error 1010) blocks requests with no/tool UA. + "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; +https://github.com/spcl/serverless-benchmarks)", + ], + ) c.setopt(pycurl.POST, 1) c.setopt(pycurl.URL, url) if not verify_ssl: @@ -85,14 +89,14 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec if status_code == 502: self.logging.info(f"Container returned 502 (still starting?), will retry...") raise ContainerProvisioningError(f"502 gateway error from container worker") - + # Check for Cloudflare error code 1042 (CPU time limit / worker not ready) # Output may be a plain string like "error code: 1042" rather than a dict. output_str = str(output) if "1042" in output_str and "error code" in output_str: self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") raise ContainerProvisioningError(f"Error 1042 from worker: {output_str}") - + if status_code != 200: self.logging.error(f"Invocation on URL {url} failed!") self.logging.error(f"Output: {output}") @@ -118,11 +122,11 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec if "1042" in raw_text and "error code" in raw_text: self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") raise ContainerProvisioningError(f"Error 1042 from worker: {raw_text[:200]}") - if status_code == 502 or any(p.lower() in raw_text.lower() for p in provisioning_phrases): + if status_code == 502 or any( + p.lower() in raw_text.lower() for p in provisioning_phrases + ): self.logging.info(f"Container still provisioning (URL {url}): {raw_text[:120]}") - raise ContainerProvisioningError( - f"Container not yet available: {raw_text[:200]}" - ) + raise ContainerProvisioningError(f"Container not yet available: {raw_text[:200]}") self.logging.error(f"Invocation on URL {url} failed!") if raw_text: self.logging.error(f"Output: {raw_text}") @@ -148,40 +152,40 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: time.sleep(provisioning_retry_wait) else: raise - + # Extract measurement data from the response if available - if result.output and 'result' in result.output: # type: ignore[union-attr] - result_data = result.output['result'] - if isinstance(result_data, dict) and 'measurement' in result_data: - measurement = result_data['measurement'] - + if result.output and "result" in result.output: # type: ignore[union-attr] + result_data = result.output["result"] + if isinstance(result_data, dict) and "measurement" in result_data: + measurement = result_data["measurement"] + # Extract timing metrics if provided by the benchmark if isinstance(measurement, dict): # CPU time in microseconds - if 'cpu_time_us' in measurement: - result.provider_times.execution = measurement['cpu_time_us'] - elif 'cpu_time_ms' in measurement: - result.provider_times.execution = int(measurement['cpu_time_ms'] * 1000) - + if "cpu_time_us" in measurement: + result.provider_times.execution = measurement["cpu_time_us"] + elif "cpu_time_ms" in measurement: + result.provider_times.execution = int(measurement["cpu_time_ms"] * 1000) + # Wall time in microseconds - if 'wall_time_us' in measurement: - result.times.benchmark = measurement['wall_time_us'] - elif 'wall_time_ms' in measurement: - result.times.benchmark = int(measurement['wall_time_ms'] * 1000) - + if "wall_time_us" in measurement: + result.times.benchmark = measurement["wall_time_us"] + elif "wall_time_ms" in measurement: + result.times.benchmark = int(measurement["wall_time_ms"] * 1000) + # Cold/warm start detection - if 'is_cold' in measurement: - result.stats.cold_start = measurement['is_cold'] - + if "is_cold" in measurement: + result.stats.cold_start = measurement["is_cold"] + # Memory usage if available - if 'memory_used_mb' in measurement: - result.stats.memory_used = measurement['memory_used_mb'] - + if "memory_used_mb" in measurement: + result.stats.memory_used = measurement["memory_used_mb"] + # Store the full measurement for later analysis - result.output['measurement'] = measurement - + result.output["measurement"] = measurement + self.logging.debug(f"Extracted measurements: {measurement}") - + return result def async_invoke(self, payload: dict) -> concurrent.futures.Future: diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 1a373de26..121cfdb4b 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -10,6 +10,7 @@ import shutil import json from importlib.resources import files + try: import tomllib # Python 3.11+ except ImportError: @@ -81,38 +82,30 @@ def generate_wrangler_toml( """ # Load template template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-worker.toml") - with open(template_path, 'rb') as f: + with open(template_path, "rb") as f: config = tomllib.load(f) # Update basic configuration - config['name'] = worker_name - config['account_id'] = account_id + config["name"] = worker_name + config["account_id"] = account_id # Add language- and variant-specific configuration. # For Node.js workers, we always bundle through build.js into dist/, # regardless of language variant (default/cloudflare), because the # wrangler entrypoint points to dist/handler.js. if language == "nodejs": - config['main'] = "dist/handler.js" - config['compatibility_flags'] = ["nodejs_compat"] - config['no_bundle'] = True - config['rules'] = [ - { - 'type': 'ESModule', - 'globs': ['**/*.js'], - 'fallthrough': True - }, - { - 'type': 'Text', - 'globs': ['**/*.html'], - 'fallthrough': True - } + config["main"] = "dist/handler.js" + config["compatibility_flags"] = ["nodejs_compat"] + config["no_bundle"] = True + config["rules"] = [ + {"type": "ESModule", "globs": ["**/*.js"], "fallthrough": True}, + {"type": "Text", "globs": ["**/*.html"], "fallthrough": True}, ] elif language == "python": - config['main'] = "handler.py" - config['compatibility_flags'] = ["python_workers"] + config["main"] = "handler.py" + config["compatibility_flags"] = ["python_workers"] else: - config['main'] = "dist/handler.js" if language == "nodejs" else "handler.py" + config["main"] = "dist/handler.js" if language == "nodejs" else "handler.py" # Add NoSQL KV namespace bindings if benchmark uses them if code_package and code_package.uses_nosql: @@ -121,50 +114,50 @@ def generate_wrangler_toml( if nosql_storage.retrieve_cache(benchmark_for_nosql): nosql_tables = nosql_storage.get_tables(benchmark_for_nosql) if nosql_tables: - config['kv_namespaces'] = [] + config["kv_namespaces"] = [] for table_name, namespace_id in nosql_tables.items(): - config['kv_namespaces'].append({ - 'binding': table_name, - 'id': namespace_id, - }) - + config["kv_namespaces"].append( + { + "binding": table_name, + "id": namespace_id, + } + ) + # Add environment variables if benchmark_name or (code_package and code_package.uses_nosql): - config['vars'] = {} + config["vars"] = {} if benchmark_name: - config['vars']['BENCHMARK_NAME'] = benchmark_name + config["vars"]["BENCHMARK_NAME"] = benchmark_name if code_package and code_package.uses_nosql: - config['vars']['NOSQL_STORAGE_DATABASE'] = "kvstore" - + config["vars"]["NOSQL_STORAGE_DATABASE"] = "kvstore" + # Add R2 bucket binding try: from sebs.faas.config import Resources + storage = self.system_resources.get_storage() bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) if bucket_name: - config['r2_buckets'] = [{ - 'binding': 'R2', - 'bucket_name': bucket_name - }] + config["r2_buckets"] = [{"binding": "R2", "bucket_name": bucket_name}] self.logging.info(f"R2 bucket '{bucket_name}' will be bound to worker as 'R2'") except Exception as e: self.logging.warning( f"R2 bucket binding not configured: {e}. " f"Benchmarks requiring file access will not work properly." ) - + # Write wrangler.toml to package directory toml_path = os.path.join(package_dir, "wrangler.toml") os.makedirs(package_dir, exist_ok=True) try: # Try tomli_w (writes binary) - with open(toml_path, 'wb') as f: + with open(toml_path, "wb") as f: tomli_w.dump(config, f) except TypeError: # Fallback to toml library (writes text) - with open(toml_path, 'w') as f: + with open(toml_path, "w") as f: f.write(tomli_w.dumps(config)) - + self.logging.info(f"Generated wrangler.toml at {toml_path}") return toml_path @@ -209,7 +202,7 @@ def package_code( if language_variant in ("cloudflare", "default"): if os.path.exists(requirements_file): - with open(requirements_file, 'r') as reqf: + with open(requirements_file, "r") as reqf: reqtext = reqf.read() needed_pkg = [] unsupported = [] @@ -241,7 +234,7 @@ def package_code( pyproject_config = { "project": { "name": f"{benchmark.replace('.', '-')}-python-" - f"{language_version.replace('.', '')}", + f"{language_version.replace('.', '')}", "version": "0.1.0", "description": "dummy description", "requires-python": f">={language_version}", @@ -252,10 +245,10 @@ def package_code( }, } try: - with open(project_file, 'wb') as pf: + with open(project_file, "wb") as pf: tomli_w.dump(pyproject_config, pf) except TypeError: - with open(project_file, 'w') as pf: + with open(project_file, "w") as pf: pf.write(tomli_w.dumps(pyproject_config)) # Pyodide Workers require all function files in a function/ subdir funcdir = os.path.join(directory, "function") @@ -306,7 +299,9 @@ def package_code( total_size += os.path.getsize(filepath) mbytes = total_size / 1024.0 / 1024.0 - self.logging.info(f"Worker package size: {mbytes:.2f} MB (Python: missing vendored modules)") + self.logging.info( + f"Worker package size: {mbytes:.2f} MB (Python: missing vendored modules)" + ) return (directory, total_size, "") diff --git a/sebs/config.py b/sebs/config.py index a9d164028..10d852e32 100644 --- a/sebs/config.py +++ b/sebs/config.py @@ -209,9 +209,11 @@ def benchmark_container_images( self, deployment_name: str, language_name: str, architecture: str ) -> Dict[str, str]: """Get container base images for container deployments.""" - return self._system_config[deployment_name]["languages"][language_name].get( - "container_images", {} - ).get(architecture, {}) + return ( + self._system_config[deployment_name]["languages"][language_name] + .get("container_images", {}) + .get(architecture, {}) + ) def version(self) -> str: """Get the SeBS framework version. diff --git a/sebs/faas/config.py b/sebs/faas/config.py index d1fa07a93..7da30433f 100644 --- a/sebs/faas/config.py +++ b/sebs/faas/config.py @@ -431,14 +431,15 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> "Confi from sebs.openwhisk.config import OpenWhiskConfig implementations["openwhisk"] = OpenWhiskConfig.deserialize - + # Cloudflare is available by default (like local) try: from sebs.cloudflare.config import CloudflareConfig + implementations["cloudflare"] = CloudflareConfig.deserialize except ImportError: pass - + func = implementations.get(name) assert func, "Unknown config type!" return func(config[name] if name in config else config, cache, handlers) From fbe8185a667b4efadcae5128c815af946efc2016 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 09:32:06 +0200 Subject: [PATCH 146/230] fix(handler): update return structure to include result object --- .../100.webapps/120.uploader/nodejs/cloudflare/function.js | 2 +- benchmarks/wrappers/cloudflare/nodejs/handler.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js index fe27944fd..586129827 100644 --- a/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js +++ b/benchmarks/100.webapps/120.uploader/nodejs/cloudflare/function.js @@ -31,5 +31,5 @@ export const handler = async function(event) { ); await uploadPromise; - return {bucket: bucket, url: url, key: keyName}; + return {result: {bucket: bucket, url: url, key: keyName}}; }; diff --git a/benchmarks/wrappers/cloudflare/nodejs/handler.js b/benchmarks/wrappers/cloudflare/nodejs/handler.js index bcf7876be..80170344a 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/handler.js @@ -188,7 +188,7 @@ export default { const micro = elapsed * 1000; // Convert milliseconds to microseconds // Build log_data similar to Python handler - const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; + const log_data = { result: ret && ret.result !== undefined ? ret.result : ret }; if (ret && ret.measurement !== undefined) { log_data.measurement = ret.measurement; } else { From bf1b0ac79a504b53819341393a824e4a69ba965d Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 09:40:06 +0200 Subject: [PATCH 147/230] fix(handler): update log_data structure to use 'result' key instead of 'output' --- benchmarks/wrappers/cloudflare/nodejs/container/handler.js | 2 +- benchmarks/wrappers/cloudflare/python/container/handler.py | 2 +- benchmarks/wrappers/cloudflare/python/handler.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js index 948722f3c..dd4df7318 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/handler.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/handler.js @@ -133,7 +133,7 @@ const server = http.createServer(async (req, res) => { const micro = elapsed * 1000; // Convert milliseconds to microseconds // Build log_data similar to native handler - const log_data = { output: ret && ret.result !== undefined ? ret.result : ret }; + const log_data = { result: ret && ret.result !== undefined ? ret.result : ret }; if (ret && ret.measurement !== undefined) { log_data.measurement = ret.measurement; } else { diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 1bef40216..8ae89e6c0 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -138,7 +138,7 @@ def handle_request(self): # Prepare response matching native handler format exactly log_data = { - 'output': result['result'] + 'result': result['result'] } if 'measurement' in result: log_data['measurement'] = result['measurement'] diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index a29ea2a2e..65376c6df 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -92,7 +92,7 @@ async def fetch2(self, request, env): ret = function.handler(event) log_data = { - 'output': ret['result'] + 'result': ret['result'] } if 'measurement' in ret: log_data['measurement'] = ret['measurement'] From 584a3f8b3e905a0b08ad2b79c5d5599a27eaff34 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 09:46:41 +0200 Subject: [PATCH 148/230] fix(build): extend node built-ins filter to include 'constants' and handle 'graceful-fs' redirection --- benchmarks/wrappers/cloudflare/nodejs/build.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/build.js b/benchmarks/wrappers/cloudflare/nodejs/build.js index c130d781a..886955a94 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/build.js +++ b/benchmarks/wrappers/cloudflare/nodejs/build.js @@ -83,7 +83,7 @@ const nodeBuiltinsPlugin = { // Benchmarks commonly `require('fs')`, `require('path')`, etc. Workers // reject those bare specifiers; rewrite them to the `node:`-prefixed // form and mark them external so the runtime resolves them. - build.onResolve({ filter: /^(fs|querystring|path|crypto|stream|buffer|util|events|http|https|net|tls|zlib|os|child_process|tty|assert|url)$/ }, (args) => { + build.onResolve({ filter: /^(fs|querystring|path|crypto|stream|buffer|util|events|http|https|net|tls|zlib|os|child_process|tty|assert|url|constants)$/ }, (args) => { return { path: 'node:' + args.path, external: true }; }); @@ -96,6 +96,13 @@ const nodeBuiltinsPlugin = { path: resolve(wrapperDir, 'request-polyfill.js') }; }); + + // `graceful-fs` monkey-patches the `fs` module at runtime, which Workers + // rejects ("object is not extensible"). Redirect it straight to node:fs + // so the patching never runs and consumers get the same API. + build.onResolve({ filter: /^graceful-fs$/ }, () => { + return { path: 'node:fs', external: true }; + }); } }; From bd2111d5fb94dd79f7c500d007a5a170a99dabfc Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 10:33:34 +0200 Subject: [PATCH 149/230] fix(workers): streamline package code logic and ensure proper handling of requirements --- sebs/cloudflare/workers.py | 42 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 121cfdb4b..5ddd7733a 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -201,10 +201,10 @@ def package_code( self.logging.info(f"move {src} to {dest}") if language_variant in ("cloudflare", "default"): + needed_pkg = [] if os.path.exists(requirements_file): with open(requirements_file, "r") as reqf: reqtext = reqf.read() - needed_pkg = [] unsupported = [] seen = set() for raw_line in reqtext.splitlines(): @@ -230,26 +230,26 @@ def package_code( "for the list of supported packages." ) - project_file = os.path.join(directory, "pyproject.toml") - pyproject_config = { - "project": { - "name": f"{benchmark.replace('.', '-')}-python-" - f"{language_version.replace('.', '')}", - "version": "0.1.0", - "description": "dummy description", - "requires-python": f">={language_version}", - "dependencies": needed_pkg, - }, - "dependency-groups": { - "dev": ["workers-py", "workers-runtime-sdk"], - }, - } - try: - with open(project_file, "wb") as pf: - tomli_w.dump(pyproject_config, pf) - except TypeError: - with open(project_file, "w") as pf: - pf.write(tomli_w.dumps(pyproject_config)) + project_file = os.path.join(directory, "pyproject.toml") + pyproject_config = { + "project": { + "name": f"{benchmark.replace('.', '-')}-python-" + f"{language_version.replace('.', '')}", + "version": "0.1.0", + "description": "dummy description", + "requires-python": f">={language_version}", + "dependencies": needed_pkg, + }, + "dependency-groups": { + "dev": ["workers-py", "workers-runtime-sdk"], + }, + } + try: + with open(project_file, "wb") as pf: + tomli_w.dump(pyproject_config, pf) + except TypeError: + with open(project_file, "w") as pf: + pf.write(tomli_w.dumps(pyproject_config)) # Pyodide Workers require all function files in a function/ subdir funcdir = os.path.join(directory, "function") if not os.path.exists(funcdir): From c476ac80d2f2ae8e47aa1d5b287be8146efc22fb Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 10:54:40 +0200 Subject: [PATCH 150/230] fix(storage): simplify data handling in aupload_stream by removing base64 conversion --- benchmarks/wrappers/cloudflare/python/storage.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index e7968eb5a..cabdb7184 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -1,11 +1,7 @@ import io import os import uuid -import asyncio -import base64 -from pyodide.ffi import to_js, jsnull, run_sync, JsProxy -from pyodide.webloop import WebLoop -import js +from pyodide.ffi import to_js, jsnull, run_sync from workers import WorkerEntrypoint @@ -77,18 +73,10 @@ def upload_stream(self, bucket, key, data): async def aupload_stream(self, bucket, key, data): unique_key = storage.unique_name(key) - # Handle BytesIO objects - extract bytes if hasattr(data, 'getvalue'): data = data.getvalue() - # Convert bytes to Blob using base64 encoding as intermediate step if isinstance(data, bytes): - # Encode as base64 - b64_str = base64.b64encode(data).decode('ascii') - # Create a Response from base64, then get the blob - # This creates a proper JavaScript Blob that R2 will accept - response = await js.fetch(f"data:application/octet-stream;base64,{b64_str}") - blob = await response.blob() - data_js = blob + data_js = to_js(data) else: data_js = str(data) bobj = self.get_bucket(bucket) From b9eb11f90c1d7f19593bc90266ff929c0636a4e1 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 12:52:56 +0200 Subject: [PATCH 151/230] fix(benchmarks): update subproject commit reference to latest version --- benchmarks-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks-data b/benchmarks-data index 269ac284f..30ca2f5c5 160000 --- a/benchmarks-data +++ b/benchmarks-data @@ -1 +1 @@ -Subproject commit 269ac284fa3ae58f2fcb444b3a83e2255028c20f +Subproject commit 30ca2f5c533c3f441deb5e05fc03a39fe65f9948 From e34c67d9431ba9a0fab24cfbd1cc2746f17ccf4d Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 13:17:18 +0200 Subject: [PATCH 152/230] fix(cli): add 'cloudflare' option to deployment platforms in CLI commands --- sebs/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sebs/cli.py b/sebs/cli.py index 999d3d031..052418e19 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -1005,7 +1005,7 @@ def docker_cmd(): @click.option( "--deployment", default=None, - type=click.Choice(["local", "aws", "azure", "gcp", "openwhisk"]), + type=click.Choice(["local", "aws", "azure", "gcp", "openwhisk", "cloudflare"]), help="Deployment platform to build images for", ) @click.option( @@ -1087,7 +1087,7 @@ def docker_build( @click.option( "--deployment", default=None, - type=click.Choice(["local", "aws", "azure", "gcp", "openwhisk"]), + type=click.Choice(["local", "aws", "azure", "gcp", "openwhisk", "cloudflare"]), help="Deployment platform to push images for", ) @click.option( From cd5992bf09c2ba419bcb2e2b0ab1920e8568d910 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 13:56:22 +0200 Subject: [PATCH 153/230] fix(cloudflare): enhance CLI container management for thread safety and simplify shutdown process --- docs/platforms.md | 18 ++++++++- sebs/cloudflare/cli.py | 71 +++++++++++++++-------------------- sebs/cloudflare/cloudflare.py | 1 - sebs/cloudflare/containers.py | 7 ++-- sebs/cloudflare/workers.py | 7 ++-- 5 files changed, 53 insertions(+), 51 deletions(-) diff --git a/docs/platforms.md b/docs/platforms.md index fa27f6fb9..557ab5277 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -279,7 +279,23 @@ Cloudflare Workers support multiple languages through different deployment metho ### CLI Container -SeBS uses a containerized CLI approach for Cloudflare deployments, eliminating the need to install Node.js, npm, wrangler, pywrangler, or uv on your host system. The CLI container (`sebs/manage.cloudflare`) is automatically built on first use and contains all necessary tools. This ensures consistent behavior across platforms and simplifies setup—only Docker is required. +SeBS uses a containerized CLI approach for Cloudflare deployments, eliminating the need to install Node.js, npm, wrangler, pywrangler, or uv on your host system. The CLI container (`spcleth/serverless-benchmarks:manage.cloudflare`) is pulled from Docker Hub on first use and contains all necessary tools. This ensures consistent behavior across platforms and simplifies setup — only Docker is required. + +To build and push an updated `manage.cloudflare` image (developers only): + +```bash +sebs docker build --deployment cloudflare --image-type manage +sebs docker push --deployment cloudflare --image-type manage +``` + +#### Shared singleton and lifecycle + +`CloudflareCLI` is a process-wide singleton: both the script-based (`workers.py`) and container-based (`containers.py`) deployment handlers share a single `manage.cloudflare` Docker container. The first call to `CloudflareCLI.get_instance()` starts the container and registers a shutdown hook via `atexit`; subsequent calls from any handler or thread return the already-running instance. + +This has two consequences: + +- **Thread safety during creation** — `get_instance()` uses a double-checked lock so that when multiple benchmarks run in parallel (e.g. during `sebs regression`), only one thread starts the container while the others wait. +- **Lifecycle** — individual deployment handlers (and `Cloudflare.shutdown()`) drop their local reference to the instance but do not stop the container. The container is stopped exactly once at process exit by the `atexit` hook, regardless of whether SeBS is invoked directly (`sebs benchmark invoke`) or through the regression suite. ### Deployment Architecture diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 26d4cdd6a..2e90eddb9 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -1,13 +1,15 @@ +import atexit import io import logging import os import tarfile +import threading from typing import Optional import docker from sebs.config import SeBSConfig -from sebs.utils import LoggingBase, get_resource_path +from sebs.utils import LoggingBase class CloudflareCLI(LoggingBase): @@ -19,6 +21,7 @@ class CloudflareCLI(LoggingBase): """ _instance: Optional["CloudflareCLI"] = None + _lock: threading.Lock = threading.Lock() @staticmethod def get_instance(system_config: SeBSConfig, docker_client: docker.client) -> "CloudflareCLI": @@ -26,9 +29,13 @@ def get_instance(system_config: SeBSConfig, docker_client: docker.client) -> "Cl Container and native workers deployments share one underlying CLI container so that combined runs don't spawn duplicates. + Thread-safe: the first caller builds the container; concurrent callers wait. """ if CloudflareCLI._instance is None: - CloudflareCLI._instance = CloudflareCLI(system_config, docker_client) + with CloudflareCLI._lock: + if CloudflareCLI._instance is None: + CloudflareCLI._instance = CloudflareCLI(system_config, docker_client) + atexit.register(CloudflareCLI.shutdown_instance) return CloudflareCLI._instance def __init__(self, system_config: SeBSConfig, docker_client: docker.client): @@ -37,41 +44,16 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): repo_name = system_config.docker_repository() image_name = "manage.cloudflare" - full_image_name = repo_name + ":" + image_name try: - docker_client.images.get(full_image_name) + docker_client.images.get(repo_name + ":" + image_name) except docker.errors.ImageNotFound: try: - logging.info(f"Pulling Docker image {full_image_name}...") - docker_client.images.pull(repo_name, image_name) - except docker.errors.APIError as pull_error: - logging.info(f"Pull failed: {pull_error}. Building image locally...") - dockerfile_path = str( - get_resource_path("dockerfiles", "cloudflare", "Dockerfile.manage") + logging.info( + "Docker pull of image {repo}:{image}".format(repo=repo_name, image=image_name) ) - if not os.path.exists(dockerfile_path): - raise RuntimeError( - f"Dockerfile not found at {dockerfile_path}. " - "Cannot build Cloudflare CLI container." - ) - build_path = str(get_resource_path()) - logging.info(f"Building {full_image_name} from {dockerfile_path}...") - try: - _, build_logs = docker_client.images.build( - path=build_path, - dockerfile=dockerfile_path, - tag=full_image_name, - rm=True, - pull=True, - ) - for log in build_logs: - if "stream" in log: - logging.debug(log["stream"].strip()) - logging.info(f"Successfully built {full_image_name}") - except docker.errors.BuildError as build_error: - raise RuntimeError( - f"Failed to build Docker image {full_image_name}: {build_error}" - ) + docker_client.images.pull(repo_name, image_name) + except docker.errors.APIError: + raise RuntimeError("Docker pull of image {} failed!".format(image_name)) # Start the container in detached mode self.docker_instance = docker_client.containers.run( @@ -238,12 +220,19 @@ def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: out = self.execute(cmd, env=env) return out.decode("utf-8") - def shutdown(self): - """Shutdown Docker instance. Idempotent — safe to call multiple times.""" - if self._stopped: - return - self._stopped = True - self.logging.info("Stopping Cloudflare CLI Docker instance") - self.docker_instance.stop() - if CloudflareCLI._instance is self: + @staticmethod + def shutdown_instance(): + """Stop the shared CLI container and clear the singleton. + + Call this once at process teardown, after all parallel benchmarks + have finished. Individual deployment handlers must NOT call this — + they should just drop their local reference. + """ + with CloudflareCLI._lock: + instance = CloudflareCLI._instance CloudflareCLI._instance = None + + if instance is not None and not instance._stopped: + instance._stopped = True + instance.logging.info("Stopping Cloudflare CLI Docker instance") + instance.docker_instance.stop() diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 05c78935a..90403f4b2 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1050,6 +1050,5 @@ def shutdown(self) -> None: finally: self.cache_client.unlock() - # Shutdown deployment handler CLI containers self._workers_deployment.shutdown() self._containers_deployment.shutdown() diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index b02447d9f..d42b8a40a 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -422,7 +422,6 @@ def wait_for_container_worker_ready( ) def shutdown(self): - """Shutdown CLI container if initialized.""" - if self._cli is not None: - self._cli.shutdown() - self._cli = None + """Drop the local CLI reference. The shared container is owned by CloudflareCLI; + call CloudflareCLI.shutdown_instance() once at process teardown.""" + self._cli = None diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 5ddd7733a..d16d0793d 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -306,7 +306,6 @@ def package_code( return (directory, total_size, "") def shutdown(self): - """Shutdown CLI container if initialized.""" - if self._cli is not None: - self._cli.shutdown() - self._cli = None + """Drop the local CLI reference. The shared container is owned by CloudflareCLI; + call CloudflareCLI.shutdown_instance() once at process teardown.""" + self._cli = None From 6500fd26a4e06978dd082e708a0855ca7ec91642 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 13:56:47 +0200 Subject: [PATCH 154/230] fix(cloudflare): add manage image to systems.json --- configs/systems.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configs/systems.json b/configs/systems.json index 78211ed94..0f0086315 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -522,6 +522,11 @@ } } }, + "images": { + "manage": { + "username": "docker_user" + } + }, "architecture": ["x64"], "deployments": ["package", "container"] } From 16e7454bc2d364a98a7570aed9c0cfbe519155ca Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 14:02:11 +0200 Subject: [PATCH 155/230] fix(docs): clarify terminology for Cloudflare Workers in platforms.md --- docs/platforms.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/platforms.md b/docs/platforms.md index 557ab5277..7f36e1e1c 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -217,6 +217,9 @@ or in the JSON input configuration: ## Cloudflare Workers +> [!NOTE] +> **Terminology mapping**: SeBS uses the term *function* throughout its CLI and configuration. On Cloudflare, the equivalent unit of deployment is a **Worker**. Wherever SeBS refers to a function (e.g. `--function-name`, `create_function`, `CloudflareWorker`), it refers to a Cloudflare Worker script deployed to `{name}.{account}.workers.dev`. + Cloudflare offers a free tier for Workers with generous limits for development and testing. To use Cloudflare Workers with SeBS, you need to create a Cloudflare account and obtain API credentials. ### Credentials From d3e65686feaf1defcb9db647a80780059e6eabbe Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 14:05:04 +0200 Subject: [PATCH 156/230] fix(docs): add wall-clock timing explanation for Cloudflare Workers in platforms.md --- docs/platforms.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/platforms.md b/docs/platforms.md index 7f36e1e1c..96a3f29d7 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -387,6 +387,7 @@ To use a different Docker Hub repository, change `['general']['docker_repository | standard-2 | 1 | 6 GiB | 12 GB | | standard-3 | 2 | 8 GiB | 16 GB | | standard-4 | 4 | 12 GiB | 20 GB | +- **Wall-Clock Timing**: Cloudflare Workers freezes `Date.now()` and `performance.now()` between I/O operations as a timing side-channel mitigation, so the clock does not advance inside pure-compute sections. To record a meaningful wall-clock `compute_time`, the handler issues a throwaway self-fetch (a `HEAD /favicon` request) before sampling the end time. This I/O call unfreezes the timer. See the [Cloudflare security model docs](https://developers.cloudflare.com/workers/reference/security-model/#step-1-disallow-timers-and-multi-threading) for details. - **Metrics Collection**: Uses response-based per-invocation metrics. During each function invocation, the worker handler measures performance metrics (CPU time, wall time, memory usage) and embeds them directly in the JSON response. SeBS extracts these metrics immediately from each response. When `download_metrics()` is called for postprocessing, it only aggregates the metrics that were already collected during invocations—no additional data is fetched from external services. This approach provides immediate per-invocation granularity without delays. Note that while Cloudflare does expose an Analytics Engine, it only provides aggregated metrics without individual request-level data, making it unsuitable for detailed benchmarking purposes. ### Storage Configuration From 9681b6ebcfe132fb1ae64573c4f940d9ae5cd7fa Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 14:21:47 +0200 Subject: [PATCH 157/230] refactor(cloudflare): remove container warm-up logic from Cloudflare class --- sebs/cloudflare/cloudflare.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 90403f4b2..bf7ebd06f 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -633,29 +633,6 @@ def _create_or_update_worker( else: self._wait_for_worker_ready(worker_name, worker_url) - # Keep the container warm for a minimum provisioning window. - # A flat sleep lets the Durable Object hibernate, which causes the - # container runtime to reject the next start() call. Instead we - # ping /health every few seconds so the DO stays alive. - if container_deployment: - warm_seconds = 60 - ping_interval = 5 - account_id = env.get("CLOUDFLARE_ACCOUNT_ID") - worker_url = self._build_workers_dev_url(worker_name, account_id) - health_url = f"{worker_url}/health" - self.logging.info( - f"Keeping container warm for {warm_seconds}s " - f"(pinging {health_url} every {ping_interval}s)..." - ) - deadline = time.time() + warm_seconds - while time.time() < deadline: - try: - requests.get(health_url, timeout=10) - except Exception: - pass - remaining = deadline - time.time() - time.sleep(min(ping_interval, max(0, remaining))) - return {"success": True, "output": output} except RuntimeError as e: From 4d2db1f8f3bf2cd135baf01f94b676ca03b3da65 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 14:30:51 +0200 Subject: [PATCH 158/230] refactor: linting with flake8 --- sebs/cloudflare/cli.py | 3 ++- sebs/cloudflare/cloudflare.py | 29 ++++++++++++++--------------- sebs/cloudflare/containers.py | 8 +++----- sebs/cloudflare/function.py | 3 +-- sebs/cloudflare/kvstore.py | 14 +++++++++----- sebs/cloudflare/r2.py | 8 ++++---- sebs/cloudflare/resources.py | 2 -- sebs/cloudflare/triggers.py | 15 ++++++++------- sebs/cloudflare/workers.py | 4 ++-- sebs/regression.py | 7 +++++-- 10 files changed, 48 insertions(+), 45 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 2e90eddb9..4d7e1d9f1 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -107,7 +107,8 @@ def execute(self, cmd: str, env: dict = None): shell_cmd = ["/bin/sh", "-c", cmd] exit_code, out = self.docker_instance.exec_run( shell_cmd, - user="root", # Run as root since entrypoint creates docker_user but we don't wait for it + # Run as root since entrypoint creates docker_user but we don't wait for it + user="root", environment=env, ) if exit_code != 0: diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index bf7ebd06f..58d01da88 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,7 +1,5 @@ -import os import uuid import time -from datetime import datetime from typing import cast, Dict, List, Optional, Tuple, Type import docker @@ -162,7 +160,8 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) raise RuntimeError( f"Benchmark '{benchmark_name}' is not supported for " f"{language} {deployment_type} deployments on Cloudflare. " - f"Supported benchmarks: {self.SUPPORTED_BENCHMARKS.get((language, container_deployment))}" + "Supported benchmarks: " + f"{self.SUPPORTED_BENCHMARKS.get((language, container_deployment))}" ) # For workers deployments, auto-promote the variant from "default" to @@ -257,8 +256,9 @@ def initialize_resources(self, select_prefix: Optional[str] = None): except Exception as e: self.logging.warning( f"R2 storage initialization failed: {e}. " - f"R2 must be enabled in your Cloudflare dashboard to use storage-dependent benchmarks. " - f"Continuing without R2 storage - only benchmarks that don't require storage will work." + "R2 must be enabled in your Cloudflare dashboard " + "to use storage-dependent benchmarks. " + "Continuing without R2 - only benchmarks that don't require storage will work." ) @property @@ -308,8 +308,9 @@ def _verify_credentials(self): if response.status_code != 200: raise RuntimeError( - f"Failed to verify Cloudflare credentials: {response.status_code} - {response.text}\n" - f"Please check that your CLOUDFLARE_API_TOKEN and CLOUDFLARE_ACCOUNT_ID are correct." + f"Failed to verify Cloudflare credentials: " + f"{response.status_code} - {response.text}\n" + "Please check that your CLOUDFLARE_API_TOKEN and CLOUDFLARE_ACCOUNT_ID are correct." ) self.logging.info("Cloudflare credentials verified successfully") @@ -529,7 +530,7 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: if response.status_code == 200: try: return response.json().get("result") - except: + except Exception: return None elif response.status_code == 404: return None @@ -607,8 +608,6 @@ def _create_or_update_worker( # Deploy using Wrangler in container self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") - language_variant = code_package.language_variant if code_package else "cloudflare" - try: # pywrangler is used for all native Python workers (packages must be # synced via pyproject.toml before wrangler uploads the bundle). @@ -729,8 +728,6 @@ def cached_function(self, function: Function): Args: function: The cached function """ - from sebs.cloudflare.triggers import HTTPTrigger - for trigger in function.triggers(Trigger.TriggerType.HTTP): trigger.logging_handlers = self.logging_handlers @@ -769,7 +766,8 @@ def update_function( if is_container: self.logging.info( - f"Skipping redeployment for container worker {worker.name} - containers don't support runtime memory updates" + f"Skipping redeployment for container worker {worker.name} - " + "containers don't support runtime memory updates" ) else: self._create_or_update_worker( @@ -784,7 +782,7 @@ def update_function( ) self.logging.info(f"Updated worker {worker.name}") - # Update configuration if needed (no-op for containers since they don't support runtime memory changes) + # Update configuration if needed (no-op for containers: no runtime memory changes) self.update_function_configuration(worker, code_package) def update_function_configuration(self, cached_function: Function, benchmark: Benchmark): @@ -845,7 +843,8 @@ def format_function_name(name: str, container_deployment: bool = False) -> str: Args: name: The original name - container_deployment: Whether this is a container worker (adds 'w-' prefix if name starts with digit) + container_deployment: Whether this is a container worker + (adds 'w-' prefix if name starts with digit) Returns: Formatted name diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index d42b8a40a..6d933813d 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -232,7 +232,8 @@ def package_code( base_image = container_images.get(language_version) if not base_image: raise RuntimeError( - f"No container base image found in systems.json for {language_name} {language_version} on {architecture}" + f"No container base image found in systems.json for " + f"{language_name} {language_version} on {architecture}" ) self._base_image = base_image @@ -252,7 +253,7 @@ def package_code( worker_js_src = os.path.join(nodejs_wrapper_dir, "worker.js") if os.path.exists(worker_js_src): shutil.copy2(worker_js_src, os.path.join(directory, "worker.js")) - self.logging.info(f"Copied worker.js orchestration file from nodejs/container") + self.logging.info("Copied worker.js orchestration file from nodejs/container") # Copy init.sh if the benchmark needs it (e.g. video-processing downloads ffmpeg) from sebs.utils import find_benchmark @@ -380,9 +381,6 @@ def wait_for_container_worker_ready( self.logging.info("Checking container worker readiness via health endpoint...") - consecutive_failures = 0 - max_consecutive_failures = 5 - while time.time() - start_time < max_wait_seconds: try: # Use health check endpoint diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py index 355d9536a..cbf25a9fb 100644 --- a/sebs/cloudflare/function.py +++ b/sebs/cloudflare/function.py @@ -1,4 +1,4 @@ -from typing import Optional, cast +from typing import Optional from sebs.faas.function import Function, FunctionConfig @@ -39,7 +39,6 @@ def serialize(self) -> dict: @staticmethod def deserialize(cached_config: dict) -> "CloudflareWorker": - from sebs.faas.function import Trigger from sebs.cloudflare.triggers import HTTPTrigger cfg = FunctionConfig.deserialize(cached_config["config"]) diff --git a/sebs/cloudflare/kvstore.py b/sebs/cloudflare/kvstore.py index 4601087b9..9e8bf923a 100644 --- a/sebs/cloudflare/kvstore.py +++ b/sebs/cloudflare/kvstore.py @@ -97,7 +97,8 @@ def _account_id(self) -> str: return account_id def _kv_api_base(self) -> str: - return f"https://api.cloudflare.com/client/v4/accounts/{self._account_id()}/storage/kv/namespaces" + account = self._account_id() + return f"https://api.cloudflare.com/client/v4/accounts/{account}/storage/kv/namespaces" def _get_auth_headers(self, content_type: str = "application/json") -> dict[str, str]: """Get authentication headers for Cloudflare API requests.""" @@ -134,7 +135,7 @@ def _namespace_title(self, benchmark: str, table: str) -> str: f"sebs-nosql-{self._sanitize_component(self._resource_id())}-" f"{self._sanitize_component(benchmark)}-{self._sanitize_component(table)}" ) - # Cloudflare KV namespace title has length constraints. Keep a deterministic suffix if truncated. + # KV namespace title has length constraints; keep a deterministic suffix if truncated. max_len = 100 if len(title) > max_len: digest = hashlib.sha1(title.encode("utf-8")).hexdigest()[:12] @@ -204,8 +205,9 @@ def _index_key(primary_value: str) -> str: return f"__sebs_idx__{primary_value}" def _read_index(self, namespace_id: str, primary_value: str) -> List[str]: + index_key = quote(self._index_key(primary_value), safe="") response = requests.get( - f"{self._kv_api_base()}/{namespace_id}/values/{quote(self._index_key(primary_value), safe='')}", + f"{self._kv_api_base()}/{namespace_id}/values/{index_key}", headers=self._get_auth_headers(), ) if response.status_code == 404: @@ -227,8 +229,9 @@ def _read_index(self, namespace_id: str, primary_value: str) -> List[str]: return [str(v) for v in parsed] def _write_index(self, namespace_id: str, primary_value: str, values: List[str]) -> None: + index_key = quote(self._index_key(primary_value), safe="") response = requests.put( - f"{self._kv_api_base()}/{namespace_id}/values/{quote(self._index_key(primary_value), safe='')}", + f"{self._kv_api_base()}/{namespace_id}/values/{index_key}", data=json.dumps(values, separators=(",", ":")).encode("utf-8"), headers=self._get_auth_headers(content_type="text/plain;charset=UTF-8"), ) @@ -370,7 +373,8 @@ def write_to_table( def clear_table(self, name: str) -> str: self.logging.warning( - "Cloudflare KV clear_table is not implemented. Use remove_table() + create_table() instead." + "clear_table is not implemented for Cloudflare KV. " + "Use remove_table() + create_table() instead." ) return name diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 164e3d431..e1d97df79 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -1,4 +1,3 @@ -import json import os import requests @@ -68,7 +67,8 @@ def _get_s3_client(self): if not self._credentials.r2_access_key_id or not self._credentials.r2_secret_access_key: self.logging.warning( "R2 S3-compatible API credentials not configured. " - "Set CLOUDFLARE_R2_ACCESS_KEY_ID and CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." + "Set CLOUDFLARE_R2_ACCESS_KEY_ID and " + "CLOUDFLARE_R2_SECRET_ACCESS_KEY environment variables." ) return None @@ -134,7 +134,7 @@ def _create_bucket( f"R2 bucket creation failed. Status: {create_bucket_response.status_code}, " f"Response: {error_data}" ) - except: + except Exception: self.logging.error( f"R2 bucket creation failed. Status: {create_bucket_response.status_code}, " f"Response: {create_bucket_response.text}" @@ -212,7 +212,7 @@ def upload_bytes(self, bucket_name: str, key: str, data: bytes): """ s3_client = self._get_s3_client() if s3_client is None: - self.logging.warning(f"Cannot upload bytes to R2 - S3 client not available") + self.logging.warning("Cannot upload bytes to R2 - S3 client not available") return try: diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index 1a76475bc..96966d804 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -1,5 +1,4 @@ import docker - from typing import Optional from sebs.cache import Cache @@ -10,7 +9,6 @@ from sebs.faas.storage import PersistentStorage from sebs.faas.nosql import NoSQLStorage from sebs.utils import LoggingHandlers -import json class CloudflareSystemResources(SystemResources): diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index d0dd8f6fc..3ce45ef0d 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -58,7 +58,8 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec [ "Content-Type: application/json", # Cloudflare bot-protection (error 1010) blocks requests with no/tool UA. - "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; +https://github.com/spcl/serverless-benchmarks)", + "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; " + "+https://github.com/spcl/serverless-benchmarks)", ], ) c.setopt(pycurl.POST, 1) @@ -87,14 +88,14 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec output = json.loads(output["body"]) if status_code == 502: - self.logging.info(f"Container returned 502 (still starting?), will retry...") - raise ContainerProvisioningError(f"502 gateway error from container worker") + self.logging.info("Container returned 502 (still starting?), will retry...") + raise ContainerProvisioningError("502 gateway error from container worker") # Check for Cloudflare error code 1042 (CPU time limit / worker not ready) # Output may be a plain string like "error code: 1042" rather than a dict. output_str = str(output) if "1042" in output_str and "error code" in output_str: - self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") + self.logging.info("Worker returned error 1042 (CPU time limit), will retry...") raise ContainerProvisioningError(f"Error 1042 from worker: {output_str}") if status_code != 200: @@ -120,7 +121,7 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec "currently provisioning", ) if "1042" in raw_text and "error code" in raw_text: - self.logging.info(f"Worker returned error 1042 (CPU time limit), will retry...") + self.logging.info("Worker returned error 1042 (CPU time limit), will retry...") raise ContainerProvisioningError(f"Error 1042 from worker: {raw_text[:200]}") if status_code == 502 or any( p.lower() in raw_text.lower() for p in provisioning_phrases @@ -146,8 +147,8 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: except ContainerProvisioningError: if attempt < max_provisioning_retries: self.logging.info( - f"Container still provisioning, waiting {provisioning_retry_wait}s before retry " - f"(attempt {attempt + 1}/{max_provisioning_retries})..." + f"Container still provisioning, waiting {provisioning_retry_wait}s " + f"before retry (attempt {attempt + 1}/{max_provisioning_retries})..." ) time.sleep(provisioning_retry_wait) else: diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index d16d0793d..bff4d168e 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -8,7 +8,6 @@ import os import re import shutil -import json from importlib.resources import files try: @@ -288,7 +287,8 @@ def package_code( ) raise RuntimeError( f"Handler file {handler_file} not found in {directory}. " - f"Available files: {', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" + f"Available files: " + f"{', '.join(os.listdir(directory)) if os.path.exists(directory) else 'none'}" ) # Calculate total size of the package directory diff --git a/sebs/regression.py b/sebs/regression.py index 2c7591926..21fe43278 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -25,7 +25,7 @@ import testtools import threading from time import sleep -from typing import cast, Dict, Optional, Set, Tuple, TYPE_CHECKING +from typing import cast, Dict, Optional, Set, TYPE_CHECKING from sebs.faas.function import Trigger from sebs.utils import ColoredWrapper, SensitiveDataFilter, LoggingBase @@ -1416,7 +1416,10 @@ def filter_out_benchmarks( if allowed is not None: # benchmark is the test method name, e.g. "test_cloudflare_120.uploader_x64_workers" # Extract the numeric benchmark prefix (e.g. "120") from before the first "." - benchmark_id = benchmark.split(".")[-2].split("_")[-1] if "." in benchmark else benchmark.split("_")[-1] + if "." in benchmark: + benchmark_id = benchmark.split(".")[-2].split("_")[-1] + else: + benchmark_id = benchmark.split("_")[-1] return benchmark_id in allowed # fmt: on From eda4d0fc1a62da4e967c82d545f2d5b56387f8b5 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:29:26 +0200 Subject: [PATCH 159/230] refactor(cloudflare): update type hints and improve error handling in various modules --- sebs/benchmark.py | 5 +-- sebs/cloudflare/cli.py | 8 ++--- sebs/cloudflare/cloudflare.py | 57 ++++++++++++++++++++++++++--------- sebs/cloudflare/containers.py | 26 ++++++++++------ sebs/cloudflare/r2.py | 4 +-- sebs/cloudflare/resources.py | 21 +++++++------ sebs/cloudflare/triggers.py | 10 ++++++ sebs/cloudflare/workers.py | 9 +++--- 8 files changed, 94 insertions(+), 46 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index c3f401784..49e624ef3 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -624,6 +624,7 @@ def hash(self) -> str: self._language_variant, container_deployment=self._container_deployment, ) + assert self._hash_value is not None return self._hash_value @hash.setter # noqa: A003 @@ -811,8 +812,8 @@ def hash_directory( "benchmarks", "wrappers", "cloudflare", "nodejs", "container", "worker.js" ) if os.path.isfile(str(nodejs_worker)): - with open(str(nodejs_worker), "rb") as f: - hash_sum.update(f.read()) + with open(str(nodejs_worker), "rb") as worker_file: + hash_sum.update(worker_file.read()) return hash_sum.hexdigest() def serialize(self) -> dict: diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 4d7e1d9f1..c2d2d3541 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -91,7 +91,7 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client): def typename() -> str: return "Cloudflare.CLI" - def execute(self, cmd: str, env: dict = None): + def execute(self, cmd: str, env: Optional[dict] = None): """ Execute the given command in Cloudflare CLI container. Throws an exception on failure (commands are expected to execute successfully). @@ -163,7 +163,7 @@ def check_pywrangler_version(self) -> str: out = self.execute("pywrangler --version") return out.decode("utf-8").strip() - def containers_push(self, tag: str, env: dict = None) -> str: + def containers_push(self, tag: str, env: Optional[dict] = None) -> str: """ Push a locally-built image to Cloudflare's container registry. @@ -191,7 +191,7 @@ def containers_push(self, tag: str, env: dict = None) -> str: f"Could not parse registry URI from wrangler containers push output:\n{output}" ) - def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: + def wrangler_deploy(self, package_dir: str, env: Optional[dict] = None) -> str: """ Deploy a worker using wrangler. @@ -206,7 +206,7 @@ def wrangler_deploy(self, package_dir: str, env: dict = None) -> str: out = self.execute(cmd, env=env) return out.decode("utf-8") - def pywrangler_deploy(self, package_dir: str, env: dict = None) -> str: + def pywrangler_deploy(self, package_dir: str, env: Optional[dict] = None) -> str: """ Deploy a Python worker using pywrangler. diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 58d01da88..25a41d97f 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,3 +1,4 @@ +import os import uuid import time from typing import cast, Dict, List, Optional, Tuple, Type @@ -175,6 +176,13 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) ): code_package.select_variant(self.name()) + # The cache stores functions under their formatted name (e.g. + # "container-311-compression-nodejs-18"), but callers pass the + # unformatted default name. Format it here so the cache lookup in + # super().get_function() finds the right entry. + if func_name is not None: + func_name = self.format_function_name(func_name, container_deployment) + return super().get_function(code_package, func_name) def __init__( @@ -209,7 +217,12 @@ def __init__( # Adapter so benchmark.build() can call container_client.build_base_image() self._container_adapter = _CloudflareContainerAdapter(self._containers_deployment) - def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): + def initialize( + self, + config: Dict[str, str] = {}, + resource_prefix: Optional[str] = None, + quiet: bool = False, + ): """ Initialize the Cloudflare Workers platform. @@ -221,7 +234,7 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] self._verify_credentials() self.initialize_resources(select_prefix=resource_prefix) - def initialize_resources(self, select_prefix: Optional[str] = None): + def initialize_resources(self, select_prefix: Optional[str] = None, quiet: bool = False): """ Initialize Cloudflare resources. @@ -262,7 +275,7 @@ def initialize_resources(self, select_prefix: Optional[str] = None): ) @property - def container_client(self) -> _CloudflareContainerAdapter: + def container_client(self) -> _CloudflareContainerAdapter: # type: ignore[override] """Return the Cloudflare-specific container build adapter. Overrides System.container_client (which returns None) so that @@ -393,7 +406,7 @@ def _generate_wrangler_toml( benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, - container_uri: str = "", + container_uri: Optional[str] = None, ) -> str: """ Generate wrangler.toml by delegating to the appropriate deployment handler. @@ -429,7 +442,7 @@ def create_function( code_package: Benchmark, func_name: str, container_deployment: bool, - container_uri: str, + container_uri: str | None, ) -> CloudflareWorker: """ Create a new Cloudflare Worker. @@ -446,16 +459,21 @@ def create_function( CloudflareWorker instance """ # For container builds benchmark.build() goes through container_client.build_base_image(), - # which does NOT set code_package._code_location. Fall back to the directory that - # _CloudflareContainerAdapter stored during its last build_base_image() call. + # which does NOT set code_package._code_location. Fall back in order: + # 1. _CloudflareContainerAdapter.last_directory (set when build actually ran this session) + # 2. code_package._output_dir (the on-disk build directory from a previous session — + # build() leaves it in place when the image cache is valid and the build is skipped) package = code_package.code_location if package is None and container_deployment: package = self._container_adapter.last_directory - if package is None: - raise RuntimeError( - f"Code location is not set for {code_package.benchmark}. " - "The build step may not have completed successfully." - ) + if package is None and container_deployment: + output_dir = code_package._output_dir + if os.path.isdir(output_dir): + package = output_dir + self.logging.info( + f"Using existing output directory for {code_package.benchmark}: {package}" + ) + benchmark = code_package.benchmark language = code_package.language_name language_runtime = code_package.language_version @@ -470,6 +488,12 @@ def create_function( # Check if worker already exists existing_worker = self._get_worker(func_name, account_id) + if package is None: + raise RuntimeError( + f"Code location is not set for {code_package.benchmark}. " + "The build step may not have completed successfully." + ) + if existing_worker: self.logging.info(f"Worker {func_name} already exists, updating it") worker = CloudflareWorker( @@ -547,7 +571,7 @@ def _create_or_update_worker( benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, container_deployment: bool = False, - container_uri: str = "", + container_uri: str | None = None, ) -> dict: """Create or update a Cloudflare Worker using Wrangler CLI in container. @@ -736,7 +760,7 @@ def update_function( function: Function, code_package: Benchmark, container_deployment: bool, - container_uri: str, + container_uri: str | None, ): """ Update an existing Cloudflare Worker. @@ -770,6 +794,11 @@ def update_function( "containers don't support runtime memory updates" ) else: + if package is None: + raise RuntimeError( + f"Code location is not set for {benchmark}. " + "The build step may not have completed successfully." + ) self._create_or_update_worker( worker.name, package, diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 6d933813d..ec971c8b8 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -16,12 +16,11 @@ try: import tomllib # Python 3.11+ except ImportError: - import tomli as tomllib # Fallback for older Python + import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python try: import tomli_w except ImportError: - # Fallback to basic TOML writing if tomli_w not available - import toml as tomli_w + import toml as tomli_w # type: ignore[no-redef, import-untyped] from typing import Optional, Tuple import requests @@ -68,7 +67,7 @@ def generate_wrangler_toml( account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, - container_uri: str = "", + container_uri: Optional[str] = None, language_variant: str = "default", ) -> str: """ @@ -88,7 +87,7 @@ def generate_wrangler_toml( """ # Load template template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-container.toml") - with open(template_path, "rb") as f: + with template_path.open("rb") as f: config = tomllib.load(f) # Update basic configuration @@ -204,6 +203,15 @@ def package_code( src = os.path.join(wrapper_container_dir, f) if os.path.exists(src): shutil.copy2(src, os.path.join(directory, f)) + elif language_name == "nodejs": + # add_deployment_files() stages the ESM workers variants of + # storage.js and nosql.js; replace them with the CJS container + # versions so the Node.js HTTP server can require() them. + for f in ["storage.js", "nosql.js"]: + src = os.path.join(wrapper_container_dir, f) + if os.path.exists(src): + shutil.copy2(src, os.path.join(directory, f)) + self.logging.info(f"Replaced {f} with container-specific version") # For Python: move benchmark code into function/ so that relative imports # work natively, matching the workers and AWS layout. @@ -277,13 +285,13 @@ def package_code( f"package.json not found at {package_json_path} " f"for nodejs benchmark '{benchmark}'" ) - with open(package_json_path, "r") as f: - package_json = json.load(f) + with open(package_json_path, "r") as pkg_r: + package_json = json.load(pkg_r) else: package_json = {} package_json.setdefault("dependencies", {})["@cloudflare/containers"] = "*" - with open(package_json_path, "w") as f: - json.dump(package_json, f, indent=2) + with open(package_json_path, "w") as pkg_w: + json.dump(package_json, pkg_w, indent=2) # For Python containers, promote the versioned requirements.txt to requirements.txt if language_name == "python": diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index e1d97df79..b7cb64c7a 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -97,9 +97,9 @@ def correct_name(self, name: str) -> str: return name def _create_bucket( - self, name: str, buckets: list[str] = [], randomize_name: bool = False + self, name: str, buckets: Optional[List[str]] = None, randomize_name: bool = False ) -> str: - for bucket_name in buckets: + for bucket_name in (buckets or []): if name in bucket_name: self.logging.info( "Bucket {} for {} already exists, skipping.".format(bucket_name, name) diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index 96966d804..595289cd2 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -1,8 +1,8 @@ import docker -from typing import Optional +from typing import Optional, cast from sebs.cache import Cache -from sebs.cloudflare.config import CloudflareConfig +from sebs.cloudflare.config import CloudflareConfig, CloudflareCredentials from sebs.cloudflare.r2 import R2 from sebs.cloudflare.kvstore import KVStore from sebs.faas.resources import SystemResources @@ -33,19 +33,20 @@ def __init__( @property def config(self) -> CloudflareConfig: - return self._config + return cast(CloudflareConfig, self._config) def _get_auth_headers(self) -> dict[str, str]: """Get authentication headers for Cloudflare API requests.""" - if self._config.credentials.api_token: + credentials = cast(CloudflareCredentials, self._config.credentials) + if credentials.api_token: return { - "Authorization": f"Bearer {self._config.credentials.api_token}", + "Authorization": f"Bearer {credentials.api_token}", "Content-Type": "application/json", } - elif self._config.credentials.email and self._config.credentials.api_key: + elif credentials.email and credentials.api_key: return { - "X-Auth-Email": self._config.credentials.email, - "X-Auth-Key": self._config.credentials.api_key, + "X-Auth-Email": credentials.email, + "X-Auth-Key": credentials.api_key, "Content-Type": "application/json", } else: @@ -72,7 +73,7 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStor cache_client=self._cache_client, resources=self._config.resources, replace_existing=replace_existing, - credentials=self._config.credentials, + credentials=cast(CloudflareCredentials, self._config.credentials), ) def get_nosql_storage(self) -> NoSQLStorage: @@ -88,5 +89,5 @@ def get_nosql_storage(self) -> NoSQLStorage: region=self._config.region, cache_client=self._cache_client, resources=self._config.resources, - credentials=self._config.credentials, + credentials=cast(CloudflareCredentials, self._config.credentials), ) diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 3ce45ef0d..4bd1fb3b7 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -98,6 +98,14 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec self.logging.info("Worker returned error 1042 (CPU time limit), will retry...") raise ContainerProvisioningError(f"Error 1042 from worker: {output_str}") + container_not_ready_phrases = ( + "The container is not running", + "Failed to start container", + ) + if any(p in output_str for p in container_not_ready_phrases): + self.logging.info("Container not yet running, will retry...") + raise ContainerProvisioningError(f"Container startup error: {output_str[:200]}") + if status_code != 200: self.logging.error(f"Invocation on URL {url} failed!") self.logging.error(f"Output: {output}") @@ -119,6 +127,8 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec "no Container instance available", "provisioning the Container", "currently provisioning", + "The container is not running", + "Failed to start container", ) if "1042" in raw_text and "error code" in raw_text: self.logging.info("Worker returned error 1042 (CPU time limit), will retry...") diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index bff4d168e..b3d400e11 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -13,12 +13,11 @@ try: import tomllib # Python 3.11+ except ImportError: - import tomli as tomllib # Fallback for older Python + import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python try: import tomli_w except ImportError: - # Fallback to basic TOML writing if tomli_w not available - import toml as tomli_w + import toml as tomli_w # type: ignore[no-redef, import-untyped] from typing import Optional, Tuple from sebs.benchmark import Benchmark @@ -62,7 +61,7 @@ def generate_wrangler_toml( account_id: str, benchmark_name: Optional[str] = None, code_package: Optional[Benchmark] = None, - container_uri: str = "", + container_uri: Optional[str] = None, language_variant: str = "cloudflare", ) -> str: """ @@ -81,7 +80,7 @@ def generate_wrangler_toml( """ # Load template template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-worker.toml") - with open(template_path, "rb") as f: + with template_path.open("rb") as f: config = tomllib.load(f) # Update basic configuration From 305c4fb169b55040bf3b07eebb89f9fd2d8c37ff Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:34:55 +0200 Subject: [PATCH 160/230] feat(storage): add downloadDirectory method to facilitate directory downloads --- benchmarks/wrappers/cloudflare/nodejs/container/storage.js | 4 ++++ benchmarks/wrappers/cloudflare/nodejs/storage.js | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js index a858db54f..f69aa5748 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/storage.js @@ -313,6 +313,10 @@ class storage { } } + async downloadDirectory(bucket, prefix, out_path) { + return this.download_directory(bucket, prefix, out_path); + } + uploadStream(bucket, key) { // Return [stream, promise, unique_key] to match native wrapper API const unique_key = storage.unique_name(key); diff --git a/benchmarks/wrappers/cloudflare/nodejs/storage.js b/benchmarks/wrappers/cloudflare/nodejs/storage.js index a49cc3347..3ba303839 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/storage.js +++ b/benchmarks/wrappers/cloudflare/nodejs/storage.js @@ -127,6 +127,10 @@ class storage { } } + async downloadDirectory(bucket, prefix, out_path) { + return this.download_directory(bucket, prefix, out_path); + } + async upload_stream(__bucket, key, data) { const instance = storage.instance || this; const unique_key = storage.unique_name(key); From 1e402e477fd8c77ee3a0ceb6f1465ec3201152b9 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:35:09 +0200 Subject: [PATCH 161/230] black reformat --- sebs/cloudflare/r2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index b7cb64c7a..4450f3948 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -99,7 +99,7 @@ def correct_name(self, name: str) -> str: def _create_bucket( self, name: str, buckets: Optional[List[str]] = None, randomize_name: bool = False ) -> str: - for bucket_name in (buckets or []): + for bucket_name in buckets or []: if name in bucket_name: self.logging.info( "Bucket {} for {} already exists, skipping.".format(bucket_name, name) From 93406968595cd7f567bb5675607fc4bf5613e00e Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:39:57 +0200 Subject: [PATCH 162/230] refactor(cloudflare): enhance function cache handling and redeployment logic --- sebs/cloudflare/cloudflare.py | 58 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 25a41d97f..e1def3e2d 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -747,7 +747,9 @@ def cached_function(self, function: Function): """ Handle a function retrieved from cache. - Refreshes triggers and logging handlers. + Refreshes triggers and logging handlers, and verifies the worker still + exists on Cloudflare. If it has been deleted remotely, clear the hash + so the caller's hash-mismatch path triggers a full redeployment. Args: function: The cached function @@ -755,6 +757,15 @@ def cached_function(self, function: Function): for trigger in function.triggers(Trigger.TriggerType.HTTP): trigger.logging_handlers = self.logging_handlers + worker = cast(CloudflareWorker, function) + account_id = worker.account_id or self.config.credentials.account_id + if account_id and not self._get_worker(worker.name, account_id): + self.logging.info( + f"Cached worker {worker.name} no longer exists on Cloudflare " + "— will redeploy." + ) + function.code_package_hash = "" + def update_function( self, function: Function, @@ -783,33 +794,26 @@ def update_function( if not account_id: raise RuntimeError("Account ID is required to update worker") - # For container deployments, skip redeployment if code hasn't changed - # Containers don't support runtime memory configuration changes - # Detect container deployment by checking if worker name starts with "container-" - is_container = worker.name.startswith("container-") - - if is_container: - self.logging.info( - f"Skipping redeployment for container worker {worker.name} - " - "containers don't support runtime memory updates" - ) - else: - if package is None: - raise RuntimeError( - f"Code location is not set for {benchmark}. " - "The build step may not have completed successfully." - ) - self._create_or_update_worker( - worker.name, - package, - account_id, - language, - benchmark, - code_package, - container_deployment, - container_uri, + if package is None and container_deployment: + output_dir = code_package._output_dir + if os.path.isdir(output_dir): + package = output_dir + if package is None: + raise RuntimeError( + f"Code location is not set for {benchmark}. " + "The build step may not have completed successfully." ) - self.logging.info(f"Updated worker {worker.name}") + self._create_or_update_worker( + worker.name, + package, + account_id, + language, + benchmark, + code_package, + container_deployment, + container_uri, + ) + self.logging.info(f"Updated worker {worker.name}") # Update configuration if needed (no-op for containers: no runtime memory changes) self.update_function_configuration(worker, code_package) From 867f6b7bc787b98696500feb2f1797ae3da2d93b Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:46:26 +0200 Subject: [PATCH 163/230] refactor(cloudflare): streamline logging message for worker redeployment --- sebs/cloudflare/cloudflare.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index e1def3e2d..d9d17f0d1 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -761,8 +761,7 @@ def cached_function(self, function: Function): account_id = worker.account_id or self.config.credentials.account_id if account_id and not self._get_worker(worker.name, account_id): self.logging.info( - f"Cached worker {worker.name} no longer exists on Cloudflare " - "— will redeploy." + f"Cached worker {worker.name} no longer exists on Cloudflare " "— will redeploy." ) function.code_package_hash = "" From c076d79ef7084fcc51ce10c65b7da48bc6f3e52e Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:49:02 +0200 Subject: [PATCH 164/230] refactor(mypy): add missing imports configuration for docker module --- .mypy.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mypy.ini b/.mypy.ini index 5d02772f9..7d2fe13ff 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -3,6 +3,9 @@ [mypy-docker] ignore_missing_imports = True +[mypy-docker.*] +ignore_missing_imports = True + [mypy-tzlocal] ignore_missing_imports = True From 7c8867bbb4ab3e33cde839beafb4581881a2eb73 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 15:52:56 +0200 Subject: [PATCH 165/230] refactor(cloudflare): update docker client type hints for consistency --- sebs/cloudflare/cli.py | 4 ++-- sebs/cloudflare/cloudflare.py | 2 +- sebs/cloudflare/containers.py | 4 ++-- sebs/cloudflare/resources.py | 2 +- sebs/cloudflare/workers.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index c2d2d3541..0ee3adddd 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -24,7 +24,7 @@ class CloudflareCLI(LoggingBase): _lock: threading.Lock = threading.Lock() @staticmethod - def get_instance(system_config: SeBSConfig, docker_client: docker.client) -> "CloudflareCLI": + def get_instance(system_config: SeBSConfig, docker_client: docker.client.DockerClient) -> "CloudflareCLI": """Return the shared CloudflareCLI instance, creating it on first use. Container and native workers deployments share one underlying CLI @@ -38,7 +38,7 @@ def get_instance(system_config: SeBSConfig, docker_client: docker.client) -> "Cl atexit.register(CloudflareCLI.shutdown_instance) return CloudflareCLI._instance - def __init__(self, system_config: SeBSConfig, docker_client: docker.client): + def __init__(self, system_config: SeBSConfig, docker_client: docker.client.DockerClient): super().__init__() self._stopped = False diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index d9d17f0d1..7a1cebfa1 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -190,7 +190,7 @@ def __init__( sebs_config: SeBSConfig, config: CloudflareConfig, cache_client: Cache, - docker_client: docker.client, + docker_client: docker.client.DockerClient, logger_handlers: LoggingHandlers, ): super().__init__( diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index ec971c8b8..13afd7f59 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -14,7 +14,7 @@ from importlib.resources import files try: - import tomllib # Python 3.11+ + import tomllib # type: ignore[import-not-found] # Python 3.11+ except ImportError: import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python try: @@ -86,7 +86,7 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-container.toml") + template_path = files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-container.toml") with template_path.open("rb") as f: config = tomllib.load(f) diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index 595289cd2..fd88f7723 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -24,7 +24,7 @@ def __init__( self, config: CloudflareConfig, cache_client: Cache, - docker_client: docker.client, + docker_client: docker.client.DockerClient, logging_handlers: LoggingHandlers, ): super().__init__(config, cache_client, docker_client) diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index b3d400e11..fd8d93621 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -11,7 +11,7 @@ from importlib.resources import files try: - import tomllib # Python 3.11+ + import tomllib # type: ignore[import-not-found] # Python 3.11+ except ImportError: import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python try: @@ -79,7 +79,7 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = files("sebs.cloudflare").joinpath("templates", "wrangler-worker.toml") + template_path = files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-worker.toml") with template_path.open("rb") as f: config = tomllib.load(f) From 617e7d2ddcf4bfc8066e179c90c5426de7085655 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 16:04:24 +0200 Subject: [PATCH 166/230] refactor(triggers): increase provisioning retries and wait time for HTTP trigger --- sebs/cloudflare/containers.py | 2 +- sebs/cloudflare/triggers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 13afd7f59..6455e97ad 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -384,7 +384,7 @@ def wait_for_container_worker_ready( Returns: True if ready, False if timeout """ - wait_interval = 10 + wait_interval = 20 start_time = time.time() self.logging.info("Checking container worker readiness via health endpoint...") diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 4bd1fb3b7..8b35be973 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -148,8 +148,8 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec def sync_invoke(self, payload: dict) -> ExecutionResult: """Synchronously invoke a Cloudflare Worker via HTTP.""" self.logging.debug(f"Invoke function {self.url}") - max_provisioning_retries = 6 - provisioning_retry_wait = 30 # seconds between retries + max_provisioning_retries = 10 + provisioning_retry_wait = 60 # seconds between retries for attempt in range(max_provisioning_retries + 1): try: result = self._http_invoke(payload, self.url) From 88e232b3b602c8f62af95f86fce75610db4fe1c9 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 16:23:28 +0200 Subject: [PATCH 167/230] refactor(cli): update Docker image tag handling to include versioning --- .../cloudflare/python/container/storage.py | 5 +---- sebs/cloudflare/cli.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index d639754e8..8c9a32fcb 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -238,10 +238,7 @@ def download_directory(self, bucket, prefix, local_path): def _download_one(obj): obj_key = obj['key'] - relative_path = obj_key - if prefix and obj_key.startswith(prefix): - relative_path = obj_key[len(prefix):].lstrip('/') - local_file_path = os.path.join(local_path, relative_path) + local_file_path = os.path.join(local_path, obj_key) local_dir = os.path.dirname(local_file_path) if local_dir: os.makedirs(local_dir, exist_ok=True) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 0ee3adddd..f55c2851f 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -43,21 +43,25 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client.Docke self._stopped = False repo_name = system_config.docker_repository() + sebs_version = system_config.version() image_name = "manage.cloudflare" + versioned_tag = f"{image_name}-{sebs_version}" try: - docker_client.images.get(repo_name + ":" + image_name) + docker_client.images.get(repo_name + ":" + versioned_tag) except docker.errors.ImageNotFound: + logging.info( + "Docker pull of image {repo}:{tag}".format(repo=repo_name, tag=versioned_tag) + ) try: - logging.info( - "Docker pull of image {repo}:{image}".format(repo=repo_name, image=image_name) + docker_client.images.pull(repo_name, tag=versioned_tag) + except (docker.errors.APIError, docker.errors.ImageNotFound) as e: + raise RuntimeError( + "Docker pull of image {}:{} failed: {}".format(repo_name, versioned_tag, e) ) - docker_client.images.pull(repo_name, image_name) - except docker.errors.APIError: - raise RuntimeError("Docker pull of image {} failed!".format(image_name)) # Start the container in detached mode self.docker_instance = docker_client.containers.run( - image=repo_name + ":" + image_name, + image=repo_name + ":" + versioned_tag, command="/bin/bash", environment={ "CONTAINER_UID": str(os.getuid()), From 606e1acc312943925de62a0f321cd5708de296c9 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 16:54:11 +0200 Subject: [PATCH 168/230] black --- sebs/cloudflare/cli.py | 4 +++- sebs/cloudflare/containers.py | 4 +++- sebs/cloudflare/workers.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index f55c2851f..4cb27c238 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -24,7 +24,9 @@ class CloudflareCLI(LoggingBase): _lock: threading.Lock = threading.Lock() @staticmethod - def get_instance(system_config: SeBSConfig, docker_client: docker.client.DockerClient) -> "CloudflareCLI": + def get_instance( + system_config: SeBSConfig, docker_client: docker.client.DockerClient + ) -> "CloudflareCLI": """Return the shared CloudflareCLI instance, creating it on first use. Container and native workers deployments share one underlying CLI diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 6455e97ad..be8cdd248 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -86,7 +86,9 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-container.toml") + template_path = ( + files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-container.toml") + ) with template_path.open("rb") as f: config = tomllib.load(f) diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index fd8d93621..5da444c7f 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -79,7 +79,9 @@ def generate_wrangler_toml( Path to the generated wrangler.toml file """ # Load template - template_path = files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-worker.toml") + template_path = ( + files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-worker.toml") + ) with template_path.open("rb") as f: config = tomllib.load(f) From c123da6079ba31030ae70666c08f62599b0d4d0b Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 16:56:53 +0200 Subject: [PATCH 169/230] refactor(cli): add assertion for output type in command execution --- sebs/cloudflare/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 4cb27c238..848bda444 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -117,6 +117,7 @@ def execute(self, cmd: str, env: Optional[dict] = None): user="root", environment=env, ) + assert isinstance(out, bytes) if exit_code != 0: raise RuntimeError( "Command {} failed at Cloudflare CLI docker!\n Output {}".format( From c9c53720a75f66ee53caae72b825cb7b873d832b Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 27 Apr 2026 17:07:46 +0200 Subject: [PATCH 170/230] refactor(cloudflare): enhance docstrings for clarity and consistency across modules --- sebs/benchmark.py | 4 ++++ sebs/cloudflare/__init__.py | 2 ++ sebs/cloudflare/cli.py | 4 ++++ sebs/cloudflare/cloudflare.py | 8 ++++++++ sebs/cloudflare/config.py | 30 ++++++++++++++++++++++++++++++ sebs/cloudflare/containers.py | 2 +- sebs/cloudflare/function.py | 6 ++++++ sebs/cloudflare/kvstore.py | 28 ++++++++++++++++++++++++++++ sebs/cloudflare/r2.py | 11 +++++++++++ sebs/cloudflare/resources.py | 4 ++++ sebs/cloudflare/triggers.py | 9 +++++++++ sebs/cloudflare/workers.py | 2 +- sebs/regression.py | 4 ++++ 13 files changed, 112 insertions(+), 2 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 49e624ef3..b02afe8f7 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -70,11 +70,13 @@ class LanguageSpec: """ def __init__(self, language: "Language", variants: Dict[str, Any]): + """Store the language and its variant-to-directory mapping.""" self._language = language self._variants = variants @property def language(self) -> "Language": + """The programming language this spec applies to.""" return self._language @property @@ -110,6 +112,7 @@ def resolve_dir(self, variant: str, container_deployment: bool) -> str: @staticmethod def deserialize(val) -> "LanguageSpec": + """Build a LanguageSpec from a config.json language entry (string or dict).""" if isinstance(val, str): # Legacy: "python" → only the default variant return LanguageSpec(Language.deserialize(val), {"default": "default"}) @@ -124,6 +127,7 @@ def deserialize(val) -> "LanguageSpec": return LanguageSpec(Language.deserialize(val["language"]), variants) def serialize(self) -> dict: + """Return a serializable dict of the language and its variant mapping.""" return { "language": self._language.value, "variants": self._variants, diff --git a/sebs/cloudflare/__init__.py b/sebs/cloudflare/__init__.py index 5a2c557d3..d8d5c47e6 100644 --- a/sebs/cloudflare/__init__.py +++ b/sebs/cloudflare/__init__.py @@ -1,3 +1,5 @@ +"""Cloudflare Workers serverless platform implementation.""" + from sebs.cloudflare.cloudflare import Cloudflare from sebs.cloudflare.config import CloudflareConfig diff --git a/sebs/cloudflare/cli.py b/sebs/cloudflare/cli.py index 848bda444..6f738a870 100644 --- a/sebs/cloudflare/cli.py +++ b/sebs/cloudflare/cli.py @@ -1,3 +1,5 @@ +"""Cloudflare CLI container management for wrangler-based deployments.""" + import atexit import io import logging @@ -41,6 +43,7 @@ def get_instance( return CloudflareCLI._instance def __init__(self, system_config: SeBSConfig, docker_client: docker.client.DockerClient): + """Pull the manage image if needed and start the CLI container.""" super().__init__() self._stopped = False @@ -95,6 +98,7 @@ def __init__(self, system_config: SeBSConfig, docker_client: docker.client.Docke @staticmethod def typename() -> str: + """Return the canonical type name for this class.""" return "Cloudflare.CLI" def execute(self, cmd: str, env: Optional[dict] = None): diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 7a1cebfa1..15009e699 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,3 +1,5 @@ +"""Cloudflare Workers platform implementation for SeBS.""" + import os import uuid import time @@ -32,6 +34,7 @@ class _CloudflareContainerAdapter: """ def __init__(self, containers_deployment: CloudflareContainersDeployment): + """Initialize the adapter with the given containers deployment handler.""" self._containers = containers_deployment # Populated by build_base_image() so create_function() can find the dir. self.last_directory: Optional[str] = None @@ -116,18 +119,22 @@ class Cloudflare(System): @staticmethod def name(): + """Return the platform name used in configuration and cache keys.""" return "cloudflare" @staticmethod def typename(): + """Return the human-readable type name for this platform.""" return "Cloudflare" @staticmethod def function_type() -> "Type[Function]": + """Return the Function subclass used by this platform.""" return CloudflareWorker @property def config(self) -> CloudflareConfig: + """Return the Cloudflare-specific platform configuration.""" return self._config def is_benchmark_supported( @@ -193,6 +200,7 @@ def __init__( docker_client: docker.client.DockerClient, logger_handlers: LoggingHandlers, ): + """Initialize the Cloudflare platform with credentials and deployment handlers.""" super().__init__( sebs_config, cache_client, diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index 2219a8def..c8a7a3dd3 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -1,3 +1,5 @@ +"""Configuration classes for the Cloudflare Workers platform.""" + import os from typing import Optional, cast @@ -38,6 +40,7 @@ def __init__( r2_access_key_id: Optional[str] = None, r2_secret_access_key: Optional[str] = None, ): + """Store Cloudflare API credentials supplied at construction time.""" super().__init__() self._api_token = api_token @@ -49,34 +52,42 @@ def __init__( @staticmethod def typename() -> str: + """Return the canonical type name for this credentials class.""" return "Cloudflare.Credentials" @property def api_token(self) -> Optional[str]: + """Scoped API token for Cloudflare authentication.""" return self._api_token @property def email(self) -> Optional[str]: + """Account email used with the Global API Key authentication method.""" return self._email @property def api_key(self) -> Optional[str]: + """Global API Key used with the email authentication method.""" return self._api_key @property def account_id(self) -> Optional[str]: + """Cloudflare account ID required for all API operations.""" return self._account_id @property def r2_access_key_id(self) -> Optional[str]: + """S3-compatible access key ID for R2 bucket operations.""" return self._r2_access_key_id @property def r2_secret_access_key(self) -> Optional[str]: + """S3-compatible secret access key for R2 bucket operations.""" return self._r2_secret_access_key @staticmethod def initialize(dct: dict) -> "CloudflareCredentials": + """Build a CloudflareCredentials instance from a plain dictionary.""" return CloudflareCredentials( dct.get("api_token"), dct.get("email"), @@ -88,6 +99,7 @@ def initialize(dct: dict) -> "CloudflareCredentials": @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Credentials: + """Load credentials from config dict, falling back to environment variables.""" cached_config = cache.get_config("cloudflare") ret: CloudflareCredentials account_id: Optional[str] = None @@ -135,12 +147,14 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden return ret def update_cache(self, cache: Cache): + """Persist the account ID to the local cache.""" if self._account_id: cache.update_config( val=self._account_id, keys=["cloudflare", "credentials", "account_id"] ) def serialize(self) -> dict: + """Return a serializable dict of non-secret credential fields.""" out = {} if self._account_id: out["account_id"] = self._account_id @@ -153,23 +167,28 @@ class CloudflareResources(Resources): """ def __init__(self): + """Initialize Cloudflare resources with no namespace ID assigned.""" super().__init__(name="cloudflare") self._namespace_id: Optional[str] = None @staticmethod def typename() -> str: + """Return the canonical type name for this resources class.""" return "Cloudflare.Resources" @property def namespace_id(self) -> Optional[str]: + """KV namespace ID associated with this resource deployment.""" return self._namespace_id @namespace_id.setter def namespace_id(self, value: str): + """Set the KV namespace ID for this resource deployment.""" self._namespace_id = value @staticmethod def initialize(res: Resources, dct: dict): + """Populate a CloudflareResources instance from a config dictionary.""" ret = cast(CloudflareResources, res) super(CloudflareResources, CloudflareResources).initialize(ret, dct) @@ -179,12 +198,14 @@ def initialize(res: Resources, dct: dict): return ret def serialize(self) -> dict: + """Return a serializable dict of Cloudflare resource fields.""" out = {**super().serialize()} if self._namespace_id: out["namespace_id"] = self._namespace_id return out def update_cache(self, cache: Cache): + """Persist resource IDs to the local cache.""" super().update_cache(cache) if self._namespace_id: cache.update_config( @@ -193,6 +214,7 @@ def update_cache(self, cache: Cache): @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resources: + """Load resources from cached or user-provided configuration.""" ret = CloudflareResources() cached_config = cache.get_config("cloudflare") @@ -223,30 +245,36 @@ class CloudflareConfig(Config): """ def __init__(self, credentials: CloudflareCredentials, resources: CloudflareResources): + """Initialize configuration with the given credentials and resources.""" super().__init__(name="cloudflare") self._credentials = credentials self._resources = resources @staticmethod def typename() -> str: + """Return the canonical type name for this configuration class.""" return "Cloudflare.Config" @property def credentials(self) -> CloudflareCredentials: + """Cloudflare API credentials for this configuration.""" return self._credentials @property def resources(self) -> CloudflareResources: + """Cloudflare resource identifiers for this deployment.""" return self._resources @staticmethod def initialize(cfg: Config, dct: dict): + """Apply region and other fields from a config dictionary to an existing instance.""" config = cast(CloudflareConfig, cfg) # Cloudflare Workers are globally distributed, no region needed config._region = dct.get("region", "global") @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config: + """Build a CloudflareConfig from user config and cache, resolving credentials.""" cached_config = cache.get_config("cloudflare") credentials = cast( CloudflareCredentials, CloudflareCredentials.deserialize(config, cache, handlers) @@ -269,11 +297,13 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config return config_obj def update_cache(self, cache: Cache): + """Persist region, credentials, and resources to the local cache.""" cache.update_config(val=self.region, keys=["cloudflare", "region"]) self.credentials.update_cache(cache) self.resources.update_cache(cache) def serialize(self) -> dict: + """Return a serializable dict of the full Cloudflare configuration.""" out = { "name": "cloudflare", "region": self._region, diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index be8cdd248..433e26523 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -16,7 +16,7 @@ try: import tomllib # type: ignore[import-not-found] # Python 3.11+ except ImportError: - import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python + import tomli as tomllib # type: ignore[no-redef, import-not-found] # Fallback for older Python try: import tomli_w except ImportError: diff --git a/sebs/cloudflare/function.py b/sebs/cloudflare/function.py index cbf25a9fb..203aa3416 100644 --- a/sebs/cloudflare/function.py +++ b/sebs/cloudflare/function.py @@ -1,3 +1,5 @@ +"""Cloudflare Workers function and trigger definitions.""" + from typing import Optional from sebs.faas.function import Function, FunctionConfig @@ -20,6 +22,7 @@ def __init__( cfg: FunctionConfig, account_id: Optional[str] = None, ): + """Create a CloudflareWorker with the given script ID, runtime, and account.""" super().__init__(benchmark, name, code_package_hash, cfg) self.script_id = script_id self.runtime = runtime @@ -27,9 +30,11 @@ def __init__( @staticmethod def typename() -> str: + """Return the canonical type name for this function class.""" return "Cloudflare.Worker" def serialize(self) -> dict: + """Return a serializable dict including script ID, runtime, and account.""" return { **super().serialize(), "script_id": self.script_id, @@ -39,6 +44,7 @@ def serialize(self) -> dict: @staticmethod def deserialize(cached_config: dict) -> "CloudflareWorker": + """Reconstruct a CloudflareWorker from a cached configuration dict.""" from sebs.cloudflare.triggers import HTTPTrigger cfg = FunctionConfig.deserialize(cached_config["config"]) diff --git a/sebs/cloudflare/kvstore.py b/sebs/cloudflare/kvstore.py index 9e8bf923a..163df9685 100644 --- a/sebs/cloudflare/kvstore.py +++ b/sebs/cloudflare/kvstore.py @@ -1,3 +1,5 @@ +"""Cloudflare KV namespace-backed NoSQL storage implementation.""" + import hashlib import json import re @@ -72,10 +74,12 @@ class KVStore(NoSQLStorage): @staticmethod def typename() -> str: + """Return the canonical type name for this storage class.""" return "Cloudflare.KVStore" @staticmethod def deployment_name() -> str: + """Return the deployment platform name.""" return "cloudflare" def __init__( @@ -85,18 +89,21 @@ def __init__( resources: Resources, credentials: CloudflareCredentials, ): + """Initialize KV storage with Cloudflare credentials.""" super().__init__(region, cache_client, resources) self._credentials = credentials # benchmark -> logical table name -> KV namespace id self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) def _account_id(self) -> str: + """Return the account ID, raising if not configured.""" account_id = self._credentials.account_id if not account_id: raise RuntimeError("Cloudflare account ID is required for KV operations") return account_id def _kv_api_base(self) -> str: + """Return the base URL for the Cloudflare KV namespace API.""" account = self._account_id() return f"https://api.cloudflare.com/client/v4/accounts/{account}/storage/kv/namespaces" @@ -118,19 +125,23 @@ def _get_auth_headers(self, content_type: str = "application/json") -> dict[str, @classmethod def _is_namespace_id(cls, value: str) -> bool: + """Return True if value matches the 32-character hex namespace ID pattern.""" return bool(cls.NAMESPACE_ID_PATTERN.fullmatch(value)) def _resource_id(self) -> str: + """Return the resource prefix used in namespace titles.""" if self._cloud_resources.has_resources_id: return self._cloud_resources.resources_id return "default" @staticmethod def _sanitize_component(value: str) -> str: + """Replace characters not allowed in KV namespace titles with hyphens.""" sanitized = re.sub(r"[^A-Za-z0-9_-]", "-", value) return sanitized.strip("-") or "default" def _namespace_title(self, benchmark: str, table: str) -> str: + """Build a deterministic KV namespace title for the given benchmark and table.""" title = ( f"sebs-nosql-{self._sanitize_component(self._resource_id())}-" f"{self._sanitize_component(benchmark)}-{self._sanitize_component(table)}" @@ -143,6 +154,7 @@ def _namespace_title(self, benchmark: str, table: str) -> str: return title def _list_namespaces(self) -> List[dict]: + """Fetch all KV namespaces for the account, following pagination.""" namespaces: List[dict] = [] page = 1 per_page = 100 @@ -171,12 +183,14 @@ def _list_namespaces(self) -> List[dict]: return namespaces def _find_namespace_id_by_title(self, title: str) -> Optional[str]: + """Return the namespace ID whose title matches, or None if not found.""" for namespace in self._list_namespaces(): if namespace.get("title") == title: return namespace.get("id") return None def _delete_namespace(self, namespace_id: str) -> None: + """Delete the KV namespace with the given ID, ignoring 404 responses.""" response = requests.delete( f"{self._kv_api_base()}/{namespace_id}", headers=self._get_auth_headers(), @@ -196,15 +210,18 @@ def _delete_namespace(self, namespace_id: str) -> None: def _compose_key( primary_key: Tuple[str, str], secondary_key: Optional[Tuple[str, str]] = None ) -> str: + """Build the KV storage key from primary and optional secondary key tuples.""" if secondary_key is None: return str(primary_key[1]) return f"{primary_key[1]}#{secondary_key[1]}" @staticmethod def _index_key(primary_value: str) -> str: + """Return the KV key used to store the secondary-key index for a primary value.""" return f"__sebs_idx__{primary_value}" def _read_index(self, namespace_id: str, primary_value: str) -> List[str]: + """Fetch the list of secondary-key values stored in the index for primary_value.""" index_key = quote(self._index_key(primary_value), safe="") response = requests.get( f"{self._kv_api_base()}/{namespace_id}/values/{index_key}", @@ -229,6 +246,7 @@ def _read_index(self, namespace_id: str, primary_value: str) -> List[str]: return [str(v) for v in parsed] def _write_index(self, namespace_id: str, primary_value: str, values: List[str]) -> None: + """Persist the secondary-key index for primary_value to KV storage.""" index_key = quote(self._index_key(primary_value), safe="") response = requests.put( f"{self._kv_api_base()}/{namespace_id}/values/{index_key}", @@ -238,13 +256,16 @@ def _write_index(self, namespace_id: str, primary_value: str, values: List[str]) response.raise_for_status() def _get_tables(self) -> Dict[str, List[str]]: + """Return all cached table names grouped by benchmark.""" tables = self.cache_client.get_nosql_configs(self.deployment_name()) return {benchmark: list(v.values()) for benchmark, v in tables.items()} def get_tables(self, benchmark: str) -> Dict[str, str]: + """Return the table-name-to-namespace-ID mapping for the given benchmark.""" return self._tables[benchmark] def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + """Return the namespace ID for the given benchmark and logical table name, or None.""" if benchmark not in self._tables: return None if table not in self._tables[benchmark]: @@ -252,6 +273,7 @@ def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: return self._tables[benchmark][table] def retrieve_cache(self, benchmark: str) -> bool: + """Load cached KV namespace mappings for a benchmark; return True if found.""" if benchmark in self._tables: return True @@ -277,6 +299,7 @@ def retrieve_cache(self, benchmark: str) -> bool: return True def update_cache(self, benchmark: str): + """Persist the current KV namespace mappings for a benchmark to the cache.""" self.cache_client.update_nosql( self.deployment_name(), benchmark, @@ -287,6 +310,7 @@ def update_cache(self, benchmark: str): def create_table( self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None ) -> str: + """Create or reuse a KV namespace for the given benchmark and table name.""" # Unused in KV namespace allocation, kept for interface compatibility _ = primary_key, secondary_key @@ -344,6 +368,7 @@ def write_to_table( primary_key: Tuple[str, str], secondary_key: Optional[Tuple[str, str]] = None, ): + """Write a record to the KV namespace, updating the secondary-key index if needed.""" namespace_id = self._get_table_name(benchmark, table) if not namespace_id: raise ValueError(f"Table {table} not found for benchmark {benchmark}") @@ -372,6 +397,7 @@ def write_to_table( self._write_index(namespace_id, primary_value, index_values) def clear_table(self, name: str) -> str: + """Log a warning; KV does not support bulk clear — use remove_table + create_table.""" self.logging.warning( "clear_table is not implemented for Cloudflare KV. " "Use remove_table() + create_table() instead." @@ -379,6 +405,7 @@ def clear_table(self, name: str) -> str: return name def remove_table(self, name: str) -> str: + """Delete the KV namespace identified by logical name or namespace ID.""" benchmark_to_modify: Optional[str] = None logical_name_to_delete: Optional[str] = None namespace_id_to_delete: Optional[str] = None @@ -410,4 +437,5 @@ def remove_table(self, name: str) -> str: return name def envs(self) -> dict: + """Return environment variables required by benchmarks to access KV storage.""" return {"NOSQL_STORAGE_DATABASE": "kvstore"} diff --git a/sebs/cloudflare/r2.py b/sebs/cloudflare/r2.py index 4450f3948..4e0d65790 100644 --- a/sebs/cloudflare/r2.py +++ b/sebs/cloudflare/r2.py @@ -1,3 +1,5 @@ +"""Cloudflare R2 object storage implementation.""" + import os import requests @@ -10,20 +12,26 @@ class R2(PersistentStorage): + """Cloudflare R2 object storage backend for SeBS benchmarks.""" + @staticmethod def typename() -> str: + """Return the canonical type name for this storage class.""" return "Cloudflare.R2" @staticmethod def deployment_name() -> str: + """Return the deployment platform name.""" return "cloudflare" @property def replace_existing(self) -> bool: + """Whether existing objects should be overwritten on upload.""" return self._replace_existing @replace_existing.setter def replace_existing(self, val: bool): + """Set whether existing objects should be overwritten on upload.""" self._replace_existing = val def __init__( @@ -34,6 +42,7 @@ def __init__( replace_existing: bool, credentials: CloudflareCredentials, ): + """Initialize R2 storage with Cloudflare credentials.""" super().__init__(region, cache_client, resources, replace_existing) self._credentials = credentials self._s3_client = None @@ -94,11 +103,13 @@ def _get_s3_client(self): return None def correct_name(self, name: str) -> str: + """Return the bucket name unchanged; R2 does not require name transformations.""" return name def _create_bucket( self, name: str, buckets: Optional[List[str]] = None, randomize_name: bool = False ) -> str: + """Create an R2 bucket, reusing an existing one if the name is already present.""" for bucket_name in buckets or []: if name in bucket_name: self.logging.info( diff --git a/sebs/cloudflare/resources.py b/sebs/cloudflare/resources.py index fd88f7723..77e2988a2 100644 --- a/sebs/cloudflare/resources.py +++ b/sebs/cloudflare/resources.py @@ -1,3 +1,5 @@ +"""Cloudflare system resources manager.""" + import docker from typing import Optional, cast @@ -27,12 +29,14 @@ def __init__( docker_client: docker.client.DockerClient, logging_handlers: LoggingHandlers, ): + """Initialize Cloudflare system resources with config and logging handlers.""" super().__init__(config, cache_client, docker_client) self._config = config self.logging_handlers = logging_handlers @property def config(self) -> CloudflareConfig: + """Return the Cloudflare-specific platform configuration.""" return cast(CloudflareConfig, self._config) def _get_auth_headers(self) -> dict[str, str]: diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 8b35be973..1311c1e99 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -1,3 +1,5 @@ +"""HTTP trigger implementation for Cloudflare Workers.""" + from typing import Optional import concurrent.futures import json @@ -21,25 +23,30 @@ class HTTPTrigger(Trigger): """ def __init__(self, worker_name: str, url: Optional[str] = None): + """Initialize the HTTP trigger with the worker name and optional URL.""" super().__init__() self.worker_name = worker_name self._url = url @staticmethod def typename() -> str: + """Return the canonical type name for this trigger class.""" return "Cloudflare.HTTPTrigger" @staticmethod def trigger_type() -> Trigger.TriggerType: + """Return the trigger type enum value.""" return Trigger.TriggerType.HTTP @property def url(self) -> str: + """HTTPS endpoint URL for invoking the worker.""" assert self._url is not None, "HTTP trigger URL has not been set" return self._url @url.setter def url(self, url: str): + """Set the HTTPS endpoint URL for the worker.""" self._url = url def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> ExecutionResult: @@ -208,6 +215,7 @@ def async_invoke(self, payload: dict) -> concurrent.futures.Future: return fut def serialize(self) -> dict: + """Return a serializable dict with the trigger type, worker name, and URL.""" return { "type": self.typename(), "worker_name": self.worker_name, @@ -216,5 +224,6 @@ def serialize(self) -> dict: @staticmethod def deserialize(obj: dict) -> "HTTPTrigger": + """Reconstruct an HTTPTrigger from a serialized dict.""" trigger = HTTPTrigger(obj["worker_name"], obj.get("url")) return trigger diff --git a/sebs/cloudflare/workers.py b/sebs/cloudflare/workers.py index 5da444c7f..4add1ccc9 100644 --- a/sebs/cloudflare/workers.py +++ b/sebs/cloudflare/workers.py @@ -13,7 +13,7 @@ try: import tomllib # type: ignore[import-not-found] # Python 3.11+ except ImportError: - import tomli as tomllib # type: ignore[no-redef] # Fallback for older Python + import tomli as tomllib # type: ignore[no-redef, import-not-found] # Fallback for older Python try: import tomli_w except ImportError: diff --git a/sebs/regression.py b/sebs/regression.py index 21fe43278..ae1aee166 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -1181,6 +1181,7 @@ class CloudflareTestSequencePythonWorkers( """Test suite for Python benchmarks on Cloudflare Workers.""" def get_deployment(self, benchmark_name, architecture, deployment_type): + """Return an initialized Cloudflare deployment client for Python workers.""" deployment_name = "cloudflare" assert cloud_config, "Cloud configuration is required" @@ -1211,6 +1212,7 @@ class CloudflareTestSequencePythonContainers( """Test suite for Python benchmarks on Cloudflare Containers.""" def get_deployment(self, benchmark_name, architecture, deployment_type): + """Return an initialized Cloudflare deployment client for Python containers.""" deployment_name = "cloudflare" assert cloud_config, "Cloud configuration is required" @@ -1241,6 +1243,7 @@ class CloudflareTestSequenceNodejsWorkers( """Test suite for Node.js benchmarks on Cloudflare Workers.""" def get_deployment(self, benchmark_name, architecture, deployment_type): + """Return an initialized Cloudflare deployment client for Node.js workers.""" deployment_name = "cloudflare" assert cloud_config, "Cloud configuration is required" @@ -1271,6 +1274,7 @@ class CloudflareTestSequenceNodejsContainers( """Test suite for Node.js benchmarks on Cloudflare Containers.""" def get_deployment(self, benchmark_name, architecture, deployment_type): + """Return an initialized Cloudflare deployment client for Node.js containers.""" deployment_name = "cloudflare" assert cloud_config, "Cloud configuration is required" From 2053171c153193e32c174a64e3eaf816440fdd72 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 29 Apr 2026 15:13:13 +0200 Subject: [PATCH 171/230] refactor(health-check): enhance comments for clarity on health check endpoint and provisioning logic --- .../wrappers/cloudflare/nodejs/container/worker.js | 12 +++++++++++- sebs/cloudflare/containers.py | 10 +++++++++- sebs/cloudflare/triggers.py | 10 +++++++++- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index a46146d22..96fe1a335 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -16,7 +16,17 @@ export default { async fetch(request, env) { const url = new URL(request.url); - // Health check endpoint + // Health check endpoint — used by the SeBS harness (sebs/cloudflare/containers.py) + // to detect when the worker is provisioned before running benchmarks. + // + // Why a dedicated endpoint: a first successful benchmark HTTP response (HTTP 200) would + // also confirm the worker is up, but container startup time is highly variable, so the + // harness cannot know how long to wait before attempting that first call. The /health + // endpoint gives a defined starting point: a 200 here means (1) the Cloudflare Worker + // itself is reachable AND (2) the Durable Object / container binding is instantiated. + // Only a short gap remains until the benchmark handler is fully ready, which + // sync_invoke retries in triggers.py can cover cheaply — instead of retrying across + // the entire variable-length provisioning window. if (url.pathname === '/health' || url.pathname === '/_health') { try { const containerId = 'default'; diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 433e26523..33b6de4e3 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -378,13 +378,21 @@ def wait_for_container_worker_ready( """ Wait for container worker to be fully provisioned and ready. + Polls /health instead of issuing a benchmark request directly because container + startup time is highly variable. A 200 from /health confirms that (1) the Worker + is reachable and (2) the Durable Object / container binding is instantiated — at + which point only a short gap remains before the benchmark handler is fully ready. + That residual gap is covered by the sync_invoke retry loop in triggers.py, keeping + the retry window small and predictable rather than spanning the entire provisioning + window from scratch. + Args: worker_name: Name of the worker worker_url: URL of the worker max_wait_seconds: Maximum time to wait in seconds Returns: - True if ready, False if timeout + True if ready, raises RuntimeError on timeout """ wait_interval = 20 start_time = time.time() diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 1311c1e99..1e05990c4 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -153,7 +153,15 @@ def _http_invoke(self, payload: dict, url: str, verify_ssl: bool = True) -> Exec raise RuntimeError(f"Failed invocation of function! Output: {raw_text}") def sync_invoke(self, payload: dict) -> ExecutionResult: - """Synchronously invoke a Cloudflare Worker via HTTP.""" + """ + Synchronously invoke a Cloudflare Worker via HTTP. + + Retries on ContainerProvisioningError to cover the short gap between the /health + check passing (Worker + Durable Object up) and the benchmark handler being fully + ready. The /health gate in containers.py absorbs the unpredictable bulk of + container startup; the retry budget here only needs to bridge the remaining, + much shorter window. + """ self.logging.debug(f"Invoke function {self.url}") max_provisioning_retries = 10 provisioning_retry_wait = 60 # seconds between retries From a083ef38c8666d4f11f17d3a6fc7ec029d0e3510 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 30 Apr 2026 13:49:26 +0200 Subject: [PATCH 172/230] refactor(cloudflare): replace container_deployment with system_variant for clarity --- sebs/benchmark.py | 6 +++--- sebs/cloudflare/cloudflare.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index e8b5e3fd9..7d9d84640 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -533,7 +533,7 @@ def select_variant(self, variant: str) -> None: self._language_variant, self._language_version, self._architecture, - "container" if self._container_deployment else "package", + "container" if self._system_variant.is_container else "package", ) self.query_cache() if self._experiment_config.update_code: @@ -616,7 +616,7 @@ def hash(self) -> str: self._deployment_name, self.language, self._language_variant, - container_deployment=self._container_deployment, + container_deployment=self._system_variant.is_container, ) assert self._hash_value is not None return self._hash_value @@ -895,7 +895,7 @@ def copy_code(self, output_dir: str) -> None: if self._language_variant != "default": lang_spec = self.benchmark_config.get_language_spec(self.language) overlay_dir_name = lang_spec.resolve_dir( - self._language_variant, self._container_deployment + self._language_variant, self._system_variant.is_container ) if overlay_dir_name != "default": diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 15009e699..3d66a5a2f 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -18,6 +18,7 @@ from sebs.config import SeBSConfig from sebs.utils import LoggingHandlers from sebs.faas.function import Function, ExecutionResult, Trigger, FunctionConfig +from sebs.experiments.config import SystemVariant from sebs.faas.system import System from sebs.faas.config import Resources from sebs.sebs_types import Language @@ -161,7 +162,7 @@ def is_benchmark_supported( def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) -> Function: """Override to validate benchmark support and auto-select cloudflare variant.""" language = code_package.language_name - container_deployment = code_package.container_deployment + container_deployment = code_package.system_variant.is_container benchmark_name = code_package.benchmark if not self.is_benchmark_supported(benchmark_name, language, container_deployment): deployment_type = "container" if container_deployment else "worker" @@ -449,7 +450,7 @@ def create_function( self, code_package: Benchmark, func_name: str, - container_deployment: bool, + system_variant: SystemVariant, container_uri: str | None, ) -> CloudflareWorker: """ @@ -460,12 +461,13 @@ def create_function( Args: code_package: Benchmark containing the function code func_name: Name of the worker - container_deployment: Whether to deploy as container + system_variant: Selected deployment variant container_uri: URI of container image Returns: CloudflareWorker instance """ + container_deployment = system_variant.is_container # For container builds benchmark.build() goes through container_client.build_base_image(), # which does NOT set code_package._code_location. Fall back in order: # 1. _CloudflareContainerAdapter.last_directory (set when build actually ran this session) @@ -513,7 +515,7 @@ def create_function( function_cfg, account_id, ) - self.update_function(worker, code_package, container_deployment, container_uri) + self.update_function(worker, code_package, system_variant, container_uri) worker.updated_code = True else: self.logging.info(f"Creating new worker {func_name}") @@ -777,7 +779,7 @@ def update_function( self, function: Function, code_package: Benchmark, - container_deployment: bool, + system_variant: SystemVariant, container_uri: str | None, ): """ @@ -786,9 +788,10 @@ def update_function( Args: function: Existing function instance to update code_package: New benchmark containing the function code - container_deployment: Whether to deploy as container + system_variant: Selected deployment variant container_uri: URI of container image """ + container_deployment = system_variant.is_container worker = cast(CloudflareWorker, function) package = code_package.code_location if package is None and container_deployment: From e1c937a492e2cc6916b979b2f8b488fb3e51a7b7 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 6 May 2026 17:15:22 +0200 Subject: [PATCH 173/230] refactor(cloudflare): update deployment types to reflect correct options for Cloudflare --- configs/systems.json | 2 +- sebs/experiments/config.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/systems.json b/configs/systems.json index 4eaaf73c9..3506e3d35 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -546,6 +546,6 @@ } }, "architecture": ["x64"], - "deployments": ["package", "container"] + "deployments": ["workers", "container"] } } diff --git a/sebs/experiments/config.py b/sebs/experiments/config.py index d88ee6ead..dd881f99f 100644 --- a/sebs/experiments/config.py +++ b/sebs/experiments/config.py @@ -34,6 +34,8 @@ class SystemVariant: # GCP specific "function-gen1", "function-gen2", + # Cloudflare worker deployment + "workers", ] def __init__(self, value: str): From e9f95b7040026d8449949363999c0a103b1dc093 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 7 May 2026 18:06:32 +0200 Subject: [PATCH 174/230] refactor(cloudflare): enhance container deployment logic and readiness checks --- .../cloudflare/nodejs/container/worker.js | 55 +----- docs/platforms.md | 2 +- sebs/cloudflare/cloudflare.py | 164 +++++++++++++++++- sebs/cloudflare/containers.py | 71 +------- sebs/cloudflare/triggers.py | 15 +- 5 files changed, 174 insertions(+), 133 deletions(-) diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index 96fe1a335..bd47ea538 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -15,60 +15,7 @@ export class ContainerWorker extends Container { export default { async fetch(request, env) { const url = new URL(request.url); - - // Health check endpoint — used by the SeBS harness (sebs/cloudflare/containers.py) - // to detect when the worker is provisioned before running benchmarks. - // - // Why a dedicated endpoint: a first successful benchmark HTTP response (HTTP 200) would - // also confirm the worker is up, but container startup time is highly variable, so the - // harness cannot know how long to wait before attempting that first call. The /health - // endpoint gives a defined starting point: a 200 here means (1) the Cloudflare Worker - // itself is reachable AND (2) the Durable Object / container binding is instantiated. - // Only a short gap remains until the benchmark handler is fully ready, which - // sync_invoke retries in triggers.py can cover cheaply — instead of retrying across - // the entire variable-length provisioning window. - if (url.pathname === '/health' || url.pathname === '/_health') { - try { - const containerId = 'default'; - const id = env.CONTAINER_WORKER.idFromName(containerId); - const stub = env.CONTAINER_WORKER.get(id); - - // Make a simple GET request to the root path to verify container is responsive - const healthRequest = new Request('http://localhost/', { - method: 'GET', - headers: { - 'X-Health-Check': 'true' - } - }); - - const response = await stub.fetch(healthRequest); - - // Container is ready if it responds (even with an error from the benchmark handler) - // A 500 from the handler means the container is running, just not a valid benchmark request - if (response.status >= 200 && response.status < 600) { - return new Response('OK', { status: 200 }); - } else { - return new Response(JSON.stringify({ - error: 'Container not responding', - status: response.status - }), { - status: 503, - headers: { 'Content-Type': 'application/json' } - }); - } - - } catch (error) { - return new Response(JSON.stringify({ - error: 'Container failed to start', - details: error.message, - stack: error.stack - }), { - status: 503, - headers: { 'Content-Type': 'application/json' } - }); - } - } - + try { // Handle NoSQL proxy requests - intercept BEFORE forwarding to container if (url.pathname.startsWith('/nosql/')) { diff --git a/docs/platforms.md b/docs/platforms.md index e4da555d4..c63acf6d4 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -468,7 +468,7 @@ Wrangler templates live alongside the deployment code at `sebs/cloudflare/templa 4. **Deploy** — `CloudflareCLI.wrangler_deploy` runs `npm install && wrangler deploy` inside the `manage.cloudflare` container. `npm install` materializes `node_modules/@cloudflare/containers` (listed in `package.json`) so that wrangler's bundler can resolve the `worker.js` import. Wrangler then deploys the Worker script and creates the Durable-Object-backed container worker backed by the registry image. -5. `wait_for_durable_object_ready` polls `/health` until the container reports healthy, then SeBS pings `/health` for ~60 s to keep the Durable Object alive during the container provisioning window before the first measured invocation. +5. **Rollout and instance readiness wait** — `wrangler deploy` compares the newly pushed registry image digest against the image currently running in the container worker. If the digest has changed, Cloudflare starts a rollout: it pulls the new image, replaces running instances, and sets `active_rollout_id` on the container application record for the duration. SeBS polls `GET /accounts/{id}/containers/applications/{uuid}` every 20 s in two phases: first it waits for `active_rollout_id` to disappear (rollout complete, can take up to 10 minutes for large containers), then it waits for `health.instances.healthy >= max_instances`. The `health.instances` sub-object tracks runtime state and is not formally documented by Cloudflare (derived from observed API responses): `starting` = still booting (image pull + Firecracker init), `healthy` = passed health check and ready to serve, `active` = currently handling a request (always 0 until the first invocation). The readiness threshold is `max_instances`, not the top-level `instances` field — in practice `instances = max_instances + 1` because Cloudflare counts one extra Durable Object coordination instance that never appears as healthy. Only once an instance is confirmed running does SeBS proceed to invoke the benchmark, avoiding the "no Container instance available" Durable Object error that would otherwise occur on cold starts. If wrangler reported "no changes" (digest unchanged), no rollout is started and this wait is skipped entirely. 6. `HTTPTrigger` is attached using the `workers.dev` URL. diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 3d66a5a2f..04603c3ba 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -639,10 +639,9 @@ def _create_or_update_worker( self.logging.info(f"Uploading package to container: {container_package_path}") cli.upload_package(package_dir, container_package_path) - # Deploy using Wrangler in container - self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") - try: + self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") + # pywrangler is used for all native Python workers (packages must be # synced via pyproject.toml before wrangler uploads the bundle). # All other cases — nodejs, containers — use wrangler directly. @@ -655,14 +654,28 @@ def _create_or_update_worker( self.logging.debug(f"Wrangler deploy output: {output}") # Wait for the worker to become reachable before returning. - # Container workers expose /health; native workers are probed - # with a lightweight GET to confirm edge propagation. account_id_val = env.get("CLOUDFLARE_ACCOUNT_ID") worker_url = self._build_workers_dev_url(worker_name, account_id_val) if container_deployment: - self.logging.info("Waiting for container worker to initialize...") - self._containers_deployment.wait_for_container_worker_ready(worker_name, worker_url) + container_name = self._containers_deployment._container_name_from_worker(worker_name) + # Cloudflare compares the newly pushed registry image against the + # image currently running in the container worker. If the image digest + # has changed, wrangler deploy triggers a rollout: Cloudflare pulls the + # new image, replaces the running instances, and sets active_rollout_id + # on the container application record until the rollout finishes. + # If nothing changed (same digest), wrangler reports "no changes" and + # no rollout is started — the container is already on the correct image. + if "no changes" in output.lower(): + self.logging.info( + f"Container {container_name} unchanged, skipping readiness wait." + ) + else: + # A rollout is in progress. Poll the Cloudflare REST API until + # active_rollout_id disappears, which signals that all container + # instances have been replaced and are serving the new image. + self.logging.info("Waiting for container rollout to complete...") + self._wait_for_container_rollout(container_name, account_id) else: self._wait_for_worker_ready(worker_name, worker_url) @@ -697,6 +710,143 @@ def _wait_for_worker_ready( "proceeding anyway — invocation retries will handle residual propagation delay." ) + def _get_container_id(self, container_name: str, account_id: str) -> Optional[str]: + """Resolve a container name to its UUID via the Cloudflare REST API. + + Lists all container applications for the account and returns the UUID + of the one whose name matches container_name, or None if not found yet. + """ + url = f"{self._api_base_url}/accounts/{account_id}/containers/applications" + headers = self._get_auth_headers() + try: + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code != 200: + return None + items = resp.json().get("result", []) + for item in items: + if item.get("name") == container_name: + return item.get("id") + except requests.exceptions.RequestException: + pass + return None + + def _wait_for_container_rollout( + self, + container_name: str, + account_id: str, + max_wait_seconds: int = 900, + poll_interval: int = 20, + ) -> None: + """Poll the Cloudflare API until the container has rolled out and an instance is running. + + This covers two sequential phases using the same + GET /accounts/{id}/containers/applications/{uuid} endpoint: + + Phase 1 — Rollout: Cloudflare pulls the new image and replaces instances. + active_rollout_id is set for the duration. Large containers (e.g. ML inference + images) can take up to 10 minutes. Do not lower max_wait_seconds aggressively. + + Phase 2 — Instance readiness: After the rollout finishes, Cloudflare must start + at least one container instance before it can accept requests. The top-level + `instances` field is the configured/desired count. Runtime state lives under + `health.instances`: `starting` = still booting, `healthy` = passed health check + and ready to serve, `active` = currently handling a request (always 0 until the + first invocation). We wait until `health.instances.healthy >= max_instances`. + Note: the top-level `instances` field equals `max_instances + 1` because + Cloudflare adds one extra Durable Object coordination instance that never + becomes healthy — `max_instances` is the correct readiness threshold. + This avoids the + first benchmark invocation hitting a "no Container instance available" error + from the Durable Object. + + Args: + container_name: Cloudflare container name (e.g. my-worker-containerworker) + account_id: Cloudflare account ID + max_wait_seconds: Maximum seconds to wait (covers both phases) + poll_interval: Seconds between polls + """ + headers = self._get_auth_headers() + start = time.time() + container_id: Optional[str] = None + rollout_complete = False + + while time.time() - start < max_wait_seconds: + elapsed = int(time.time() - start) + try: + if container_id is None: + container_id = self._get_container_id(container_name, account_id) + if container_id is None: + self.logging.info( + f"Container {container_name} not registered yet... ({elapsed}s elapsed)" + ) + time.sleep(poll_interval) + continue + self.logging.info(f"Resolved container ID: {container_id}") + + url = f"{self._api_base_url}/accounts/{account_id}/containers/applications/{container_id}" + resp = requests.get(url, headers=headers, timeout=30) + if resp.status_code == 200: + data = resp.json().get("result", resp.json()) + active_rollout = data.get("active_rollout_id") + + if active_rollout: + self.logging.info( + f"Container {container_name} rollout in progress " + f"(rollout_id={active_rollout}, {elapsed}s elapsed)" + ) + else: + if not rollout_complete: + self.logging.info( + f"Container {container_name} rollout complete, " + "waiting for an instance to start..." + ) + rollout_complete = True + + # Phase 2: wait for at least one healthy instance so the + # first benchmark invocation does not hit a cold Durable Object. + # The top-level `instances` field is the configured/desired count, + # not the runtime state. Actual readiness is in health.instances: + # healthy — booted, passed health check, ready to serve (what we need > 0) + # starting — still booting (image pull + firecracker init) + # active — currently handling a request (always 0 until first invocation) + # The top-level `instances` field equals max_instances + 1 in practice: + # Cloudflare appears to count one extra Durable Object coordination + # instance that never appears as healthy. The `health.instances` + # sub-object tracks runtime state per instance (not formally documented + # by Cloudflare at time of writing, derived from observed API responses): + # healthy — passed health check, ready to serve requests + # starting — still booting (image pull + firecracker init) + # active — currently handling a request (0 until first invocation) + # Use max_instances as the readiness threshold since that is the + # configured number of workload instances. + max_instances = data.get("max_instances", 0) + health_instances = data.get("health", {}).get("instances", {}) + healthy = health_instances.get("healthy", 0) + starting = health_instances.get("starting", 0) + self.logging.debug(f"Container {container_name} health: {health_instances}") + if max_instances > 0 and healthy >= max_instances: + self.logging.info( + f"Container {container_name} is ready " + f"({healthy}/{max_instances} instances healthy)." + ) + return + self.logging.info( + f"Container {container_name} awaiting all instances to become healthy " + f"(healthy={healthy}/{max_instances}, starting={starting}, {elapsed}s elapsed)" + ) + else: + self.logging.info( + f"Unexpected API response {resp.status_code} ({elapsed}s elapsed)" + ) + except requests.exceptions.RequestException as e: + self.logging.debug(f"API request failed ({elapsed}s): {e}") + + time.sleep(poll_interval) + + raise RuntimeError( + f"Container {container_name} did not become ready after {max_wait_seconds}s." + ) + def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: """Fetch the workers.dev subdomain for the given account. diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index 33b6de4e3..d4e67b299 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -23,7 +23,6 @@ import toml as tomli_w # type: ignore[no-redef, import-untyped] from typing import Optional, Tuple -import requests from sebs.benchmark import Benchmark from sebs.cloudflare.cli import CloudflareCLI @@ -372,70 +371,16 @@ def _build_container_image_local( self.logging.info(f"Container image built: {image_tag}") return image_tag - def wait_for_container_worker_ready( - self, worker_name: str, worker_url: str, max_wait_seconds: int = 400 - ) -> bool: - """ - Wait for container worker to be fully provisioned and ready. - - Polls /health instead of issuing a benchmark request directly because container - startup time is highly variable. A 200 from /health confirms that (1) the Worker - is reachable and (2) the Durable Object / container binding is instantiated — at - which point only a short gap remains before the benchmark handler is fully ready. - That residual gap is covered by the sync_invoke retry loop in triggers.py, keeping - the retry window small and predictable rather than spanning the entire provisioning - window from scratch. - - Args: - worker_name: Name of the worker - worker_url: URL of the worker - max_wait_seconds: Maximum time to wait in seconds + @staticmethod + def _container_name_from_worker(worker_name: str) -> str: + """Return the Cloudflare container name for a given worker name. - Returns: - True if ready, raises RuntimeError on timeout + Cloudflare appends the Durable Object class name (lowercased) to the worker + name to form the container name, e.g.: + worker: container-311-compression-nodejs-18 + container: container-311-compression-nodejs-18-containerworker """ - wait_interval = 20 - start_time = time.time() - - self.logging.info("Checking container worker readiness via health endpoint...") - - while time.time() - start_time < max_wait_seconds: - try: - # Use health check endpoint - response = requests.get(f"{worker_url}/health", timeout=60) - - # 200 = ready - if response.status_code == 200: - self.logging.info("Container worker is ready!") - return True - # 503 = not ready yet - elif response.status_code == 503: - elapsed = int(time.time() - start_time) - self.logging.info( - f"Container worker not ready yet (503 Service Unavailable)... " - f"({elapsed}s elapsed, will retry)" - ) - # Other errors - else: - self.logging.warning( - f"Unexpected status {response.status_code}: {response.text[:100]}" - ) - - except requests.exceptions.Timeout: - elapsed = int(time.time() - start_time) - self.logging.info( - f"Health check timeout (container may be starting)... ({elapsed}s elapsed)" - ) - except requests.exceptions.RequestException as e: - elapsed = int(time.time() - start_time) - self.logging.debug(f"Connection error ({elapsed}s): {str(e)[:100]}") - - time.sleep(wait_interval) - - raise RuntimeError( - f"Container worker {worker_name} did not become ready after {max_wait_seconds}s. " - "Deployment cannot proceed without a healthy container." - ) + return f"{worker_name}-containerworker" def shutdown(self): """Drop the local CLI reference. The shared container is owned by CloudflareCLI; diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 1e05990c4..5b7a208cb 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -156,15 +156,14 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: """ Synchronously invoke a Cloudflare Worker via HTTP. - Retries on ContainerProvisioningError to cover the short gap between the /health - check passing (Worker + Durable Object up) and the benchmark handler being fully - ready. The /health gate in containers.py absorbs the unpredictable bulk of - container startup; the retry budget here only needs to bridge the remaining, - much shorter window. + For container workers, the deployment path already waits until an instance + is running before returning, so provisioning retries here are a last-resort + safety net only (e.g. the instance was recycled between deployment and the + first invocation). """ self.logging.debug(f"Invoke function {self.url}") - max_provisioning_retries = 10 - provisioning_retry_wait = 60 # seconds between retries + max_provisioning_retries = 2 + provisioning_retry_wait = 15 # seconds between retries for attempt in range(max_provisioning_retries + 1): try: result = self._http_invoke(payload, self.url) @@ -172,7 +171,7 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: except ContainerProvisioningError: if attempt < max_provisioning_retries: self.logging.info( - f"Container still provisioning, waiting {provisioning_retry_wait}s " + f"Container not yet ready, waiting {provisioning_retry_wait}s " f"before retry (attempt {attempt + 1}/{max_provisioning_retries})..." ) time.sleep(provisioning_retry_wait) From 960d27fcc901c37464b9ab13e58b23d068f3c051 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 8 May 2026 11:00:34 +0200 Subject: [PATCH 175/230] feat(cloudflare): add compression benchmark using fflate for Cloudflare Workers --- .../300.utilities/311.compression/config.json | 2 +- .../nodejs/cloudflare/function.js | 100 ++++++++++++++++++ .../nodejs/cloudflare/package.json | 9 ++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 benchmarks/300.utilities/311.compression/nodejs/cloudflare/function.js create mode 100644 benchmarks/300.utilities/311.compression/nodejs/cloudflare/package.json diff --git a/benchmarks/300.utilities/311.compression/config.json b/benchmarks/300.utilities/311.compression/config.json index d69311ae3..0b20dbd82 100644 --- a/benchmarks/300.utilities/311.compression/config.json +++ b/benchmarks/300.utilities/311.compression/config.json @@ -13,7 +13,7 @@ "language": "nodejs", "variants": { "default": "default", - "cloudflare": {"workers": "default", "containers": "default"} + "cloudflare": {"workers": "cloudflare", "containers": "default"} } } ], diff --git a/benchmarks/300.utilities/311.compression/nodejs/cloudflare/function.js b/benchmarks/300.utilities/311.compression/nodejs/cloudflare/function.js new file mode 100644 index 000000000..8ffba4601 --- /dev/null +++ b/benchmarks/300.utilities/311.compression/nodejs/cloudflare/function.js @@ -0,0 +1,100 @@ +// Cloudflare Workers variant: replaces archiver (which relies on Node streams +// and prototype inheritance that breaks under Workers) with fflate, a pure-JS +// zip library that runs without any Node-specific APIs. +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { zipSync, strToU8 } from 'fflate'; +import { v4 as uuidv4 } from 'uuid'; +import { storage } from './storage'; + +let storage_handler = new storage(); + +function parseDirectory(directory) { + let size = 0; + function walkDir(dir) { + const files = fs.readdirSync(dir); + for (const file of files) { + const filepath = path.join(dir, file); + const stat = fs.statSync(filepath); + if (stat.isDirectory()) { + walkDir(filepath); + } else { + size += stat.size; + } + } + } + walkDir(directory); + return size; +} + +function collectFiles(directory) { + const result = {}; + function walkDir(dir, prefix) { + const files = fs.readdirSync(dir); + for (const file of files) { + const filepath = path.join(dir, file); + const relPath = prefix ? `${prefix}/${file}` : file; + const stat = fs.statSync(filepath); + if (stat.isDirectory()) { + walkDir(filepath, relPath); + } else { + result[relPath] = [fs.readFileSync(filepath), { level: 9 }]; + } + } + } + walkDir(directory, ''); + return result; +} + +export const handler = async function(event) { + const bucket = event.bucket.bucket; + const input_prefix = event.bucket.input; + const output_prefix = event.bucket.output; + const key = event.object.key; + + const download_path = path.join('/tmp', `${key}-${uuidv4()}`); + fs.mkdirSync(download_path, { recursive: true }); + + const s3_download_begin = Date.now(); + await storage_handler.downloadDirectory(bucket, path.join(input_prefix, key), download_path); + const s3_download_stop = Date.now(); + + const size = parseDirectory(download_path); + + const compress_begin = Date.now(); + const archive_name = `${key}.zip`; + const archive_path = path.join('/tmp', archive_name); + const files = collectFiles(download_path); + const zipped = zipSync(files); + fs.writeFileSync(archive_path, zipped); + const compress_end = Date.now(); + + const archive_size = fs.statSync(archive_path).size; + + const s3_upload_begin = Date.now(); + const [key_name, uploadPromise] = storage_handler.upload( + bucket, + path.join(output_prefix, archive_name), + archive_path + ); + await uploadPromise; + const s3_upload_stop = Date.now(); + + const download_time = (s3_download_stop - s3_download_begin) * 1000; + const upload_time = (s3_upload_stop - s3_upload_begin) * 1000; + const process_time = (compress_end - compress_begin) * 1000; + + return { + result: { + bucket: bucket, + key: key_name + }, + measurement: { + download_time: download_time, + download_size: size, + upload_time: upload_time, + upload_size: archive_size, + compute_time: process_time + } + }; +}; diff --git a/benchmarks/300.utilities/311.compression/nodejs/cloudflare/package.json b/benchmarks/300.utilities/311.compression/nodejs/cloudflare/package.json new file mode 100644 index 000000000..0f455e790 --- /dev/null +++ b/benchmarks/300.utilities/311.compression/nodejs/cloudflare/package.json @@ -0,0 +1,9 @@ +{ + "name": "compression-benchmark", + "version": "1.0.0", + "description": "Compression benchmark for serverless platforms", + "dependencies": { + "fflate": "^0.8.2", + "uuid": "^10.0.0" + } +} From f6c5d2a2e4f3a3b15af8d0ace5de18dbc48527c6 Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 8 May 2026 11:12:30 +0200 Subject: [PATCH 176/230] refactor(cloudflare): black formatting --- sebs/cloudflare/cloudflare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 04603c3ba..33bf405da 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -658,7 +658,9 @@ def _create_or_update_worker( worker_url = self._build_workers_dev_url(worker_name, account_id_val) if container_deployment: - container_name = self._containers_deployment._container_name_from_worker(worker_name) + container_name = self._containers_deployment._container_name_from_worker( + worker_name + ) # Cloudflare compares the newly pushed registry image against the # image currently running in the container worker. If the image digest # has changed, wrangler deploy triggers a rollout: Cloudflare pulls the From 1c66b921e8be0a04edaac34ecf662769534eb0ba Mon Sep 17 00:00:00 2001 From: laurin Date: Fri, 8 May 2026 11:18:15 +0200 Subject: [PATCH 177/230] refactor(cloudflare): shorten too long lines --- sebs/cloudflare/cloudflare.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 33bf405da..fd2cca4f9 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -785,7 +785,10 @@ def _wait_for_container_rollout( continue self.logging.info(f"Resolved container ID: {container_id}") - url = f"{self._api_base_url}/accounts/{account_id}/containers/applications/{container_id}" + url = ( + f"{self._api_base_url}/accounts/{account_id}" + f"/containers/applications/{container_id}" + ) resp = requests.get(url, headers=headers, timeout=30) if resp.status_code == 200: data = resp.json().get("result", resp.json()) @@ -808,9 +811,9 @@ def _wait_for_container_rollout( # first benchmark invocation does not hit a cold Durable Object. # The top-level `instances` field is the configured/desired count, # not the runtime state. Actual readiness is in health.instances: - # healthy — booted, passed health check, ready to serve (what we need > 0) + # healthy — passed health check, ready to serve # starting — still booting (image pull + firecracker init) - # active — currently handling a request (always 0 until first invocation) + # active — currently handling a request (0 until first invocation) # The top-level `instances` field equals max_instances + 1 in practice: # Cloudflare appears to count one extra Durable Object coordination # instance that never appears as healthy. The `health.instances` @@ -834,7 +837,8 @@ def _wait_for_container_rollout( return self.logging.info( f"Container {container_name} awaiting all instances to become healthy " - f"(healthy={healthy}/{max_instances}, starting={starting}, {elapsed}s elapsed)" + f"(healthy={healthy}/{max_instances}, starting={starting}, " + f"{elapsed}s elapsed)" ) else: self.logging.info( From ed3f23b375244f40731e10b508b2999ce35fa9f9 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 11 May 2026 11:12:53 +0200 Subject: [PATCH 178/230] feat(cloudflare): add CI support for cloudflare. unified the config to all the other platforms. removed specific cloudflare config. --- .github/workflows/_regression-job.yml | 8 ++++++++ .github/workflows/regression.yml | 8 ++++++++ configs/cloudflare-test.json | 26 -------------------------- configs/example.json | 2 ++ 4 files changed, 18 insertions(+), 26 deletions(-) delete mode 100644 configs/cloudflare-test.json diff --git a/.github/workflows/_regression-job.yml b/.github/workflows/_regression-job.yml index 296b2221f..6c6f34c52 100644 --- a/.github/workflows/_regression-job.yml +++ b/.github/workflows/_regression-job.yml @@ -49,6 +49,14 @@ jobs: echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> $GITHUB_ENV echo "AWS_DEFAULT_REGION=${{ secrets.AWS_DEFAULT_REGION || 'us-east-1' }}" >> $GITHUB_ENV + - name: Setup Cloudflare credentials + if: inputs.platform == 'cloudflare' + run: | + echo "CLOUDFLARE_API_TOKEN=${{ secrets.CLOUDFLARE_API_TOKEN }}" >> $GITHUB_ENV + echo "CLOUDFLARE_ACCOUNT_ID=${{ secrets.CLOUDFLARE_ACCOUNT_ID }}" >> $GITHUB_ENV + echo "CLOUDFLARE_R2_ACCESS_KEY_ID=${{ secrets.CLOUDFLARE_R2_ACCESS_KEY_ID }}" >> $GITHUB_ENV + echo "CLOUDFLARE_R2_SECRET_ACCESS_KEY=${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }}" >> $GITHUB_ENV + - name: Install uv uses: astral-sh/setup-uv@v4 diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 429ae1e8e..7724faae6 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -64,6 +64,14 @@ jobs: language: java version: "17" architecture: "x64" + - platform: cloudflare + language: python + version: "3.11" + architecture: "x64" + - platform: cloudflare + language: nodejs + version: "18" + architecture: "x64" fail-fast: false uses: ./.github/workflows/_regression-job.yml diff --git a/configs/cloudflare-test.json b/configs/cloudflare-test.json deleted file mode 100644 index 275aa021f..000000000 --- a/configs/cloudflare-test.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "experiments": { - "deployment": "cloudflare", - "update_code": false, - "update_storage": false, - "download_results": false, - "architecture": "x64", - "container_deployment": false, - "runtime": { - "language": "nodejs", - "version": "18" - } - }, - "deployment": { - "name": "cloudflare", - "cloudflare": { - "credentials": { - "api_token": "", - "account_id": "", - "r2_access_key_id": "", - "r2_secret_access_key": "" - } - }, - "container": false - } -} diff --git a/configs/example.json b/configs/example.json index a3f3f9b20..44bdba6b6 100644 --- a/configs/example.json +++ b/configs/example.json @@ -99,6 +99,8 @@ "type": "minio" } }, + "cloudflare": { + }, "openwhisk": { "shutdownStorage": false, "removeCluster": false, From c60557ea69621b0438a65ac74fe582e1f8a92673 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 11 May 2026 11:44:58 +0200 Subject: [PATCH 179/230] fix(cloudflare): update Python version to 3.12 in regression tests --- .github/workflows/regression.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 7724faae6..14f4b1fa4 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -66,7 +66,7 @@ jobs: architecture: "x64" - platform: cloudflare language: python - version: "3.11" + version: "3.12" architecture: "x64" - platform: cloudflare language: nodejs From 3fcfb19f3eb2442bc8d99b656fa9ce8c792742fe Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 11 May 2026 12:39:00 +0200 Subject: [PATCH 180/230] feat(cloudflare): add system_variant configuration for Cloudflare test sequences --- sebs/regression.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sebs/regression.py b/sebs/regression.py index f6f414c6e..42bb2c10b 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -1212,6 +1212,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = False + config_copy["experiments"]["system_variant"] = "workers" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1243,6 +1244,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = True + config_copy["experiments"]["system_variant"] = "container" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1274,6 +1276,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = False + config_copy["experiments"]["system_variant"] = "workers" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( @@ -1305,6 +1308,7 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): config_copy = copy.deepcopy(cloud_config) config_copy["experiments"]["architecture"] = architecture config_copy["experiments"]["container_deployment"] = True + config_copy["experiments"]["system_variant"] = "container" f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( From 6d27ff14b07f303a2a8fb4f8daaacab4a6cbe9b6 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 11 May 2026 13:21:52 +0200 Subject: [PATCH 181/230] feat(cloudflare): add max_instances configuration for Cloudflare container deployments --- configs/example.json | 1 + sebs/cloudflare/cloudflare.py | 2 ++ sebs/cloudflare/config.py | 8 ++++++++ sebs/cloudflare/containers.py | 2 ++ 4 files changed, 13 insertions(+) diff --git a/configs/example.json b/configs/example.json index 44bdba6b6..d6ff19775 100644 --- a/configs/example.json +++ b/configs/example.json @@ -100,6 +100,7 @@ } }, "cloudflare": { + "max_instances": 1 }, "openwhisk": { "shutdownStorage": false, diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index fd2cca4f9..c8eef2287 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -623,6 +623,8 @@ def _create_or_update_worker( self.logging.info(f"Image pushed to: {container_uri}") # Generate wrangler.toml for this worker (uses registry URI if available) + if container_deployment: + self._containers_deployment.max_instances = self.config.max_instances self._generate_wrangler_toml( worker_name, package_dir, diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index c8a7a3dd3..d18c8a56b 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -249,6 +249,7 @@ def __init__(self, credentials: CloudflareCredentials, resources: CloudflareReso super().__init__(name="cloudflare") self._credentials = credentials self._resources = resources + self._max_instances: int = 10 @staticmethod def typename() -> str: @@ -265,12 +266,19 @@ def resources(self) -> CloudflareResources: """Cloudflare resource identifiers for this deployment.""" return self._resources + @property + def max_instances(self) -> int: + """Maximum number of container instances for container deployments.""" + return self._max_instances + @staticmethod def initialize(cfg: Config, dct: dict): """Apply region and other fields from a config dictionary to an existing instance.""" config = cast(CloudflareConfig, cfg) # Cloudflare Workers are globally distributed, no region needed config._region = dct.get("region", "global") + if "max_instances" in dct: + config._max_instances = int(dct["max_instances"]) @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config: diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index d4e67b299..adbd79ed4 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -48,6 +48,7 @@ def __init__(self, logging, system_config, docker_client, system_resources): self.system_resources = system_resources self._base_image: Optional[str] = None self._cli: Optional[CloudflareCLI] = None + self.max_instances: int = 10 def _get_cli(self) -> CloudflareCLI: """Get or initialize the Cloudflare CLI container.""" @@ -94,6 +95,7 @@ def generate_wrangler_toml( # Update basic configuration config["name"] = worker_name config["account_id"] = account_id + config["containers"][0]["max_instances"] = self.max_instances if container_uri and container_uri.startswith("registry.cloudflare.com"): # Pre-built image already pushed to Cloudflare registry — point wrangler From 0cabc2f53b27ab9468d9b256f4017b61aaed4e2a Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:09 +0200 Subject: [PATCH 182/230] [workflows] Add FSM state machine abstraction for workflow definitions Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- benchmarks-data | 2 +- sebs/faas/fsm.py | 218 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 sebs/faas/fsm.py diff --git a/benchmarks-data b/benchmarks-data index 30ca2f5c5..dec81ad59 160000 --- a/benchmarks-data +++ b/benchmarks-data @@ -1 +1 @@ -Subproject commit 30ca2f5c533c3f441deb5e05fc03a39fe65f9948 +Subproject commit dec81ad593a6b0b921d1104956cd9e2bbbd65586 diff --git a/sebs/faas/fsm.py b/sebs/faas/fsm.py new file mode 100644 index 000000000..039457cd4 --- /dev/null +++ b/sebs/faas/fsm.py @@ -0,0 +1,218 @@ +from abc import ABC +from abc import abstractmethod +from typing import Optional, List, Callable, Union, Dict, Type, Tuple +import json + + +class State(ABC): + def __init__(self, name: str): + self.name = name + + @staticmethod + def deserialize(name: str, payload: dict) -> "State": + cls = _STATE_TYPES[payload["type"]] + return cls.deserialize(name, payload) + + +class Task(State): + def __init__(self, name: str, func_name: str, next: Optional[str], failure: Optional[str]): + self.name = name + self.func_name = func_name + self.next = next + self.failure = failure + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Task": + return cls( + name=name, + func_name=payload["func_name"], + next=payload.get("next"), + failure=payload.get("failure"), + ) + + +class Switch(State): + class Case: + def __init__(self, var: str, op: str, val: str, next: str): + self.var = var + self.op = op + self.val = val + self.next = next + + @staticmethod + def deserialize(payload: dict) -> "Switch.Case": + return Switch.Case(**payload) + + def __init__(self, name: str, cases: List[Case], default: Optional[str]): + self.name = name + self.cases = cases + self.default = default + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Switch": + cases = [Switch.Case.deserialize(c) for c in payload["cases"]] + + return cls(name=name, cases=cases, default=payload["default"]) + + +class Parallel(State): + def __init__(self, name: str, funcs: List, next: Optional[str]): + self.name = name + self.funcs = funcs + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Parallel": + return cls(name=name, funcs=payload.get("parallel_functions"), next=payload.get("next")) + + +class Map(State): + def __init__( + self, + name: str, + funcs: List, + array: str, + root: str, + next: Optional[str], + common_params: Optional[str], + ): + self.name = name + self.funcs = funcs + self.array = array + self.root = root + self.next = next + self.common_params = common_params + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Map": + return cls( + name=name, + funcs=payload["states"], + array=payload["array"], + root=payload["root"], + next=payload.get("next"), + common_params=payload.get("common_params"), + ) + + +class Repeat(State): + def __init__(self, name: str, func_name: str, count: int, next: Optional[str]): + self.name = name + self.func_name = func_name + self.count = count + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Repeat": + return cls( + name=name, + func_name=payload["func_name"], + count=payload["count"], + next=payload.get("next"), + ) + + +class Loop(State): + def __init__(self, name: str, func_name: str, array: str, next: Optional[str]): + self.name = name + self.func_name = func_name + self.array = array + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Loop": + return cls( + name=name, + func_name=payload["func_name"], + array=payload["array"], + next=payload.get("next"), + ) + + +_STATE_TYPES: Dict[str, Type[State]] = { + "task": Task, + "switch": Switch, + "map": Map, + "repeat": Repeat, + "loop": Loop, + "parallel": Parallel, +} + + +class Generator(ABC): + def __init__(self, export_func: Callable[[dict], str] = json.dumps): + self._export_func = export_func + + def parse(self, path: str): + with open(path) as f: + definition = json.load(f) + + self.states = {n: State.deserialize(n, s) for n, s in definition["states"].items()} + self.root = self.states[definition["root"]] + + def generate(self) -> str: + states = list(self.states.values()) + payloads = [] + for s in states: + obj = self.encode_state(s) + if isinstance(obj, dict): + payloads.append(obj) + elif isinstance(obj, list): + payloads += obj + else: + raise ValueError("Unknown encoded state returned.") + + definition = self.postprocess(payloads) + + return self._export_func(definition) + + def postprocess(self, payloads: List[dict]) -> dict: + return payloads + + def encode_state(self, state: State) -> Union[dict, List[dict]]: + if isinstance(state, Task): + return self.encode_task(state) + elif isinstance(state, Switch): + return self.encode_switch(state) + elif isinstance(state, Map): + return self.encode_map(state) + elif isinstance(state, Repeat): + return self.encode_repeat(state) + elif isinstance(state, Loop): + return self.encode_loop(state) + elif isinstance(state, Parallel): + return self.encode_parallel(state) + else: + raise ValueError(f"Unknown state of type {type(state)}.") + + @abstractmethod + def encode_task(self, state: Task) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_map(self, state: Map) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_parallel(self, state: Parallel) -> Union[dict, List[dict]]: + pass + + def encode_repeat(self, state: Repeat) -> Union[dict, List[dict]]: + tasks = [] + for i in range(state.count): + name = state.name if i == 0 else f"{state.name}_{i}" + next = state.next if i == state.count - 1 else f"{state.name}_{i+1}" + task = Task(name, state.func_name, next, None) + + res = self.encode_task(task) + tasks += res if isinstance(res, list) else [res] + + return tasks + + @abstractmethod + def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: + pass From c4400c68cdcc72d03fd2cfef67581cfd99f02c27 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:17 +0200 Subject: [PATCH 183/230] [workflows] Add Workflow base class and get_workflow to FaaS abstractions Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- sebs/faas/function.py | 7 +++++ sebs/faas/system.py | 62 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/sebs/faas/function.py b/sebs/faas/function.py index 4e1d3c8d0..5a1ff71dc 100644 --- a/sebs/faas/function.py +++ b/sebs/faas/function.py @@ -915,3 +915,10 @@ def deserialize(cached_config: dict) -> "Function": Function: New instance with the deserialized data """ pass + + +class Workflow(Function): + @staticmethod + @abstractmethod + def deserialize(cached_config: dict) -> "Workflow": + pass diff --git a/sebs/faas/system.py b/sebs/faas/system.py index 291e9ce75..e687e3987 100644 --- a/sebs/faas/system.py +++ b/sebs/faas/system.py @@ -24,7 +24,7 @@ from sebs.faas.container import DockerContainer from sebs.faas.resources import SystemResources from sebs.faas.config import Resources -from sebs.faas.function import Function, Trigger, ExecutionResult +from sebs.faas.function import Function, Trigger, ExecutionResult, Workflow from sebs.utils import LoggingBase from sebs.sebs_types import Language from .config import Config @@ -180,6 +180,66 @@ def function_type() -> "Type[Function]": """ pass + @staticmethod + def workflow_type() -> "Type[Workflow]": + raise NotImplementedError("Workflows not supported on this platform") + + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> Workflow: + raise NotImplementedError("Workflows not supported on this platform") + + def update_workflow(self, workflow: Workflow, code_package: Benchmark): + raise NotImplementedError("Workflows not supported on this platform") + + def get_workflow( + self, code_package: Benchmark, workflow_name: Optional[str] = None + ) -> Workflow: + if not workflow_name: + workflow_name = self.default_function_name(code_package) + + rebuilt, _, system_variant, container_uri = code_package.build( + self.package_code, self.container_client, self.finalize_container_build(), + is_workflow=True, + ) + + functions = code_package.functions + if not functions or workflow_name not in functions: + self.logging.info( + f"Creating new workflow! Reason: workflow {workflow_name} not found in cache." + ) + workflow = self.create_workflow(code_package, workflow_name) + self.cache_client.add_function( + deployment_name=self.name(), + language_name=code_package.language_name, + code_package=code_package, + function=workflow, + ) + code_package.query_cache() + return workflow + + cached_workflow = functions[workflow_name] + workflow = self.workflow_type().deserialize(cached_workflow) + self.cached_function(workflow) + self.logging.info(f"Using cached workflow {workflow_name}") + + if workflow.code_package_hash != code_package.hash or rebuilt: + self.logging.info( + f"Cached workflow {workflow_name} with hash " + f"{workflow.code_package_hash} is not up to date with " + f"current build {code_package.hash}, updating!" + ) + self.update_workflow(workflow, code_package) + workflow.code_package_hash = code_package.hash + workflow.updated_code = True + self.cache_client.add_function( + deployment_name=self.name(), + language_name=code_package.language_name, + code_package=code_package, + function=workflow, + ) + code_package.query_cache() + + return workflow + def find_deployments(self) -> List[str]: """ Find existing deployments in the cloud platform. From 859e71fbb0b6e8cb395cf301216f14ad59e33047 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:26 +0200 Subject: [PATCH 184/230] [workflows] Add workflow build support, Redis helpers, and deployment config Co-authored-by: Marcin Copik --- configs/example.json | 4 ++++ configs/systems.json | 5 ++++- sebs/benchmark.py | 28 ++++++++++++++++++++++++++-- sebs/utils.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/configs/example.json b/configs/example.json index d6ff19775..4d8fe7410 100644 --- a/configs/example.json +++ b/configs/example.json @@ -48,6 +48,10 @@ "region": "us-east-1", "lambda-role": "", "resources": { + "redis": { + "host": "ec2-54-86-32-136.compute-1.amazonaws.com", + "password": "xB46z3u9I6WJ" + }, "use-function-url": true, "function-url-auth-type": "NONE" } diff --git a/configs/systems.json b/configs/systems.json index 3506e3d35..147576b38 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -102,11 +102,14 @@ "deployment": { "files": [ "handler.py", + "handler_workflow.py", "storage.py", "nosql.py", "setup.py" ], - "packages": [], + "packages": [ + "redis" + ], "module_packages": {} } }, diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 7d9d84640..7cebf36e6 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -633,6 +633,20 @@ def hash(self, val: str): """ self._hash_value = val + def get_code_files(self, include_config=True): + FILES = { + "python": ["*.py"], + "nodejs": ["*.js"], + } + if include_config: + FILES["python"] += ["requirements.txt*", "*.json"] + FILES["nodejs"] += ["package.json", "*.json"] + + path = os.path.join(self.benchmark_path, self.language_name) + for file_type in FILES.get(self.language_name, []): + for f in glob.glob(os.path.join(path, file_type)): + yield f + def __init__( self, benchmark: str, @@ -984,7 +998,7 @@ def add_benchmark_data(self, output_dir: str) -> None: "init.sh failed (exit {}): {}".format(result.returncode, output) ) - def add_deployment_files(self, output_dir: str) -> None: + def add_deployment_files(self, output_dir: str, is_workflow: bool = False) -> None: """Add deployment-specific wrapper files to output directory. Copies platform-specific wrapper files (handlers, adapters) that @@ -995,6 +1009,7 @@ def add_deployment_files(self, output_dir: str) -> None: Args: output_dir: Directory where deployment files should be added + is_workflow: If True, use handler_workflow.py as handler.py """ handlers_dir = get_resource_path( "benchmarks", "wrappers", self._deployment_name, self.language_name @@ -1014,6 +1029,14 @@ def add_deployment_files(self, output_dir: str) -> None: if not os.path.exists(destination): shutil.copy2(file, destination) + if self.language_name == "python": + handler_path = os.path.join(output_dir, "handler.py") + handler_workflow_path = os.path.join(output_dir, "handler_workflow.py") + if is_workflow and os.path.exists(handler_workflow_path): + os.replace(handler_workflow_path, handler_path) + elif os.path.exists(handler_workflow_path): + os.remove(handler_workflow_path) + def add_deployment_package_python(self, output_dir: str) -> None: """Add Python deployment packages to requirements file. @@ -1450,6 +1473,7 @@ def build( container_client: DockerContainer | None, container_build_step: Callable[[str, Language, str, str, str, bool], Tuple[str, float]] | None, + is_workflow: bool = False, ) -> Tuple[bool, str | None, SystemVariant, str | None]: """Build the complete benchmark deployment package. @@ -1517,7 +1541,7 @@ def build( self.copy_code(self._output_dir) self.add_benchmark_data(self._output_dir) - self.add_deployment_files(self._output_dir) + self.add_deployment_files(self._output_dir, is_workflow) self.add_deployment_package(self._output_dir) """ diff --git a/sebs/utils.py b/sebs/utils.py index 043fd2c3c..468c0f3f4 100644 --- a/sebs/utils.py +++ b/sebs/utils.py @@ -795,3 +795,35 @@ def ensure_benchmarks_data(logger: ColoredWrapper) -> Path: raise RuntimeError(f"Failed to initialize benchmarks-data submodule: {e.stderr}") from e except FileNotFoundError: raise RuntimeError("git command not found. Please install git to use SeBS") from None + + +def replace_string_in_file(path: str, from_str: str, to_str: str): + with open(path, "rt") as f: + data = f.read() + data = data.replace(from_str, to_str) + with open(path, "wt") as f: + f.write(data) + + +def connect_to_redis_cache(host: str): + from redis import Redis + + redis = Redis(host=host, port=6379, decode_responses=True, socket_connect_timeout=10) + redis.ping() + return redis + + +def download_measurements(redis, workflow_name: str, after: float, **static_args): + payloads = [] + for key in redis.scan_iter(match=f"{workflow_name}/*"): + payload = redis.get(key) + redis.delete(key) + if payload: + try: + payload = json.loads(payload) + if payload["start"] > after: + payload = {**payload, **static_args} + payloads.append(payload) + except json.decoder.JSONDecodeError: + pass + return payloads From a87ab075bd6885da15f77e2775ec084131d0da9d Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:36 +0200 Subject: [PATCH 185/230] [workflows] Add AWS Step Functions workflow implementation Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- sebs/aws/aws.py | 141 +++++++++++++++++++++++++++++++++++++++++- sebs/aws/config.py | 25 +++++++- sebs/aws/generator.py | 131 +++++++++++++++++++++++++++++++++++++++ sebs/aws/triggers.py | 53 ++++++++++++++++ sebs/aws/workflow.py | 59 ++++++++++++++++++ 5 files changed, 405 insertions(+), 4 deletions(-) create mode 100644 sebs/aws/generator.py create mode 100644 sebs/aws/workflow.py diff --git a/sebs/aws/aws.py b/sebs/aws/aws.py index dd7077d18..2622bbb96 100644 --- a/sebs/aws/aws.py +++ b/sebs/aws/aws.py @@ -170,6 +170,12 @@ def initialize( self.system_config, self.session, self.config, self.docker_client ) + @staticmethod + def format_resource_name(name: str) -> str: + name = name.replace("-", "_") + name = name.replace(".", "_") + return name + def get_lambda_client(self): """ Get or create an AWS Lambda client. @@ -184,6 +190,14 @@ def get_lambda_client(self): ) return self.client + def get_sfn_client(self): + if not hasattr(self, "_sfn_client"): + self._sfn_client = self.session.client( + service_name="stepfunctions", + region_name=self.config.region, + ) + return self._sfn_client + def package_code( self, directory: str, @@ -244,6 +258,18 @@ def package_code( Language.NODEJS: ["handler.js", "package.json", "node_modules"], } + handler_path = os.path.join(directory, CONFIG_FILES[language][0]) + if self.config.redis_host is not None: + from sebs.utils import replace_string_in_file + replace_string_in_file( + handler_path, "{{REDIS_HOST}}", f'"{self.config.redis_host}"' + ) + if self.config.redis_password is not None: + from sebs.utils import replace_string_in_file + replace_string_in_file( + handler_path, "{{REDIS_PASSWORD}}", f'"{self.config.redis_password}"' + ) + if language in [Language.PYTHON, Language.NODEJS]: package_config = CONFIG_FILES[language] function_dir = os.path.join(directory, "function") @@ -435,8 +461,9 @@ def create_function( self.wait_function_active(lambda_function) - # Update environment variables - self.update_function_configuration(lambda_function, code_package) + # Update environment variables (only if input has been processed) + if code_package.has_input_processed: + self.update_function_configuration(lambda_function, code_package) # Add LibraryTrigger to a new function from sebs.aws.triggers import LibraryTrigger @@ -528,7 +555,8 @@ def update_function( self.wait_function_updated(function) self.logging.info(f"Updated code of {name} function. ") # and update config - self.update_function_configuration(function, code_package) + if code_package.has_input_processed: + self.update_function_configuration(function, code_package) def update_function_configuration( self, function: Function, code_package: Benchmark, env_variables: dict = {} @@ -948,6 +976,113 @@ def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) self.cache_client.update_function(function) return trigger + @staticmethod + def workflow_type() -> "Type[Function]": + from sebs.aws.workflow import SFNWorkflow + + return SFNWorkflow + + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> "Function": + import re + from sebs.aws.workflow import SFNWorkflow + from sebs.aws.generator import SFNGenerator + from sebs.aws.triggers import WorkflowLibraryTrigger + + workflow_name = AWS.format_resource_name(workflow_name) + + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found for {workflow_name}") + + code_files = list(code_package.get_code_files(include_config=False)) + func_names = [os.path.splitext(os.path.basename(p))[0] for p in code_files] + funcs = [ + self.create_function( + code_package, + workflow_name + "___" + fn, + code_package.system_variant, + None, + ) + for fn in func_names + ] + + gen = SFNGenerator({n: f.arn for (n, f) in zip(func_names, funcs)}) + gen.parse(definition_path) + definition = gen.generate() + + try: + ret = self.get_sfn_client().create_state_machine( + name=workflow_name, + definition=definition, + roleArn=self.config.resources.lambda_role(self.session), + ) + self.logging.info(f"Creating workflow {workflow_name}") + workflow = SFNWorkflow( + workflow_name, + funcs, + code_package.benchmark, + ret["stateMachineArn"], + code_package.hash, + FunctionConfig.from_benchmark(code_package), + ) + except self.get_sfn_client().exceptions.StateMachineAlreadyExists as e: + match = re.search("'([^']*)'", str(e)) + if not match: + raise + arn = match.group()[1:-1] + self.logging.info(f"Workflow {workflow_name} exists on AWS, updating.") + workflow = SFNWorkflow( + workflow_name, funcs, code_package.benchmark, arn, code_package.hash, + FunctionConfig.from_benchmark(code_package), + ) + self._update_workflow_definition(workflow, code_package) + workflow.updated_code = True + + trigger = WorkflowLibraryTrigger(workflow.arn, self) + trigger.logging_handlers = self.logging_handlers + workflow.add_trigger(trigger) + return workflow + + def update_workflow(self, workflow, code_package: Benchmark): + from sebs.aws.workflow import SFNWorkflow + + workflow = cast(SFNWorkflow, workflow) + self._update_workflow_definition(workflow, code_package) + + def _update_workflow_definition(self, workflow, code_package: Benchmark): + from sebs.aws.workflow import SFNWorkflow + from sebs.aws.generator import SFNGenerator + + workflow = cast(SFNWorkflow, workflow) + + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found for {workflow.name}") + + code_files = list(code_package.get_code_files(include_config=False)) + func_names = [os.path.splitext(os.path.basename(p))[0] for p in code_files] + funcs = [ + self.create_function( + code_package, + workflow.name + "___" + fn, + code_package.system_variant, + None, + ) + for fn in func_names + ] + + gen = SFNGenerator({n: f.arn for (n, f) in zip(func_names, funcs)}) + gen.parse(definition_path) + definition = gen.generate() + + self.get_sfn_client().update_state_machine( + stateMachineArn=workflow.arn, + definition=definition, + roleArn=self.config.resources.lambda_role(self.session), + ) + workflow.functions = funcs + self.logging.info("Published new workflow code") + def _enforce_cold_start(self, function: Function, code_package: Benchmark) -> None: """Enforce cold start for a single function. diff --git a/sebs/aws/config.py b/sebs/aws/config.py index 83230641a..259e0dc67 100644 --- a/sebs/aws/config.py +++ b/sebs/aws/config.py @@ -494,13 +494,19 @@ def lambda_role(self, boto3_session: boto3.session.Session) -> str: "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole", - } + }, + { + "Effect": "Allow", + "Principal": {"Service": "states.amazonaws.com"}, + "Action": "sts:AssumeRole", + }, ], } role_name = "sebs-lambda-role" attached_policies = [ "arn:aws:iam::aws:policy/AmazonS3FullAccess", "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + "arn:aws:iam::aws:policy/service-role/AWSLambdaRole", ] try: out = iam_client.get_role(RoleName=role_name) @@ -1036,6 +1042,7 @@ def initialize(res: Resources, dct: dict) -> None: for key, value in dct["function-urls"].items(): ret._function_urls[key] = AWSResources.FunctionURL.deserialize(value) + ret._redis = dct.get("redis") ret._use_function_url = dct.get("use-function-url", True) auth_type_str = dct.get("function-url-auth-type", "NONE") ret.function_url_auth_type = FunctionURLAuthType.from_string(auth_type_str) @@ -1121,6 +1128,8 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resour AWSResources.initialize(ret, cached_config["resources"]) ret.logging_handlers = handlers ret.logging.info("Using cached resources for AWS") + if "resources" in config and "redis" in config["resources"]: + ret._redis = config["resources"]["redis"] else: # Check for new config if "resources" in config: @@ -1184,6 +1193,20 @@ def resources(self) -> AWSResources: """ return self._resources + @property + def redis_host(self) -> str | None: + redis_cfg = getattr(self._resources, "_redis", None) + if redis_cfg: + return redis_cfg.get("host") + return None + + @property + def redis_password(self) -> str | None: + redis_cfg = getattr(self._resources, "_redis", None) + if redis_cfg: + return redis_cfg.get("password") + return None + @staticmethod def initialize(cfg: Config, dct: dict) -> None: """Initialize AWS configuration from dictionary. diff --git a/sebs/aws/generator.py b/sebs/aws/generator.py new file mode 100644 index 000000000..f0c54637a --- /dev/null +++ b/sebs/aws/generator.py @@ -0,0 +1,131 @@ +from typing import Dict, List, Union, Any +import numbers +import uuid + +from sebs.faas.fsm import Generator, State, Task, Switch, Map, Repeat, Loop + + +class SFNGenerator(Generator): + def __init__(self, func_arns: Dict[str, str]): + super().__init__() + self._func_arns = func_arns + + def postprocess(self, payloads: List[dict]) -> dict: + def _nameless(p: dict) -> dict: + del p["Name"] + return p + + state_payloads = {p["Name"]: _nameless(p) for p in payloads} + definition = { + "Comment": "SeBS auto-generated benchmark", + "StartAt": self.root.name, + "States": state_payloads, + } + + return definition + + def encode_task(self, state: Task) -> Union[dict, List[dict]]: + payload: Dict[str, Any] = { + "Name": state.name, + "Type": "Task", + "Resource": self._func_arns[state.func_name] + } + + if state.next: + payload["Next"] = state.next + else: + payload["End"] = True + + return payload + + def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: + choises = [self._encode_case(c) for c in state.cases] + return { + "Name": state.name, + "Type": "Choice", + "Choices": choises, + "Default": state.default + } + + def _encode_case(self, case: Switch.Case) -> dict: + type = "Numeric" if isinstance(case.val, numbers.Number) else "String" + comp = { + "<": "LessThan", + "<=": "LessThanEquals", + "==": "Equals", + ">=": "GreaterThanEquals", + ">": "GreaterThan", + } + cond = type + comp[case.op] + + return {"Variable": "$." + case.var, cond: case.val, "Next": case.next} + + def encode_map(self, state: Map) -> Union[dict, List[dict]]: + map_func_name = "func_" + str(uuid.uuid4())[:8] + + # state.funcs can be a dict of nested states or a list of function names + if isinstance(state.funcs, dict): + # Get func_name from the first nested task state + first_state = next(iter(state.funcs.values())) + func_name = first_state["func_name"] + else: + func_name = state.funcs[0] + + payload: Dict[str, Any] = { + "Name": state.name, + "Type": "Map", + "ItemsPath": "$." + state.array, + "Iterator": { + "StartAt": map_func_name, + "States": { + map_func_name: { + "Type": "Task", + "Resource": self._func_arns[func_name], + "End": True, + } + }, + }, + } + + if state.next: + payload["Next"] = state.next + else: + payload["End"] = True + + return payload + + def encode_parallel(self, state) -> Union[dict, List[dict]]: + payload: Dict[str, Any] = { + "Name": state.name, + "Type": "Parallel", + "Branches": [ + { + "StartAt": f"func_{i}", + "States": { + f"func_{i}": { + "Type": "Task", + "Resource": self._func_arns[fn], + "End": True, + } + }, + } + for i, fn in enumerate(state.funcs) + ], + } + + if state.next: + payload["Next"] = state.next + else: + payload["End"] = True + + return payload + + def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: + map_state = Map(state.name, [state.func_name], state.array, state.name, state.next, None) + payload = self.encode_map(map_state) + payload["MaxConcurrency"] = 1 + payload["ResultSelector"] = dict() + payload["ResultPath"] = "$." + str(uuid.uuid4())[:8] + + return payload + diff --git a/sebs/aws/triggers.py b/sebs/aws/triggers.py index bfc66e31e..5f11f1f4f 100644 --- a/sebs/aws/triggers.py +++ b/sebs/aws/triggers.py @@ -14,6 +14,8 @@ import concurrent.futures import datetime import json +import time +import uuid from enum import Enum from typing import Dict, Optional # noqa @@ -206,6 +208,57 @@ def deserialize(obj: dict) -> Trigger: return LibraryTrigger(obj["name"]) +class WorkflowLibraryTrigger(LibraryTrigger): + def sync_invoke(self, payload: dict) -> ExecutionResult: + self.logging.debug(f"Invoke workflow {self.name}") + + request_id = str(uuid.uuid4())[0:8] + sfn_input = {"payload": payload, "request_id": request_id} + + client = self._deployment_client.get_sfn_client() + begin = datetime.datetime.now() + ret = client.start_execution(stateMachineArn=self.name, input=json.dumps(sfn_input)) + end = datetime.datetime.now() + + aws_result = ExecutionResult.from_times(begin, end) + aws_result.request_id = request_id + execution_arn = ret["executionArn"] + + execution_finished = False + while not execution_finished: + execution = client.describe_execution(executionArn=execution_arn) + status = execution["status"] + execution_finished = status != "RUNNING" + + if not execution_finished: + time.sleep(1) + elif status == "FAILED": + self.logging.error(f"Invocation of {self.name} failed") + self.logging.error(f"Input: {payload}") + aws_result.stats.failure = True + return aws_result + + return aws_result + + def async_invoke(self, payload: dict): + raise NotImplementedError("Async invocation is not implemented for workflows") + + @staticmethod + def typename() -> str: + return "AWS.WorkflowLibraryTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + return Trigger.TriggerType.LIBRARY + + def serialize(self) -> dict: + return {"type": "Library", "name": self.name} + + @staticmethod + def deserialize(obj: dict) -> "WorkflowLibraryTrigger": + return WorkflowLibraryTrigger(obj["name"]) + + class HTTPTrigger(Trigger): """AWS HTTP trigger for Lambda functions. diff --git a/sebs/aws/workflow.py b/sebs/aws/workflow.py new file mode 100644 index 000000000..3a8d7f831 --- /dev/null +++ b/sebs/aws/workflow.py @@ -0,0 +1,59 @@ +from typing import cast, List + +from sebs.aws.s3 import S3 +from sebs.aws.function import FunctionConfig, LambdaFunction +from sebs.faas.function import Workflow + + +class SFNWorkflow(Workflow): + def __init__( + self, + name: str, + functions: List[LambdaFunction], + benchmark: str, + arn: str, + code_package_hash: str, + cfg: FunctionConfig, + ): + super().__init__(benchmark, name, code_package_hash, cfg) + self.functions = functions + self.arn = arn + + @staticmethod + def typename() -> str: + return "AWS.SFNWorkflow" + + def serialize(self) -> dict: + return { + **super().serialize(), + "functions": [f.serialize() for f in self.functions], + "arn": self.arn, + } + + @staticmethod + def deserialize(cached_config: dict) -> "SFNWorkflow": + from sebs.faas.function import Trigger + from sebs.aws.triggers import WorkflowLibraryTrigger, HTTPTrigger + + funcs = [LambdaFunction.deserialize(f) for f in cached_config["functions"]] + cfg = FunctionConfig.deserialize(cached_config["config"]) + ret = SFNWorkflow( + cached_config["name"], + funcs, + cached_config["benchmark"], + cached_config["arn"], + cached_config["hash"], + cfg, + ) + for trigger in cached_config["triggers"]: + trigger_type = cast( + Trigger, + {"Library": WorkflowLibraryTrigger, "HTTP": HTTPTrigger}.get(trigger["type"]), + ) + assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) + ret.add_trigger(trigger_type.deserialize(trigger)) + return ret + + def code_bucket(self, benchmark: str, storage_client: S3): + self.bucket, idx = storage_client.add_input_bucket(benchmark) + return self.bucket From 5525b1ddfc2d25daa2295cc8ac4c11b7bf6de0d0 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:42 +0200 Subject: [PATCH 186/230] [workflows] Add GCP Workflows workflow implementation Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- sebs/gcp/gcp.py | 4 ++ sebs/gcp/generator.py | 101 ++++++++++++++++++++++++++++++++++++++++++ sebs/gcp/triggers.py | 55 +++++++++++++++++++++++ sebs/gcp/workflow.py | 60 +++++++++++++++++++++++++ 4 files changed, 220 insertions(+) create mode 100644 sebs/gcp/generator.py create mode 100644 sebs/gcp/workflow.py diff --git a/sebs/gcp/gcp.py b/sebs/gcp/gcp.py index a0adea561..a85656284 100644 --- a/sebs/gcp/gcp.py +++ b/sebs/gcp/gcp.py @@ -292,6 +292,10 @@ def get_full_function_name(project_name: str, location: str, func_name: str) -> """ ... + @staticmethod + def get_full_workflow_name(project_name: str, location: str, workflow_name: str) -> str: + return f"projects/{project_name}/locations/{location}/workflows/{workflow_name}" + def function_exists(self, project_name: str, location: str, func_name: str) -> Any: """Check whether the function or service exists. diff --git a/sebs/gcp/generator.py b/sebs/gcp/generator.py new file mode 100644 index 000000000..223c0c4b9 --- /dev/null +++ b/sebs/gcp/generator.py @@ -0,0 +1,101 @@ +import uuid +from typing import Dict, Union, List + +from sebs.faas.fsm import Generator, State, Task, Switch, Map, Repeat, Loop + + +class GCPGenerator(Generator): + def __init__(self, workflow_name: str, func_triggers: Dict[str, str]): + super().__init__() + self._workflow_name = workflow_name + self._func_triggers = func_triggers + self._map_funcs: Dict[str, str] = dict() + + def postprocess(self, payloads: List[dict]) -> dict: + payloads.append({"final": {"return": ["${res}"]}}) + + definition = {"main": {"params": ["res"], "steps": payloads}} + + return definition + + def encode_task(self, state: Task) -> Union[dict, List[dict]]: + url = self._func_triggers[state.func_name] + + return [ + { + state.name: { + "call": "http.post", + "args": {"url": url, "body": "${res}"}, + "result": "res", + } + }, + {"assign_res_" + state.name: {"assign": [{"res": "${res.body}"}]}}, + ] + + def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: + return { + state.name: { + "switch": [self._encode_case(c) for c in state.cases], + "next": state.default, + } + } + + def _encode_case(self, case: Switch.Case) -> dict: + cond = "res." + case.var + " " + case.op + " " + str(case.val) + return {"condition": "${" + cond + "}", "next": case.next} + + def encode_map(self, state: Map) -> Union[dict, List[dict]]: + id = self._workflow_name + "_" + "map" + str(uuid.uuid4())[0:8] + self._map_funcs[id] = self._func_triggers[state.func_name] + + return { + state.name: { + "call": "experimental.executions.map", + "args": {"workflow_id": id, "arguments": "${res." + state.array + "}"}, + "result": "res", + } + } + + def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: + url = self._func_triggers[state.func_name] + + return { + state.name: { + "for": { + "value": "val", + "index": "idx", + "in": "${"+state.array+"}", + "steps": [ + { + "body": { + "call": "http.post", + "args": {"url": url, "body": "${val}"} + } + } + ] + } + } + } + + def generate_maps(self): + for workflow_id, url in self._map_funcs.items(): + yield ( + workflow_id, + self._export_func( + { + "main": { + "params": ["elem"], + "steps": [ + { + "map": { + "call": "http.post", + "args": {"url": url, "body": "${elem}"}, + "result": "elem", + } + }, + {"ret": {"return": "${elem.body}"}}, + ], + } + } + ), + ) diff --git a/sebs/gcp/triggers.py b/sebs/gcp/triggers.py index a31f04b1d..04ec1c472 100644 --- a/sebs/gcp/triggers.py +++ b/sebs/gcp/triggers.py @@ -23,6 +23,7 @@ import concurrent.futures import datetime import json +import time from typing import Dict, Optional # noqa from sebs.gcp.gcp import GCP @@ -262,6 +263,60 @@ def deserialize(obj: Dict) -> Trigger: return LibraryTrigger(obj["name"], deployment_type=deployment_type) +class WorkflowLibraryTrigger(LibraryTrigger): + def sync_invoke(self, payload: dict) -> ExecutionResult: + from google.cloud.workflows.executions_v1 import ExecutionsClient, Execution + + self.logging.info(f"Invoke workflow {self.name}") + + config = self._deployment_client.config + full_workflow_name = GCP.get_full_workflow_name( + config.project_name, config.region, self.name + ) + + execution_client = ExecutionsClient() + execution = Execution(argument=json.dumps(payload)) + + begin = datetime.datetime.now() + res = execution_client.create_execution(parent=full_workflow_name, execution=execution) + end = datetime.datetime.now() + + gcp_result = ExecutionResult.from_times(begin, end) + + execution_finished = False + while not execution_finished: + execution = execution_client.get_execution(request={"name": res.name}) + execution_finished = execution.state != Execution.State.ACTIVE + + if not execution_finished: + time.sleep(10) + elif execution.state == Execution.State.FAILED: + self.logging.error(f"Invocation of {self.name} failed") + self.logging.error(f"Input: {payload}") + gcp_result.stats.failure = True + return gcp_result + + return gcp_result + + def async_invoke(self, payload: dict): + raise NotImplementedError("Async invocation is not implemented for workflows") + + @staticmethod + def typename() -> str: + return "GCP.WorkflowLibraryTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + return Trigger.TriggerType.LIBRARY + + def serialize(self) -> dict: + return {"type": "Library", "name": self.name} + + @staticmethod + def deserialize(obj: dict) -> "WorkflowLibraryTrigger": + return WorkflowLibraryTrigger(obj["name"]) + + class HTTPTrigger(Trigger): """HTTP endpoint trigger for Cloud Functions invocation. diff --git a/sebs/gcp/workflow.py b/sebs/gcp/workflow.py new file mode 100644 index 000000000..374ac9cd6 --- /dev/null +++ b/sebs/gcp/workflow.py @@ -0,0 +1,60 @@ +from typing import List, cast, Optional + +from sebs.faas.function import FunctionConfig, Workflow +from sebs.gcp.function import GCPFunction +from sebs.gcp.storage import GCPStorage + + +class GCPWorkflow(Workflow): + def __init__( + self, + name: str, + functions: List[GCPFunction], + benchmark: str, + code_package_hash: str, + cfg: FunctionConfig, + bucket: Optional[str] = None, + ): + super().__init__(benchmark, name, code_package_hash, cfg) + self.functions = functions + self.bucket = bucket + + @staticmethod + def typename() -> str: + return "GCP.GCPWorkflow" + + def serialize(self) -> dict: + return { + **super().serialize(), + "functions": [f.serialize() for f in self.functions], + "bucket": self.bucket, + } + + @staticmethod + def deserialize(cached_config: dict) -> "GCPWorkflow": + from sebs.faas.function import Trigger + from sebs.gcp.triggers import WorkflowLibraryTrigger, HTTPTrigger + + cfg = FunctionConfig.deserialize(cached_config["config"]) + funcs = [GCPFunction.deserialize(f) for f in cached_config["functions"]] + ret = GCPWorkflow( + cached_config["name"], + funcs, + cached_config["benchmark"], + cached_config["hash"], + cfg, + cached_config["bucket"], + ) + for trigger in cached_config["triggers"]: + trigger_type = cast( + Trigger, + {"Library": WorkflowLibraryTrigger, "HTTP": HTTPTrigger}.get(trigger["type"]), + ) + assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) + ret.add_trigger(trigger_type.deserialize(trigger)) + return ret + + def code_bucket(self, benchmark: str, storage_client: GCPStorage): + if not self.bucket: + self.bucket, idx = storage_client.add_input_bucket(benchmark) + return self.bucket From cc117ac45dbc5302ee8eb6930a06be028e13f273 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:47 +0200 Subject: [PATCH 187/230] [workflows] Add AWS and GCP workflow handler wrappers Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- .../wrappers/aws/python/handler_workflow.py | 93 +++++++++++++++++++ .../wrappers/gcp/python/handler_workflow.py | 86 +++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 benchmarks/wrappers/aws/python/handler_workflow.py create mode 100644 benchmarks/wrappers/gcp/python/handler_workflow.py diff --git a/benchmarks/wrappers/aws/python/handler_workflow.py b/benchmarks/wrappers/aws/python/handler_workflow.py new file mode 100644 index 000000000..18a892989 --- /dev/null +++ b/benchmarks/wrappers/aws/python/handler_workflow.py @@ -0,0 +1,93 @@ +import datetime +import io +import json +import os +import sys +import uuid +import importlib + +# Add current directory to allow location of packages +sys.path.append(os.path.join(os.path.dirname(__file__), ".python_packages/lib/site-packages")) + +from redis import Redis + + +def probe_cold_start(): + is_cold = False + fname = os.path.join("/tmp", "cold_run") + if not os.path.exists(fname): + is_cold = True + container_id = str(uuid.uuid4())[0:8] + with open(fname, "a") as f: + f.write(container_id) + else: + with open(fname, "r") as f: + container_id = f.read() + + return is_cold, container_id + + +def handler(event, context): + start = datetime.datetime.now().timestamp() + os.environ["STORAGE_UPLOAD_BYTES"] = "0" + os.environ["STORAGE_DOWNLOAD_BYTES"] = "0" + + req_id = context.aws_request_id + + if isinstance(event, dict) and "payload" in event: + func_payload = event["payload"] + request_id = event.get("request_id", req_id) + elif isinstance(event, dict): + request_id = event.pop("__request_id", req_id) + func_payload = event + else: + func_payload = event + request_id = req_id + + workflow_name, func_name = context.function_name.split("___") + function = importlib.import_module(f"function.{func_name}") + res = function.handler(func_payload) + + end = datetime.datetime.now().timestamp() + + is_cold, container_id = probe_cold_start() + measurement = { + "func": func_name, + "start": start, + "end": end, + "is_cold": is_cold, + "container_id": container_id, + "provider.request_id": context.aws_request_id, + } + + func_res = os.getenv("SEBS_FUNCTION_RESULT") + if func_res: + measurement["result"] = json.loads(func_res) + + bytes_upload = os.getenv("STORAGE_UPLOAD_BYTES", 0) + if bytes_upload: + measurement["blob.upload"] = int(bytes_upload) + + bytes_download = os.getenv("STORAGE_DOWNLOAD_BYTES", 0) + if bytes_download: + measurement["blob.download"] = int(bytes_download) + + measurement_json = json.dumps(measurement) + + try: + redis = Redis( + host={{REDIS_HOST}}, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password={{REDIS_PASSWORD}}, + ) + + key = os.path.join(workflow_name, func_name, request_id, str(uuid.uuid4())[0:8]) + redis.set(key, measurement_json) + except Exception: + pass + + if isinstance(res, dict): + res["__request_id"] = request_id + return res diff --git a/benchmarks/wrappers/gcp/python/handler_workflow.py b/benchmarks/wrappers/gcp/python/handler_workflow.py new file mode 100644 index 000000000..083ef0a53 --- /dev/null +++ b/benchmarks/wrappers/gcp/python/handler_workflow.py @@ -0,0 +1,86 @@ + +import datetime +import io +import json +import os +import sys +import uuid +import importlib + +# Add current directory to allow location of packages +sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) + +if 'NOSQL_STORAGE_DATABASE' in os.environ: + from function import nosql + + nosql.nosql.get_instance( + os.environ['NOSQL_STORAGE_DATABASE'] + ) + +from redis import Redis + +def probe_cold_start(): + is_cold = False + fname = os.path.join("/tmp", "cold_run") + if not os.path.exists(fname): + is_cold = True + container_id = str(uuid.uuid4())[0:8] + with open(fname, "a") as f: + f.write(container_id) + else: + with open(fname, "r") as f: + container_id = f.read() + + return is_cold, container_id + + +def handler(req): + start = datetime.datetime.now().timestamp() + os.environ["STORAGE_UPLOAD_BYTES"] = "0" + os.environ["STORAGE_DOWNLOAD_BYTES"] = "0" + provider_request_id = req.headers.get("Function-Execution-Id") + + event = req.get_json() + event["payload"]['request-id'] = provider_request_id + full_function_name = os.getenv("MY_FUNCTION_NAME") + workflow_name, func_name = full_function_name.split("___") + function = importlib.import_module(f"function.{func_name}") + res = function.handler(event["payload"]) + + end = datetime.datetime.now().timestamp() + + is_cold, container_id = probe_cold_start() + payload = { + "func": func_name, + "start": start, + "end": end, + "is_cold": is_cold, + "container_id": container_id, + "provider.request_id": provider_request_id + } + + func_res = os.getenv("SEBS_FUNCTION_RESULT") + if func_res: + payload["result"] = json.loads(func_res) + + bytes_upload = os.getenv("STORAGE_UPLOAD_BYTES", 0) + if bytes_upload: + payload["blob.upload"] = int(bytes_upload) + + bytes_download = os.getenv("STORAGE_DOWNLOAD_BYTES", 0) + if bytes_download: + payload["blob.download"] = int(bytes_download) + + payload = json.dumps(payload) + + redis = Redis(host={{REDIS_HOST}}, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password={{REDIS_PASSWORD}}) + + req_id = event["request_id"] + key = os.path.join(workflow_name, func_name, req_id, str(uuid.uuid4())[0:8]) + redis.set(key, payload) + + return res From 82991ea052c973e4eaf6541f8ade55020a152ad6 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:45:55 +0200 Subject: [PATCH 188/230] [workflows] Add Azure Durable Functions workflow handler wrappers Co-authored-by: Larissa Schmid --- benchmarks/wrappers/azure/python/fsm.py | 1 + .../wrappers/azure/python/handler_workflow.py | 83 +++++ .../wrappers/azure/python/main_workflow.py | 74 +++++ .../wrappers/azure/python/run_subworkflow.py | 224 ++++++++++++++ .../wrappers/azure/python/run_workflow.py | 284 ++++++++++++++++++ 5 files changed, 666 insertions(+) create mode 100644 benchmarks/wrappers/azure/python/fsm.py create mode 100644 benchmarks/wrappers/azure/python/handler_workflow.py create mode 100644 benchmarks/wrappers/azure/python/main_workflow.py create mode 100644 benchmarks/wrappers/azure/python/run_subworkflow.py create mode 100644 benchmarks/wrappers/azure/python/run_workflow.py diff --git a/benchmarks/wrappers/azure/python/fsm.py b/benchmarks/wrappers/azure/python/fsm.py new file mode 100644 index 000000000..30c579a5c --- /dev/null +++ b/benchmarks/wrappers/azure/python/fsm.py @@ -0,0 +1 @@ +../../../../sebs/faas/fsm.py \ No newline at end of file diff --git a/benchmarks/wrappers/azure/python/handler_workflow.py b/benchmarks/wrappers/azure/python/handler_workflow.py new file mode 100644 index 000000000..143b5287f --- /dev/null +++ b/benchmarks/wrappers/azure/python/handler_workflow.py @@ -0,0 +1,83 @@ +import datetime +import json +import os +import uuid +import importlib + +import logging + +import azure.functions as func +from redis import Redis + +def probe_cold_start(): + is_cold = False + fname = os.path.join("/tmp", "cold_run") + if not os.path.exists(fname): + is_cold = True + container_id = str(uuid.uuid4())[0:8] + with open(fname, "a") as f: + f.write(container_id) + else: + with open(fname, "r") as f: + container_id = f.read() + + return is_cold, container_id + +def main(event, context: func.Context): + start = datetime.datetime.now().timestamp() + os.environ["STORAGE_UPLOAD_BYTES"] = "0" + os.environ["STORAGE_DOWNLOAD_BYTES"] = "0" + + workflow_name = os.getenv("APPSETTING_WEBSITE_SITE_NAME") + func_name = os.path.basename(os.path.dirname(__file__)) + + # FIXME: sort out workflow and function request id + #event["request-id"] = context.invocation_id + # this only works on benchmarks where payload is dict + event["payload"]["request-id"] = context.invocation_id + + module_name = f"{func_name}.{func_name}" + module_path = f"{func_name}/{func_name}.py" + spec = importlib.util.spec_from_file_location(module_name, module_path) + function = importlib.util.module_from_spec(spec) + spec.loader.exec_module(function) + + res = function.handler(event["payload"]) + + end = datetime.datetime.now().timestamp() + + is_cold, container_id = probe_cold_start() + payload = { + "func": func_name, + "start": start, + "end": end, + "is_cold": is_cold, + "container_id": container_id, + "provider.request_id": context.invocation_id + } + + func_res = os.getenv("SEBS_FUNCTION_RESULT") + if func_res: + payload["result"] = json.loads(func_res) + + bytes_upload = os.getenv("STORAGE_UPLOAD_BYTES", 0) + if bytes_upload: + payload["blob.upload"] = int(bytes_upload) + + bytes_download = os.getenv("STORAGE_DOWNLOAD_BYTES", 0) + if bytes_download: + payload["blob.download"] = int(bytes_download) + + payload = json.dumps(payload) + + redis = Redis(host={{REDIS_HOST}}, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password={{REDIS_PASSWORD}}) + + req_id = event["request_id"] + key = os.path.join(workflow_name, func_name, req_id, str(uuid.uuid4())[0:8]) + redis.set(key, payload) + + return res diff --git a/benchmarks/wrappers/azure/python/main_workflow.py b/benchmarks/wrappers/azure/python/main_workflow.py new file mode 100644 index 000000000..0c4e55a0c --- /dev/null +++ b/benchmarks/wrappers/azure/python/main_workflow.py @@ -0,0 +1,74 @@ +import os +import json +import datetime +import uuid + +import azure.functions as func +import azure.durable_functions as df + +import logging + +def probe_cold_start(): + is_cold = False + fname = os.path.join("/tmp", "cold_run") + if not os.path.exists(fname): + is_cold = True + container_id = str(uuid.uuid4())[0:8] + with open(fname, "a") as f: + f.write(container_id) + else: + with open(fname, "r") as f: + container_id = f.read() + + return is_cold, container_id + + +async def main(req: func.HttpRequest, starter: str, context: func.Context) -> func.HttpResponse: + event = req.get_json() + req_id = event["request_id"] + logging.info("complete event: ") + logging.info(event) + logging.info("req_id in main: ") + logging.info(req_id) + + if 'connection_string' in event: + logging.info("setting connection string.") + os.environ['STORAGE_CONNECTION_STRING'] = event['connection_string'] + + begin = datetime.datetime.now() + + client = df.DurableOrchestrationClient(starter) + instance_id = await client.start_new("run_workflow", None, event) + + res = client.create_check_status_response(req, instance_id) + #res = await client.wait_for_completion_or_create_check_status_response(req, instance_id, 1000000) + + end = datetime.datetime.now() + + is_cold, container_id = probe_cold_start() + #status = await client.get_status(instance_id) + #code = 500 if str(status.runtime_status) == "Failed" else 200 + + #try: + # result = json.loads(res.get_body()) + #except json.decoder.JSONDecodeError: + # result = res.get_body().decode() + + body = json.loads(res.get_body()) + logging.info("body") + logging.info(body) + body = { + **body, + "begin": begin.strftime("%s.%f"), + "end": end.strftime("%s.%f"), + "is_cold": is_cold, + "container_id": container_id, + "provider.request_id": context.invocation_id, + "request_id": req_id, + } + + return func.HttpResponse( + status_code=res.status_code, + body=json.dumps(body), + mimetype="application/json" + ) diff --git a/benchmarks/wrappers/azure/python/run_subworkflow.py b/benchmarks/wrappers/azure/python/run_subworkflow.py new file mode 100644 index 000000000..c2730714e --- /dev/null +++ b/benchmarks/wrappers/azure/python/run_subworkflow.py @@ -0,0 +1,224 @@ +import json +import sys +import os +import uuid +import operator +import logging +import datetime + +import azure.durable_functions as df +from redis import Redis + +dir_path = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(dir_path, os.path.pardir)) + +from .fsm import * + + +def get_var(obj, path: str): + names = path.split(".") + assert(len(names) > 0) + + for n in names: + obj = obj[n] + + return obj + + +def set_var(obj, val, path: str): + names = path.split(".") + assert(len(names) > 0) + + for n in names[:-1]: + obj = obj[n] + obj[names[-1]] = val + +def handler(context: df.DurableOrchestrationContext): + start = datetime.datetime.now().timestamp() + ts = start + now = lambda: datetime.datetime.now().timestamp() + duration = 0 + + input = context.get_input() + res = input["payload"] + request_id = input["request_id"] + all_states = input["states"] + states = {n: State.deserialize(n, s) + for n, s in all_states.items()} + current = states[input["root"]] + + while current: + logging.info(current.name) + + if isinstance(current, Task): + input = {"payload": res, "request_id": request_id} + + duration += (now() - ts) + res = yield context.call_activity(current.func_name, input) + ts = now() + current = states.get(current.next, None) + elif isinstance(current, Switch): + ops = { + "<": operator.lt, + "<=": operator.le, + "==": operator.eq, + ">=": operator.ge, + ">": operator.gt + } + + next = None + for case in current.cases: + var = get_var(res, case.var) + op = ops[case.op] + if op(var, case.val): + next = states[case.next] + break + + if not next and current.default: + next = states[current.default] + current = next + elif isinstance(current, Map): + array = get_var(res, current.array) + tasks = [] + if current.common_params: + #assemble input differently + for elem in array: + #assemble payload + payload = {} + payload["array_element"] = elem + params = current.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + tasks.append(context.call_activity(current.func_name, myinput)) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + tasks.append(context.call_activity(current.func_name, myinput)) + + duration += (now() - ts) + map_res = yield context.task_all(tasks) + ts = now() + + set_var(res, map_res, current.array) + current = states.get(current.next, None) + elif isinstance(current, Repeat): + for i in range(current.count): + input = {"payload": res, "request_id": request_id} + + duration += (now() - ts) + res = yield context.call_activity(current.func_name, input) + ts = now() + + current = states.get(current.next, None) + elif isinstance(current, Loop): + array = get_var(res, current.array) + for elem in array: + input = {"payload": elem, "request_id": request_id} + + duration += (now() - ts) + yield context.call_activity(current.func_name, input) + ts = now() + + current = states.get(current.next, None) + + elif isinstance(current, Parallel): + parallel_tasks = [] + first_states = [] + state_to_result = {} + for i, subworkflow in enumerate(current.funcs): + parallel_states = {n: State.deserialize(n, s) for n, s in subworkflow["states"].items()} + + #for state in parallel_states.values(): + # state_to_result[state.func_name] = [] + + + first_state = parallel_states[subworkflow["root"]] + first_states.append(first_state) + state_to_result[first_state.func_name] = [] + + if isinstance(first_state, Task): + input = {"payload": res, "request_id": request_id} + + #task directly here if only one state, task within suborchestrator if multiple states. + if first_state.next: + #call suborchestrator + #FIXME define other parameters. + parallel_task = context.call_sub_orchestrator("run_subworkflow", input, subworkflow["root"], parallel_states) + parallel_tasks.append(parallel_task) + else: + parallel_tasks.append(context.call_activity(first_state.func_name, input)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + + elif isinstance(first_state, Map): + array = get_var(res, first_state.array) + tasks = [] + + if first_state.next: + #call suborchestrator. + if first_state.common_params: + #assemble input differently + for elem in array: + payload = {} + payload["array_element"] = elem + params = first_state.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + #FIXME use right parameters for suborchestrator. + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput, subworkflow["root"], parallel_states) + parallel_tasks.append(parallel_task) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput, subworkflow["root"], parallel_states) + parallel_tasks.append(parallel_task) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + if first_state.common_params: + #assemble input differently + for elem in array: + payload = {} + payload["array_element"] = elem + params = first_state.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + parallel_tasks.append(context.call_activity(first_state.func_name, myinput)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + parallel_tasks.append(context.call_activity(first_state.func_name, myinput)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + + duration += (now() - ts) + map_res = yield context.task_all(parallel_tasks) + ts = now() + res = {} + + for state in first_states: + indices = state_to_result[state.func_name] + if len(indices) > 1: + output = [] + for index in indices: + output.append(map_res[index]) + res[state.func_name] = output + else: + #task state + res[state.func_name] = map_res[indices[0]] + + current = states.get(current.next, None) + + else: + raise ValueError(f"Undefined state: {current}") + + #workflow_name = os.getenv("APPSETTING_WEBSITE_SITE_NAME") + func_name = "run_subworkflow" + + return res + + +main = df.Orchestrator.create(handler) diff --git a/benchmarks/wrappers/azure/python/run_workflow.py b/benchmarks/wrappers/azure/python/run_workflow.py new file mode 100644 index 000000000..d5e111408 --- /dev/null +++ b/benchmarks/wrappers/azure/python/run_workflow.py @@ -0,0 +1,284 @@ +import json +import sys +import os +import uuid +import operator +import logging +import datetime + +import azure.durable_functions as df +from redis import Redis + +dir_path = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(dir_path, os.path.pardir)) + +from .fsm import * + + +def get_var(obj, path: str): + names = path.split(".") + assert(len(names) > 0) + + for n in names: + obj = obj[n] + + return obj + + +def set_var(obj, val, path: str): + names = path.split(".") + assert(len(names) > 0) + + for n in names[:-1]: + obj = obj[n] + obj[names[-1]] = val + +def handler(context: df.DurableOrchestrationContext): + start = datetime.datetime.now().timestamp() + ts = start + now = lambda: datetime.datetime.now().timestamp() + duration = 0 + + with open("definition.json") as f: + definition = json.load(f) + + states = {n: State.deserialize(n, s) + for n, s in definition["states"].items()} + current = states[definition["root"]] + input = context.get_input() + + logging.info("START") + res = input["payload"] + request_id = input["request_id"] + + while current: + logging.info(current.name) + + if isinstance(current, Task): + input = {"payload": res, "request_id": request_id} + + duration += (now() - ts) + + if current.failure is None: + res = yield context.call_activity(current.func_name, input) + current = states.get(current.next, None) + else: + try: + res = yield context.call_activity(current.func_name, input) + current = states.get(current.next, None) + except: + current = states.get(current.failure, None) + + ts = now() + + elif isinstance(current, Switch): + ops = { + "<": operator.lt, + "<=": operator.le, + "==": operator.eq, + ">=": operator.ge, + ">": operator.gt + } + + next = None + for case in current.cases: + var = get_var(res, case.var) + op = ops[case.op] + if op(var, case.val): + next = states[case.next] + break + + if not next and current.default: + next = states[current.default] + current = next + elif isinstance(current, Map): + + map_states = {n: State.deserialize(n, s) for n, s in current.funcs.items()} + first_state = map_states[current.root] + + array = get_var(res, current.array) + tasks = [] + if first_state.next: + #call suborchestrator - each map task should proceed with next step directly after it finished. + if current.common_params: + for elem in array: + payload = {} + payload["array_element"] = elem + params = current.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + myinput["root"] = current.root + + myinput["states"] = current.funcs + uuid_name = str(uuid.uuid4())[0:4] + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput, uuid_name) + tasks.append(parallel_task) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + myinput["root"] = current.root + myinput["states"] = current.funcs + + uuid_name = str(uuid.uuid4())[0:4] + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput, uuid_name) + tasks.append(parallel_task) + else: + if current.common_params: + #assemble input differently + for elem in array: + payload = {} + payload["array_element"] = elem + params = current.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + tasks.append(context.call_activity(first_state.func_name, myinput)) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + tasks.append(context.call_activity(first_state.func_name, myinput)) + + duration += (now() - ts) + map_res = yield context.task_all(tasks) + ts = now() + + set_var(res, map_res, current.array) + current = states.get(current.next, None) + elif isinstance(current, Repeat): + for i in range(current.count): + input = {"payload": res, "request_id": request_id} + + duration += (now() - ts) + res = yield context.call_activity(current.func_name, input) + ts = now() + + current = states.get(current.next, None) + elif isinstance(current, Loop): + array = get_var(res, current.array) + for elem in array: + input = {"payload": elem, "request_id": request_id} + + duration += (now() - ts) + yield context.call_activity(current.func_name, input) + ts = now() + + current = states.get(current.next, None) + + elif isinstance(current, Parallel): + parallel_tasks = [] + first_states = [] + state_to_result = {} + for subworkflow in current.funcs: + parallel_states = {n: State.deserialize(n, s) for n, s in subworkflow["states"].items()} + + first_state = parallel_states[subworkflow["root"]] + first_states.append(first_state) + state_to_result[first_state.func_name] = [] + + if isinstance(first_state, Task): + input = {"payload": res, "request_id": request_id} + + #task directly here if only one state, task within suborchestrator if multiple states. + if first_state.next: + input["root"] = subworkflow["root"] + input["states"] = subworkflow["states"] #parallel_states + parallel_task = context.call_sub_orchestrator("run_subworkflow", input) + parallel_tasks.append(parallel_task) + else: + parallel_tasks.append(context.call_activity(first_state.func_name, input)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + + elif isinstance(first_state, Map): + array = get_var(res, first_state.array) + tasks = [] + + if first_state.next: + #call suborchestrator. + if first_state.common_params: + #assemble input differently + for elem in array: + payload = {} + payload["array_element"] = elem + params = first_state.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + myinput["root"] = subworkflow["root"] + myinput["states"] = subworkflow["states"] + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput) + parallel_tasks.append(parallel_task) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + + myinput["root"] = subworkflow["root"] + myinput["states"] = subworkflow["states"] + parallel_task = context.call_sub_orchestrator("run_subworkflow", myinput) + parallel_tasks.append(parallel_task) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + if first_state.common_params: + #assemble input differently + for elem in array: + payload = {} + payload["array_element"] = elem + params = first_state.common_params.split(",") + for param in params: + payload[param] = get_var(res, param) + myinput = {"payload": payload, "request_id": request_id} + parallel_tasks.append(context.call_activity(first_state.func_name, myinput)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + else: + for elem in array: + myinput = {"payload": elem, "request_id": request_id} + parallel_tasks.append(context.call_activity(first_state.func_name, myinput)) + state_to_result[first_state.func_name].append(len(parallel_tasks)-1) + + duration += (now() - ts) + map_res = yield context.task_all(parallel_tasks) + ts = now() + res = {} + + for state in first_states: + #get respective results of map_res related to func according to state_to_result + indices = state_to_result[state.func_name] + if len(indices) > 1: + output = [] + for index in indices: + output.append(map_res[index]) + res[state.func_name] = output + else: + #task state + res[state.func_name] = map_res[indices[0]] + + current = states.get(current.next, None) + + else: + raise ValueError(f"Undefined state: {current}") + + workflow_name = os.getenv("APPSETTING_WEBSITE_SITE_NAME") + func_name = "run_workflow" + + payload = { + "func": func_name, + "start": start, + "end": start+duration + } + + payload = json.dumps(payload) + + redis = Redis(host={{REDIS_HOST}}, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password={{REDIS_PASSWORD}}) + + key = os.path.join(workflow_name, func_name, request_id, str(uuid.uuid4())[0:8]) + redis.set(key, payload) + + return res + + +main = df.Orchestrator.create(handler) From 1314695d94a7f30824bd8c301971f2c7564caec4 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:46:02 +0200 Subject: [PATCH 189/230] [workflows] Add workflow CLI command and documentation Co-authored-by: Marcin Copik --- docs/workflows.md | 160 ++++++++++++++++++++++++++++++++++++++++++++++ sebs/cli.py | 87 +++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 docs/workflows.md diff --git a/docs/workflows.md b/docs/workflows.md new file mode 100644 index 000000000..2033e5dcc --- /dev/null +++ b/docs/workflows.md @@ -0,0 +1,160 @@ +## Workflows + +### Installation + +SeBS makes use of [redis](https://redis.io) in order to make reliable and accurate measurements during the execution of workflows. Ideally, the redis instance should be deployed in the same cloud region such that the write latency is minimal. +Because not all platforms allow connections from a workflow execution to a VPC cache, it proved to be easiest to just deploy a VM and have that machine host redis. Make sure to open port `6379` and admit connections in your VPC accordingly. Redis can be hosted as follows: +```bash +docker run --network=host --name redis -d redis redis-server --save 60 1 --loglevel warning --requirepass {yourpassword} +``` + +### Usage + +To execute a workflow, the host address and password of the redis instance must be given as part of the config file for the respective platform: + +```json +"resources": { + "redis": { + "host": "1.1.1.1", + "password": "yourpassword" + } +} +``` + +Our workflow benchmarks are provided in the benchmarks folder (benchmarks/600.workflows). To execute a given workflow, use the following command, with "http" triggers for Azure and "library" for AWS and GCP: + +``` +./sebs.py benchmark workflow {workflow-name} --config {path/to/config.json} --deployment {platform-name} --verbose {input-size} --trigger {library|http} --repetitions 1 +``` + +### Definition + +Workflows have been adopted by all major cloud providers, but their implementations are significantly different in capabilities, differing not only in APIs and syntax provided, but also in programming models. We define a workflow model based on Petri Nets, and define workflows using a JSON syntax. The general structure of a workflow definition looks like this: + +```json +{ + "root": "first_state", + "states": { + } +} +``` + +`root` defines the initial state to start the workflow from, while `states` holds a dictionary of `(name, state)` tuples. The following state types are supported. + +#### Task + +A task state is the most basic state: it executes a serverless function. + +```json +{ + "type": "task", + "func_name": "compute", + "next": "postprocess_compute" +}, +``` + +`func_name` is the name of the file in the benchmark directory, `next` sets the state with which to follow. + + +#### Map + +A map state takes a list as input and processes each element in parallel using the given functions: + +```json +{ +"type": "map", +"array": "customers", +"root": "shorten", +"next": "list_emails", +"states": { + "shorten": { + "type": "task", + "func_name": "short" + } +} +} +``` + +`array` defines the list to be processed, while `root` defines which of the functions given in `states` should be called first. `func_name` is the name of the file in the benchmark directory. In contrast to a `task`'s function, this one receives only an element of the given array, not the entire running variable. Other fields required from the running variable can be given using the `common_params` entry. + +#### Loop + +The loop phase is similar to map but traverses the given input array sequentially. Thus, loop encodes tasks that cannot be parallelized due to existing dependencies. + +#### Repeat + +A repeat phase executes a function a given number of times. This syntactic sugar eases modeling a chain of tasks. + +```json +{ +"type": "repeat", +"func_name": "process", +"count": 10 +} +``` + +#### Switch + +A switch state makes it possible to encode basic control flow. + +```json +{ + "type": "switch", + "cases": [ + { + "var": "people.number", + "op": "<", + "val": 10, + "next": "few_people" + }, + { + "var": "people.number", + "op": ">=", + "val": 10, + "next": "many_people" + } + ], + "default": "few_people" +} +``` + +This state transcribes to the following Python expression: +```python +if people.number < 10: + few_people() +elif people.number >= 10: + many_people() +else: + few_people() +``` + +#### Parallel + +This higher-level phase corresponds to a parallel routing and executes sub-workflows, consisting of any of the phases, concurrently. The sub-workflows can consist of any of the phases presented. All sub-workflows receive the complete output of the previous phase as input. The outputs of the sub-workflows are merged after all functions have completed execution. + +```json +{ +"type": "parallel", +"parallel_functions": [ + { + "root": "compute", + "states": { + "compute": { + "type": "task", + "func_name": "compute" + } + } + }, + { + "root": "sort", + "states": { + "sort": { + "type": "task", + "func_name": "sort" + } + } + } +], +"next": "frequency_and_overlap" +} +``` diff --git a/sebs/cli.py b/sebs/cli.py index 58c459816..b84728bdf 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -370,6 +370,93 @@ def invoke( sebs_client.logging.info("Save results to {}".format(os.path.abspath(result_file))) +@benchmark.command() +@click.argument("benchmark", type=str) +@click.argument("benchmark-input-size", type=click.Choice(["test", "small", "large"])) +@click.option("--repetitions", default=5, type=int, help="Number of experimental repetitions.") +@click.option( + "--trigger", + type=click.Choice(["library", "http"]), + default="http", + help="Workflow trigger to be used.", +) +@click.option( + "--workflow-name", + default=None, + type=str, + help="Override workflow name for random generation.", +) +@common_params +def workflow(benchmark, benchmark_input_size, repetitions, trigger, workflow_name, **kwargs): + """Invoke a workflow benchmark and measure performance.""" + import pandas as pd + from sebs.utils import connect_to_redis_cache, download_measurements + + (config, output_dir, logging_filename, sebs_client, deployment_client) = parse_common_params( + **kwargs + ) + + experiment_config = sebs_client.get_experiment_config(config["experiments"]) + benchmark_obj = sebs_client.get_benchmark( + benchmark, + deployment_client, + experiment_config, + logging_filename=logging_filename, + ) + + wf = deployment_client.get_workflow( + benchmark_obj, + workflow_name if workflow_name else deployment_client.default_function_name(benchmark_obj), + ) + + input_config = benchmark_obj.prepare_input( + deployment_client.system_resources, + size=benchmark_input_size, + replace_existing=experiment_config.update_storage, + ) + + redis_host = getattr(deployment_client.config, "redis_host", None) + redis = None + if redis_host: + try: + redis = connect_to_redis_cache(redis_host) + except Exception as e: + sebs_client.logging.warning(f"Could not connect to Redis ({e}), skipping measurements") + + result = sebs.experiments.ExperimentResult(experiment_config, deployment_client.config) + result.begin() + + trigger_type = Trigger.TriggerType.get(trigger) + triggers = wf.triggers(trigger_type) + if len(triggers) == 0: + trigger = deployment_client.create_trigger(wf, trigger_type) + else: + trigger = triggers[0] + + measurements = [] + for i in range(repetitions): + sebs_client.logging.info(f"Beginning repetition {i + 1}/{repetitions}") + ret = trigger.sync_invoke(input_config) + if ret.stats.failure: + sebs_client.logging.info(f"Failure on repetition {i + 1}/{repetitions}") + + if redis: + measurements += download_measurements(redis, wf.name, result.begin_time, rep=i) + result.add_invocation(wf, ret) + result.end() + + if measurements: + path = os.path.join(output_dir, "results", wf.name, deployment_client.name() + ".csv") + os.makedirs(os.path.dirname(path), exist_ok=True) + df = pd.DataFrame(measurements) + df.to_csv(path, index=False) + + result_file = os.path.join(output_dir, "experiments.json") + with open(result_file, "w") as out_f: + out_f.write(sebs.utils.serialize(result)) + sebs_client.logging.info("Save results to {}".format(os.path.abspath(result_file))) + + @benchmark.command() @common_params def process(**kwargs): From 8fb0ce7b12ca67f30d37ce2548eb3bfabfedb700 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 10:46:11 +0200 Subject: [PATCH 190/230] [workflows] Add workflow benchmark definitions (610.gen, 620-631, 640, 650, 660, 680, 690, 1000-genome, trip-booking) Co-authored-by: Larissa Schmid Co-authored-by: Marcin Copik --- benchmarks-data | 2 +- benchmarks/600.workflows/610.gen/config.json | 7 + .../600.workflows/610.gen/definition.json | 54 +++ benchmarks/600.workflows/610.gen/input.py | 5 + .../610.gen/python/few_people.py | 5 + .../610.gen/python/get_astros.py | 8 + .../610.gen/python/many_people.py | 5 + .../610.gen/python/map_astros.py | 7 + .../610.gen/python/process_astros.py | 5 + .../610.gen/python/requirements.txt | 1 + .../600.workflows/6100.1000-genome/LICENSE | 201 ++++++++++ .../600.workflows/6100.1000-genome/README.md | 1 + .../6100.1000-genome/config.json | 6 + .../6100.1000-genome/definition.json | 71 ++++ .../600.workflows/6100.1000-genome/input.py | 66 +++ .../6100.1000-genome/python/frequency.py | 272 +++++++++++++ .../6100.1000-genome/python/individuals.py | 116 ++++++ .../python/individuals_merge.py | 90 +++++ .../python/mutation_overlap.py | 379 ++++++++++++++++++ .../6100.1000-genome/python/requirements.txt | 3 + .../6100.1000-genome/python/sifting.py | 74 ++++ .../6101.1000-genome-individuals/config.json | 6 + .../definition.json | 17 + .../6101.1000-genome-individuals/input.py | 65 +++ .../python/individuals.py | 114 ++++++ .../python/requirements.txt | 3 + .../600.workflows/620.func-invo/config.json | 6 + .../620.func-invo/definition.json | 15 + .../600.workflows/620.func-invo/input.py | 16 + .../600.workflows/620.func-invo/python/gen.py | 14 + .../620.func-invo/python/process.py | 14 + .../6200.trip-booking/config.json | 6 + .../6200.trip-booking/definition.json | 41 ++ .../600.workflows/6200.trip-booking/input.py | 50 +++ .../6200.trip-booking/python/cancel_flight.py | 16 + .../6200.trip-booking/python/cancel_hotel.py | 15 + .../6200.trip-booking/python/cancel_rental.py | 16 + .../6200.trip-booking/python/confirm.py | 42 ++ .../python/reserve_flight.py | 40 ++ .../6200.trip-booking/python/reserve_hotel.py | 35 ++ .../python/reserve_rental.py | 33 ++ .../630.parallel-sleep/config.json | 6 + .../630.parallel-sleep/definition.json | 21 + .../600.workflows/630.parallel-sleep/input.py | 34 ++ .../630.parallel-sleep/python/generate.py | 12 + .../630.parallel-sleep/python/process.py | 7 + .../631.parallel-download/config.json | 6 + .../631.parallel-download/definition.json | 21 + .../631.parallel-download/input.py | 48 +++ .../631.parallel-download/python/generate.py | 8 + .../631.parallel-download/python/process.py | 11 + .../640.selfish-detour/config.json | 6 + .../640.selfish-detour/definition.json | 9 + .../600.workflows/640.selfish-detour/input.py | 12 + .../640.selfish-detour/package.sh | 11 + .../640.selfish-detour/python/measure.py | 37 ++ .../640.selfish-detour/selfish-detour.c | 138 +++++++ benchmarks/600.workflows/650.vid/LICENSE | 22 + benchmarks/600.workflows/650.vid/README.md | 3 + benchmarks/600.workflows/650.vid/config.json | 6 + .../600.workflows/650.vid/definition.json | 26 ++ benchmarks/600.workflows/650.vid/input.py | 30 ++ .../600.workflows/650.vid/python/analyse.py | 80 ++++ .../600.workflows/650.vid/python/decode.py | 67 ++++ .../650.vid/python/requirements.txt | 1 + .../600.workflows/650.vid/python/summarize.py | 18 + .../600.workflows/660.map-reduce/config.json | 6 + .../660.map-reduce/definition.json | 38 ++ .../600.workflows/660.map-reduce/input.py | 35 ++ .../660.map-reduce/python/map.py | 36 ++ .../660.map-reduce/python/reduce.py | 23 ++ .../660.map-reduce/python/shuffle.py | 27 ++ .../660.map-reduce/python/split.py | 53 +++ benchmarks/600.workflows/670.auth/config.json | 5 + .../600.workflows/670.auth/definition.json | 9 + benchmarks/600.workflows/670.auth/input.py | 21 + .../600.workflows/670.auth/python/auth.py | 39 ++ .../670.auth/python/requirements.txt | 1 + benchmarks/600.workflows/680.excamera/LICENSE | 22 + .../600.workflows/680.excamera/README.md | 3 + .../600.workflows/680.excamera/config.json | 6 + .../680.excamera/definition.json | 39 ++ .../600.workflows/680.excamera/input.py | 44 ++ .../680.excamera/python/encode.py | 88 ++++ .../680.excamera/python/rebase.py | 115 ++++++ .../680.excamera/python/reencode.py | 103 +++++ .../680.excamera/python/requirements.txt | 1 + .../680.excamera/python/split.py | 26 ++ benchmarks/600.workflows/690.ml/config.json | 6 + .../600.workflows/690.ml/definition.json | 21 + benchmarks/600.workflows/690.ml/input.py | 25 ++ .../600.workflows/690.ml/python/generate.py | 54 +++ .../600.workflows/690.ml/python/package.sh | 25 ++ .../690.ml/python/requirements.txt | 2 + .../600.workflows/690.ml/python/train.py | 72 ++++ 95 files changed, 3526 insertions(+), 1 deletion(-) create mode 100644 benchmarks/600.workflows/610.gen/config.json create mode 100644 benchmarks/600.workflows/610.gen/definition.json create mode 100644 benchmarks/600.workflows/610.gen/input.py create mode 100644 benchmarks/600.workflows/610.gen/python/few_people.py create mode 100644 benchmarks/600.workflows/610.gen/python/get_astros.py create mode 100644 benchmarks/600.workflows/610.gen/python/many_people.py create mode 100644 benchmarks/600.workflows/610.gen/python/map_astros.py create mode 100644 benchmarks/600.workflows/610.gen/python/process_astros.py create mode 100644 benchmarks/600.workflows/610.gen/python/requirements.txt create mode 100644 benchmarks/600.workflows/6100.1000-genome/LICENSE create mode 100644 benchmarks/600.workflows/6100.1000-genome/README.md create mode 100644 benchmarks/600.workflows/6100.1000-genome/config.json create mode 100644 benchmarks/600.workflows/6100.1000-genome/definition.json create mode 100644 benchmarks/600.workflows/6100.1000-genome/input.py create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/frequency.py create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/individuals.py create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/individuals_merge.py create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/mutation_overlap.py create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/requirements.txt create mode 100644 benchmarks/600.workflows/6100.1000-genome/python/sifting.py create mode 100644 benchmarks/600.workflows/6101.1000-genome-individuals/config.json create mode 100644 benchmarks/600.workflows/6101.1000-genome-individuals/definition.json create mode 100644 benchmarks/600.workflows/6101.1000-genome-individuals/input.py create mode 100644 benchmarks/600.workflows/6101.1000-genome-individuals/python/individuals.py create mode 100644 benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt create mode 100644 benchmarks/600.workflows/620.func-invo/config.json create mode 100644 benchmarks/600.workflows/620.func-invo/definition.json create mode 100644 benchmarks/600.workflows/620.func-invo/input.py create mode 100644 benchmarks/600.workflows/620.func-invo/python/gen.py create mode 100644 benchmarks/600.workflows/620.func-invo/python/process.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/config.json create mode 100644 benchmarks/600.workflows/6200.trip-booking/definition.json create mode 100644 benchmarks/600.workflows/6200.trip-booking/input.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/cancel_flight.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/cancel_hotel.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/cancel_rental.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/confirm.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/reserve_flight.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/reserve_hotel.py create mode 100644 benchmarks/600.workflows/6200.trip-booking/python/reserve_rental.py create mode 100644 benchmarks/600.workflows/630.parallel-sleep/config.json create mode 100644 benchmarks/600.workflows/630.parallel-sleep/definition.json create mode 100644 benchmarks/600.workflows/630.parallel-sleep/input.py create mode 100644 benchmarks/600.workflows/630.parallel-sleep/python/generate.py create mode 100644 benchmarks/600.workflows/630.parallel-sleep/python/process.py create mode 100644 benchmarks/600.workflows/631.parallel-download/config.json create mode 100644 benchmarks/600.workflows/631.parallel-download/definition.json create mode 100644 benchmarks/600.workflows/631.parallel-download/input.py create mode 100644 benchmarks/600.workflows/631.parallel-download/python/generate.py create mode 100644 benchmarks/600.workflows/631.parallel-download/python/process.py create mode 100644 benchmarks/600.workflows/640.selfish-detour/config.json create mode 100644 benchmarks/600.workflows/640.selfish-detour/definition.json create mode 100644 benchmarks/600.workflows/640.selfish-detour/input.py create mode 100644 benchmarks/600.workflows/640.selfish-detour/package.sh create mode 100644 benchmarks/600.workflows/640.selfish-detour/python/measure.py create mode 100644 benchmarks/600.workflows/640.selfish-detour/selfish-detour.c create mode 100644 benchmarks/600.workflows/650.vid/LICENSE create mode 100644 benchmarks/600.workflows/650.vid/README.md create mode 100644 benchmarks/600.workflows/650.vid/config.json create mode 100644 benchmarks/600.workflows/650.vid/definition.json create mode 100644 benchmarks/600.workflows/650.vid/input.py create mode 100644 benchmarks/600.workflows/650.vid/python/analyse.py create mode 100644 benchmarks/600.workflows/650.vid/python/decode.py create mode 100644 benchmarks/600.workflows/650.vid/python/requirements.txt create mode 100644 benchmarks/600.workflows/650.vid/python/summarize.py create mode 100644 benchmarks/600.workflows/660.map-reduce/config.json create mode 100644 benchmarks/600.workflows/660.map-reduce/definition.json create mode 100644 benchmarks/600.workflows/660.map-reduce/input.py create mode 100644 benchmarks/600.workflows/660.map-reduce/python/map.py create mode 100644 benchmarks/600.workflows/660.map-reduce/python/reduce.py create mode 100644 benchmarks/600.workflows/660.map-reduce/python/shuffle.py create mode 100644 benchmarks/600.workflows/660.map-reduce/python/split.py create mode 100644 benchmarks/600.workflows/670.auth/config.json create mode 100644 benchmarks/600.workflows/670.auth/definition.json create mode 100644 benchmarks/600.workflows/670.auth/input.py create mode 100644 benchmarks/600.workflows/670.auth/python/auth.py create mode 100644 benchmarks/600.workflows/670.auth/python/requirements.txt create mode 100644 benchmarks/600.workflows/680.excamera/LICENSE create mode 100644 benchmarks/600.workflows/680.excamera/README.md create mode 100644 benchmarks/600.workflows/680.excamera/config.json create mode 100644 benchmarks/600.workflows/680.excamera/definition.json create mode 100644 benchmarks/600.workflows/680.excamera/input.py create mode 100644 benchmarks/600.workflows/680.excamera/python/encode.py create mode 100644 benchmarks/600.workflows/680.excamera/python/rebase.py create mode 100644 benchmarks/600.workflows/680.excamera/python/reencode.py create mode 100644 benchmarks/600.workflows/680.excamera/python/requirements.txt create mode 100644 benchmarks/600.workflows/680.excamera/python/split.py create mode 100644 benchmarks/600.workflows/690.ml/config.json create mode 100644 benchmarks/600.workflows/690.ml/definition.json create mode 100644 benchmarks/600.workflows/690.ml/input.py create mode 100644 benchmarks/600.workflows/690.ml/python/generate.py create mode 100644 benchmarks/600.workflows/690.ml/python/package.sh create mode 100644 benchmarks/600.workflows/690.ml/python/requirements.txt create mode 100644 benchmarks/600.workflows/690.ml/python/train.py diff --git a/benchmarks-data b/benchmarks-data index dec81ad59..30ca2f5c5 160000 --- a/benchmarks-data +++ b/benchmarks-data @@ -1 +1 @@ -Subproject commit dec81ad593a6b0b921d1104956cd9e2bbbd65586 +Subproject commit 30ca2f5c533c3f441deb5e05fc03a39fe65f9948 diff --git a/benchmarks/600.workflows/610.gen/config.json b/benchmarks/600.workflows/610.gen/config.json new file mode 100644 index 000000000..b0cea12a6 --- /dev/null +++ b/benchmarks/600.workflows/610.gen/config.json @@ -0,0 +1,7 @@ +{ + "timeout": 120, + "memory": 128, + "languages": ["python"], + "modules": [], + "cpp_dependencies": [] +} diff --git a/benchmarks/600.workflows/610.gen/definition.json b/benchmarks/600.workflows/610.gen/definition.json new file mode 100644 index 000000000..fcdf203bc --- /dev/null +++ b/benchmarks/600.workflows/610.gen/definition.json @@ -0,0 +1,54 @@ +{ + "root": "get_astros", + "states": { + "get_astros": { + "type": "task", + "func_name": "get_astros", + "next": "select_astros_number" + }, + "select_astros_number": { + "type": "switch", + "cases": [ + { + "var": "astros.number", + "op": "<", + "val": 10, + "next": "few_people" + }, + { + "var": "astros.number", + "op": ">=", + "val": 10, + "next": "many_people" + } + ], + "default": "few_people" + }, + "few_people": { + "type": "task", + "func_name": "few_people", + "next": "map_astros" + }, + "many_people": { + "type": "task", + "func_name": "many_people", + "next": "map_astros" + }, + "map_astros": { + "type": "map", + "array": "astros.people", + "root": "map_astros", + "next": "process_astros", + "states": { + "map_astros": { + "type": "task", + "func_name": "map_astros" + } + } + }, + "process_astros": { + "type": "task", + "func_name": "process_astros" + } + } +} diff --git a/benchmarks/600.workflows/610.gen/input.py b/benchmarks/600.workflows/610.gen/input.py new file mode 100644 index 000000000..1300f159d --- /dev/null +++ b/benchmarks/600.workflows/610.gen/input.py @@ -0,0 +1,5 @@ +def buckets_count(): + return (0, 0) + +def generate_input(data_dir, size, bucket, input_buckets, output_buckets, upload_func, nosql_func): + return dict() \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/few_people.py b/benchmarks/600.workflows/610.gen/python/few_people.py new file mode 100644 index 000000000..9c70d9fbc --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/few_people.py @@ -0,0 +1,5 @@ +def handler(event): + return { + "many_astros": False, + **event + } \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/get_astros.py b/benchmarks/600.workflows/610.gen/python/get_astros.py new file mode 100644 index 000000000..627c65231 --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/get_astros.py @@ -0,0 +1,8 @@ +import requests + +def handler(event): + res = requests.get("http://api.open-notify.org/astros.json") + + return { + "astros": res.json() + } \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/many_people.py b/benchmarks/600.workflows/610.gen/python/many_people.py new file mode 100644 index 000000000..2d339f325 --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/many_people.py @@ -0,0 +1,5 @@ +def handler(event): + return { + "many_astros": True, + **event + } \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/map_astros.py b/benchmarks/600.workflows/610.gen/python/map_astros.py new file mode 100644 index 000000000..5cfa4631b --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/map_astros.py @@ -0,0 +1,7 @@ +def handler(elem): + name = elem["name"] + parts = name.split(" ", 1) + name = " ".join(reversed(parts)) + elem["name_rev"] = name + + return elem \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/process_astros.py b/benchmarks/600.workflows/610.gen/python/process_astros.py new file mode 100644 index 000000000..a981660e0 --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/process_astros.py @@ -0,0 +1,5 @@ +def handler(arr): + return { + "astros": arr, + "done": True + } \ No newline at end of file diff --git a/benchmarks/600.workflows/610.gen/python/requirements.txt b/benchmarks/600.workflows/610.gen/python/requirements.txt new file mode 100644 index 000000000..f2293605c --- /dev/null +++ b/benchmarks/600.workflows/610.gen/python/requirements.txt @@ -0,0 +1 @@ +requests diff --git a/benchmarks/600.workflows/6100.1000-genome/LICENSE b/benchmarks/600.workflows/6100.1000-genome/LICENSE new file mode 100644 index 000000000..8c0e74641 --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2007-2012 University Of Southern California + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/benchmarks/600.workflows/6100.1000-genome/README.md b/benchmarks/600.workflows/6100.1000-genome/README.md new file mode 100644 index 000000000..0719dbc0a --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/README.md @@ -0,0 +1 @@ +The implementation of the 1000Genome benchmark is based on the implementation of the 1000Genome workflow here: https://github.com/pegasus-isi/1000genome-workflow, with the license provided. diff --git a/benchmarks/600.workflows/6100.1000-genome/config.json b/benchmarks/600.workflows/6100.1000-genome/config.json new file mode 100644 index 000000000..e14b3b052 --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 540, + "memory": 2048, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/6100.1000-genome/definition.json b/benchmarks/600.workflows/6100.1000-genome/definition.json new file mode 100644 index 000000000..e242fd1fb --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/definition.json @@ -0,0 +1,71 @@ +{ + "root": "individuals", + "states": { + "individuals": { + "type": "map", + "root": "individuals_func", + "array": "blob", + "common_params": "benchmark_bucket,bucket,columns,columns_bucket,populations,sifting_input,individuals_file", + "next": "merge_and_sifting", + "states": { + "individuals_func": { + "type": "task", + "func_name": "individuals" + } + } + }, + "merge_and_sifting": { + "type": "parallel", + "parallel_functions": [{ + "root": "individuals_merge", + "states": { + "individuals_merge": { + "type": "task", + "func_name": "individuals_merge" + }}}, + { + "root": "sifting", + "states": { + "sifting": { + "type": "task", + "func_name": "sifting" + } + }}], + "next": "frequency_and_overlap" + }, + "frequency_and_overlap": { + "type": "parallel", + "parallel_functions": [{ + "root": "frequency", + "states": { + "frequency": { + "type": "map", + "root": "frequency_func", + "array": "sifting.populations", + "common_params": "sifting,individuals_merge", + "states": { + "frequency_func": { + "type": "task", + "func_name": "frequency" + } + } + }}}, + { + "root": "mutation_overlap", + "states": { + "mutation_overlap": { + "type": "map", + "array": "sifting.populations", + "common_params": "sifting,individuals_merge", + "root": "mutation_overlap_func", + "states": { + "mutation_overlap_func": { + "type": "task", + "func_name": "mutation_overlap" + } + } + } + }}] + } + } +} diff --git a/benchmarks/600.workflows/6100.1000-genome/input.py b/benchmarks/600.workflows/6100.1000-genome/input.py new file mode 100644 index 000000000..def8d0195 --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/input.py @@ -0,0 +1,66 @@ +import os +import re +import uuid +import io + +size_generators = { + "test" : (1), + "small": (5), + "small-10": (10), + "large": (10), +} + +def buckets_count(): + return (1, 1) + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + files = ["ALL.chr21.1250.vcf", "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.annotation.vcf", "columns.txt", "AFR", "ALL", "AMR", "EAS", "EUR", "GBR", "SAS"] + for name in files: + #if name != "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.annotation.vcf": + path = os.path.join(data_dir, name) + upload_func(0, name, path) + + num_individuals_jobs = size_generators[size] + + blobs = [] + start_bytes = 0 + with open(os.path.join(data_dir, files[0]), "r") as f: + content = f.readlines() + #TODO potentially change if input file with different number of lines is to be processed. + range_per_job = 1250 / num_individuals_jobs + for i in range(0, num_individuals_jobs): + #actually split file; return it afterwards. see e.g. split.py in 660.map-reduce. + #regex = re.compile('(?!#)') + start = i * range_per_job + end = i * range_per_job + range_per_job + #print("start: ", start, "end: ", end, "range_per_job: ", range_per_job, "num_individuals_jobs: ", num_individuals_jobs) + #data = list(filter(regex.match, content[int(start):int(end)])) + data = content[int(start):int(end)] + #name with start and end lines is not needed as all individuals jobs can just read their entire file. + name = str(uuid.uuid4())[:8] + + upload_data = io.BytesIO() + upload_data.writelines((val).encode("utf-8") for val in data) + upload_data.seek(0) + #name = client.upload_stream(output_bucket, name, upload_data) + #TODO keep track of start + stop bytes and return them. + nbytes = upload_data.getbuffer().nbytes + + output = { + "start_bytes": start_bytes, + "end_bytes": start_bytes + nbytes - 1 + } + + blobs.append(output) + start_bytes += nbytes + + return { + "bucket": output_buckets[0], + "blob": blobs, + "individuals_file": files[0], + "benchmark_bucket": benchmarks_bucket, + "columns_bucket": input_buckets[0], + "columns": files[2], + "populations": files[3:9], + "sifting_input": files[1], + } diff --git a/benchmarks/600.workflows/6100.1000-genome/python/frequency.py b/benchmarks/600.workflows/6100.1000-genome/python/frequency.py new file mode 100644 index 000000000..969a76de2 --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/frequency.py @@ -0,0 +1,272 @@ +import time + +tic = time.perf_counter() +import numpy as np +from random import sample +import os.path +import matplotlib + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import collections +from collections import Counter + +import datetime + +import os +from . import storage + + +class ReadData: + def read_names(self, POP, pop_dir, columns_file): + tic = time.perf_counter() + namefile = pop_dir + POP + f = open(namefile, 'r') + text = f.read() + f.close() + text = text.split() + all_ids = text[0:] + file = columns_file + f = open(file, 'r') + text = f.read() + f.close() + genome_ids = text.split() + + ids = list(set(all_ids) & set(genome_ids)) + return ids + + def read_rs_numbers(self, siftfile, SIFT): + ## NB This file is in the format of: + ## line number, rs number, ENSG number, SIFT, Phenotype + tic = time.perf_counter() + rs_numbers = [] + variations = {} + map_variations = {} + all_variations = [] + sift_file = open(siftfile, 'r') + for item in sift_file: + item = item.split() + if len(item) > 2: + rs_numbers.append(item[1]) + map_variations[item[1]] = item[2] + variations[item[0]] = item[2] + + return rs_numbers, map_variations + + def read_individuals(self, ids, rs_numbers, data_dir, chrom, individuals_merge_filename): + tic = time.perf_counter() + mutation_index_array = [] + for name in ids: + filename = data_dir + individuals_merge_filename + '/' + chrom + '.' + name + f = open(filename, 'r') + text = [] + for item in f: + item = item.split() + try: + text.append(item[1]) + except IndexError as e: + print("ERROR({}): while reading {}: (item: {})".format(str(e), filename, item)) + sifted_mutations = list(set(rs_numbers).intersection(text)) + mutation_index_array.append(sifted_mutations) + + return mutation_index_array + + +class Results: + + def overlap_ind(self, ids, mutation_index_array, n_runs, n_indiv): + n_p = len(mutation_index_array) + tic = time.perf_counter() + list_p = np.linspace(0, n_p - 1, n_p).astype(int) + mutation_overlap = [] + random_indiv = [] + for run in range(n_runs): + randomized_list = sample(list(list_p), n_p) + result = Counter() + r_ids = [] + for pq in range(n_indiv): + if 2 * pq >= len(randomized_list): + break + b_multiset = collections.Counter(mutation_index_array[randomized_list[2 * pq]]) + r_ids.append(ids[randomized_list[2 * pq]]) + result = result + b_multiset + random_indiv.append(r_ids) + mutation_overlap.append(result) + return mutation_overlap, random_indiv + + def histogram_overlap(self, mutation_overlap, n_runs): + tic = time.perf_counter() + histogram_overlap = [] + for run in range(n_runs): + final_counts = [count for item, count in mutation_overlap[run].items()] + histogram_overlap.append(collections.Counter(final_counts)) + return histogram_overlap + + +class PlotData: + + def plot_histogram_overlap(self, POP, histogram_overlap, outputFile, n_runs): + tic = time.perf_counter() + for run in range(n_runs): + output = outputFile + str(run) + '.png' + final_counts = [count for item, count in histogram_overlap[run].items()] + N = len(final_counts) + x = range(N) + width = 1 / 1.5 + bar1 = plt.bar(x, final_counts, width, color="grey") + plt.ylabel('Mutations') + plt.xlabel('Individuals') + plt.xticks(np.arange(1, N + 1)) + plt.savefig(output) + plt.close() + + +class WriteData: + + def write_histogram_overlap(self, histogram_overlapfile, histogram_overlap, n_runs, n_indiv): + tic = time.perf_counter() + for run in range(n_runs): + overlapfile = histogram_overlapfile + str(run) + '.txt' + f = open(overlapfile, 'w') + f.write('Number Individuals - Number Mutations \n') + for i in range(1, n_indiv + 1): + if i in histogram_overlap[run]: + f.write(str(i) + '-' + str(histogram_overlap[run][i]) + '\n') + else: + f.write(str(i) + '-' + str(0) + '\n') + f.close() + + + def write_mutation_overlap(self, mutation_overlapfile, mutation_overlap, n_runs): + tic = time.perf_counter() + for run in range(n_runs): + overlapfile = mutation_overlapfile + str(run) + '.txt' + f = open(overlapfile, 'w') + f.write('Mutation Index- Number Overlapings \n') + for key, count in mutation_overlap[run].items(): + f.write(key + '-' + str(count) + '\n') + f.close() + + def write_random_indiv(self, randomindiv_file, random_indiv, n_runs): + tic = time.perf_counter() + for run in range(n_runs): + randomfile = randomindiv_file + str(run) + '.txt' + f = open(randomfile, 'w') + f.write('Individuals \n') + for item in random_indiv[run]: + f.write("%s\n" % item) + f.close() + + def write_mutation_index_array(self, mutation_index_array_file, mutation_index_array): + tic = time.perf_counter() + f = open(mutation_index_array_file, "w") + for item in mutation_index_array: + f.write("%s\n" % item) + f.close() + + def write_map_variations(self, map_variations_file, map_variations): + tic = time.perf_counter() + f = open(map_variations_file, 'w') + for key, count in map_variations.items(): + f.write(key + '\t' + str(count) + '\n') + f.close() + + +def handler(event): + POP = event["array_element"] + benchmark_bucket = event["sifting"]["benchmark_bucket"] + output_bucket = event["sifting"]["output_bucket"] + input_bucket = event["sifting"]["input_bucket"] + sifting_filename = event["sifting"]["output_sifting"] + individuals_merge_filename = event["individuals_merge"]["merge_outputfile_name"] + + #download files + siftfile = os.path.join("/tmp", "sifting.txt") + individuals_merge_file = os.path.join("/tmp", "individuals_merge.tar.gz") + pop_file = os.path.join("/tmp", POP) + columns_file = os.path.join("/tmp", "columns.txt") + + client = storage.storage.get_instance() + client.download(benchmark_bucket, output_bucket + '/' + sifting_filename, siftfile) + client.download(benchmark_bucket, output_bucket + '/' + individuals_merge_filename, individuals_merge_file) + client.download(benchmark_bucket, input_bucket + '/' + POP, pop_file) + client.download(benchmark_bucket, input_bucket + '/' + "columns.txt", columns_file) + + #chromosome number, doesn't matter here - just used for naming + c = 21 + + SIFT = 'NO-SIFT' + n_runs = 1000 + n_indiv = 52 + + data_dir = '/tmp/' + pop_dir = '/tmp/' + outdata_dir = "/tmp/chr{0}-{1}-freq/output_no_sift/".format(str(c), str(POP)) + plot_dir = "/tmp/chr{0}-{1}-freq/plots_no_sift/".format(str(c), str(POP)) + + if not os.path.exists(outdata_dir): + os.makedirs(outdata_dir, exist_ok=True) + if not os.path.exists(plot_dir): + os.makedirs(plot_dir, exist_ok=True) + + OutputFormat = '.png' + chrom = 'chr' + str(c) + + font = {'family': 'serif', 'size': 14} + plt.rc('font', **font) + + # untar input data + import tarfile + + tar = tarfile.open(individuals_merge_file) + tar.extractall(path='/tmp/' + individuals_merge_filename) + tar.close() + + rd = ReadData() + res = Results() + wr = WriteData() + pd = PlotData() + + histogram_overlapfile = outdata_dir + 'Histogram_mutation_overlap_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '_' + mutation_overlapfile = outdata_dir + 'Mutation_overlap_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '_' + mutation_index_array_file = outdata_dir + 'mutation_index_array' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + histogram_overlap_plot = plot_dir + 'Frequency_mutations' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + map_variations_file = outdata_dir + 'map_variations' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + + randomindiv_file = outdata_dir + 'random_indiv' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '_' + + ids = rd.read_names(POP, pop_dir, columns_file) + n_pairs = len(ids) / 2 + + rs_numbers, map_variations = rd.read_rs_numbers(siftfile, SIFT) + mutation_index_array = rd.read_individuals(ids, rs_numbers, data_dir, chrom, individuals_merge_filename) + + wr.write_map_variations(map_variations_file, map_variations) + wr.write_mutation_index_array(mutation_index_array_file, mutation_index_array) + + mutation_overlap, random_indiv = res.overlap_ind(ids, mutation_index_array, n_runs, n_indiv) + histogram_overlap = res.histogram_overlap(mutation_overlap, n_runs) + + wr.write_mutation_overlap(mutation_overlapfile, mutation_overlap, n_runs) + wr.write_histogram_overlap(histogram_overlapfile, histogram_overlap, n_runs, n_indiv) + wr.write_random_indiv(randomindiv_file, random_indiv, n_runs) + + pd.plot_histogram_overlap(POP, histogram_overlap, histogram_overlap_plot, n_runs) + + # gen final output + tar = tarfile.open('/tmp/chr%s-%s-freq.tar.gz' % (c, POP), 'w:gz') + tar.add(outdata_dir) + tar.add(plot_dir) + tar.close() + result_name = client.upload(benchmark_bucket, output_bucket + '/' + 'chr%s-%s-freq.tar.gz' % (c, POP), '/tmp/chr%s-%s-freq.tar.gz' % (c, POP)) + result_name = result_name.replace(output_bucket + '/', '') + + return { + "output_frequency": result_name + } diff --git a/benchmarks/600.workflows/6100.1000-genome/python/individuals.py b/benchmarks/600.workflows/6100.1000-genome/python/individuals.py new file mode 100644 index 000000000..e156d0f5b --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/individuals.py @@ -0,0 +1,116 @@ +import os +import uuid +import tarfile +import shutil +import re +from . import storage +import datetime + +client = storage.storage.get_instance() + + +def compress(output, input_dir): + with tarfile.open(output, "w:gz") as file: + file.add(input_dir, arcname=os.path.basename(input_dir)) + +def readfile(file): + with open(file, 'r') as f: + content = f.readlines() + return content + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + individuals_bucket = event["bucket"] + individuals_input = event["individuals_file"] + + start_bytes = event["array_element"]["start_bytes"] + end_bytes = event["array_element"]["end_bytes"] + + columns = event["columns"] + columns_bucket = event["columns_bucket"] + columns_path = os.path.join("/tmp", "columns.txt") + + client = storage.storage.get_instance() + client.download(benchmark_bucket, columns_bucket + '/' + columns, columns_path) + data = client.download_within_range(benchmark_bucket, columns_bucket + '/' + individuals_input, start_bytes, end_bytes) + + + ndir = 'chr{}n-{}/'.format(21, individuals_input) + ndir = os.path.join("/tmp", ndir) + os.makedirs(ndir, exist_ok=True) + + + regex = re.compile('(?!#)') + #print("data: ", data) + data = data.split("\n") + data = list(filter(lambda line: regex.match(line) and line != "", data)) + + chrp_data = {} + columndata = readfile(columns_path)[0].rstrip('\n').split('\t') + + start_data = 9 # where the real data start, the first 0|1, 1|1, 1|0 or 0|0 + # position of the last element (normally equals to len(data[0].split(' ')) + #end_data = 2504 + end_data = len(columndata) - start_data + + for i in range(0, end_data): + col = i + start_data + name = columndata[col] + + filename = "{}/chr{}.{}".format(ndir, "21", name) + chrp_data[i] = [] + + with open(filename, 'w') as f: + zeilennummer = 0 + for line in data: + zeilennummer += 1 + try: + first = line.split('\t')[col] # first =`echo $l | cut -d -f$i` + except Exception as e: + print("faulty line at col = ", col, "zeilennummer:", zeilennummer, " line : ", line) + raise e + #second =`echo $l | cut -d -f 2, 3, 4, 5, 8 --output-delimiter = ' '` + second = line.split('\t')[0:8] + # We select the one we want + second = [elem for id, elem in enumerate(second) if id in [1, 2, 3, 4, 7]] + af_value = second[4].split(';')[8].split('=')[1] + # We replace with AF_Value + second[4] = af_value + try: + if ',' in af_value: + # We only keep the first value if more than one (that's what awk is doing) + af_value = float(af_value.split(',')[0]) + else: + af_value = float(af_value) + + elem = first.split('|') + # We skip some lines that do not meet these conditions + if af_value >= 0.5 and elem[0] == '0': + chrp_data[i].append(second) + elif af_value < 0.5 and elem[0] == '1': + chrp_data[i].append(second) + else: + continue + + f.write("{0} {1} {2} {3} {4}\n".format( + second[0], second[1], second[2], second[3], second[4]) + ) + except ValueError: + continue + + outputfile = "chr{}n-{}.tar.gz".format(21, individuals_input) + + # tar -zcf .. /$outputfile . + compress(os.path.join("/tmp/", outputfile), ndir) + outputfile_name = client.upload(benchmark_bucket, individuals_bucket + '/' + outputfile, os.path.join("/tmp/", outputfile)) + outputfile_name = outputfile_name.replace(individuals_bucket + '/', '') + + # Cleaning temporary files + try: + shutil.rmtree(ndir) + except OSError as e: + print("Error: %s : %s" % (ndir, e.strerror)) + + return { + "individuals_output": outputfile_name + } diff --git a/benchmarks/600.workflows/6100.1000-genome/python/individuals_merge.py b/benchmarks/600.workflows/6100.1000-genome/python/individuals_merge.py new file mode 100644 index 000000000..7a563366b --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/individuals_merge.py @@ -0,0 +1,90 @@ +import os +from . import storage +import time +import tarfile +import tempfile +import shutil +import datetime + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + individuals_output_bucket = event["bucket"] + filenames = [] + for elem in event["blob"]: + filenames.append(elem["individuals_output"]) + + #download files + client = storage.storage.get_instance() + for file in filenames: + client.download(benchmark_bucket, individuals_output_bucket + '/' + file, os.path.join('/tmp', file)) + + #call merging with c and directories. + outputfile_name, outputfile = merging(21, filenames) + + #upload outputfile + outputfile_name = client.upload(benchmark_bucket, individuals_output_bucket + '/' + outputfile_name, outputfile) + outputfile_name = outputfile_name.replace(individuals_output_bucket + '/', '') + + + return { + "merge_outputfile_name": outputfile_name + } + +def compress(archive, input_dir): + with tarfile.open(archive, "w:gz") as f: + f.add(input_dir, arcname="") + +def extract_all(archive, output_dir): + with tarfile.open(archive, "r:*") as f: + f.extractall(output_dir) + flist = f.getnames() + if flist[0] == '': + flist = flist[1:] + return flist + +def readfile(filename): + with open(filename, 'r') as f: + content = f.readlines() + return content + +def writefile(filename, content): + with open(filename, 'w') as f: + f.writelines(content) + +def merging(c, tar_files): + tic = time.perf_counter() + + + merged_dir = "merged_chr{}".format(c) + merged_dir = os.path.join("/tmp", merged_dir) + os.makedirs(merged_dir, exist_ok=True) + + data = {} + + for tar in tar_files: + tic_iter = time.perf_counter() + os.makedirs("/tmp/temp_dir", exist_ok=True) + with tempfile.TemporaryDirectory(dir="/tmp/temp_dir") as temp_dir: + for filename in extract_all(os.path.join("/tmp", tar), temp_dir): + content = readfile(os.path.join(temp_dir, filename)) + if filename in data: + data[filename] += content + else: + data[filename] = content + + + for filename,content in data.items(): + writefile(os.path.join(merged_dir, filename), content) + + outputfile_name = "chr{}n.tar.gz".format(c) + outputfile = os.path.join("/tmp", outputfile_name) + + compress(outputfile, merged_dir) + + # Cleaning temporary files + try: + shutil.rmtree(merged_dir) + except OSError as e: + print("Error: %s : %s" % (merged_dir, e.strerror)) + + return outputfile_name, outputfile diff --git a/benchmarks/600.workflows/6100.1000-genome/python/mutation_overlap.py b/benchmarks/600.workflows/6100.1000-genome/python/mutation_overlap.py new file mode 100644 index 000000000..2c377e47c --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/mutation_overlap.py @@ -0,0 +1,379 @@ +import time + +tic = time.perf_counter() +import numpy as np +from random import sample +import os +import os.path +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import itertools +from matplotlib import pyplot +import matplotlib as mpl +import collections +from collections import Counter +import datetime + +import os +from . import storage + + +class ReadData : + def read_names(self, POP, pop_dir, columns_file) : + tic = time.perf_counter() + namefile = pop_dir + POP + f = open(namefile, 'r') + text = f.read() + f.close() + text = text.split() + all_ids = text[0:] + file = columns_file + f = open(file, 'r') + text = f.read() + f.close() + genome_ids = text.split() + + ids = list(set(all_ids) & set(genome_ids)) + + return ids + + def read_rs_numbers(self, siftfile, SIFT) : + ## NB This file is in the format of: + ## line number, rs number, ENSG number, SIFT, Phenotype + tic = time.perf_counter() + rs_numbers = [] + variations = {} + map_variations = {} + all_variations = [] + sift_file = open(siftfile,'r') + for item in sift_file: + item = item.split() + if len(item) > 2: + rs_numbers.append(item[1]) + map_variations[item[1]] = item[2] + + return rs_numbers, map_variations + + def read_individuals(self, ids, rs_numbers, data_dir, chrom, individuals_merge_filename) : + tic = time.perf_counter() + mutation_index_array = [] + total_mutations={} + total_mutations_list =[] + for name in ids : + filename = data_dir + individuals_merge_filename + '/' + chrom + '.' + name + f = open(filename, 'r') + text = f.read() + f.close() + text = text.split() + sifted_mutations = list(set(rs_numbers).intersection(text)) + mutation_index_array.append(sifted_mutations) + total_mutations[name]= len(sifted_mutations) + total_mutations_list.append(len(sifted_mutations)) + + return mutation_index_array, total_mutations, total_mutations_list + + def read_pairs_overlap(self, indpairsfile) : + tic = time.perf_counter() + pairs_overlap = np.loadtxt(indpairsfile, unpack=True) + pairs_overlap = np.transpose(pairs_overlap) + + return pairs_overlap + + +class Results : + + def group_indivuals(self, total_mutations_list, n_runs) : + tic = time.perf_counter() + n_group = 26 + random_mutations_list= [] + for run in range(n_runs): + random_mutations_list.append(sample(total_mutations_list, n_group)) + return random_mutations_list + + def pair_individuals(self, mutation_index_array, n_runs) : + tic = time.perf_counter() + + n_p = len(mutation_index_array) + n_pairs = int(round(n_p/2)) + list_p = np.linspace(0, n_p - 1, n_p).astype(int) + pairs_overlap = np.zeros((n_runs, n_pairs)) + for run in range(n_runs) : + randomized_list = sample(list(list_p) , n_p) + for pq in range(n_pairs) : + array1 = mutation_index_array[randomized_list[2*pq]] + + array2 = mutation_index_array[randomized_list[2*pq]] + pair_array = set(array1) & set(array2) + pairs_overlap[run][pq] = len(pair_array) + + return pairs_overlap + + def total_pair_individuals (self, mutation_index_array) : + tic = time.perf_counter() + n_p = len(mutation_index_array) + total_pairs_overlap = np.zeros((n_p, n_p)) + simetric_overlap = np.zeros((n_p, n_p)) + for run in range(n_p): + array1 = mutation_index_array[run] + start = run +1 + for pq in range(start, n_p) : + array2 = mutation_index_array[pq] + pairs_array = set(array1) & set(array2) + total_pairs_overlap[run][pq]=len(pairs_array) + simetric_overlap[run][pq] = len(pairs_array) + simetric_overlap[pq][run]= len(pairs_array) + + return total_pairs_overlap , simetric_overlap + + def half_pair_individuals(self, mutation_index_array) : + tic = time.perf_counter() + n_p = len(mutation_index_array) + n_pairs = int(round(n_p/2)) + pairs_overlap = np.zeros((n_pairs, n_pairs)) + for run in range(n_pairs): + array1 = mutation_index_array[run] + index =0 + for pq in range(n_pairs+1, n_p): + array2 = mutation_index_array[pq] + pairs_array = set(array1) & set(array2) + pairs_overlap[run][index]=len(pairs_array) + + return pairs_overlap + + def gene_pairs(self, mutation_index_array) : + + tic = time.perf_counter() + n_p = len(mutation_index_array) + gene_pair_list = {} + for pp in range(n_p) : + pairs = itertools.combinations(mutation_index_array[pp], 2) + for pair in pairs : + key = str(pair) + if key not in gene_pair_list : gene_pair_list[key] = 1 + else : gene_pair_list[key] += 1 + + + return gene_pair_list + +class PlotData : + + def individual_overlap(self, POP, pairs_overlap, outputFile, c, SIFT) : + tic = time.perf_counter() + + pairs_overlap = np.array(pairs_overlap) + + min_p = np.min(pairs_overlap) + max_p = np.max(pairs_overlap) + nbins = int(max_p) + 1 + n_runs = len(pairs_overlap) + + + nbins = int(np.max(pairs_overlap)) + bin_centres = np.linspace(0, nbins, nbins) + bin_edges = np.linspace(-0.5, nbins + 0.5, nbins + 1) + + fig = plt.figure(frameon=False, figsize=(10, 9)) + ax = fig.add_subplot(111) + hists = [] + max_h = 0 + for run in range(n_runs) : + h, edges = np.histogram(pairs_overlap[run], bins = bin_edges) + ax.plot(bin_centres, h, alpha = 0.5) + if len(h) > 0: + max_h = max(max_h, max(h)) + + plt.xlabel('Number of overlapping gene mutations', fontsize = 24) + plt.ylabel(r'frequency', fontsize = 28) + text1 = 'population ' + POP + '\n' +\ + 'chromosome ' + str(c) + '\n' + \ + 'SIFT < ' + str(SIFT) + '\n' + \ + str(n_runs) + ' runs' + plt.text(.95, .95, text1, fontsize = 24, + verticalalignment='top', horizontalalignment='right', + transform = ax.transAxes) + plt.savefig(outputFile) + plt.close() + + def total_colormap_overlap(self, POP, total_pairs_overlap, outputFile): + tic = time.perf_counter() + fig = plt.figure() + cmap = mpl.colors.ListedColormap(['blue','black','red', 'green', 'pink']) + img = pyplot.imshow(total_pairs_overlap,interpolation='nearest', cmap = cmap, origin='lower') + pyplot.colorbar(img,cmap=cmap) + + plt.savefig(outputFile) + plt.close() + + +class WriteData : + def write_pair_individuals(self, indpairsfile, pairs_overlap) : + tic = time.perf_counter() + np.savetxt(indpairsfile, pairs_overlap, fmt = '%i') + + def write_gene_pairs(self, genepairsfile, gene_pair_list) : + tic = time.perf_counter() + f = open(genepairsfile, 'w') + for key, count in gene_pair_list.items() : + f.write(key + '\t' + str(count) + '\n') + f.close() + + def write_total_indiv(self, total_mutations_filename, total_mutations) : + tic = time.perf_counter() + f = open(total_mutations_filename, 'w') + for key, count in total_mutations.items() : + f.write(key + '\t' + str(count) + '\n') + f.close() + + def write_random_mutations_list(self, random_mutations_filename, random_mutations_list, n_runs) : + for run in range(n_runs): + filename= random_mutations_filename +'_run_' + str(run) + '.txt' + f = open(filename, 'w') + f.writelines(["%s\n" % item for item in random_mutations_list[run]]) + + def write_mutation_index_array(self, mutation_index_array_file, mutation_index_array): + f=open(mutation_index_array_file,"w") + for item in mutation_index_array: + f.write("%s\n" % item) + f.close() + + def write_map_variations(self, map_variations_file, map_variations) : + tic = time.perf_counter() + f = open(map_variations_file, 'w') + for key, count in map_variations.items() : + f.write(key + '\t' + str(count) + '\n') + f.close() + + + +def handler(event): + POP = event["array_element"] + benchmark_bucket = event["sifting"]["benchmark_bucket"] + output_bucket = event["sifting"]["output_bucket"] + input_bucket = event["sifting"]["input_bucket"] + sifting_filename = event["sifting"]["output_sifting"] + individuals_merge_filename = event["individuals_merge"]["merge_outputfile_name"] + + + #download files + siftfile = os.path.join("/tmp", "sifting.txt") + individuals_merge_file = os.path.join("/tmp", "individuals_merge.tar.gz") + pop_file = os.path.join("/tmp", POP) + columns_file = os.path.join("/tmp", "columns.txt") + + client = storage.storage.get_instance() + client.download(benchmark_bucket, output_bucket + '/' + sifting_filename, siftfile) + client.download(benchmark_bucket, output_bucket + '/' + individuals_merge_filename, individuals_merge_file) + client.download(benchmark_bucket, input_bucket + '/' + POP, pop_file) + client.download(benchmark_bucket, input_bucket + '/' + "columns.txt", columns_file) + #chromosome no, doesn't matter. + c = 21 + + SIFT = 'NO-SIFT' + n_runs = 1 + + data_dir = '/tmp/' + pop_dir = '/tmp/' + outdata_dir = "/tmp/chr{0}-{1}/output_no_sift/".format(str(c), str(POP)) + plots_dir = "/tmp/chr{0}-{1}/plots_no_sift/".format(str(c), str(POP)) + + if not os.path.exists(outdata_dir): + os.makedirs(outdata_dir, exist_ok=True) + if not os.path.exists(plots_dir): + os.makedirs(plots_dir, exist_ok=True) + + OutputFormat = '.png' + chrom = 'chr' + str(c) + + font = {'family':'serif', + 'size':14 } + plt.rc('font', **font) + + + # untar input data + import tarfile + tar = tarfile.open(individuals_merge_file) + tar.extractall(path='/tmp/' + individuals_merge_filename) + tar.close() + + tic = time.perf_counter() + + rd = ReadData() + res = Results() + wr = WriteData() + pd = PlotData() + + half_indpairsfile = outdata_dir + 'individual_half_pairs_overlap_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + total_indpairsfile = outdata_dir + 'total_individual_pairs_overlap_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + genepairsfile = outdata_dir + 'gene_pairs_count_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + random_indpairsfile = outdata_dir + '100_individual_overlap_chr' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + + colormap = plots_dir + 'colormap_distribution_c' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + OutputFormat + half_overlap = plots_dir + 'half_distribution_c' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + OutputFormat + total_overlap = plots_dir + 'total_distribution_c' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + OutputFormat + random_overlap = plots_dir + '100_distribution_c' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + OutputFormat + + total_mutations_filename = outdata_dir + 'total_mutations_individual' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + random_mutations_filename = outdata_dir + 'random_mutations_individual' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + + mutation_index_array_file = outdata_dir + 'mutation_index_array' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + + map_variations_file = outdata_dir + 'map_variations' + str(c) + '_s' + \ + str(SIFT) + '_' + POP + '.txt' + + + + ids = rd.read_names(POP, pop_dir, columns_file) + n_pairs = len(ids)/2 + + + rs_numbers, map_variations = rd.read_rs_numbers(siftfile, SIFT) + mutation_index_array, total_mutations, total_mutations_list = rd.read_individuals(ids, rs_numbers, data_dir, chrom, individuals_merge_filename) + wr.write_total_indiv(total_mutations_filename, total_mutations) + wr.write_map_variations(map_variations_file, map_variations) + + #cross-correlations mutations overlapping + half_pairs_overlap = res.half_pair_individuals(mutation_index_array) + total_pairs_overlap, simetric_overlap = res.total_pair_individuals(mutation_index_array) + random_pairs_overlap = res.pair_individuals(mutation_index_array, n_runs) + + wr.write_mutation_index_array(mutation_index_array_file, mutation_index_array) + wr.write_pair_individuals(half_indpairsfile, half_pairs_overlap) + wr.write_pair_individuals(total_indpairsfile, total_pairs_overlap) + wr.write_pair_individuals(random_indpairsfile, random_pairs_overlap,) + + pd.individual_overlap(POP, half_pairs_overlap, half_overlap, c, SIFT) + pd.individual_overlap(POP, simetric_overlap, total_overlap, c, SIFT) + pd.individual_overlap(POP, random_pairs_overlap, random_overlap, c, SIFT) + pd.total_colormap_overlap(POP, total_pairs_overlap, colormap) + + #list of frecuency of mutations in 26 individuals + random_mutations_list=res.group_indivuals(total_mutations_list, n_runs) + wr.write_random_mutations_list(random_mutations_filename, random_mutations_list, n_runs) + + # gen overlapping + gene_pair_list = res.gene_pairs(mutation_index_array) + wr.write_gene_pairs(genepairsfile, gene_pair_list) + + # gen final output + tar = tarfile.open('/tmp/chr%s-%s.tar.gz' % (c, POP), 'w:gz') + tar.add(outdata_dir) + tar.add(plots_dir) + tar.close() + result_name = client.upload(benchmark_bucket, output_bucket + '/' + 'chr%s-%s.tar.gz' % (c, POP), '/tmp/chr%s-%s.tar.gz' % (c, POP)) + result_name = result_name.replace(output_bucket + '/', '') + + return { + "output_mutation_overlap": result_name + } diff --git a/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt b/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt new file mode 100644 index 000000000..c357805d6 --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt @@ -0,0 +1,3 @@ +#numpy==1.17 +numpy==1.25 #1.16 works on Azure, but not AWS +matplotlib diff --git a/benchmarks/600.workflows/6100.1000-genome/python/sifting.py b/benchmarks/600.workflows/6100.1000-genome/python/sifting.py new file mode 100644 index 000000000..2add45bdb --- /dev/null +++ b/benchmarks/600.workflows/6100.1000-genome/python/sifting.py @@ -0,0 +1,74 @@ +import os +import re +from . import storage +import subprocess +import datetime + +def readfile(file): + with open(file, 'r') as f: + content = f.readlines() + return content + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + input_bucket = event["columns_bucket"] + input_filename = event["sifting_input"] + inputfile = os.path.join("/tmp", "sifting_file.vcf") + + output_bucket = event["bucket"] + + + client = storage.storage.get_instance() + client.download(benchmark_bucket, input_bucket + '/' + input_filename, inputfile) + + #c is the chromosome number - doesn't matter here. + c = 21 + final_name = 'sifted.SIFT.chr{}.txt'.format(c) + final = os.path.join("/tmp", final_name) + + rawdata = readfile(inputfile) + + + r1 = re.compile('.*(#).*') + header = len(list(filter(r1.match, rawdata[:1000]))) + + siftfile = 'SIFT.chr{}.vcf'.format(c) + siftfile = os.path.join("/tmp", siftfile) + with open(siftfile, 'w') as f: + subprocess.run(["grep -n \"deleterious\|tolerated\" {}".format(inputfile)], shell=True, stdout=f) + + data_temp = readfile(siftfile) + + r3 = re.compile('.*(rs).*') + data = list(filter(r3.match, data_temp)) + + + with open(final, 'w') as f: + for l in data: + line = str(int(l.split('\t')[0].split(':')[0]) - int(header)) + id = l.split('\t')[2] + + sifts = l.split('\t')[7].split('|') + sifts = sifts[4] + ' ' + sifts[16] + ' ' + sifts[17] + sifts = sifts.replace('(', ' ').replace(')', '') + + temp = (line + ' ' + id + ' ' + sifts).split(' ') + + if temp[3] == '' or temp[4] == '': + f.write("{} {} {}\n".format(temp[0], temp[1], temp[2])) + elif temp[5] == '': + f.write("{} {} {} {}\n".format(temp[0], temp[1], temp[2], temp[4])) + else: + f.write("{} {} {} {} {}\n".format(temp[0], temp[1], temp[2], temp[4], temp[6])) + + os.remove(siftfile) + final_name = client.upload(benchmark_bucket, output_bucket + '/' + final_name, final) + final_name = final_name.replace(output_bucket + '/', '') + + return { + "output_bucket": output_bucket, + "benchmark_bucket": benchmark_bucket, + "output_sifting": final_name, + "populations": event["populations"], + "input_bucket": input_bucket + } diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/config.json b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json new file mode 100644 index 000000000..e14b3b052 --- /dev/null +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 540, + "memory": 2048, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json b/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json new file mode 100644 index 000000000..1f5852d22 --- /dev/null +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json @@ -0,0 +1,17 @@ +{ + "root": "individuals_state", + "states": { + "individuals_state": { + "type": "map", + "root": "individuals", + "array": "blob", + "common_params": "bucket,columns,columns_bucket,populations,sifting_input,individuals_file", + "states": { + "individuals": { + "type": "task", + "func_name": "individuals" + } + } + } + } +} diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/input.py b/benchmarks/600.workflows/6101.1000-genome-individuals/input.py new file mode 100644 index 000000000..c30c5bdcc --- /dev/null +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/input.py @@ -0,0 +1,65 @@ +import os +import re +import uuid +import io + +size_generators = { + "test" : (1), + "small": (5), + "small-10": (10), + "small-20": (20), + "large": (10), +} + +def buckets_count(): + return (1, 1) + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + files = ["ALL.chr21.1250.vcf", "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.annotation.vcf", "columns.txt", "AFR", "ALL", "AMR", "EAS", "EUR", "GBR", "SAS"] + for name in files: + if name == "ALL.chr21.1250.vcf" or name == "columns.txt": + #if name != "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.annotation.vcf": + path = os.path.join(data_dir, name) + upload_func(0, name, path) + + num_individuals_jobs = size_generators[size] + + blobs = [] + start_bytes = 0 + with open(os.path.join(data_dir, files[0]), "r") as f: + content = f.readlines() + range_per_job = 1250 / num_individuals_jobs + for i in range(0, num_individuals_jobs): + #actually split file; return it afterwards. see e.g. split.py in 660.map-reduce. + #regex = re.compile('(?!#)') + start = i * range_per_job + end = i * range_per_job + range_per_job + #print("start: ", start, "end: ", end, "range_per_job: ", range_per_job, "num_individuals_jobs: ", num_individuals_jobs) + #data = list(filter(regex.match, content[int(start):int(end)])) + data = content[int(start):int(end)] + #name with start and end lines is not needed as all individuals jobs can just read their entire file. + name = str(uuid.uuid4())[:8] + + upload_data = io.BytesIO() + upload_data.writelines((val).encode("utf-8") for val in data) + upload_data.seek(0) + nbytes = upload_data.getbuffer().nbytes + + output = { + "start_bytes": start_bytes, + "end_bytes": start_bytes + nbytes - 1 + } + + blobs.append(output) + start_bytes += nbytes + + return { + "bucket": output_buckets[0], + "blob": blobs, + "individuals_file": files[0], + "benchmark_bucket": benchmarks_bucket, + "columns_bucket": input_buckets[0], + "columns": files[2], + "populations": files[3:9], + "sifting_input": files[1], + } diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/python/individuals.py b/benchmarks/600.workflows/6101.1000-genome-individuals/python/individuals.py new file mode 100644 index 000000000..f02c3b789 --- /dev/null +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/python/individuals.py @@ -0,0 +1,114 @@ +import os +import uuid +import tarfile +import shutil +import re +from . import storage + +client = storage.storage.get_instance() + +def compress(output, input_dir): + with tarfile.open(output, "w:gz") as file: + file.add(input_dir, arcname=os.path.basename(input_dir)) + + +def readfile(file): + with open(file, 'r') as f: + content = f.readlines() + return content + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + individuals_bucket = event["bucket"] + individuals_input = event["individuals_file"] + + start_bytes = event["array_element"]["start_bytes"] + end_bytes = event["array_element"]["end_bytes"] + + columns = event["columns"] + columns_bucket = event["columns_bucket"] + columns_path = os.path.join("/tmp", "columns.txt") + + client = storage.storage.get_instance() + client.download(benchmark_bucket, columns_bucket + '/' + columns, columns_path) + data = client.download_within_range(benchmark_bucket, columns_bucket + '/' + individuals_input, start_bytes, end_bytes) + + ndir = 'chr{}n-{}/'.format(21, individuals_input) + ndir = os.path.join("/tmp", ndir) + os.makedirs(ndir, exist_ok=True) + + + regex = re.compile('(?!#)') + #print("data: ", data) + data = data.split("\n") + data = list(filter(lambda line: regex.match(line) and line != "", data)) + + chrp_data = {} + columndata = readfile(columns_path)[0].rstrip('\n').split('\t') + + start_data = 9 # where the real data start, the first 0|1, 1|1, 1|0 or 0|0 + # position of the last element (normally equals to len(data[0].split(' ')) + #end_data = 2504 + end_data = len(columndata) - start_data + + for i in range(0, end_data): + col = i + start_data + name = columndata[col] + + filename = "{}/chr{}.{}".format(ndir, "21", name) + chrp_data[i] = [] + + with open(filename, 'w') as f: + zeilennummer = 0 + for line in data: + zeilennummer += 1 + try: + first = line.split('\t')[col] # first =`echo $l | cut -d -f$i` + except Exception as e: + print("faulty line at col = ", col, "zeilennummer:", zeilennummer, " line : ", line) + raise e + #second =`echo $l | cut -d -f 2, 3, 4, 5, 8 --output-delimiter = ' '` + second = line.split('\t')[0:8] + # We select the one we want + second = [elem for id, elem in enumerate(second) if id in [1, 2, 3, 4, 7]] + af_value = second[4].split(';')[8].split('=')[1] + # We replace with AF_Value + second[4] = af_value + try: + if ',' in af_value: + # We only keep the first value if more than one (that's what awk is doing) + af_value = float(af_value.split(',')[0]) + else: + af_value = float(af_value) + + elem = first.split('|') + # We skip some lines that do not meet these conditions + if af_value >= 0.5 and elem[0] == '0': + chrp_data[i].append(second) + elif af_value < 0.5 and elem[0] == '1': + chrp_data[i].append(second) + else: + continue + + f.write("{0} {1} {2} {3} {4}\n".format( + second[0], second[1], second[2], second[3], second[4]) + ) + except ValueError: + continue + + outputfile = "chr{}n-{}.tar.gz".format(21, individuals_input) + + # tar -zcf .. /$outputfile . + compress(os.path.join("/tmp/", outputfile), ndir) + outputfile_name = client.upload(benchmark_bucket, individuals_bucket + '/' + outputfile, os.path.join("/tmp/", outputfile)) + outputfile_name = outputfile_name.replace(individuals_bucket + '/', '') + + # Cleaning temporary files + try: + shutil.rmtree(ndir) + except OSError as e: + print("Error: %s : %s" % (ndir, e.strerror)) + + return { + "individuals_output": outputfile_name + } diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt b/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt new file mode 100644 index 000000000..5453e2d48 --- /dev/null +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt @@ -0,0 +1,3 @@ +#numpy==1.17 +numpy==1.18 #1.16 works on Azure, but not AWS +matplotlib diff --git a/benchmarks/600.workflows/620.func-invo/config.json b/benchmarks/600.workflows/620.func-invo/config.json new file mode 100644 index 000000000..d6d184e8a --- /dev/null +++ b/benchmarks/600.workflows/620.func-invo/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 256, + "languages": ["python"], + "modules": [] +} diff --git a/benchmarks/600.workflows/620.func-invo/definition.json b/benchmarks/600.workflows/620.func-invo/definition.json new file mode 100644 index 000000000..c64b88349 --- /dev/null +++ b/benchmarks/600.workflows/620.func-invo/definition.json @@ -0,0 +1,15 @@ +{ + "root": "gen", + "states": { + "gen": { + "type": "task", + "func_name": "gen", + "next": "process" + }, + "process": { + "type": "repeat", + "func_name": "process", + "count": 8 + } + } +} diff --git a/benchmarks/600.workflows/620.func-invo/input.py b/benchmarks/600.workflows/620.func-invo/input.py new file mode 100644 index 000000000..afefd5d9a --- /dev/null +++ b/benchmarks/600.workflows/620.func-invo/input.py @@ -0,0 +1,16 @@ +size_generators = { + 'test' : 10, + 'small' : 2**5, + 'large': 2**20, + '2e5': 2**5, + '2e8': 2**8, + '2e10': 2**10, + '2e12': 2**12, + '2e14': 2**14, + '2e16': 2**16, + '2e18': 2**18, + '2e18-1000': (2**18)-1000 +} + +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): + return { 'size': size_generators[size] } diff --git a/benchmarks/600.workflows/620.func-invo/python/gen.py b/benchmarks/600.workflows/620.func-invo/python/gen.py new file mode 100644 index 000000000..60c328fee --- /dev/null +++ b/benchmarks/600.workflows/620.func-invo/python/gen.py @@ -0,0 +1,14 @@ +from random import shuffle + +def handler(event): + size = int(event["size"]) + elems = list(range(size)) + shuffle(elems) + + data = "" + for i in elems: + data += str(i % 255) + if len(data) > size: + break + + return {'len' : data[:size]} diff --git a/benchmarks/600.workflows/620.func-invo/python/process.py b/benchmarks/600.workflows/620.func-invo/python/process.py new file mode 100644 index 000000000..084062854 --- /dev/null +++ b/benchmarks/600.workflows/620.func-invo/python/process.py @@ -0,0 +1,14 @@ +from random import shuffle + +def handler(event): + size = len(event['len']) + elems = list(range(size)) + shuffle(elems) + + data = "" + for i in elems: + data += str(i % 255) + if len(data) > size: + break + + return {'len' : data[:size]} diff --git a/benchmarks/600.workflows/6200.trip-booking/config.json b/benchmarks/600.workflows/6200.trip-booking/config.json new file mode 100644 index 000000000..3dd2268cd --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 256, + "languages": ["python"], + "modules": ["nosql"] +} diff --git a/benchmarks/600.workflows/6200.trip-booking/definition.json b/benchmarks/600.workflows/6200.trip-booking/definition.json new file mode 100644 index 000000000..1d9885750 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/definition.json @@ -0,0 +1,41 @@ +{ + "root": "hotel", + "states": { + "hotel": { + "type": "task", + "func_name": "reserve_hotel", + "next": "rental" + }, + "rental": { + "type": "task", + "func_name": "reserve_rental", + "next": "flight", + "failure": "cancel_hotel" + }, + "flight": { + "type": "task", + "func_name": "reserve_flight", + "next": "confirm", + "failure": "cancel_rental" + }, + "confirm": { + "type": "task", + "func_name": "confirm", + "failure": "cancel_flight" + }, + "cancel_flight": { + "type": "task", + "func_name": "cancel_flight", + "next": "cancel_rental" + }, + "cancel_rental": { + "type": "task", + "func_name": "cancel_rental", + "next": "cancel_hotel" + }, + "cancel_hotel": { + "type": "task", + "func_name": "cancel_hotel" + } + } +} diff --git a/benchmarks/600.workflows/6200.trip-booking/input.py b/benchmarks/600.workflows/6200.trip-booking/input.py new file mode 100644 index 000000000..4c261f755 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/input.py @@ -0,0 +1,50 @@ + +def allocate_nosql() -> dict: + + return { + "flights": { + "primary_key": "trip_id", + "secondary_key": "flight_id" + }, + "car_rentals": { + "primary_key": "trip_id", + "secondary_key": "rental_id" + }, + "hotel_booking": { + "primary_key": "trip_id", + "secondary_key": "booking_id" + } + } + +def generate_input( + data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func +): + + input_config = {} + + # test - invoke a single trip, succeed + # small - fail in the middle + # large - fail at the last step + + trip_details = { + "flight_depart": "ZRH", + "flight_arrive": "KTW", + "flight_date": "2020-08-22T13:00:00", + "hotel_stars": "3", + "hotel_nights": "3", + "hotel_distance": "1500", + "hotel_price_max": "150", + "rental_class": "compact", + "rental_price_max": "100", + "rental_duration": 3, + "rental_requests": ["full_tank", "CDW", "assistance"] + } + + size_results = { + "test": {"result": "success"}, + "small": {"result": "failure", "reason": "hotel"}, + "large": {"result": "failure", "reason": "confirm"} + } + trip_details["expected_result"] = size_results[size] + + return trip_details diff --git a/benchmarks/600.workflows/6200.trip-booking/python/cancel_flight.py b/benchmarks/600.workflows/6200.trip-booking/python/cancel_flight.py new file mode 100644 index 000000000..f94da3dfe --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/cancel_flight.py @@ -0,0 +1,16 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() + + +def handler(event): + + trip_id = event["trip_id"] + + # Confirm flight + nosql_table_name = "flights" + flight_id = event["flight_id"] + nosql_client.delete(nosql_table_name, ("trip_id", trip_id), ("flight_id", flight_id)) + + event.pop("flight_id") + return event diff --git a/benchmarks/600.workflows/6200.trip-booking/python/cancel_hotel.py b/benchmarks/600.workflows/6200.trip-booking/python/cancel_hotel.py new file mode 100644 index 000000000..e1f69077d --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/cancel_hotel.py @@ -0,0 +1,15 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() + + +def handler(event): + + trip_id = event["trip_id"] + + # Confirm flight + nosql_table_name = "hotel_booking" + booking_id = event["booking_id"] + nosql_client.delete(nosql_table_name, ("trip_id", trip_id), ("booking_id", booking_id)) + + return {"trip_id": trip_id, "status": "failure"} diff --git a/benchmarks/600.workflows/6200.trip-booking/python/cancel_rental.py b/benchmarks/600.workflows/6200.trip-booking/python/cancel_rental.py new file mode 100644 index 000000000..f8ff38044 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/cancel_rental.py @@ -0,0 +1,16 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() + + +def handler(event): + + trip_id = event["trip_id"] + + # Confirm flight + nosql_table_name = "car_rentals" + rental_id = event["rental_id"] + nosql_client.delete(nosql_table_name, ("trip_id", trip_id), ("rental_id", rental_id)) + + event.pop("rental_id") + return event diff --git a/benchmarks/600.workflows/6200.trip-booking/python/confirm.py b/benchmarks/600.workflows/6200.trip-booking/python/confirm.py new file mode 100644 index 000000000..3a555f6a4 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/confirm.py @@ -0,0 +1,42 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() + + +def handler(event): + + expected_result = event["expected_result"] + if expected_result["result"] == "failure" and expected_result["reason"] == "confirm": + raise RuntimeError("Failed to confirm the booking!") + + trip_id = event["trip_id"] + + # Confirm flight + nosql_table_name = "flights" + flight_id = event["flight_id"] + nosql_client.update( + nosql_table_name, + ("trip_id", trip_id), + ("flight_id", flight_id), + {"status": "booked"}, + ) + + # Confirm car rental + nosql_table_name = "car_rentals" + nosql_client.update( + nosql_table_name, + ("trip_id", trip_id), + ("rental_id", event["rental_id"]), + {"status": "booked"}, + ) + + # Confirm hotel booking + nosql_table_name = "hotel_booking" + nosql_client.update( + nosql_table_name, + ("trip_id", trip_id), + ("booking_id", event["booking_id"]), + {"status": "booked"}, + ) + + return {"trip_id": trip_id, "status": "success"} diff --git a/benchmarks/600.workflows/6200.trip-booking/python/reserve_flight.py b/benchmarks/600.workflows/6200.trip-booking/python/reserve_flight.py new file mode 100644 index 000000000..fe55ed0c1 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/reserve_flight.py @@ -0,0 +1,40 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() +nosql_table_name = "flights" + + +def handler(event): + + expected_result = event["expected_result"] + if expected_result["result"] == "failure" and expected_result["reason"] == "flight": + raise RuntimeError("Failed to book a flight!") + + # We start with the hotel + trip_id = event["trip_id"] + flight_id = event["request-id"] + + # Simulate return from a service + flight_price = "1000" + flight_connections = ["WAW"] + flight_duration = "4h30m" + + nosql_client.insert( + nosql_table_name, + ("trip_id", trip_id), + ("flight_id", flight_id), + { + **{key: event[key] for key in event.keys() if key.startswith("flight_")}, + "price": flight_price, + "connections": flight_connections, + "duration": flight_duration, + "status": "pending", + }, + ) + + return { + "trip_id": trip_id, + "flight_id": flight_id, + **{key: event[key] for key in ["booking_id", "rental_id"]}, + "expected_result": expected_result, + } diff --git a/benchmarks/600.workflows/6200.trip-booking/python/reserve_hotel.py b/benchmarks/600.workflows/6200.trip-booking/python/reserve_hotel.py new file mode 100644 index 000000000..623d1a8b0 --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/reserve_hotel.py @@ -0,0 +1,35 @@ +import uuid + +from . import nosql + +nosql_client = nosql.nosql.get_instance() +nosql_table_name = "hotel_booking" + + +def handler(event): + + expected_result = event["expected_result"] + if expected_result["result"] == "failure" and expected_result["reason"] == "hotel": + raise RuntimeError("Failed to book the hotel!") + + # We start with the hotel + trip_id = str(uuid.uuid4().hex) + hotel_booking_id = event["request-id"] + + # Simulate return from a service + hotel_price = "130" + hotel_name = "BestEver Hotel" + + nosql_client.insert( + nosql_table_name, + ("trip_id", trip_id), + ("booking_id", hotel_booking_id), + { + **{key: event[key] for key in event.keys() if key.startswith("hotel_")}, + "hotel_price": hotel_price, + "hotel_name": hotel_name, + "status": "pending", + }, + ) + + return {"trip_id": trip_id, "booking_id": hotel_booking_id, **event} diff --git a/benchmarks/600.workflows/6200.trip-booking/python/reserve_rental.py b/benchmarks/600.workflows/6200.trip-booking/python/reserve_rental.py new file mode 100644 index 000000000..8cf0b11fc --- /dev/null +++ b/benchmarks/600.workflows/6200.trip-booking/python/reserve_rental.py @@ -0,0 +1,33 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() +nosql_table_name = "car_rentals" + + +def handler(event): + + expected_result = event["expected_result"] + if expected_result["result"] == "failure" and expected_result["reason"] == "rental": + raise RuntimeError("Failed to rent a car!") + + # We start with the hotel + trip_id = event["trip_id"] + rental_id = event["request-id"] + + # Simulate return from a service + car_price = "125" + car_name = "Fiat 126P" + + nosql_client.insert( + nosql_table_name, + ("trip_id", trip_id), + ("rental_id", rental_id), + { + **{key: event[key] for key in event.keys() if key.startswith("rental_")}, + "rental_price": car_price, + "rental_name": car_name, + "status": "pending", + }, + ) + + return {"trip_id": trip_id, "rental_id": rental_id, **event} diff --git a/benchmarks/600.workflows/630.parallel-sleep/config.json b/benchmarks/600.workflows/630.parallel-sleep/config.json new file mode 100644 index 000000000..d6d184e8a --- /dev/null +++ b/benchmarks/600.workflows/630.parallel-sleep/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 256, + "languages": ["python"], + "modules": [] +} diff --git a/benchmarks/600.workflows/630.parallel-sleep/definition.json b/benchmarks/600.workflows/630.parallel-sleep/definition.json new file mode 100644 index 000000000..61c56083c --- /dev/null +++ b/benchmarks/600.workflows/630.parallel-sleep/definition.json @@ -0,0 +1,21 @@ +{ + "root": "generate", + "states": { + "generate": { + "type": "task", + "func_name": "generate", + "next": "process-state" + }, + "process-state": { + "type": "map", + "root": "process", + "array": "buffer", + "states": { + "process": { + "type": "task", + "func_name": "process" + } + } + } + } +} diff --git a/benchmarks/600.workflows/630.parallel-sleep/input.py b/benchmarks/600.workflows/630.parallel-sleep/input.py new file mode 100644 index 000000000..092981d7a --- /dev/null +++ b/benchmarks/600.workflows/630.parallel-sleep/input.py @@ -0,0 +1,34 @@ + #threads-duration +size_generators = { + 'test' : (2, 2), + 'small': (16, 20), + 'large': (50, 2), + '2-1': (2, 1), + '4-1': (4, 1), + '8-1': (8, 1), + '16-1': (16, 1), + '2-5': (2, 5), + '4-5': (4, 5), + '8-5': (8, 5), + '16-5': (16, 5), + '2-10': (2, 10), + '4-10': (4, 10), + '8-10': (8, 10), + '16-10': (16, 10), + '2-15': (2, 15), + '4-15': (4, 15), + '8-15': (8, 15), + '16-15': (16, 15), + '2-20': (2, 20), + '4-20': (4, 20), + '8-20': (8, 20), + '16-20': (16, 20), + '50-1': (50, 1) +} + +def buckets_count(): + return (0, 0) + +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): + count, sleep = size_generators[size] + return { 'count': count, 'sleep': sleep } diff --git a/benchmarks/600.workflows/630.parallel-sleep/python/generate.py b/benchmarks/600.workflows/630.parallel-sleep/python/generate.py new file mode 100644 index 000000000..c291c6754 --- /dev/null +++ b/benchmarks/600.workflows/630.parallel-sleep/python/generate.py @@ -0,0 +1,12 @@ +def handler(event): + count = int(event["count"]) + sleep = int(event["sleep"]) + + sleep_list = [] + for i in range(0, count): + sleep_list.append({'sleep':sleep}) + + + return { + "buffer": sleep_list + } diff --git a/benchmarks/600.workflows/630.parallel-sleep/python/process.py b/benchmarks/600.workflows/630.parallel-sleep/python/process.py new file mode 100644 index 000000000..9e2f1ab05 --- /dev/null +++ b/benchmarks/600.workflows/630.parallel-sleep/python/process.py @@ -0,0 +1,7 @@ +import time + +def handler(event): + time.sleep(event['sleep']) + + + return "ok" diff --git a/benchmarks/600.workflows/631.parallel-download/config.json b/benchmarks/600.workflows/631.parallel-download/config.json new file mode 100644 index 000000000..4314d183a --- /dev/null +++ b/benchmarks/600.workflows/631.parallel-download/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 512, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/631.parallel-download/definition.json b/benchmarks/600.workflows/631.parallel-download/definition.json new file mode 100644 index 000000000..c12e59553 --- /dev/null +++ b/benchmarks/600.workflows/631.parallel-download/definition.json @@ -0,0 +1,21 @@ +{ + "root": "generate", + "states": { + "generate": { + "type": "task", + "func_name": "generate", + "next": "process1" + }, + "process1": { + "type": "map", + "root": "process", + "array": "buffer", + "states": { + "process": { + "type": "task", + "func_name": "process" + } + } + } + } +} diff --git a/benchmarks/600.workflows/631.parallel-download/input.py b/benchmarks/600.workflows/631.parallel-download/input.py new file mode 100644 index 000000000..fd9d6d7b5 --- /dev/null +++ b/benchmarks/600.workflows/631.parallel-download/input.py @@ -0,0 +1,48 @@ +import os +from random import shuffle + +size_generators = { + 'test' : (5, 10), + 'small': (20, 2**10), + 'large': (50, 2**10), + '2e10': (20, 2**10), + '2e28': (20, 2**28), + '2e15': (20, 2**15), + '2e20': (20, 2**20), + '2e25': (20, 2**25), + '2e26': (20, 2**26), + '2e27': (20, 2**27) +} + + +def buckets_count(): + return (1, 0) + + +def generate(size): + elems = list(range(size)) + shuffle(elems) + + length = 0 + for i in elems: + data = str(i % 255) + length += len(data) + if length > size: + break + yield data + + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + count, size_bytes = size_generators[size] + + data_name = f"data-{size_bytes}.txt" + data_path = os.path.join(data_dir, data_name) + + if not os.path.exists(data_path): + with open(data_path, "w") as f: + f.writelines(k for k in generate(size_bytes)) + + upload_func(0, data_name, data_path) + # os.remove(data_path) + + return { 'count': count, "bucket": benchmarks_bucket, "blob": input_buckets[0] + '/' + data_name} diff --git a/benchmarks/600.workflows/631.parallel-download/python/generate.py b/benchmarks/600.workflows/631.parallel-download/python/generate.py new file mode 100644 index 000000000..fa20cd018 --- /dev/null +++ b/benchmarks/600.workflows/631.parallel-download/python/generate.py @@ -0,0 +1,8 @@ +def handler(event): + count = int(event["count"]) + del event["count"] + + + return { + "buffer": count * [event] + } diff --git a/benchmarks/600.workflows/631.parallel-download/python/process.py b/benchmarks/600.workflows/631.parallel-download/python/process.py new file mode 100644 index 000000000..e4d56fe20 --- /dev/null +++ b/benchmarks/600.workflows/631.parallel-download/python/process.py @@ -0,0 +1,11 @@ +from . import storage + +def handler(event): + bucket = event["bucket"] + blob = event["blob"] + + client = storage.storage.get_instance() + buffer = client.download_stream(bucket, blob) + + + return "ok" diff --git a/benchmarks/600.workflows/640.selfish-detour/config.json b/benchmarks/600.workflows/640.selfish-detour/config.json new file mode 100644 index 000000000..8ff6eec59 --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 128, + "languages": ["python"], + "modules": [] +} diff --git a/benchmarks/600.workflows/640.selfish-detour/definition.json b/benchmarks/600.workflows/640.selfish-detour/definition.json new file mode 100644 index 000000000..c05a0ee6f --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/definition.json @@ -0,0 +1,9 @@ +{ + "root": "measure", + "states": { + "measure": { + "type": "task", + "func_name": "measure" + } + } +} \ No newline at end of file diff --git a/benchmarks/600.workflows/640.selfish-detour/input.py b/benchmarks/600.workflows/640.selfish-detour/input.py new file mode 100644 index 000000000..69d06fcd5 --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/input.py @@ -0,0 +1,12 @@ +size_generators = { + 'test' : 100, + 'small': 5000, + 'large': 10000, +} + +def buckets_count(): + return (0, 0) + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + num_samples = size_generators[size] + return { 'num_samples': num_samples } diff --git a/benchmarks/600.workflows/640.selfish-detour/package.sh b/benchmarks/600.workflows/640.selfish-detour/package.sh new file mode 100644 index 000000000..c1145e436 --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/package.sh @@ -0,0 +1,11 @@ +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +CUR_DIR=$(pwd) +cd ${SCRIPT_DIR} + +for C_FILE in $(ls *.c) +do + cc -fPIC -shared -o ${C_FILE%%.*}.so ${C_FILE} + rm ${C_FILE} +done + +cd ${CUR_DIR} diff --git a/benchmarks/600.workflows/640.selfish-detour/python/measure.py b/benchmarks/600.workflows/640.selfish-detour/python/measure.py new file mode 100644 index 000000000..7a0900c8f --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/python/measure.py @@ -0,0 +1,37 @@ +import os +import json +from ctypes import * + +def handler(event): + num_samples = event["num_samples"] + + so_file = "selfish-detour.so" + dir = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(dir, so_file) + if not os.path.exists(path): + path = os.path.join(dir, os.pardir, so_file) + + lib = cdll.LoadLibrary(path) + lib.get_ticks_per_second.restype = c_double + lib.selfish_detour.argtypes = [c_int, c_int, POINTER(c_ulonglong)] + + tps = lib.get_ticks_per_second() + assert(tps > 0) + + res = (c_ulonglong*num_samples)() + ptr = cast(res, POINTER(c_ulonglong)) + lib.selfish_detour(num_samples, 900, ptr) + + res = list(res) + assert(all(x<=y for x, y in zip(res[2:], res[3:]))) + + payload = json.dumps({ + "min_diff": res[0], + "num_iterations": res[1], + "timestamps": res[2:], + "tps": tps + }) + os.environ["SEBS_FUNCTION_RESULT"] = payload + + return "ok" + diff --git a/benchmarks/600.workflows/640.selfish-detour/selfish-detour.c b/benchmarks/600.workflows/640.selfish-detour/selfish-detour.c new file mode 100644 index 000000000..d4559d417 --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/selfish-detour.c @@ -0,0 +1,138 @@ +// THIS IS THE SELFISH DETOUR EXAMPLE FROM NETGAUGE https://spcl.inf.ethz.ch/Research/Performance/Netgauge/OS_Noise/ + +#include +#include +#include + +#define UINT64_T uint64_t +#define UINT32_T uint32_t + +typedef struct { + UINT32_T l; + UINT32_T h; +} x86_64_timeval_t; + +#define HRT_TIMESTAMP_T x86_64_timeval_t + +/* TODO: Do we need a while loop here? aka Is rdtsc atomic? - check in the documentation */ +#define HRT_GET_TIMESTAMP(t1) __asm__ __volatile__ ("rdtsc" : "=a" (t1.l), "=d" (t1.h)); + +#define HRT_GET_ELAPSED_TICKS(t1, t2, numptr) *numptr = (((( UINT64_T ) t2.h) << 32) | t2.l) - \ + (((( UINT64_T ) t1.h) << 32) | t1.l); + +#define HRT_GET_TIME(t1, time) time = (((( UINT64_T ) t1.h) << 32) | t1.l) + +double get_ticks_per_second() { + #define NUM_TESTS 10 + + HRT_TIMESTAMP_T t1, t2; + uint64_t res[NUM_TESTS]; + uint64_t min=0; + int count; + + for (count=0; count res[count]) min = res[count]; + } + + return ((double) min); +} + +void selfish_detour(int num_runs, int threshold, uint64_t *results) { + int cnt=0, num_not_smaller = 0; + HRT_TIMESTAMP_T current, prev, start; + uint64_t sample = 0; + uint64_t elapsed, thr, min=(uint64_t)~0; + int i; + + // we will do a "calibration run" of the detour benchmark to + // get a reasonable value for the minimal detour time + // just perform the benchmark and record the minimal detour time until + // this minimal detour time does not get smaller for 1000 (as defined by NOT_SMALLER) + // consecutive runs + + #define NOT_SMALLER 100 + #define INNER_TRIES 50 + + thr = min*(threshold/100.0); + while (num_not_smaller < NOT_SMALLER) { + cnt = 0; + + HRT_GET_TIMESTAMP(start); + HRT_GET_TIMESTAMP(current); + + // this is exactly the same loop as below for measurement + while (cnt < INNER_TRIES) { + prev = current; + HRT_GET_TIMESTAMP(current); + + sample++; + + HRT_GET_ELAPSED_TICKS(prev, current, &elapsed); + // != instead of < in the benchmark loop in order to make the + // notsmaller principle useful + if ( elapsed != thr ) { + HRT_GET_ELAPSED_TICKS(start, prev, &results[cnt++]); + HRT_GET_ELAPSED_TICKS(start, current, &results[cnt++]); + } + } + + // find minimum in results array - this is outside the + // calibration/measurement loop! + { + if(min == 0) { + printf("The initialization reached 0 clock cycles - the clock accuracy seems too low (setting min=1 and exiting calibration)\n"); + min = 1; + break; + } + int smaller=0; + for(i = 0; i < INNER_TRIES; i+=2) { + if(results[i+1]-results[i] < min) { + min = results[i+1]-results[i]; + smaller=1; + //printf("[%i] min: %lu\n", r, min); + } + } + if (!smaller) num_not_smaller++; + else num_not_smaller = 0; + } + } + + // now we perform the actual benchmark: Read a time-stamp-counter in a tight + // loop ignore the results if the timestamps are close to each other, as we can assume + // that nobody interrupted us. If the difference of the timestamps exceeds a certain + // threshold, we assume that we have been "hit" by a "noise event" and record the + // time difference for later analysis + + cnt = 2; + sample = 0; + + HRT_GET_TIMESTAMP(start); + HRT_GET_TIMESTAMP(current); + + // perform this outside measurement loop in order to save + // time/increase measurement frequency + thr = min*(threshold/100.0); + while (cnt < num_runs) { + prev = current; + HRT_GET_TIMESTAMP(current); + + sample++; + + HRT_GET_ELAPSED_TICKS(prev, current, &elapsed); + if ( elapsed > thr ) { + HRT_GET_ELAPSED_TICKS(start, prev, &results[cnt++]); + HRT_GET_ELAPSED_TICKS(start, current, &results[cnt++]); + } + } + + results[0] = min; + results[1] = sample; +} \ No newline at end of file diff --git a/benchmarks/600.workflows/650.vid/LICENSE b/benchmarks/600.workflows/650.vid/LICENSE new file mode 100644 index 000000000..eb52badbf --- /dev/null +++ b/benchmarks/600.workflows/650.vid/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 EASE Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/benchmarks/600.workflows/650.vid/README.md b/benchmarks/600.workflows/650.vid/README.md new file mode 100644 index 000000000..88e93c258 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/README.md @@ -0,0 +1,3 @@ +The implementation of the Video benchmark is based on the implementation here: https://github.com/vhive-serverless/vSwarm/tree/main/benchmarks/video-analytics, with the license provided. + +We use the [ExCamera static binaries](https://github.com/excamera/excamera-static-bins). diff --git a/benchmarks/600.workflows/650.vid/config.json b/benchmarks/600.workflows/650.vid/config.json new file mode 100644 index 000000000..7c750b980 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 300, + "memory": 2048, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/650.vid/definition.json b/benchmarks/600.workflows/650.vid/definition.json new file mode 100644 index 000000000..94b66ce6b --- /dev/null +++ b/benchmarks/600.workflows/650.vid/definition.json @@ -0,0 +1,26 @@ +{ + "root": "decode", + "states": { + "decode": { + "type": "task", + "func_name": "decode", + "next": "analyse-map" + }, + "analyse-map": { + "type": "map", + "root": "analyse", + "array": "frames", + "next": "summarize", + "states": { + "analyse": { + "type": "task", + "func_name": "analyse" + } + } + }, + "summarize": { + "type": "task", + "func_name": "summarize" + } + } +} diff --git a/benchmarks/600.workflows/650.vid/input.py b/benchmarks/600.workflows/650.vid/input.py new file mode 100644 index 000000000..c1515f901 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/input.py @@ -0,0 +1,30 @@ +import os + +size_generators = { + "test" : (3, 10, "video_test.mp4"), + "small": (10, 5, "video_small.mp4"), + "large": (1000, 3, "video_large.mp4"), +} + + +def buckets_count(): + return (1, 1) + + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + n_frames, batch_size, video_name = size_generators[size] + files = ["frozen_inference_graph.pb", "faster_rcnn_resnet50_coco_2018_01_28.pbtxt", video_name] + for name in files: + path = os.path.join(data_dir, name) + upload_func(0, name, path) + + return { + "video": video_name, + "n_frames": n_frames, + "batch_size": batch_size, + "frames_bucket": output_buckets[0], + "benchmark_bucket": benchmarks_bucket, + "input_bucket": input_buckets[0], + "model_weights": files[0], + "model_config": files[1] + } diff --git a/benchmarks/600.workflows/650.vid/python/analyse.py b/benchmarks/600.workflows/650.vid/python/analyse.py new file mode 100644 index 000000000..1b8f31664 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/python/analyse.py @@ -0,0 +1,80 @@ +import os +import io +import json +import sys +from . import storage + +import cv2 + +client = storage.storage.get_instance() + +labels = ["person", "bicycle", "car", "motorcycle", +"airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", +"stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", +"sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", +"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", +"snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", +"surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", +"spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", +"pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", +"toilet", "tv", "laptop", "mouse", "remote", "keyboard", +"cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", +"book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" ] + + +def load_model(bucket, weights_blob, config_blob, dest_dir): + weights_path = os.path.join(dest_dir, "model.weights") + client.download(bucket, weights_blob, weights_path) + + config_path = os.path.join(dest_dir, "model.config") + client.download(bucket, config_blob, config_path) + + net = cv2.dnn.readNetFromTensorflow(weights_path, config_path) + return net + + +def load_frames(benchmark_bucket, bucket, blobs, dest_dir): + for blob in blobs: + stripped_blob = blob.replace(bucket + '/', '') + path = os.path.join(dest_dir, stripped_blob) + client.download(benchmark_bucket, blob, path) + yield cv2.imread(path) + + +def detect(net, img): + rows = img.shape[0] + cols = img.shape[1] + img = cv2.dnn.blobFromImage(img, size=(300, 300), swapRB=True, crop=False) + net.setInput(img) + out = net.forward() + + preds = [] + for detection in out[0,0,:,:]: + score = float(detection[2]) + if score > 0.5: + class_id = int(detection[1]) + preds.append({ + "class": labels[class_id], + "score": score + }) + + return preds + + +def handler(event): + tmp_dir = "/tmp" + + benchmark_bucket = event["benchmark_bucket"] + + frames = list(load_frames(benchmark_bucket, event["frames_bucket"], event["frames"], tmp_dir)) + net = load_model(benchmark_bucket, event["model_bucket"] + '/' + event["model_weights"], event["model_bucket"] + '/' + event["model_config"], tmp_dir) + + preds = [detect(net, frame) for frame in frames] + + frames_names = event["frames"] + frames_names = [x.split(".")[0] for x in event["frames"]] + + preds = {f"{frames_names[idx]}": dets for idx, dets in enumerate(preds)} + + return preds + diff --git a/benchmarks/600.workflows/650.vid/python/decode.py b/benchmarks/600.workflows/650.vid/python/decode.py new file mode 100644 index 000000000..d27b67c3c --- /dev/null +++ b/benchmarks/600.workflows/650.vid/python/decode.py @@ -0,0 +1,67 @@ +import os +import uuid +from . import storage + +import cv2 + +client = storage.storage.get_instance() + + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + + +def load_video(benchmark_bucket, bucket, blob, dest_dir): + path = os.path.join(dest_dir, blob) + client.download(benchmark_bucket, bucket + '/' + blob, path) + return path + + +def decode_video(path, n_frames, dest_dir): + vidcap = cv2.VideoCapture(path) + success, img = vidcap.read() + img_paths = [] + while success and len(img_paths) < n_frames: + img_path = os.path.join(dest_dir, f"frame{len(img_paths)}.jpg") + img_paths.append(img_path) + cv2.imwrite(img_path, img) + success, img = vidcap.read() + + return img_paths + + +def upload_imgs(benchmark_bucket, bucket, paths): + client = storage.storage.get_instance() + + for path in paths: + name = os.path.basename(path) + yield client.upload(benchmark_bucket, bucket + '/' + name, path) + + +def handler(event): + vid_blob = event["video"] + n_frames = event["n_frames"] + batch_size = event["batch_size"] + frames_bucket = event["frames_bucket"] + input_bucket = event["input_bucket"] + benchmark_bucket = event["benchmark_bucket"] + + tmp_dir = os.path.join("/tmp", str(uuid.uuid4())) + os.makedirs(tmp_dir, exist_ok=True) + + vid_path = load_video(benchmark_bucket, input_bucket, vid_blob, tmp_dir) + img_paths = decode_video(vid_path, n_frames, tmp_dir) + paths = list(upload_imgs(benchmark_bucket, frames_bucket, img_paths)) + frames = list(chunks(paths, batch_size)) + + return { + "frames": [{ + "frames_bucket": frames_bucket, + "frames": fs, + "benchmark_bucket": benchmark_bucket, + "model_bucket": input_bucket, + "model_config": event["model_config"], + "model_weights": event["model_weights"] + } for fs in frames] + } diff --git a/benchmarks/600.workflows/650.vid/python/requirements.txt b/benchmarks/600.workflows/650.vid/python/requirements.txt new file mode 100644 index 000000000..6ab6d0d57 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/python/requirements.txt @@ -0,0 +1 @@ +opencv-python-headless diff --git a/benchmarks/600.workflows/650.vid/python/summarize.py b/benchmarks/600.workflows/650.vid/python/summarize.py new file mode 100644 index 000000000..8d290f3f9 --- /dev/null +++ b/benchmarks/600.workflows/650.vid/python/summarize.py @@ -0,0 +1,18 @@ +import os +import io +import uuid +import json +import sys +from . import storage + + +def handler(event): + frames = event["frames"] + + logs = {} + for xs in frames: + for key,value in xs.items(): + logs[key] = value + + return logs + diff --git a/benchmarks/600.workflows/660.map-reduce/config.json b/benchmarks/600.workflows/660.map-reduce/config.json new file mode 100644 index 000000000..d06e01d56 --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 256, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/660.map-reduce/definition.json b/benchmarks/600.workflows/660.map-reduce/definition.json new file mode 100644 index 000000000..de02b775c --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/definition.json @@ -0,0 +1,38 @@ +{ + "root": "split", + "states": { + "split": { + "type": "task", + "func_name": "split", + "next": "map-state" + }, + "map-state": { + "type": "map", + "root": "map", + "array": "list", + "next": "shuffle", + "states": { + "map": { + "type": "task", + "func_name": "map" + } + } + }, + "shuffle": { + "type": "task", + "func_name": "shuffle", + "next": "reduce-state" + }, + "reduce-state": { + "type": "map", + "root": "reduce", + "array": "list", + "states": { + "reduce": { + "type": "task", + "func_name": "reduce" + } + } + } + } +} diff --git a/benchmarks/600.workflows/660.map-reduce/input.py b/benchmarks/600.workflows/660.map-reduce/input.py new file mode 100644 index 000000000..36b2bcc8f --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/input.py @@ -0,0 +1,35 @@ +import os +import random + +size_generators = { + "test" : (50, 3), + "small": (1000, 3), + "large": (100000, 3) +} + + +def buckets_count(): + return (1, 1) + + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + mult, n_mappers = size_generators[size] + words = ["cat", "dog", "bird", "horse", "pig"] + lst = mult * words + random.shuffle(lst) + + list_path = os.path.join(data_dir, "words") + list_name = "words" + with open(list_path, "w") as f: + f.writelines(w+"\n" for w in lst) + + upload_func(0, list_name, list_path) + #os.remove(list_path) + + return { + "benchmark_bucket": benchmarks_bucket, + "words_bucket": input_buckets[0], + "words": list_name, + "n_mappers": n_mappers, + "output_bucket": output_buckets[0] + } diff --git a/benchmarks/600.workflows/660.map-reduce/python/map.py b/benchmarks/600.workflows/660.map-reduce/python/map.py new file mode 100644 index 000000000..0ba79ae73 --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/python/map.py @@ -0,0 +1,36 @@ +import os +import io +from . import storage + + +def count_words(lst): + index = dict() + for word in lst: + if len(word) == 0: + continue + + val = index.get(word, 0) + index[word] = val + 1 + + return index + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + bucket = event["bucket"] + blob = event["blob"] + prefix = event["prefix"] + + client = storage.storage.get_instance() + my_buffer = client.download_stream(benchmark_bucket, bucket + '/' + blob) + words = bytes(my_buffer).decode("utf-8").split("\n") + + index = count_words(words) + for word, count in index.items(): + data = io.BytesIO() + data.write(str(count).encode("utf-8")) + data.seek(0) + + #client.upload_stream(benchmark_bucket, os.path.join(bucket, prefix, word, blob), data) + client.upload_stream(benchmark_bucket, os.path.join(prefix, word, blob), data) + + return event diff --git a/benchmarks/600.workflows/660.map-reduce/python/reduce.py b/benchmarks/600.workflows/660.map-reduce/python/reduce.py new file mode 100644 index 000000000..15fe6d707 --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/python/reduce.py @@ -0,0 +1,23 @@ +import os +import io +import json +from . import storage + + +def handler(event): + bucket = event["bucket"] + path = event["dir"] + + client = storage.storage.get_instance() + count = 0 + #each blob is one word. + #for blob in client.list_directory(bucket, path): + for blob in client.list_directory(bucket, path): + my_buffer = client.download_stream(bucket, blob) + count += int(bytes(my_buffer).decode("utf-8")) + #count += int(my_buffer.getvalue().decode("utf-8")) + + return { + "word": os.path.basename(path), + "count": count + } diff --git a/benchmarks/600.workflows/660.map-reduce/python/shuffle.py b/benchmarks/600.workflows/660.map-reduce/python/shuffle.py new file mode 100644 index 000000000..44568f27d --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/python/shuffle.py @@ -0,0 +1,27 @@ +import os +import json +from . import storage + + +def handler(event): + lst = event["list"] + benchmark_bucket = lst[0]["benchmark_bucket"] + bucket = lst[0]["bucket"] + prefix = lst[0]["prefix"] + + client = storage.storage.get_instance() + dirs = client.list_directory(benchmark_bucket, prefix) + dirs = [p.split(os.sep)[1] for p in dirs] + dirs = list(set(dirs)) + lst = [{ + "bucket": benchmark_bucket, + #"dir": os.path.join(bucket, prefix, path) + #TODO add word here. + "dir": os.path.join(prefix, path) + #"dir": os.path.join(bucket, prefix) + } for path in dirs] + + + return { + "list": lst + } diff --git a/benchmarks/600.workflows/660.map-reduce/python/split.py b/benchmarks/600.workflows/660.map-reduce/python/split.py new file mode 100644 index 000000000..941ffdfff --- /dev/null +++ b/benchmarks/600.workflows/660.map-reduce/python/split.py @@ -0,0 +1,53 @@ +import os +import io +import uuid +from . import storage + +def chunks(lst, n): + m = int(len(lst) / n) + for i in range(n-1): + yield lst[i*m:i*m+m] + tail = lst[(n-1)*m:] + if len(tail) > 0: + yield tail + + +def handler(event): + benchmark_bucket = event["benchmark_bucket"] + words_bucket = event["words_bucket"] + words_blob = event["words"] + words_path = os.path.join("/tmp", "words.txt") + + client = storage.storage.get_instance() + client.download(benchmark_bucket, words_bucket + '/' + words_blob, words_path) + with open(words_path, "r") as f: + list = f.read().split("\n") + os.remove(words_path) + + n_mappers = event["n_mappers"] + output_bucket = event["output_bucket"] + map_lists = chunks(list, n_mappers) + blobs = [] + + + for chunk in map_lists: + name = str(uuid.uuid4())[:8] + data = io.BytesIO() + data.writelines((val+"\n").encode("utf-8") for val in chunk) + data.seek(0) + + name = client.upload_stream(benchmark_bucket, output_bucket + '/' + name, data) + stripped_name = name.replace(output_bucket + '/', '') + blobs.append(stripped_name) + + prefix = str(uuid.uuid4())[:8] + lst = [{ + "benchmark_bucket": benchmark_bucket, + "bucket": output_bucket, + "blob": b, + "prefix": prefix + } for b in blobs] + + return { + "list": lst + } diff --git a/benchmarks/600.workflows/670.auth/config.json b/benchmarks/600.workflows/670.auth/config.json new file mode 100644 index 000000000..e6a65cb35 --- /dev/null +++ b/benchmarks/600.workflows/670.auth/config.json @@ -0,0 +1,5 @@ +{ + "timeout": 120, + "memory": 256, + "languages": ["python"] +} diff --git a/benchmarks/600.workflows/670.auth/definition.json b/benchmarks/600.workflows/670.auth/definition.json new file mode 100644 index 000000000..a6511630b --- /dev/null +++ b/benchmarks/600.workflows/670.auth/definition.json @@ -0,0 +1,9 @@ +{ + "root": "auth", + "states": { + "auth": { + "type": "task", + "func_name": "auth" + } + } +} \ No newline at end of file diff --git a/benchmarks/600.workflows/670.auth/input.py b/benchmarks/600.workflows/670.auth/input.py new file mode 100644 index 000000000..d81d24e45 --- /dev/null +++ b/benchmarks/600.workflows/670.auth/input.py @@ -0,0 +1,21 @@ +import random + +size_generators = { + "test" : 10, + "small": 100, + "large": 1000 +} + + +def buckets_count(): + return (0, 0) + + +def generate_input(data_dir, size, input_buckets, output_buckets, upload_func): + mult = size_generators[size] + msg = "Who let the dogs out?\n" * mult + + return { + "message": msg, + "token": "allow" + } \ No newline at end of file diff --git a/benchmarks/600.workflows/670.auth/python/auth.py b/benchmarks/600.workflows/670.auth/python/auth.py new file mode 100644 index 000000000..c7b77649c --- /dev/null +++ b/benchmarks/600.workflows/670.auth/python/auth.py @@ -0,0 +1,39 @@ +import random +import string +import pyaes +import base64 + +KEY = "6368616e676520746869732070617373".encode("utf-8") + + +def AESModeCTR(plaintext): + counter = pyaes.Counter(initial_value=0) + aes = pyaes.AESModeOfOperationCTR(KEY, counter=counter) + ciphertext = aes.encrypt(plaintext) + return ciphertext + + +def AESModeCBC(plaintext): + # random initialization vector of 16 bytes + blocks_size = 16 + iv = "InitializationVe" + pad = 16 - len(plaintext)% blocks_size + plaintext = str("0" * pad) + plaintext + aes = pyaes.AESModeOfOperationCBC(KEY, iv=iv) + ciphertext = aes.encrypt(plaintext) + + return ciphertext.decode("utf-8") + + +def handler(event): + message = event["message"] + token = event["token"] + + res = "unauthorized" + if token == "allow": + res = AESModeCTR(message) + res = base64.b64encode(res).decode("ascii") + + return { + "response": res + } diff --git a/benchmarks/600.workflows/670.auth/python/requirements.txt b/benchmarks/600.workflows/670.auth/python/requirements.txt new file mode 100644 index 000000000..4f17c9ccd --- /dev/null +++ b/benchmarks/600.workflows/670.auth/python/requirements.txt @@ -0,0 +1 @@ +pyaes==1.6.1 \ No newline at end of file diff --git a/benchmarks/600.workflows/680.excamera/LICENSE b/benchmarks/600.workflows/680.excamera/LICENSE new file mode 100644 index 000000000..eb52badbf --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 EASE Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/benchmarks/600.workflows/680.excamera/README.md b/benchmarks/600.workflows/680.excamera/README.md new file mode 100644 index 000000000..ef2a26cb6 --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/README.md @@ -0,0 +1,3 @@ +The implementation of the ExCamera benchmarks is based on an adaptation of the vSwarm benchmark: https://github.com/vhive-serverless/vSwarm/tree/main/benchmarks/gg/benchmarks/excamera with the license provided. + +To encode the video, the static binaries of the ExCamera project are used: https://github.com/excamera/excamera-static-bins diff --git a/benchmarks/600.workflows/680.excamera/config.json b/benchmarks/600.workflows/680.excamera/config.json new file mode 100644 index 000000000..5118297aa --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 540, + "memory": 256, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/680.excamera/definition.json b/benchmarks/600.workflows/680.excamera/definition.json new file mode 100644 index 000000000..0a494b85b --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/definition.json @@ -0,0 +1,39 @@ +{ + "root": "split", + "states": { + "split": { + "type": "task", + "func_name": "split", + "next": "encode-state" + }, + "encode-state": { + "type": "map", + "root": "encode", + "array": "segments", + "next": "reencode-state", + "states": { + "encode": { + "type": "task", + "func_name": "encode" + } + } + }, + "reencode-state": { + "type": "map", + "root": "reencode", + "array": "segments", + "next": "rebase-state", + "states": { + "reencode": { + "type": "task", + "func_name": "reencode" + } + } + }, + "rebase-state": { + "type": "loop", + "func_name": "rebase", + "array": "segments" + } + } +} diff --git a/benchmarks/600.workflows/680.excamera/input.py b/benchmarks/600.workflows/680.excamera/input.py new file mode 100644 index 000000000..687a2eabc --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/input.py @@ -0,0 +1,44 @@ +import random +import os + +size_generators = { + "test" : (18, 6), + "small": (30, 6), + "large": (60, 6) +} + + +def buckets_count(): + return (1, 1) + + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + num_frames, batch_size = size_generators[size] + + for bin in os.listdir(data_dir): + path = os.path.join(data_dir, bin) + if os.path.isfile(path): + upload_func(0, bin, path) + + vid_dir = os.path.join(data_dir, "vid") + vid_segs = sorted(os.listdir(vid_dir)) + new_vid_segs = [] + + for i in range(num_frames): + seg = vid_segs[i % len(vid_segs)] + name = "{:08.0f}.y4m".format(i) + path = os.path.join(vid_dir, seg) + + new_vid_segs.append(name) + upload_func(0, name, path) + + assert(len(new_vid_segs) == num_frames) + + return { + "segments": new_vid_segs, + "benchmark_bucket": benchmarks_bucket, + "input_bucket": input_buckets[0], + "output_bucket": output_buckets[0], + "batch_size": batch_size, + "quality": 1 + } diff --git a/benchmarks/600.workflows/680.excamera/python/encode.py b/benchmarks/600.workflows/680.excamera/python/encode.py new file mode 100644 index 000000000..44a84c5ec --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/python/encode.py @@ -0,0 +1,88 @@ +import os +import uuid +import subprocess +from . import storage +import logging +import shutil + +VPXENC = "/tmp/vpxenc --ivf --codec=vp8 --good --cpu-used=0 --end-usage=cq --min-q=0 --max-q=63 --cq-level={quality} --buf-initial-sz=10000 --buf-optimal-sz=20000 --buf-sz=40000 --undershoot-pct=100 --passes=2 --auto-alt-ref=1 --threads=1 --token-parts=0 --tune=ssim --target-bitrate=4294967295 -o {output}.ivf {input}.y4m" +TERMINATE_CHUNK = "/tmp/xc-terminate-chunk {input}.ivf {output}.ivf" +XC_DUMP_0 = "/tmp/xc-dump {input}.ivf {output}.state" + +client = storage.storage.get_instance() + +def download_bin(benchmark_bucket, bucket, name, dest_dir): + path = os.path.join(dest_dir, name) + if not os.path.exists(path): + client.download(benchmark_bucket, bucket + '/' + name, path) + subprocess.check_output(f"chmod +x {path}", stderr=subprocess.STDOUT, shell=True) + + +def upload_files(benchmark_bucket, bucket, paths, prefix): + for path in paths: + file = os.path.basename(path) + file = prefix + file + #print("Uploading", file, "to", path) + client.upload(benchmark_bucket, bucket + '/' + file, path, unique_name=False) + + +def run(cmd): + try: + return subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError as e: + logger = logging.getLogger() + logger.error(f"Error when executing command: {cmd}\n{e.output.decode('utf-8')}") + raise e + + +def encode(segs, data_dir, quality): + files = [] + + for idx, name in enumerate(segs): + input_path = os.path.join(data_dir, name) + output_path = os.path.join(data_dir, f"{name}-vpxenc") + cmd = VPXENC.format(quality=quality, input=input_path, output=output_path) + run(cmd) + + input_path = output_path + output = name if idx == 0 else f"{name}-0" + output_path = os.path.join(data_dir, output) + cmd = TERMINATE_CHUNK.format(input=input_path, output=output_path) + run(cmd) + files.append(output_path+".ivf") + + input_path = output_path + output_path = os.path.join(data_dir, f"{name}-0") + cmd = XC_DUMP_0.format(input=input_path, output=output_path) + run(cmd) + files.append(output_path+".state") + + return files + + +def handler(event): + input_bucket = event["input_bucket"] + output_bucket = event["output_bucket"] + benchmark_bucket = event["benchmark_bucket"] + segs = event["segments"] + quality = event["quality"] + prefix = event["prefix"] + + tmp_dir = "/tmp" + download_bin(benchmark_bucket, input_bucket, "vpxenc", tmp_dir) + download_bin(benchmark_bucket, input_bucket, "xc-terminate-chunk", tmp_dir) + download_bin(benchmark_bucket, input_bucket, "xc-dump", tmp_dir) + + data_dir = os.path.join(tmp_dir, str(uuid.uuid4())) + os.makedirs(data_dir, exist_ok=True) + for seg in segs: + path = os.path.join(data_dir, seg) + client.download(benchmark_bucket, input_bucket + '/' + seg, path) + + segs = [os.path.splitext(seg)[0] for seg in segs] + output_paths = encode(segs, data_dir, quality) + upload_files(benchmark_bucket, output_bucket, output_paths, prefix) + + shutil.rmtree(data_dir) + + return event diff --git a/benchmarks/600.workflows/680.excamera/python/rebase.py b/benchmarks/600.workflows/680.excamera/python/rebase.py new file mode 100644 index 000000000..809774305 --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/python/rebase.py @@ -0,0 +1,115 @@ +import os +import uuid +import subprocess +from . import storage +import logging +import shutil + +XC_ENC_REBASE = "/tmp/xc-enc -W -w 0.75 -i y4m -o {output}.ivf -r -I {source_state}.state -p {input_pred}.ivf -S {pred_state}.state {extra} {input}.y4m" + +client = storage.storage.get_instance() + +def download_bin(benchmark_bucket, bucket, name, dest_dir): + path = os.path.join(dest_dir, name) + if not os.path.exists(path): + client.download(benchmark_bucket, bucket + '/' + name, path) + subprocess.check_output(f"chmod +x {path}", stderr=subprocess.STDOUT, shell=True) + + +def upload_files(benchmark_bucket, bucket, paths, prefix): + for path in paths: + file = os.path.basename(path) + file = prefix + file + #print("Uploading", file, "to", path) + client.upload(benchmark_bucket, bucket + '/' + file, path, unique_name=False) + + +def run(cmd): + try: + return subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError as e: + logger = logging.getLogger() + logger.error(f"Error when executing command: {cmd}\n{e.output.decode('utf-8')}") + raise e + + +def prev_seg_name(seg): + idx = int(seg)-1 + assert(idx >= 0) + return "{:08d}".format(idx) + + +def rebase(segs, data_dir, dry_run=False): + input_paths = [] + output_paths = [] + + for idx in range(2, len(segs)): + name = segs[idx] + input_path = os.path.join(data_dir, name) + prev_input_path = os.path.join(data_dir, prev_seg_name(name)) + source_state_path = f"{prev_input_path}-1" + output_state_path = f"{input_path}-1.state" + extra = f"-O {output_state_path}" if idx != len(segs)-1 else "" + input_pred_path = f"{input_path}-1" + pred_state_path = f"{prev_input_path}-0" + + cmd = XC_ENC_REBASE.format( + output=input_path, + input=input_path, + source_state=source_state_path, + extra=extra, + input_pred=input_pred_path, + pred_state=pred_state_path) + if not dry_run: + run(cmd) + + + input_paths.append(input_path+".y4m") + input_paths.append(source_state_path+".state") + input_paths.append(input_pred_path+".ivf") + input_paths.append(pred_state_path+".state") + + output_paths.append(input_path+".ivf") + if idx != len(segs)-1: + output_paths.append(output_state_path) + + return input_paths, output_paths + + +def handler(event): + input_bucket = event["input_bucket"] + output_bucket = event["output_bucket"] + benchmark_bucket = event["benchmark_bucket"] + segs = event["segments"] + segs = [os.path.splitext(seg)[0] for seg in segs] + prefix = event["prefix"] + + tmp_dir = "/tmp" + download_bin(benchmark_bucket, input_bucket, "xc-enc", tmp_dir) + + data_dir = os.path.join(tmp_dir, str(uuid.uuid4())) + os.makedirs(data_dir, exist_ok=True) + + input_paths, _ = rebase(segs, data_dir, dry_run=True) + + for path in input_paths: + file = os.path.basename(path) + + try: + if ".y4m" in file: + client.download(benchmark_bucket, input_bucket + '/' + file, path) + else: + file = prefix + file + client.download(benchmark_bucket, output_bucket + '/' + file, path) + except: + # -1.state is generated by rebase itself + if not "-1.state" in file: + raise + + _, output_paths = rebase(segs, data_dir) + upload_files(benchmark_bucket, output_bucket, output_paths, prefix) + + + shutil.rmtree(data_dir) + + return event diff --git a/benchmarks/600.workflows/680.excamera/python/reencode.py b/benchmarks/600.workflows/680.excamera/python/reencode.py new file mode 100644 index 000000000..ee9b6576a --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/python/reencode.py @@ -0,0 +1,103 @@ +import os +import uuid +import subprocess +from . import storage +import logging +import shutil + +XC_ENC_FIRST_FRAME = "/tmp/xc-enc -W -w 0.75 -i y4m -o {output}.ivf -r -I {source_state}.state -p {input_pred}.ivf {extra} {input}.y4m" + +client = storage.storage.get_instance() + +def download_bin(benchmark_bucket, bucket, name, dest_dir): + path = os.path.join(dest_dir, name) + if not os.path.exists(path): + client.download(benchmark_bucket, bucket + '/' + name, path) + subprocess.check_output(f"chmod +x {path}", stderr=subprocess.STDOUT, shell=True) + + +def upload_files(benchmark_bucket, bucket, paths, prefix): + for path in paths: + file = os.path.basename(path) + file = prefix + file + client.upload(benchmark_bucket, bucket + '/' + file, path, unique_name=False) + + +def run(cmd): + try: + return subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError as e: + logger = logging.getLogger() + logger.error(f"Error when executing command: {cmd}\n{e.output.decode('utf-8')}") + raise e + + +def prev_seg_name(seg): + idx = int(seg)-1 + assert(idx >= 0) + return "{:08d}".format(idx) + + +def reencode_first_frame(segs, data_dir, dry_run=False): + input_paths = [] + output_paths = [] + for idx in range(1, len(segs)): + name = segs[idx] + input_path = os.path.join(data_dir, name) + output_path = input_path if idx == 1 else f"{input_path}-1" + source_state_path = os.path.join(data_dir, prev_seg_name(name))+"-0" + output_state_path = f"{input_path}-1.state" + extra = f"-O {output_state_path}" if idx == 1 else "" + input_pred_path = f"{input_path}-0" + + cmd = XC_ENC_FIRST_FRAME.format( + input=input_path, + output=output_path, + source_state=source_state_path, + extra=extra, + input_pred=input_pred_path) + if not dry_run: + run(cmd) + + input_paths.append(input_path+".y4m") + input_paths.append(source_state_path+".state") + input_paths.append(input_pred_path+".ivf") + + output_paths.append(output_path+".ivf") + if idx == 1: + output_paths.append(output_state_path) + + return input_paths, output_paths + + +def handler(event): + input_bucket = event["input_bucket"] + output_bucket = event["output_bucket"] + benchmark_bucket = event["benchmark_bucket"] + segs = event["segments"] + segs = [os.path.splitext(seg)[0] for seg in segs] + prefix = event["prefix"] + + tmp_dir = "/tmp" + download_bin(benchmark_bucket, input_bucket, "xc-enc", tmp_dir) + + data_dir = os.path.join(tmp_dir, str(uuid.uuid4())) + os.makedirs(data_dir, exist_ok=True) + input_paths, _ = reencode_first_frame(segs, data_dir, dry_run=True) + for path in input_paths: + file = os.path.basename(path) + + if ".y4m" in file: + client.download(benchmark_bucket, input_bucket + '/' + file, path) + else: + file = prefix + file + client.download(benchmark_bucket, output_bucket + '/' + file, path) + + + + _, output_paths = reencode_first_frame(segs, data_dir) + upload_files(benchmark_bucket, output_bucket, output_paths, prefix) + + shutil.rmtree(data_dir) + + return event diff --git a/benchmarks/600.workflows/680.excamera/python/requirements.txt b/benchmarks/600.workflows/680.excamera/python/requirements.txt new file mode 100644 index 000000000..5396f9cce --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/python/requirements.txt @@ -0,0 +1 @@ +gitpython \ No newline at end of file diff --git a/benchmarks/600.workflows/680.excamera/python/split.py b/benchmarks/600.workflows/680.excamera/python/split.py new file mode 100644 index 000000000..5ecfad0b6 --- /dev/null +++ b/benchmarks/600.workflows/680.excamera/python/split.py @@ -0,0 +1,26 @@ +import uuid + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + + +def handler(event): + segs = chunks(event["segments"], event["batch_size"]) + input_bucket = event["input_bucket"] + output_bucket = event["output_bucket"] + benchmark_bucket = event["benchmark_bucket"] + quality = event["quality"] + + return { + "segments": [ + { + "prefix": str(uuid.uuid4().int & (1<<64)-1)[:8], + "segments": ss, + "quality": quality, + "input_bucket": input_bucket, + "output_bucket": output_bucket, + "benchmark_bucket": benchmark_bucket + } for idx, ss in enumerate(segs) + ] + } diff --git a/benchmarks/600.workflows/690.ml/config.json b/benchmarks/600.workflows/690.ml/config.json new file mode 100644 index 000000000..f0dd50968 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/config.json @@ -0,0 +1,6 @@ +{ + "timeout": 120, + "memory": 1024, + "languages": ["python"], + "modules": ["storage"] +} diff --git a/benchmarks/600.workflows/690.ml/definition.json b/benchmarks/600.workflows/690.ml/definition.json new file mode 100644 index 000000000..3e427fcd9 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/definition.json @@ -0,0 +1,21 @@ +{ + "root": "generate", + "states": { + "generate": { + "type": "task", + "func_name": "generate", + "next": "train-state" + }, + "train-state": { + "type": "map", + "root": "train", + "array": "schedules", + "states": { + "train": { + "type": "task", + "func_name": "train" + } + } + } + } +} diff --git a/benchmarks/600.workflows/690.ml/input.py b/benchmarks/600.workflows/690.ml/input.py new file mode 100644 index 000000000..d3f930bc7 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/input.py @@ -0,0 +1,25 @@ +size_generators = { + "test" : (1, 100, 5), + "small": (2, 500, 1024), + "large": (3, 1000, 1024), +} + +classifiers = [ + {"name": "SVC", "kernel": "linear", "C": 0.025}, + {"name": "RandomForestClassifier", "max_depth": 5, "n_estimators": 10}, + {"name": "RandomForestClassifier", "max_depth": 5, "n_estimators": 15}, + {"name": "AdaBoostClassifier"} +] + +def buckets_count(): + return (0, 1) + +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): + n_classifiers, n_samples, n_features = size_generators[size] + return { + "classifiers": classifiers[:n_classifiers], + "benchmark_bucket" : benchmarks_bucket, + "dataset_bucket": output_buckets[0], + "n_samples": n_samples, + "n_features": n_features + } diff --git a/benchmarks/600.workflows/690.ml/python/generate.py b/benchmarks/600.workflows/690.ml/python/generate.py new file mode 100644 index 000000000..03fea03db --- /dev/null +++ b/benchmarks/600.workflows/690.ml/python/generate.py @@ -0,0 +1,54 @@ +import os +import uuid +from . import storage + +import sklearn.datasets as datasets +import numpy as np + + +def generate(n_samples, n_features): + X, y = datasets.make_classification( + n_samples, + n_features, + n_redundant=0, + n_clusters_per_class=2, + weights=[0.9, 0.1], + flip_y=0.1, + random_state=123 + ) + + return X, y + + +def upload_dataset(benchmark_bucket, bucket, X, y): + dataset_dir = os.path.join("/tmp", str(uuid.uuid4())) + os.makedirs(dataset_dir, exist_ok=True) + + features_path = os.path.join(dataset_dir, "features.npy") + labels_path = os.path.join(dataset_dir, "labels.npy") + np.save(features_path, X) + np.save(labels_path, y) + + client = storage.storage.get_instance() + features = client.upload(benchmark_bucket, bucket + '/' + "features.npy", features_path) + features = features.replace(bucket + '/', '') + labels = client.upload(benchmark_bucket, bucket + '/' + "labels.npy", labels_path) + labels = labels.replace(bucket + '/', '') + + return features, labels + + +def handler(event): + classifiers = event["classifiers"] + bucket = event["dataset_bucket"] + benchmark_bucket = event["benchmark_bucket"] + n_samples = int(event["n_samples"]) + n_features = int(event["n_features"]) + + X, y = generate(n_samples, n_features) + X_key, y_key = upload_dataset(benchmark_bucket, bucket, X, y) + + schedules = [{**c, "features": X_key, "labels": y_key, "bucket": bucket, "benchmark_bucket": benchmark_bucket} for c in classifiers] + return { + "schedules": schedules + } diff --git a/benchmarks/600.workflows/690.ml/python/package.sh b/benchmarks/600.workflows/690.ml/python/package.sh new file mode 100644 index 000000000..1133cbac2 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/python/package.sh @@ -0,0 +1,25 @@ +# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo + +PACKAGE_DIR=$1 +echo "Original size $(du -sh $1 | cut -f1)" + +CUR_DIR=$(pwd) +cd $1 +# cleaning libs +rm -rf external +find . -type d -name "tests" -exec rm -rf {} + +find . -type d -name "test" -exec rm -rf {} + +find . -type d -name "bin" -exec rm -rf {} + + +# cleaning +find -name "*.so" -not -path "*/PIL/*" | xargs strip +find -name "*.so.*" -not -path "*/PIL/*" | xargs strip + +rm -r pip > /dev/null +rm -r pip-* > /dev/null +rm -r wheel > /dev/null +rm -r wheel-* > /dev/null +rm easy_install.py > /dev/null +find . -name \*.pyc -delete +cd ${CUR_DIR} +echo "Stripped size $(du -sh $1 | cut -f1)" diff --git a/benchmarks/600.workflows/690.ml/python/requirements.txt b/benchmarks/600.workflows/690.ml/python/requirements.txt new file mode 100644 index 000000000..49e2777d9 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/python/requirements.txt @@ -0,0 +1,2 @@ +scipy==1.10.0 +scikit-learn diff --git a/benchmarks/600.workflows/690.ml/python/train.py b/benchmarks/600.workflows/690.ml/python/train.py new file mode 100644 index 000000000..d886a3072 --- /dev/null +++ b/benchmarks/600.workflows/690.ml/python/train.py @@ -0,0 +1,72 @@ +import os +import uuid +import sys +from . import storage + +from sklearn.model_selection import train_test_split +from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.preprocessing import StandardScaler +import numpy as np + +def str_to_cls(cls_name): + #print(cls_name) + return globals()[cls_name] + +def load_dataset(benchmark_bucket, bucket, features, labels): + dataset_dir = os.path.join("/tmp", str(uuid.uuid4())) + os.makedirs(dataset_dir, exist_ok=True) + + features_path = os.path.join(dataset_dir, "features.npy") + labels_path = os.path.join(dataset_dir, "labels.npy") + + + client = storage.storage.get_instance() + client.download(benchmark_bucket, bucket + '/' + features, features_path) + client.download(benchmark_bucket, bucket + '/' + labels, labels_path) + + X = np.load(features_path) + y = np.load(labels_path) + + return X, y + + +def preprocess(X, y): + X = StandardScaler().fit_transform(X) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.4, random_state=123 + ) + + return X_train, X_test, y_train, y_test + + +def train(clf, X, y): + clf.fit(X, y) + + +def val(clf, X, y): + return clf.score(X, y) + + +def handler(schedule): + name = schedule.pop("name") + X_key = schedule.pop("features") + y_key = schedule.pop("labels") + bucket = schedule.pop("bucket") + benchmark_bucket = schedule.pop("benchmark_bucket") + request_id = schedule.pop("request-id") + + clf = str_to_cls(name)(**schedule) + + X, y = load_dataset(benchmark_bucket, bucket, X_key, y_key) + X_train, X_test, y_train, y_test = preprocess(X, y) + + train(clf, X_train, y_train) + score = val(clf, X_test, y_test) + + return { + "name": name, + "score": score + } + From 94adcde4cd6004511cc055089f24ba29718ac901 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 15:05:12 +0200 Subject: [PATCH 191/230] [workflows] Update configuration and enhance AWS workflow handlers --- benchmarks/600.workflows/670.auth/config.json | 3 ++- .../wrappers/aws/python/handler_workflow.py | 3 +++ benchmarks/wrappers/aws/python/storage.py | 6 +++++ sebs/aws/generator.py | 7 ++++++ sebs/aws/triggers.py | 23 +++++++++++-------- 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/benchmarks/600.workflows/670.auth/config.json b/benchmarks/600.workflows/670.auth/config.json index e6a65cb35..d6d184e8a 100644 --- a/benchmarks/600.workflows/670.auth/config.json +++ b/benchmarks/600.workflows/670.auth/config.json @@ -1,5 +1,6 @@ { "timeout": 120, "memory": 256, - "languages": ["python"] + "languages": ["python"], + "modules": [] } diff --git a/benchmarks/wrappers/aws/python/handler_workflow.py b/benchmarks/wrappers/aws/python/handler_workflow.py index 18a892989..43d41a1e3 100644 --- a/benchmarks/wrappers/aws/python/handler_workflow.py +++ b/benchmarks/wrappers/aws/python/handler_workflow.py @@ -44,6 +44,9 @@ def handler(event, context): func_payload = event request_id = req_id + if isinstance(func_payload, dict): + func_payload["request-id"] = req_id + workflow_name, func_name = context.function_name.split("___") function = importlib.import_module(f"function.{func_name}") res = function.handler(func_payload) diff --git a/benchmarks/wrappers/aws/python/storage.py b/benchmarks/wrappers/aws/python/storage.py index 401947df6..d11ec2c7e 100644 --- a/benchmarks/wrappers/aws/python/storage.py +++ b/benchmarks/wrappers/aws/python/storage.py @@ -30,6 +30,12 @@ def upload(self, bucket, file, filepath): def download(self, bucket, file, filepath): self.client.download_file(bucket, file, filepath) + def list_directory(self, bucket, prefix): + objects = self.client.list_objects_v2(Bucket=bucket, Prefix=prefix) + if 'Contents' not in objects: + return [] + return [obj['Key'] for obj in objects['Contents']] + def download_directory(self, bucket, prefix, path): objects = self.client.list_objects_v2(Bucket=bucket, Prefix=prefix) # 'Contents' key is only present when objects are found diff --git a/sebs/aws/generator.py b/sebs/aws/generator.py index f0c54637a..81ae093db 100644 --- a/sebs/aws/generator.py +++ b/sebs/aws/generator.py @@ -36,6 +36,11 @@ def encode_task(self, state: Task) -> Union[dict, List[dict]]: else: payload["End"] = True + if state.failure: + payload["Catch"] = [ + {"ErrorEquals": ["States.ALL"], "Next": state.failure} + ] + return payload def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: @@ -87,6 +92,8 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: }, } + payload["ResultPath"] = "$." + state.array + if state.next: payload["Next"] = state.next else: diff --git a/sebs/aws/triggers.py b/sebs/aws/triggers.py index 5f11f1f4f..2d8b26f33 100644 --- a/sebs/aws/triggers.py +++ b/sebs/aws/triggers.py @@ -218,10 +218,6 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: client = self._deployment_client.get_sfn_client() begin = datetime.datetime.now() ret = client.start_execution(stateMachineArn=self.name, input=json.dumps(sfn_input)) - end = datetime.datetime.now() - - aws_result = ExecutionResult.from_times(begin, end) - aws_result.request_id = request_id execution_arn = ret["executionArn"] execution_finished = False @@ -232,11 +228,20 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: if not execution_finished: time.sleep(1) - elif status == "FAILED": - self.logging.error(f"Invocation of {self.name} failed") - self.logging.error(f"Input: {payload}") - aws_result.stats.failure = True - return aws_result + + end = datetime.datetime.now() + aws_result = ExecutionResult.from_times(begin, end) + aws_result.request_id = request_id + + if status == "FAILED": + self.logging.error(f"Invocation of {self.name} failed") + self.logging.error(f"Input: {payload}") + aws_result.stats.failure = True + return aws_result + + if "output" in execution: + output = json.loads(execution["output"]) + aws_result.output = output return aws_result From d3dd732876eccf6359b6f6847e8ea2b9bd6d6856 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 15:05:33 +0200 Subject: [PATCH 192/230] fix: Ensure newline at end of file in requirements.txt for auth and excamera workflows --- benchmarks/600.workflows/670.auth/python/requirements.txt | 2 +- benchmarks/600.workflows/680.excamera/python/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/600.workflows/670.auth/python/requirements.txt b/benchmarks/600.workflows/670.auth/python/requirements.txt index 4f17c9ccd..68abeee58 100644 --- a/benchmarks/600.workflows/670.auth/python/requirements.txt +++ b/benchmarks/600.workflows/670.auth/python/requirements.txt @@ -1 +1 @@ -pyaes==1.6.1 \ No newline at end of file +pyaes==1.6.1 diff --git a/benchmarks/600.workflows/680.excamera/python/requirements.txt b/benchmarks/600.workflows/680.excamera/python/requirements.txt index 5396f9cce..59348f98e 100644 --- a/benchmarks/600.workflows/680.excamera/python/requirements.txt +++ b/benchmarks/600.workflows/680.excamera/python/requirements.txt @@ -1 +1 @@ -gitpython \ No newline at end of file +gitpython From af03f5374dea6e573d7e5395447dbf34499b9189 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 15:21:50 +0200 Subject: [PATCH 193/230] [storage] Add list_directory method to various storage wrappers --- benchmarks/600.workflows/670.auth/input.py | 2 +- benchmarks/wrappers/azure/python/storage.py | 5 +++++ .../wrappers/cloudflare/python/container/storage.py | 10 ++++++++++ benchmarks/wrappers/cloudflare/python/storage.py | 5 +++++ benchmarks/wrappers/gcp/python/storage.py | 4 ++++ benchmarks/wrappers/local/python/storage.py | 4 ++++ benchmarks/wrappers/openwhisk/python/storage.py | 4 ++++ 7 files changed, 33 insertions(+), 1 deletion(-) diff --git a/benchmarks/600.workflows/670.auth/input.py b/benchmarks/600.workflows/670.auth/input.py index d81d24e45..a0807d12b 100644 --- a/benchmarks/600.workflows/670.auth/input.py +++ b/benchmarks/600.workflows/670.auth/input.py @@ -11,7 +11,7 @@ def buckets_count(): return (0, 0) -def generate_input(data_dir, size, input_buckets, output_buckets, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): mult = size_generators[size] msg = "Who let the dogs out?\n" * mult diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py index 49aeb5aa2..345be43c0 100644 --- a/benchmarks/wrappers/azure/python/storage.py +++ b/benchmarks/wrappers/azure/python/storage.py @@ -30,6 +30,11 @@ def download(self, container, file, filepath): with open(filepath, 'wb') as download_file: download_file.write( self.download_stream(container, file) ) + def list_directory(self, container, prefix): + client = self.client.get_container_client(container=container) + objects = client.list_blobs(name_starts_with=prefix) + return [obj.name for obj in objects] + def download_directory(self, container, prefix, path): client = self.client.get_container_client(container=container) objects = client.list_blobs(name_starts_with=prefix) diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index 8c9a32fcb..dda903f06 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -212,6 +212,16 @@ def download(self, bucket, key, filepath): with open(filepath, 'wb') as f: f.write(data) + def list_directory(self, bucket, prefix): + """List all object keys with a given prefix.""" + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + params = urllib.parse.urlencode({'bucket': bucket, 'prefix': prefix}) + list_url = f"{storage.worker_url}/r2/list?{params}" + with urllib.request.urlopen(list_url) as response: + result = json.loads(response.read().decode('utf-8')) + return [obj['key'] for obj in result.get('objects', [])] + def download_directory(self, bucket, prefix, local_path): """ Download all files with a given prefix to a local directory. diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index cabdb7184..30b836bea 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -58,6 +58,11 @@ def download(self, bucket, key, filepath): f.write(data) return + def list_directory(self, bucket, prefix): + bobj = self.get_bucket(bucket) + list_res = run_sync(bobj.list(to_js({"prefix": prefix}))) + return [obj.key for obj in list_res.objects] + def download_directory(self, bucket, prefix, out_path): bobj = self.get_bucket(bucket) list_res = run_sync(bobj.list(to_js({"prefix": prefix}))) diff --git a/benchmarks/wrappers/gcp/python/storage.py b/benchmarks/wrappers/gcp/python/storage.py index 9ea541e0c..dfe9563a0 100644 --- a/benchmarks/wrappers/gcp/python/storage.py +++ b/benchmarks/wrappers/gcp/python/storage.py @@ -34,6 +34,10 @@ def download(self, bucket, file, filepath): blob = bucket_instance.blob(file) blob.download_to_filename(filepath) + def list_directory(self, bucket, prefix): + objects = self.client.bucket(bucket).list_blobs(prefix=prefix) + return [obj.name for obj in objects] + def download_directory(self, bucket, prefix, path): objects = self.client.bucket(bucket).list_blobs(prefix=prefix) for obj in objects: diff --git a/benchmarks/wrappers/local/python/storage.py b/benchmarks/wrappers/local/python/storage.py index f8bb7c0c0..2307462b7 100644 --- a/benchmarks/wrappers/local/python/storage.py +++ b/benchmarks/wrappers/local/python/storage.py @@ -37,6 +37,10 @@ def upload(self, bucket, file, filepath): def download(self, bucket, file, filepath): self.client.fget_object(bucket, file, filepath) + def list_directory(self, bucket, prefix): + objects = self.client.list_objects(bucket, prefix, recursive=True) + return [obj.object_name for obj in objects] + def download_directory(self, bucket, prefix, path): objects = self.client.list_objects_v2(bucket, prefix, recursive=True) for obj in objects: diff --git a/benchmarks/wrappers/openwhisk/python/storage.py b/benchmarks/wrappers/openwhisk/python/storage.py index 26ea56612..53c071e64 100644 --- a/benchmarks/wrappers/openwhisk/python/storage.py +++ b/benchmarks/wrappers/openwhisk/python/storage.py @@ -57,6 +57,10 @@ def upload(self, bucket, file, filepath): def download(self, bucket, file, filepath): self.client.fget_object(bucket, file, filepath) + def list_directory(self, bucket, prefix): + objects = self.client.list_objects(bucket, prefix, recursive=True) + return [obj.object_name for obj in objects] + def download_directory(self, bucket, prefix, path): objects = self.client.list_objects(bucket, prefix, recursive=True) for obj in objects: From 9a72eef599339d97be7bbe3ada12e86c8b04a8e3 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 15:22:47 +0200 Subject: [PATCH 194/230] fix: Move workflow retrieval to correct position in workflow function --- sebs/cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sebs/cli.py b/sebs/cli.py index b84728bdf..76512901c 100755 --- a/sebs/cli.py +++ b/sebs/cli.py @@ -404,17 +404,17 @@ def workflow(benchmark, benchmark_input_size, repetitions, trigger, workflow_nam logging_filename=logging_filename, ) - wf = deployment_client.get_workflow( - benchmark_obj, - workflow_name if workflow_name else deployment_client.default_function_name(benchmark_obj), - ) - input_config = benchmark_obj.prepare_input( deployment_client.system_resources, size=benchmark_input_size, replace_existing=experiment_config.update_storage, ) + wf = deployment_client.get_workflow( + benchmark_obj, + workflow_name if workflow_name else deployment_client.default_function_name(benchmark_obj), + ) + redis_host = getattr(deployment_client.config, "redis_host", None) redis = None if redis_host: From 340baf522cfc817a8fd1a32259fa0a92c37d6e5d Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 16:11:41 +0200 Subject: [PATCH 195/230] fix: Enhance benchmark data loading and ensure newline in package outputs --- sebs/benchmark.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 7cebf36e6..0ea165aa0 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -739,8 +739,8 @@ def __init__( # Try to ensure benchmarks-data exists ensure_benchmarks_data(self.logging) - # Load input module - self._benchmark_data_path = find_benchmark(self._benchmark, "benchmarks-data") + # Load input module — fall back to output dir for benchmarks without data files + self._benchmark_data_path = find_benchmark(self._benchmark, "benchmarks-data") or self._output_dir self._benchmark_input_module = load_benchmark_input(self._benchmark_path) # Check if input has been processed @@ -1037,6 +1037,11 @@ def add_deployment_files(self, output_dir: str, is_workflow: bool = False) -> No elif os.path.exists(handler_workflow_path): os.remove(handler_workflow_path) + if is_workflow: + definition_src = os.path.join(self._benchmark_path, "definition.json") + if os.path.exists(definition_src): + shutil.copy2(definition_src, os.path.join(output_dir, "definition.json")) + def add_deployment_package_python(self, output_dir: str) -> None: """Add Python deployment packages to requirements file. @@ -1059,7 +1064,7 @@ def add_deployment_package_python(self, output_dir: str) -> None: self._deployment_name, self.language_name ) for package in packages: - out.write(package) + out.write(f"\n{package}") module_packages = self._system_config.deployment_module_packages( self._deployment_name, self.language_name @@ -1067,7 +1072,7 @@ def add_deployment_package_python(self, output_dir: str) -> None: for bench_module in self._benchmark_config.modules: if bench_module.value in module_packages: for package in module_packages[bench_module.value]: - out.write(package) + out.write(f"\n{package}") def add_deployment_package_nodejs(self, output_dir: str) -> None: """Add Node.js deployment packages to package.json. From 0ce3240a8568eceee00c15535fcb6b387a6e4a35 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 25 May 2026 16:18:57 +0200 Subject: [PATCH 196/230] fix: Ensure directory creation for data path in generate_input function --- benchmarks/600.workflows/631.parallel-download/input.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/600.workflows/631.parallel-download/input.py b/benchmarks/600.workflows/631.parallel-download/input.py index fd9d6d7b5..2a8206519 100644 --- a/benchmarks/600.workflows/631.parallel-download/input.py +++ b/benchmarks/600.workflows/631.parallel-download/input.py @@ -39,6 +39,7 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck data_path = os.path.join(data_dir, data_name) if not os.path.exists(data_path): + os.makedirs(data_dir, exist_ok=True) with open(data_path, "w") as f: f.writelines(k for k in generate(size_bytes)) From 3a0c9943e625f5543a9553f36e0d56ee1a550108 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 25 May 2026 16:31:06 +0200 Subject: [PATCH 197/230] feat: Add workflow benchmarks and regression testing framework for workflows --- sebs/regression.py | 200 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) diff --git a/sebs/regression.py b/sebs/regression.py index 42bb2c10b..5487e9532 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -68,6 +68,22 @@ "503.graph-bfs", ] +# Workflow benchmarks available for regression testing +benchmarks_workflows = [ + "620.func-invo", + "630.parallel-sleep", + "631.parallel-download", + "640.selfish-detour", + "650.vid", + "660.map-reduce", + "670.auth", + "680.excamera", + "690.ml", + "6100.1000-genome", + "6101.1000-genome-individuals", + "6200.trip-booking", +] + # AWS-specific configurations architectures_aws = ["x64", "arm64"] deployments_aws = ["package", "container"] @@ -333,6 +349,179 @@ def test(self): return type.__new__(mcs, name, bases, dict) +class WorkflowTestSequenceMeta(type): + """Metaclass for dynamically generating workflow regression test cases. + + Similar to TestSequenceMeta but uses get_workflow instead of get_function, + and workflows have their trigger built-in (WorkflowLibraryTrigger). + """ + + def __init__(cls, name, bases, attrs, benchmarks, architectures, deployments, deployment_name): + type.__init__(cls, name, bases, attrs) + cls.deployment_name = deployment_name + + def __new__(mcs, name, bases, dict, benchmarks, architectures, deployments, deployment_name): + def gen_test(benchmark_name, architecture, deployment_type): + def test(self): + log_name = f"Regression-WF-{deployment_name}-{benchmark_name}-{deployment_type}" + logger = logging.getLogger(log_name) + logger.setLevel(logging.INFO) + logging_wrapper = ColoredWrapper(log_name, logger) + if LOGGING_REDACTED: + logger.addFilter(LOGGING_REDACTOR) + logging_wrapper.set_filter(LOGGING_REDACTOR) + + self.experiment_config["architecture"] = architecture + self.experiment_config["system_variant"] = deployment_type + + deployment_client = self.get_deployment( + benchmark_name, architecture, deployment_type + ) + deployment_client.disable_rich_output() + + logging_wrapper.info( + f"Begin workflow regression test of {benchmark_name} on " + f"{deployment_client.name()}. " + f"Architecture {architecture}, deployment type: {deployment_type}." + ) + + experiment_config = self.client.get_experiment_config(self.experiment_config) + benchmark = self.client.get_benchmark( + benchmark_name, deployment_client, experiment_config + ) + + input_config = benchmark.prepare_input( + deployment_client.system_resources, + size=benchmark_input_size, + replace_existing=experiment_config.update_storage, + ) + + wf = deployment_client.get_workflow( + benchmark, deployment_client.default_function_name(benchmark) + ) + + trigger_type = Trigger.TriggerType.LIBRARY + triggers = wf.triggers(trigger_type) + if len(triggers) == 0: + trigger = deployment_client.create_trigger(wf, trigger_type) + sleep(5) + else: + trigger = triggers[0] + + failure = False + try: + ret = trigger.sync_invoke(input_config) + if ret.stats.failure: + failure = True + logging_wrapper.error(f"{benchmark_name} workflow execution failed") + else: + logging_wrapper.info(f"{benchmark_name} workflow execution succeeded") + except RuntimeError: + failure = True + logging_wrapper.error(f"{benchmark_name} workflow invocation raised exception") + + json_filename = ( + f"regression_wf_{deployment_name}_{benchmark_name}" + f"_{architecture}_{deployment_type}.json" + ) + with open(os.path.join(self.client.output_dir, json_filename), "w") as f: + json.dump({"output": ret.output if not failure else {}}, f, indent=2) + + deployment_client.shutdown() + + if failure: + raise RuntimeError(f"Workflow test of {benchmark_name} failed!") + + return test + + for benchmark in benchmarks: + for architecture in architectures: + for deployment_type in deployments: + test_name = f"test_{deployment_name}_wf_{benchmark}" + test_name += f"_{architecture}_{deployment_type}" + test_method = gen_test(benchmark, architecture, deployment_type) + test_method.test_architecture = architecture + test_method.test_deployment_type = deployment_type + test_method.test_benchmark = benchmark + dict[test_name] = test_method + + dict["lock"] = threading.Lock() + dict["cfg"] = None + return type.__new__(mcs, name, bases, dict) + + +class AWSTestSequenceWorkflows( + unittest.TestCase, + metaclass=WorkflowTestSequenceMeta, + benchmarks=benchmarks_workflows, + architectures=["x64"], + deployments=["package"], + deployment_name="aws", +): + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "aws" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + configure_regression_deployment(config_copy, deployment_name, deployment_type) + + f = f"regression_wf_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with AWSTestSequenceWorkflows.lock: + deployment_client.initialize(resource_prefix=RESOURCE_PREFIX, quiet=LOGGING_REDACTED) + if LOGGING_REDACTED: + LOGGING_REDACTOR.set_resource_id( + deployment_client.config.resources.resources_id, + deployment_client.config.credentials.account_id, + ) + LoggingBase.set_filtering_resource_id( + deployment_client.config.resources.resources_id, + deployment_client.config.credentials.account_id, + ) + return deployment_client + + +class GCPTestSequenceWorkflows( + unittest.TestCase, + metaclass=WorkflowTestSequenceMeta, + benchmarks=benchmarks_workflows, + architectures=["x64"], + deployments=["package"], + deployment_name="gcp", +): + def get_deployment(self, benchmark_name, architecture, deployment_type): + deployment_name = "gcp" + assert cloud_config, "Cloud configuration is required" + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + configure_regression_deployment(config_copy, deployment_name, deployment_type) + + f = f"regression_wf_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with GCPTestSequenceWorkflows.lock: + deployment_client.initialize(resource_prefix=RESOURCE_PREFIX, quiet=LOGGING_REDACTED) + if LOGGING_REDACTED: + LOGGING_REDACTOR.set_resource_id( + deployment_client.config.resources.resources_id, + deployment_client.config.credentials.project_name, + ) + LoggingBase.set_filtering_resource_id( + deployment_client.config.resources.resources_id, + deployment_client.config.credentials.project_name, + ) + return deployment_client + + class AWSTestSequencePython( unittest.TestCase, metaclass=TestSequenceMeta, @@ -1600,6 +1789,17 @@ def regression_suite( ) ) + # Add workflow tests (only for Python, workflows are Python-only) + if language == "python": + if "aws" in providers: + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase(AWSTestSequenceWorkflows) + ) + if "gcp" in providers: + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase(GCPTestSequenceWorkflows) + ) + # Prepare the list of tests to run tests: List[unittest.TestCase] = [] # mypy is confused here about the type From 51c0cd5afff26e2e61810971883013ab24f3ce8b Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 25 May 2026 17:08:54 +0200 Subject: [PATCH 198/230] fix: Update requirements to specify versions for numpy and scikit-learn --- benchmarks/600.workflows/690.ml/python/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/600.workflows/690.ml/python/requirements.txt b/benchmarks/600.workflows/690.ml/python/requirements.txt index 49e2777d9..1ff24f2df 100644 --- a/benchmarks/600.workflows/690.ml/python/requirements.txt +++ b/benchmarks/600.workflows/690.ml/python/requirements.txt @@ -1,2 +1,3 @@ +numpy<2 scipy==1.10.0 -scikit-learn +scikit-learn==1.5.2 From 68271f2f7df3fead6bc57e9b8628ffa0573395c4 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 25 May 2026 17:11:21 +0200 Subject: [PATCH 199/230] fix: Add missing newline for numpy version specification in requirements.txt --- benchmarks/600.workflows/650.vid/python/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/600.workflows/650.vid/python/requirements.txt b/benchmarks/600.workflows/650.vid/python/requirements.txt index 6ab6d0d57..84cf1d278 100644 --- a/benchmarks/600.workflows/650.vid/python/requirements.txt +++ b/benchmarks/600.workflows/650.vid/python/requirements.txt @@ -1 +1,2 @@ +numpy<2 opencv-python-headless From 28e8ac9f9409294416da0c9c23ef5cdc9fc43b43 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 1 Jun 2026 11:19:55 +0200 Subject: [PATCH 200/230] feat: Replace fsm.py symlink with full implementation Co-Authored-By: Marcin Copik Co-Authored-By: Laurin Brandner --- benchmarks/wrappers/azure/python/fsm.py | 219 +++++++++++++++++++++++- 1 file changed, 218 insertions(+), 1 deletion(-) diff --git a/benchmarks/wrappers/azure/python/fsm.py b/benchmarks/wrappers/azure/python/fsm.py index 30c579a5c..039457cd4 100644 --- a/benchmarks/wrappers/azure/python/fsm.py +++ b/benchmarks/wrappers/azure/python/fsm.py @@ -1 +1,218 @@ -../../../../sebs/faas/fsm.py \ No newline at end of file +from abc import ABC +from abc import abstractmethod +from typing import Optional, List, Callable, Union, Dict, Type, Tuple +import json + + +class State(ABC): + def __init__(self, name: str): + self.name = name + + @staticmethod + def deserialize(name: str, payload: dict) -> "State": + cls = _STATE_TYPES[payload["type"]] + return cls.deserialize(name, payload) + + +class Task(State): + def __init__(self, name: str, func_name: str, next: Optional[str], failure: Optional[str]): + self.name = name + self.func_name = func_name + self.next = next + self.failure = failure + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Task": + return cls( + name=name, + func_name=payload["func_name"], + next=payload.get("next"), + failure=payload.get("failure"), + ) + + +class Switch(State): + class Case: + def __init__(self, var: str, op: str, val: str, next: str): + self.var = var + self.op = op + self.val = val + self.next = next + + @staticmethod + def deserialize(payload: dict) -> "Switch.Case": + return Switch.Case(**payload) + + def __init__(self, name: str, cases: List[Case], default: Optional[str]): + self.name = name + self.cases = cases + self.default = default + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Switch": + cases = [Switch.Case.deserialize(c) for c in payload["cases"]] + + return cls(name=name, cases=cases, default=payload["default"]) + + +class Parallel(State): + def __init__(self, name: str, funcs: List, next: Optional[str]): + self.name = name + self.funcs = funcs + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Parallel": + return cls(name=name, funcs=payload.get("parallel_functions"), next=payload.get("next")) + + +class Map(State): + def __init__( + self, + name: str, + funcs: List, + array: str, + root: str, + next: Optional[str], + common_params: Optional[str], + ): + self.name = name + self.funcs = funcs + self.array = array + self.root = root + self.next = next + self.common_params = common_params + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Map": + return cls( + name=name, + funcs=payload["states"], + array=payload["array"], + root=payload["root"], + next=payload.get("next"), + common_params=payload.get("common_params"), + ) + + +class Repeat(State): + def __init__(self, name: str, func_name: str, count: int, next: Optional[str]): + self.name = name + self.func_name = func_name + self.count = count + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Repeat": + return cls( + name=name, + func_name=payload["func_name"], + count=payload["count"], + next=payload.get("next"), + ) + + +class Loop(State): + def __init__(self, name: str, func_name: str, array: str, next: Optional[str]): + self.name = name + self.func_name = func_name + self.array = array + self.next = next + + @classmethod + def deserialize(cls, name: str, payload: dict) -> "Loop": + return cls( + name=name, + func_name=payload["func_name"], + array=payload["array"], + next=payload.get("next"), + ) + + +_STATE_TYPES: Dict[str, Type[State]] = { + "task": Task, + "switch": Switch, + "map": Map, + "repeat": Repeat, + "loop": Loop, + "parallel": Parallel, +} + + +class Generator(ABC): + def __init__(self, export_func: Callable[[dict], str] = json.dumps): + self._export_func = export_func + + def parse(self, path: str): + with open(path) as f: + definition = json.load(f) + + self.states = {n: State.deserialize(n, s) for n, s in definition["states"].items()} + self.root = self.states[definition["root"]] + + def generate(self) -> str: + states = list(self.states.values()) + payloads = [] + for s in states: + obj = self.encode_state(s) + if isinstance(obj, dict): + payloads.append(obj) + elif isinstance(obj, list): + payloads += obj + else: + raise ValueError("Unknown encoded state returned.") + + definition = self.postprocess(payloads) + + return self._export_func(definition) + + def postprocess(self, payloads: List[dict]) -> dict: + return payloads + + def encode_state(self, state: State) -> Union[dict, List[dict]]: + if isinstance(state, Task): + return self.encode_task(state) + elif isinstance(state, Switch): + return self.encode_switch(state) + elif isinstance(state, Map): + return self.encode_map(state) + elif isinstance(state, Repeat): + return self.encode_repeat(state) + elif isinstance(state, Loop): + return self.encode_loop(state) + elif isinstance(state, Parallel): + return self.encode_parallel(state) + else: + raise ValueError(f"Unknown state of type {type(state)}.") + + @abstractmethod + def encode_task(self, state: Task) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_map(self, state: Map) -> Union[dict, List[dict]]: + pass + + @abstractmethod + def encode_parallel(self, state: Parallel) -> Union[dict, List[dict]]: + pass + + def encode_repeat(self, state: Repeat) -> Union[dict, List[dict]]: + tasks = [] + for i in range(state.count): + name = state.name if i == 0 else f"{state.name}_{i}" + next = state.next if i == state.count - 1 else f"{state.name}_{i+1}" + task = Task(name, state.func_name, next, None) + + res = self.encode_task(task) + tasks += res if isinstance(res, list) else [res] + + return tasks + + @abstractmethod + def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: + pass From 9a6d2a863aa2a7f1a535a740056b5f0731bb71b5 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 1 Jun 2026 12:05:41 +0200 Subject: [PATCH 201/230] feat: Add Azure workflow benchmarks and testing suite --- sebs/regression.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/sebs/regression.py b/sebs/regression.py index 5487e9532..f0b780bc8 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -522,6 +522,72 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): return deployment_client +class AzureTestSequenceWorkflows( + unittest.TestCase, + metaclass=WorkflowTestSequenceMeta, + benchmarks=benchmarks_workflows, + architectures=architectures_azure, + deployments=deployments_azure, + deployment_name="azure", +): + """Test suite for workflow benchmarks on Azure Durable Functions.""" + + def get_deployment(self, benchmark_name, architecture, deployment_type): + """Get an Azure deployment client for workflow testing. + + Args: + benchmark_name: Name of the workflow benchmark to deploy + architecture: Architecture to deploy on (x64) + deployment_type: Deployment type (package) + + Returns: + An initialized Azure deployment client + """ + deployment_name = "azure" + assert cloud_config, "Cloud configuration is required" + + with AzureTestSequenceWorkflows.lock: + if not AzureTestSequenceWorkflows.cfg: + AzureTestSequenceWorkflows.cfg = self.client.get_deployment_config( + cloud_config["deployment"], + logging_filename=os.path.join( + self.client.output_dir, + f"regression_wf_{deployment_name}_{benchmark_name}_{architecture}.log", + ), + ) + + needs_login = False + if not hasattr(AzureTestSequenceWorkflows, "cli"): + from sebs.azure.cli import AzureCLI + + AzureTestSequenceWorkflows.cli = AzureCLI( + self.client.config, self.client.docker_client + ) + needs_login = True + + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["system_variant"] = deployment_type + + f = f"regression_wf_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + deployment_config=AzureTestSequenceWorkflows.cfg, + ) + + deployment_client.system_resources.initialize_cli( + cli=AzureTestSequenceWorkflows.cli, login=needs_login + ) + deployment_client.initialize(resource_prefix=RESOURCE_PREFIX, quiet=LOGGING_REDACTED) + if LOGGING_REDACTED: + LOGGING_REDACTOR.set_resource_id(deployment_client.config.resources.resources_id) + LoggingBase.set_filtering_resource_id( + deployment_client.config.resources.resources_id + ) + return deployment_client + + class AWSTestSequencePython( unittest.TestCase, metaclass=TestSequenceMeta, @@ -1799,6 +1865,10 @@ def regression_suite( suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase(GCPTestSequenceWorkflows) ) + if "azure" in providers: + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase(AzureTestSequenceWorkflows) + ) # Prepare the list of tests to run tests: List[unittest.TestCase] = [] @@ -1879,6 +1949,8 @@ def regression_suite( AzureTestSequencePython.cli.shutdown() if hasattr(AzureTestSequenceJava, "cli"): AzureTestSequenceJava.cli.shutdown() + if hasattr(AzureTestSequenceWorkflows, "cli"): + AzureTestSequenceWorkflows.cli.shutdown() # Return True if any test failed return not result.all_correct From 33968e9a37921af3b457f4c6a57e6663cf4cda92 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 1 Jun 2026 12:06:19 +0200 Subject: [PATCH 202/230] feat: Add selfish detour benchmark implementation in C and corresponding packaging script --- .../640.selfish-detour/package.sh | 11 --------- .../640.selfish-detour/python/package.sh | 24 +++++++++++++++++++ .../{ => python}/selfish-detour.c | 0 3 files changed, 24 insertions(+), 11 deletions(-) delete mode 100644 benchmarks/600.workflows/640.selfish-detour/package.sh create mode 100644 benchmarks/600.workflows/640.selfish-detour/python/package.sh rename benchmarks/600.workflows/640.selfish-detour/{ => python}/selfish-detour.c (100%) diff --git a/benchmarks/600.workflows/640.selfish-detour/package.sh b/benchmarks/600.workflows/640.selfish-detour/package.sh deleted file mode 100644 index c1145e436..000000000 --- a/benchmarks/600.workflows/640.selfish-detour/package.sh +++ /dev/null @@ -1,11 +0,0 @@ -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -CUR_DIR=$(pwd) -cd ${SCRIPT_DIR} - -for C_FILE in $(ls *.c) -do - cc -fPIC -shared -o ${C_FILE%%.*}.so ${C_FILE} - rm ${C_FILE} -done - -cd ${CUR_DIR} diff --git a/benchmarks/600.workflows/640.selfish-detour/python/package.sh b/benchmarks/600.workflows/640.selfish-detour/python/package.sh new file mode 100644 index 000000000..b001665b7 --- /dev/null +++ b/benchmarks/600.workflows/640.selfish-detour/python/package.sh @@ -0,0 +1,24 @@ +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +CUR_DIR=$(pwd) +cd ${SCRIPT_DIR} + +for C_FILE in $(ls *.c 2>/dev/null) +do + SO_FILE="${C_FILE%%.*}.so" + if command -v cc &>/dev/null; then + if cc -fPIC -shared -o ${SO_FILE} ${C_FILE}; then + rm ${C_FILE} + else + echo "ERROR: Failed to compile ${C_FILE}" >&2 + exit 1 + fi + elif [ -f "${SO_FILE}" ]; then + # Pre-compiled .so is present; remove source to avoid confusion + rm ${C_FILE} + else + echo "ERROR: No C compiler found and no pre-compiled ${SO_FILE} available" >&2 + exit 1 + fi +done + +cd ${CUR_DIR} diff --git a/benchmarks/600.workflows/640.selfish-detour/selfish-detour.c b/benchmarks/600.workflows/640.selfish-detour/python/selfish-detour.c similarity index 100% rename from benchmarks/600.workflows/640.selfish-detour/selfish-detour.c rename to benchmarks/600.workflows/640.selfish-detour/python/selfish-detour.c From edfc59e142091a928deed912893e28ec253c398d Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Mon, 1 Jun 2026 12:07:47 +0200 Subject: [PATCH 203/230] feat: Update benchmark file handling to include additional file types and remove package script check --- sebs/benchmark.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 0ea165aa0..8a5d3c0aa 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -887,7 +887,7 @@ def copy_code(self, output_dir: str) -> None: output_dir: Destination directory for copied files """ FILES = { - Language.PYTHON: ["*.py", "requirements.txt*"], + Language.PYTHON: ["*.py", "*.c", "*.so", "requirements.txt*", "package.sh"], Language.NODEJS: ["*.js", "package.json"], Language.JAVA: [], Language.CPP: ["*.cpp", "*.hpp", "dependencies.json"], @@ -1380,15 +1380,6 @@ def ensure_image(name: str) -> None: # Create set of mounted volumes volumes = {os.path.abspath(output_dir): {"bind": "/mnt/function", "mode": "rw"}} - package_script = os.path.abspath( - os.path.join(self._benchmark_path, self.language_name, "package.sh") - ) - # does this benchmark has package.sh script? - if os.path.exists(package_script): - volumes[package_script] = { - "bind": "/mnt/function/package.sh", - "mode": "ro", - } # run Docker container to install packages PACKAGE_FILES = { From 1ebea93af07efd50202e51f290889addeba75fbf Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 21:49:18 +0200 Subject: [PATCH 204/230] refactor: Simplify function import and Redis connection handling in handler_workflow.py --- .../wrappers/azure/python/handler_workflow.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/benchmarks/wrappers/azure/python/handler_workflow.py b/benchmarks/wrappers/azure/python/handler_workflow.py index 143b5287f..bb6f9eeb0 100644 --- a/benchmarks/wrappers/azure/python/handler_workflow.py +++ b/benchmarks/wrappers/azure/python/handler_workflow.py @@ -36,11 +36,7 @@ def main(event, context: func.Context): # this only works on benchmarks where payload is dict event["payload"]["request-id"] = context.invocation_id - module_name = f"{func_name}.{func_name}" - module_path = f"{func_name}/{func_name}.py" - spec = importlib.util.spec_from_file_location(module_name, module_path) - function = importlib.util.module_from_spec(spec) - spec.loader.exec_module(function) + function = importlib.import_module(f".{func_name}", package=__package__) res = function.handler(event["payload"]) @@ -70,14 +66,17 @@ def main(event, context: func.Context): payload = json.dumps(payload) - redis = Redis(host={{REDIS_HOST}}, - port=6379, - decode_responses=True, - socket_connect_timeout=10, - password={{REDIS_PASSWORD}}) - - req_id = event["request_id"] - key = os.path.join(workflow_name, func_name, req_id, str(uuid.uuid4())[0:8]) - redis.set(key, payload) + redis_host = {{REDIS_HOST}} + redis_password = {{REDIS_PASSWORD}} + if redis_host: + redis = Redis(host=redis_host, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password=redis_password) + + req_id = event["request_id"] + key = os.path.join(workflow_name, func_name, req_id, str(uuid.uuid4())[0:8]) + redis.set(key, payload) return res From 0dd17a3d1a4f55b292deefcbfc207a8f65d43b6f Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 22:47:34 +0200 Subject: [PATCH 205/230] refactor: Update import handling and streamline request ID assignment in Azure workflow files --- .../wrappers/azure/python/handler_workflow.py | 9 ++--- .../wrappers/azure/python/main_workflow.py | 35 ++++++------------- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/benchmarks/wrappers/azure/python/handler_workflow.py b/benchmarks/wrappers/azure/python/handler_workflow.py index bb6f9eeb0..0bb2d6d70 100644 --- a/benchmarks/wrappers/azure/python/handler_workflow.py +++ b/benchmarks/wrappers/azure/python/handler_workflow.py @@ -3,6 +3,7 @@ import os import uuid import importlib +import importlib.util import logging @@ -31,12 +32,12 @@ def main(event, context: func.Context): workflow_name = os.getenv("APPSETTING_WEBSITE_SITE_NAME") func_name = os.path.basename(os.path.dirname(__file__)) - # FIXME: sort out workflow and function request id - #event["request-id"] = context.invocation_id - # this only works on benchmarks where payload is dict event["payload"]["request-id"] = context.invocation_id - function = importlib.import_module(f".{func_name}", package=__package__) + module_path = os.path.join(os.path.dirname(__file__), f"{func_name}.py") + spec = importlib.util.spec_from_file_location(func_name, module_path) + function = importlib.util.module_from_spec(spec) + spec.loader.exec_module(function) res = function.handler(event["payload"]) diff --git a/benchmarks/wrappers/azure/python/main_workflow.py b/benchmarks/wrappers/azure/python/main_workflow.py index 0c4e55a0c..64868a919 100644 --- a/benchmarks/wrappers/azure/python/main_workflow.py +++ b/benchmarks/wrappers/azure/python/main_workflow.py @@ -6,7 +6,6 @@ import azure.functions as func import azure.durable_functions as df -import logging def probe_cold_start(): is_cold = False @@ -24,41 +23,29 @@ def probe_cold_start(): async def main(req: func.HttpRequest, starter: str, context: func.Context) -> func.HttpResponse: - event = req.get_json() - req_id = event["request_id"] - logging.info("complete event: ") - logging.info(event) - logging.info("req_id in main: ") - logging.info(req_id) - - if 'connection_string' in event: - logging.info("setting connection string.") - os.environ['STORAGE_CONNECTION_STRING'] = event['connection_string'] + req_id = context.invocation_id + event = {"payload": req.get_json(), "request_id": req_id} begin = datetime.datetime.now() client = df.DurableOrchestrationClient(starter) instance_id = await client.start_new("run_workflow", None, event) - res = client.create_check_status_response(req, instance_id) - #res = await client.wait_for_completion_or_create_check_status_response(req, instance_id, 1000000) + res = await client.wait_for_completion_or_create_check_status_response( + req, instance_id, timeout_in_milliseconds=600000 + ) end = datetime.datetime.now() is_cold, container_id = probe_cold_start() - #status = await client.get_status(instance_id) - #code = 500 if str(status.runtime_status) == "Failed" else 200 - - #try: - # result = json.loads(res.get_body()) - #except json.decoder.JSONDecodeError: - # result = res.get_body().decode() - body = json.loads(res.get_body()) - logging.info("body") - logging.info(body) + try: + result = json.loads(res.get_body()) + except json.decoder.JSONDecodeError: + result = res.get_body().decode() + body = { - **body, + "result": result, "begin": begin.strftime("%s.%f"), "end": end.strftime("%s.%f"), "is_cold": is_cold, From 9b8b3e4d0da10b93a38b0c816fb1e28e83efd007 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 22:49:58 +0200 Subject: [PATCH 206/230] refactor: Improve Redis connection handling and key generation in workflow handler --- .../wrappers/azure/python/run_workflow.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/wrappers/azure/python/run_workflow.py b/benchmarks/wrappers/azure/python/run_workflow.py index d5e111408..868909e4a 100644 --- a/benchmarks/wrappers/azure/python/run_workflow.py +++ b/benchmarks/wrappers/azure/python/run_workflow.py @@ -269,14 +269,17 @@ def handler(context: df.DurableOrchestrationContext): payload = json.dumps(payload) - redis = Redis(host={{REDIS_HOST}}, - port=6379, - decode_responses=True, - socket_connect_timeout=10, - password={{REDIS_PASSWORD}}) - - key = os.path.join(workflow_name, func_name, request_id, str(uuid.uuid4())[0:8]) - redis.set(key, payload) + redis_host = {{REDIS_HOST}} + redis_password = {{REDIS_PASSWORD}} + if redis_host: + redis = Redis(host=redis_host, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password=redis_password) + + key = os.path.join(workflow_name, func_name, request_id, str(uuid.uuid4())[0:8]) + redis.set(key, payload) return res From 7e7ce4685c5df770170d634efabe05e008d79804 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 22:52:37 +0200 Subject: [PATCH 207/230] feat: Add initialization for NoSQL and storage instances based on environment variables --- benchmarks/wrappers/azure/python/handler_workflow.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/benchmarks/wrappers/azure/python/handler_workflow.py b/benchmarks/wrappers/azure/python/handler_workflow.py index 0bb2d6d70..5de2d74ab 100644 --- a/benchmarks/wrappers/azure/python/handler_workflow.py +++ b/benchmarks/wrappers/azure/python/handler_workflow.py @@ -10,6 +10,18 @@ import azure.functions as func from redis import Redis +if 'NOSQL_STORAGE_DATABASE' in os.environ: + from . import nosql + nosql.nosql.get_instance( + os.environ['NOSQL_STORAGE_DATABASE'], + os.environ['NOSQL_STORAGE_URL'], + os.environ['NOSQL_STORAGE_CREDS'] + ) + +if 'STORAGE_CONNECTION_STRING' in os.environ: + from . import storage + storage.storage.get_instance(os.environ['STORAGE_CONNECTION_STRING']) + def probe_cold_start(): is_cold = False fname = os.path.join("/tmp", "cold_run") From 0f872f8c16f1e1eb7532ff0155c6a98b52078495 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:08:50 +0200 Subject: [PATCH 208/230] refactor: Enhance upload functionality to support optional unique naming for files --- benchmarks/wrappers/azure/python/storage.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py index 345be43c0..e22a2bf7c 100644 --- a/benchmarks/wrappers/azure/python/storage.py +++ b/benchmarks/wrappers/azure/python/storage.py @@ -22,14 +22,14 @@ def unique_name(name): random=str(uuid.uuid4()).split('-')[0] ) - def upload(self, container, file, filepath): + def upload(self, container, file, filepath, unique_name=True): with open(filepath, 'rb') as data: - return self.upload_stream(container, file, data) + return self.upload_stream(container, file, data, unique_name=unique_name) def download(self, container, file, filepath): with open(filepath, 'wb') as download_file: download_file.write( self.download_stream(container, file) ) - + def list_directory(self, container, prefix): client = self.client.get_container_client(container=container) objects = client.list_blobs(name_starts_with=prefix) @@ -43,14 +43,11 @@ def download_directory(self, container, prefix, path): path_to_file = os.path.dirname(file_name) os.makedirs(os.path.join(path, path_to_file), exist_ok=True) self.download(container, file_name, os.path.join(path, file_name)) - - def upload_stream(self, container, file, data): - key_name = storage.unique_name(file) - client = self.client.get_blob_client( - container=container, - blob=key_name - ) - client.upload_blob(data) + + def upload_stream(self, container, file, data, unique_name=True): + key_name = storage.unique_name(file) if unique_name else file + client = self.client.get_blob_client(container=container, blob=key_name) + client.upload_blob(data, overwrite=not unique_name) return key_name def download_stream(self, container, file): From eb7da3cf79bc159ff6f1b61562cefd4b9ff40ebe Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:10:30 +0200 Subject: [PATCH 209/230] refactor: Update deployment configuration to include additional handler files and package dependencies --- configs/systems.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/configs/systems.json b/configs/systems.json index 147576b38..cb1f3808b 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -212,10 +212,19 @@ "deployment": { "files": [ "handler.py", + "handler_workflow.py", + "main_workflow.py", + "run_workflow.py", + "run_subworkflow.py", + "fsm.py", "storage.py", "nosql.py" ], - "packages": [], + "packages": [ + "azure-functions", + "azure-functions-durable==1.1.6", + "redis" + ], "module_packages": { "storage": [ "azure-storage-blob" From bcccf45d2d1e54fba2670453be8bc840bae10f45 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:28:03 +0200 Subject: [PATCH 210/230] feat: Implement Azure Durable Functions workflow packaging and deployment --- sebs/azure/azure.py | 363 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 341 insertions(+), 22 deletions(-) diff --git a/sebs/azure/azure.py b/sebs/azure/azure.py index 6fbb7906e..635833395 100644 --- a/sebs/azure/azure.py +++ b/sebs/azure/azure.py @@ -32,6 +32,7 @@ """ import datetime +import glob import json import random import re @@ -46,7 +47,7 @@ from sebs.azure.blob_storage import BlobStorage from sebs.azure.cli import AzureCLI from sebs.azure.cosmosdb import CosmosDB -from sebs.azure.function import AzureFunction +from sebs.azure.function import AzureFunction, AzureWorkflow from sebs.azure.config import AzureConfig, AzureResources from sebs.azure.system_resources import AzureSystemResources from sebs.azure.triggers import AzureTrigger, HTTPTrigger @@ -55,8 +56,8 @@ from sebs.cache import Cache from sebs.config import SeBSConfig from sebs.experiments.config import SystemVariant -from sebs.utils import LoggingHandlers, execute -from sebs.faas.function import Function, FunctionConfig, ExecutionResult +from sebs.utils import LoggingHandlers, execute, replace_string_in_file +from sebs.faas.function import Function, FunctionConfig, ExecutionResult, Workflow from sebs.faas.system import System from sebs.faas.config import Resources from sebs.sebs_types import Language @@ -123,6 +124,15 @@ def function_type() -> Type[Function]: """ return AzureFunction + @staticmethod + def workflow_type() -> Type[Workflow]: + """Get the workflow type for Azure. + + Returns: + AzureWorkflow class type. + """ + return AzureWorkflow + @property def cli_instance(self) -> AzureCLI: """Get Azure CLI instance. @@ -225,11 +235,17 @@ def package_code( """Package function code for Azure Functions deployment. Creates the proper directory structure and configuration files - required for Azure Functions deployment. The structure includes: + required for Azure Functions deployment. For regular functions: - handler/ directory with source files and Azure wrappers - function.json with trigger and binding configuration - - host.json with runtime configuration - - requirements.txt or package.json with dependencies + + For workflows (Durable Functions): + - main/ directory with HTTP trigger + durableClient binding + - run_workflow/ directory with orchestration trigger + - run_subworkflow/ directory with orchestration trigger + - One directory per activity function with activityTrigger binding + + Both include host.json with runtime configuration. Args: directory: Directory containing the function code @@ -238,14 +254,22 @@ def package_code( architecture: Target architecture (currently unused) benchmark: Name of the benchmark is_cached: Whether the package is from cache - system_variant: Selected deployment variant Returns: - Tuple of (directory_path, code_size_bytes, container_uri) + Tuple of (directory_path, code_size_bytes) """ + is_workflow = os.path.exists(os.path.join(directory, "definition.json")) + + if is_workflow: + return self._package_code_workflow(directory, language, benchmark) + else: + return self._package_code_function(directory, language, benchmark) + + def _package_code_function( + self, directory: str, language: Language, benchmark: str + ) -> Tuple[str, float]: + """Package a regular (non-workflow) function for Azure.""" - # In previous step we ran a Docker container which installed packages - # Python packages are in .python_packages because this is expected by Azure EXEC_FILES = { Language.PYTHON: "handler.py", Language.NODEJS: "handler.js", @@ -256,6 +280,12 @@ def package_code( Language.NODEJS: ["package.json", "node_modules"], Language.JAVA: ["lib", "src", "pom.xml", "target", ".mvn", "mvnw", "mvnw.cmd"], } + WORKFLOW_FILES = [ + "main_workflow.py", + "run_workflow.py", + "run_subworkflow.py", + "fsm.py", + ] package_config = CONFIG_FILES[language] handler_dir = os.path.join(directory, "handler") @@ -265,18 +295,23 @@ def package_code( if language == Language.JAVA: lib_dir = os.path.join(directory, "lib") os.makedirs(lib_dir, exist_ok=True) - # Move function.jar to lib directory if os.path.exists(os.path.join(directory, "function.jar")): shutil.move( os.path.join(directory, "function.jar"), os.path.join(lib_dir, "function.jar") ) - # move all files to 'handler' except package config + # move all files to 'handler' except package config and workflow files for f in os.listdir(directory): - if f not in package_config: + if f not in package_config and f not in WORKFLOW_FILES: source_file = os.path.join(directory, f) shutil.move(source_file, handler_dir) + # Remove workflow files that shouldn't be deployed + for wf_file in WORKFLOW_FILES: + wf_path = os.path.join(directory, wf_file) + if os.path.exists(wf_path): + os.remove(wf_path) + # For Java, clean up build artifacts that we don't want to deploy if language == Language.JAVA: for artifact in ["src", "pom.xml", "target", ".mvn", "mvnw", "mvnw.cmd"]: @@ -288,11 +323,7 @@ def package_code( os.remove(artifact_path) # generate function.json - # TODO: extension to other triggers than HTTP if language == Language.JAVA: - # Java Azure Functions - For annotation-based functions, function.json - # should include scriptFile and entryPoint - # The @FunctionName annotation determines the function name default_function_json = { "scriptFile": "../lib/function.jar", "entryPoint": "org.serverlessbench.Handler.handleRequest", @@ -338,6 +369,186 @@ def package_code( execute("zip -qu -r9 {}.zip * .".format(benchmark), shell=True, cwd=directory) return directory, code_size + def _package_code_workflow( + self, directory: str, language: Language, benchmark: str + ) -> Tuple[str, float]: + """Package a Durable Functions workflow for Azure. + + Creates the directory structure expected by Azure Durable Functions: + - main/ — HTTP trigger entry point (durableClient binding) + - run_workflow/ — orchestrator function + - run_subworkflow/ — sub-orchestrator for parallel map tasks + - {activity_name}/ — one directory per activity function + """ + FILES = {"python": "*.py", "nodejs": "*.js"} + CONFIG_FILES = { + "python": ["requirements.txt", ".python_packages"], + "nodejs": ["package.json", "node_modules"], + } + WRAPPER_FILES = { + "python": ["handler.py", "storage.py", "nosql.py", "fsm.py"], + "nodejs": ["handler.js", "storage.js"], + } + file_type = FILES[language] + package_config = CONFIG_FILES[language] + wrapper_files = WRAPPER_FILES[language] + + # Rename main_workflow.py to main.py + main_path = os.path.join(directory, "main_workflow.py") + os.rename(main_path, os.path.join(directory, "main.py")) + + # Copy definition.json into the package + # It's loaded at runtime by the orchestrator + definition_src = None + for parent in [directory]: + candidate = os.path.join(parent, "definition.json") + if os.path.exists(candidate): + definition_src = candidate + break + if definition_src is None: + raise ValueError(f"No workflow definition found in {directory}") + + # Bindings for different function types + main_bindings = [ + { + "name": "req", + "type": "httpTrigger", + "direction": "in", + "authLevel": "anonymous", + "methods": ["get", "post"], + }, + {"name": "starter", "type": "durableClient", "direction": "in"}, + {"name": "$return", "type": "http", "direction": "out"}, + ] + activity_bindings = [ + {"name": "event", "type": "activityTrigger", "direction": "in"}, + ] + orchestrator_bindings = [ + {"name": "context", "type": "orchestrationTrigger", "direction": "in"} + ] + + bindings = { + "main": main_bindings, + "run_workflow": orchestrator_bindings, + "run_subworkflow": orchestrator_bindings, + } + + # Move each .py file into its own directory (Azure Functions convention) + func_dirs = [] + for file_path in glob.glob(os.path.join(directory, file_type)): + file = os.path.basename(file_path) + + if file in package_config or file in wrapper_files: + continue + + name, ext = os.path.splitext(file) + func_dir = os.path.join(directory, name) + func_dirs.append(func_dir) + + os.makedirs(func_dir) + shutil.move(os.path.join(directory, file), os.path.join(func_dir, file)) + + # Generate function.json for each function directory + script_file = file if name in bindings else "handler.py" + payload = { + "bindings": bindings.get(name, activity_bindings), + "scriptFile": script_file, + "disabled": False, + } + json.dump( + payload, + open(os.path.join(func_dir, "function.json"), "w"), + indent=2, + ) + + # Copy wrapper files to each activity function directory + for wrapper_file in wrapper_files: + src_path = os.path.join(directory, wrapper_file) + if not os.path.exists(src_path): + continue + for func_dir in func_dirs: + dst_path = os.path.join(func_dir, wrapper_file) + shutil.copyfile(src_path, dst_path) + os.remove(src_path) + + # Create __init__.py in each function directory so relative imports work + for func_dir in func_dirs: + init_path = os.path.join(func_dir, "__init__.py") + if not os.path.exists(init_path): + open(init_path, "w").close() + + # Substitute Redis placeholders in handler and orchestrator files + redis_host = self.config.redis_host + redis_password = self.config.redis_password + redis_host_val = f'"{redis_host}"' if redis_host else "None" + redis_password_val = f'"{redis_password}"' if redis_password else "None" + + for func_dir in func_dirs: + handler_path = os.path.join(func_dir, WRAPPER_FILES[language][0]) + if os.path.exists(handler_path): + replace_string_in_file(handler_path, "{{REDIS_HOST}}", redis_host_val) + replace_string_in_file(handler_path, "{{REDIS_PASSWORD}}", redis_password_val) + + run_workflow_path = os.path.join(directory, "run_workflow", "run_workflow.py") + if os.path.exists(run_workflow_path): + replace_string_in_file(run_workflow_path, "{{REDIS_HOST}}", redis_host_val) + replace_string_in_file(run_workflow_path, "{{REDIS_PASSWORD}}", redis_password_val) + + # generate host.json + host_json = { + "version": "2.0", + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[2.*, 3.0.0)", + }, + } + json.dump(host_json, open(os.path.join(directory, "host.json"), "w"), indent=2) + + code_size = Benchmark.directory_size(directory) + execute("zip -qu -r9 {}.zip * .".format(benchmark), shell=True, cwd=directory) + return directory, code_size + + def _wait_for_function_ready(self, url: str, timeout: int = 300, interval: int = 10) -> None: + """Poll the function URL until it returns a non-empty HTTP response after publish. + + Azure Functions can take up to several minutes to become available + after a fresh publish. This method polls until the app responds. + + Args: + url: The function HTTP trigger URL to probe + timeout: Maximum seconds to wait (default 300) + interval: Seconds between probe attempts (default 10) + """ + import pycurl + from io import BytesIO + + self.logging.info(f"Waiting for function app to be ready at {url}...") + deadline = time.time() + timeout + probe_payload = json.dumps({"request_id": "warmup", "payload": {}}) + + while time.time() < deadline: + c = pycurl.Curl() + c.setopt(pycurl.URL, url) + c.setopt(pycurl.POST, 1) + c.setopt(pycurl.HTTPHEADER, ["Content-Type: application/json"]) + c.setopt(pycurl.POSTFIELDS, probe_payload) + c.setopt(pycurl.TIMEOUT, 30) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + try: + c.perform() + if len(data.getvalue()) > 0: + self.logging.info("Function app is ready.") + return + except Exception: + pass + finally: + c.close() + self.logging.info(f"Function app not ready yet, retrying in {interval}s...") + time.sleep(interval) + + self.logging.warning(f"Function app did not become ready within {timeout}s, proceeding anyway.") + def _execute_cli_with_retry( self, cmd: str, @@ -538,6 +749,8 @@ def update_function( container_dest = self._mount_function_code(code_package) function_url = self.publish_function(function, code_package, container_dest, True) + self._wait_for_function_ready(function_url) + # Avoid duplication of HTTP trigger found_trigger = False for trigger in function.triggers_all(): @@ -878,6 +1091,105 @@ def cached_function(self, function: Function) -> None: azure_trigger.logging_handlers = self.logging_handlers azure_trigger.data_storage_account = data_storage_account + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureWorkflow: + """Create a new Azure Durable Functions workflow. + + Deploys the workflow as a single Function App containing the + orchestrator, sub-orchestrator, and all activity functions. + + Args: + code_package: Benchmark code package with workflow definition + workflow_name: Name for the workflow Function App + + Returns: + AzureWorkflow instance representing the deployed workflow. + """ + language = code_package.language_name + language_runtime = self._normalize_runtime_version(language, code_package.language_version) + language_runtime = str(language_runtime) + resource_group = self.config.resources.resource_group(self.cli_instance) + region = self.config.region + function_cfg = FunctionConfig.from_benchmark(code_package) + + config = { + "resource_group": resource_group, + "func_name": workflow_name, + "region": region, + "runtime": self.AZURE_RUNTIMES[language], + "runtime_version": language_runtime, + } + + # Check if function app already exists + function_storage_account: Optional[AzureResources.Storage] = None + try: + ret = self.cli_instance.execute( + ( + " az functionapp config appsettings list " + " --resource-group {resource_group} " + " --name {func_name} " + ).format(**config) + ) + for setting in json.loads(ret.decode()): + if setting["name"] == "AzureWebJobsStorage": + connection_string = setting["value"] + elems = [z for y in connection_string.split(";") for z in y.split("=")] + account_name = elems[elems.index("AccountName") + 1] + function_storage_account = AzureResources.Storage.from_cache( + account_name, connection_string + ) + assert function_storage_account is not None + self.logging.info("Azure: Selected existing workflow app {}".format(workflow_name)) + except RuntimeError: + function_storage_account = self.config.resources.add_storage_account(self.cli_instance) + config["storage_account"] = function_storage_account.account_name + while True: + try: + self.cli_instance.execute( + ( + " az functionapp create --resource-group {resource_group} " + " --os-type Linux --consumption-plan-location {region} " + " --runtime {runtime} --runtime-version {runtime_version} " + " --name {func_name} --storage-account {storage_account}" + " --functions-version 4 " + ).format(**config) + ) + self.logging.info("Azure: Created workflow app {}".format(workflow_name)) + break + except RuntimeError as e: + if "another operation is in progress" in str(e): + self.logging.info( + f"Repeat {workflow_name} creation, another operation in progress" + ) + else: + raise e from None + + workflow = AzureWorkflow( + name=workflow_name, + benchmark=code_package.benchmark, + code_hash=code_package.hash, + function_storage=function_storage_account, + cfg=function_cfg, + ) + + self.update_function(workflow, code_package, code_package.system_variant, None) + + self.cache_client.add_function( + deployment_name=self.name(), + language_name=language, + code_package=code_package, + function=workflow, + ) + return workflow + + def update_workflow(self, workflow: Workflow, code_package: Benchmark) -> None: + """Update an existing Azure Durable Functions workflow. + + Args: + workflow: Workflow instance to update + code_package: New benchmark code package + """ + self.update_function(workflow, code_package, code_package.system_variant, None) + def download_metrics( self, function_name: str, @@ -1000,21 +1312,28 @@ def enforce_cold_start(self, functions: List[Function], code_package: Benchmark) self.cold_start_counter += 1 for func in functions: self._enforce_cold_start(func, code_package) - import time - time.sleep(20) def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) -> Trigger: """Create trigger for Azure Function. - Currently not implemented as HTTP triggers are automatically - created for each function during deployment. + HTTP triggers are automatically created during deployment. + For workflows, LIBRARY trigger requests are satisfied by returning + the existing HTTP trigger, since Azure Durable Functions uses HTTP. Args: function: Function to create trigger for trigger_type: Type of trigger to create + Returns: + The HTTP trigger for this function. + Raises: - NotImplementedError: Trigger creation is not supported. + NotImplementedError: If no HTTP trigger exists on the function. """ + from sebs.azure.triggers import HTTPTrigger + + http_triggers = function.triggers(Trigger.TriggerType.HTTP) + if http_triggers: + return http_triggers[0] raise NotImplementedError() From 56d534e08fed36cfc924d6de8eff53c8dc40c7fd Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:29:18 +0200 Subject: [PATCH 211/230] feat: Add Redis configuration properties for workflow measurements --- sebs/azure/config.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sebs/azure/config.py b/sebs/azure/config.py index 2c5fb5e60..cd5c19e71 100644 --- a/sebs/azure/config.py +++ b/sebs/azure/config.py @@ -345,6 +345,21 @@ def __init__( self._storage_accounts = storage_accounts or [] self._data_storage_account = data_storage_account self._cosmosdb_account = cosmosdb_account + self._redis: Optional[Dict] = None + + @property + def redis_host(self) -> Optional[str]: + """Get Redis host for workflow measurements.""" + if self._redis: + return self._redis.get("host") + return None + + @property + def redis_password(self) -> Optional[str]: + """Get Redis password for workflow measurements.""" + if self._redis: + return self._redis.get("password") + return None def set_region(self, region: str) -> None: """Set the Azure region for resource allocation. @@ -714,6 +729,8 @@ def initialize(res: Resources, dct: dict) -> None: if "cosmosdb_account" in dct: ret._cosmosdb_account = CosmosDBAccount.deserialize(dct["cosmosdb_account"]) + ret._redis = dct.get("redis") + def serialize(self) -> dict: """Serialize resources to dictionary. @@ -804,6 +821,16 @@ def resources(self) -> AzureResources: """ return self._resources + @property + def redis_host(self) -> Optional[str]: + """Get Redis host for workflow measurements.""" + return self._resources.redis_host + + @property + def redis_password(self) -> Optional[str]: + """Get Redis password for workflow measurements.""" + return self._resources.redis_password + @staticmethod def initialize(cfg: Config, dct: dict) -> None: """Initialize configuration from dictionary data. From a2cb740b44df0c77b929cbd053117dd43e1dc259 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:30:00 +0200 Subject: [PATCH 212/230] fix: Update resource group initialization to handle missing key gracefully --- sebs/azure/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sebs/azure/config.py b/sebs/azure/config.py index cd5c19e71..54284c275 100644 --- a/sebs/azure/config.py +++ b/sebs/azure/config.py @@ -713,7 +713,7 @@ def initialize(res: Resources, dct: dict) -> None: ret = cast(AzureResources, res) super(AzureResources, AzureResources).initialize(ret, dct) - ret._resource_group = dct["resource_group"] + ret._resource_group = dct.get("resource_group") if "storage_accounts" in dct: ret._storage_accounts = [ AzureResources.Storage.deserialize(x) for x in dct["storage_accounts"] From 5e33a6b8e32d4c7add683cd4deb564fb8b0423e7 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 3 Jun 2026 23:30:20 +0200 Subject: [PATCH 213/230] feat: Add AzureWorkflow class for Durable Functions implementation --- sebs/azure/function.py | 52 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/sebs/azure/function.py b/sebs/azure/function.py index feaf9b922..bb2011dc6 100644 --- a/sebs/azure/function.py +++ b/sebs/azure/function.py @@ -6,7 +6,7 @@ """ from sebs.azure.config import AzureResources -from sebs.faas.function import Function, FunctionConfig +from sebs.faas.function import Function, FunctionConfig, Workflow class AzureFunction(Function): @@ -78,3 +78,53 @@ def deserialize(cached_config: dict) -> Function: assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) ret.add_trigger(trigger_type.deserialize(trigger)) return ret + + +class AzureWorkflow(Workflow): + """Azure Durable Functions workflow implementation.""" + + def __init__( + self, + name: str, + benchmark: str, + code_hash: str, + function_storage: AzureResources.Storage, + cfg: FunctionConfig, + ) -> None: + """Initialize Azure Workflow. + + Args: + name: Name of the Azure Function App hosting the workflow + benchmark: Name of the benchmark this workflow implements + code_hash: Hash of the workflow code for caching + function_storage: Azure Storage account for function code + cfg: Function configuration with memory, timeout, etc. + """ + super().__init__(benchmark, name, code_hash, cfg) + self.function_storage = function_storage + + def serialize(self) -> dict: + """Serialize workflow to dictionary.""" + return { + **super().serialize(), + "function_storage": self.function_storage.serialize(), + } + + @staticmethod + def deserialize(cached_config: dict) -> "AzureWorkflow": + """Deserialize workflow from cached configuration.""" + from sebs.azure.triggers import HTTPTrigger + + cfg = FunctionConfig.deserialize(cached_config["config"]) + ret = AzureWorkflow( + cached_config["name"], + cached_config["benchmark"], + cached_config["hash"], + AzureResources.Storage.deserialize(cached_config["function_storage"]), + cfg, + ) + for trigger in cached_config["triggers"]: + trigger_type = {"HTTP": HTTPTrigger}.get(trigger["type"]) + assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) + ret.add_trigger(trigger_type.deserialize(trigger)) + return ret From 53a73233648946fbab22f6a0454f02a2822c4638 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sun, 7 Jun 2026 13:22:56 +0200 Subject: [PATCH 214/230] feat: Update requirements and enhance AWS storage functionality with new download method --- .../6100.1000-genome/python/requirements.txt | 3 ++- .../python/requirements.txt | 3 ++- .../600.workflows/650.vid/python/requirements.txt | 2 +- .../600.workflows/690.ml/python/requirements.txt | 2 +- benchmarks/wrappers/aws/python/storage.py | 10 ++++++++-- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt b/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt index c357805d6..9be498baa 100644 --- a/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt +++ b/benchmarks/600.workflows/6100.1000-genome/python/requirements.txt @@ -1,3 +1,4 @@ #numpy==1.17 -numpy==1.25 #1.16 works on Azure, but not AWS +numpy==1.25 #1.25 triggers contourpy source build via matplotlib on the build image +contourpy==1.2.1 matplotlib diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt b/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt index 5453e2d48..3907a2758 100644 --- a/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/python/requirements.txt @@ -1,3 +1,4 @@ #numpy==1.17 -numpy==1.18 #1.16 works on Azure, but not AWS +numpy==1.25 #1.16 works on Azure, but not AWS +contourpy==1.2.1 matplotlib diff --git a/benchmarks/600.workflows/650.vid/python/requirements.txt b/benchmarks/600.workflows/650.vid/python/requirements.txt index 84cf1d278..03ac2d798 100644 --- a/benchmarks/600.workflows/650.vid/python/requirements.txt +++ b/benchmarks/600.workflows/650.vid/python/requirements.txt @@ -1,2 +1,2 @@ -numpy<2 +numpy==1.26.4 opencv-python-headless diff --git a/benchmarks/600.workflows/690.ml/python/requirements.txt b/benchmarks/600.workflows/690.ml/python/requirements.txt index 1ff24f2df..589cdb5fa 100644 --- a/benchmarks/600.workflows/690.ml/python/requirements.txt +++ b/benchmarks/600.workflows/690.ml/python/requirements.txt @@ -1,3 +1,3 @@ -numpy<2 +numpy==1.26.4 scipy==1.10.0 scikit-learn==1.5.2 diff --git a/benchmarks/wrappers/aws/python/storage.py b/benchmarks/wrappers/aws/python/storage.py index d11ec2c7e..e2915a2f2 100644 --- a/benchmarks/wrappers/aws/python/storage.py +++ b/benchmarks/wrappers/aws/python/storage.py @@ -22,14 +22,20 @@ def unique_name(name): random=str(uuid.uuid4()).split('-')[0] ) - def upload(self, bucket, file, filepath): - key_name = storage.unique_name(file) + def upload(self, bucket, file, filepath, unique_name=True): + key_name = storage.unique_name(file) if unique_name else file self.client.upload_file(filepath, bucket, key_name) return key_name def download(self, bucket, file, filepath): self.client.download_file(bucket, file, filepath) + def download_within_range(self, bucket, file, start_bytes, end_bytes): + response = self.client.get_object( + Bucket=bucket, Key=file, Range=f"bytes={start_bytes}-{end_bytes}" + ) + return response["Body"].read().decode("utf-8") + def list_directory(self, bucket, prefix): objects = self.client.list_objects_v2(Bucket=bucket, Prefix=prefix) if 'Contents' not in objects: From 17eec1e3b3ff02d7ecb868bf0c5b42142cc6953a Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sun, 7 Jun 2026 15:06:19 +0200 Subject: [PATCH 215/230] feat: Update workflow definitions and enhance state management with common parameters --- .../definition.json | 2 +- .../640.selfish-detour/config.json | 4 +- .../wrappers/aws/python/handler_workflow.py | 7 +-- sebs/aws/generator.py | 43 +++++++++++++------ sebs/aws/triggers.py | 2 +- sebs/faas/fsm.py | 31 ++++++++++--- 6 files changed, 61 insertions(+), 28 deletions(-) diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json b/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json index 1f5852d22..d89586cc7 100644 --- a/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/definition.json @@ -5,7 +5,7 @@ "type": "map", "root": "individuals", "array": "blob", - "common_params": "bucket,columns,columns_bucket,populations,sifting_input,individuals_file", + "common_params": "benchmark_bucket,bucket,columns,columns_bucket,populations,sifting_input,individuals_file", "states": { "individuals": { "type": "task", diff --git a/benchmarks/600.workflows/640.selfish-detour/config.json b/benchmarks/600.workflows/640.selfish-detour/config.json index 8ff6eec59..e57b5e8d1 100644 --- a/benchmarks/600.workflows/640.selfish-detour/config.json +++ b/benchmarks/600.workflows/640.selfish-detour/config.json @@ -1,6 +1,6 @@ { - "timeout": 120, - "memory": 128, + "timeout": 300, + "memory": 1024, "languages": ["python"], "modules": [] } diff --git a/benchmarks/wrappers/aws/python/handler_workflow.py b/benchmarks/wrappers/aws/python/handler_workflow.py index 43d41a1e3..60cb2cdfb 100644 --- a/benchmarks/wrappers/aws/python/handler_workflow.py +++ b/benchmarks/wrappers/aws/python/handler_workflow.py @@ -34,11 +34,8 @@ def handler(event, context): req_id = context.aws_request_id - if isinstance(event, dict) and "payload" in event: - func_payload = event["payload"] - request_id = event.get("request_id", req_id) - elif isinstance(event, dict): - request_id = event.pop("__request_id", req_id) + if isinstance(event, dict): + request_id = event.pop("__sebs_request_id", event.pop("__request_id", req_id)) func_payload = event else: func_payload = event diff --git a/sebs/aws/generator.py b/sebs/aws/generator.py index 81ae093db..28c5f1c5e 100644 --- a/sebs/aws/generator.py +++ b/sebs/aws/generator.py @@ -2,7 +2,7 @@ import numbers import uuid -from sebs.faas.fsm import Generator, State, Task, Switch, Map, Repeat, Loop +from sebs.faas.fsm import Generator, Task, Switch, Map, Repeat, Loop class SFNGenerator(Generator): @@ -16,6 +16,7 @@ def _nameless(p: dict) -> dict: return p state_payloads = {p["Name"]: _nameless(p) for p in payloads} + definition = { "Comment": "SeBS auto-generated benchmark", "StartAt": self.root.name, @@ -92,6 +93,12 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: }, } + if state.common_params: + item_selector: Dict[str, str] = {"array_element.$": "$$.Map.Item.Value"} + for p in state.common_params: + item_selector[f"{p}.$"] = f"$.{p}" + payload["ItemSelector"] = item_selector + payload["ResultPath"] = "$." + state.array if state.next: @@ -102,22 +109,30 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: return payload def encode_parallel(self, state) -> Union[dict, List[dict]]: + from sebs.faas.fsm import State as FsmState + + branches = [] + for branch in state.branches: + sub_states = {n: FsmState.deserialize(n, s) for n, s in branch.states.items()} + branch_states = {} + for sub_state in sub_states.values(): + obj = self.encode_state(sub_state) + objs = [obj] if isinstance(obj, dict) else obj + for o in objs: + name = o["Name"] + branch_states[name] = {k: v for k, v in o.items() if k != "Name"} + branches.append({"StartAt": branch.root, "States": branch_states}) + payload: Dict[str, Any] = { "Name": state.name, "Type": "Parallel", - "Branches": [ - { - "StartAt": f"func_{i}", - "States": { - f"func_{i}": { - "Type": "Task", - "Resource": self._func_arns[fn], - "End": True, - } - }, - } - for i, fn in enumerate(state.funcs) - ], + "Branches": branches, + # Convert the Parallel output array into a dict keyed by branch root name + # so downstream states can reference results by name (e.g. $.sifting). + "ResultSelector": { + f"{b.root}.$": f"$[{i}]" for i, b in enumerate(state.branches) + }, + "ResultPath": "$", } if state.next: diff --git a/sebs/aws/triggers.py b/sebs/aws/triggers.py index 2d8b26f33..b4b0e059a 100644 --- a/sebs/aws/triggers.py +++ b/sebs/aws/triggers.py @@ -213,7 +213,7 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: self.logging.debug(f"Invoke workflow {self.name}") request_id = str(uuid.uuid4())[0:8] - sfn_input = {"payload": payload, "request_id": request_id} + sfn_input = {**payload, "__sebs_request_id": request_id} client = self._deployment_client.get_sfn_client() begin = datetime.datetime.now() diff --git a/sebs/faas/fsm.py b/sebs/faas/fsm.py index 039457cd4..47ea5eb76 100644 --- a/sebs/faas/fsm.py +++ b/sebs/faas/fsm.py @@ -55,15 +55,31 @@ def deserialize(cls, name: str, payload: dict) -> "Switch": return cls(name=name, cases=cases, default=payload["default"]) +class Branch: + """A named sub-workflow branch used inside a Parallel state.""" + + def __init__(self, root: str, states: Dict[str, dict]): + self.root = root + self.states = states + + @staticmethod + def deserialize(payload) -> "Branch": + if isinstance(payload, str): + # Legacy: bare function name — treat as a single-task sub-workflow. + return Branch(root=payload, states={payload: {"type": "task", "func_name": payload}}) + return Branch(root=payload["root"], states=payload["states"]) + + class Parallel(State): - def __init__(self, name: str, funcs: List, next: Optional[str]): + def __init__(self, name: str, branches: List["Branch"], next: Optional[str]): self.name = name - self.funcs = funcs + self.branches = branches self.next = next @classmethod def deserialize(cls, name: str, payload: dict) -> "Parallel": - return cls(name=name, funcs=payload.get("parallel_functions"), next=payload.get("next")) + branches = [Branch.deserialize(f) for f in payload.get("parallel_functions", [])] + return cls(name=name, branches=branches, next=payload.get("next")) class Map(State): @@ -74,7 +90,7 @@ def __init__( array: str, root: str, next: Optional[str], - common_params: Optional[str], + common_params: Optional[List[str]], ): self.name = name self.funcs = funcs @@ -85,13 +101,18 @@ def __init__( @classmethod def deserialize(cls, name: str, payload: dict) -> "Map": + raw = payload.get("common_params") + if isinstance(raw, str): + common_params = [p.strip() for p in raw.split(",") if p.strip()] + else: + common_params = raw or None return cls( name=name, funcs=payload["states"], array=payload["array"], root=payload["root"], next=payload.get("next"), - common_params=payload.get("common_params"), + common_params=common_params, ) From 272a0449a9e3bbc2d56f9a638a1a470f7db98fcd Mon Sep 17 00:00:00 2001 From: laurin Date: Sun, 7 Jun 2026 19:24:44 +0200 Subject: [PATCH 216/230] feat: Enhance GCP workflow support with new functions and timeout configuration --- .../6101.1000-genome-individuals/config.json | 2 +- .../wrappers/gcp/python/handler_workflow.py | 82 +++++-- benchmarks/wrappers/gcp/python/storage.py | 9 +- configs/systems.json | 1 + sebs/gcp/gcp.py | 230 +++++++++++++++++- sebs/gcp/generator.py | 225 +++++++++++++++-- sebs/gcp/triggers.py | 5 +- 7 files changed, 492 insertions(+), 62 deletions(-) diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/config.json b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json index e14b3b052..aff11b0e8 100644 --- a/benchmarks/600.workflows/6101.1000-genome-individuals/config.json +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json @@ -1,5 +1,5 @@ { - "timeout": 540, + "timeout": 1800, "memory": 2048, "languages": ["python"], "modules": ["storage"] diff --git a/benchmarks/wrappers/gcp/python/handler_workflow.py b/benchmarks/wrappers/gcp/python/handler_workflow.py index 083ef0a53..6cf88eb3b 100644 --- a/benchmarks/wrappers/gcp/python/handler_workflow.py +++ b/benchmarks/wrappers/gcp/python/handler_workflow.py @@ -1,6 +1,5 @@ import datetime -import io import json import os import sys @@ -17,7 +16,6 @@ os.environ['NOSQL_STORAGE_DATABASE'] ) -from redis import Redis def probe_cold_start(): is_cold = False @@ -38,49 +36,79 @@ def handler(req): start = datetime.datetime.now().timestamp() os.environ["STORAGE_UPLOAD_BYTES"] = "0" os.environ["STORAGE_DOWNLOAD_BYTES"] = "0" - provider_request_id = req.headers.get("Function-Execution-Id") + provider_request_id = ( + req.headers.get("X-Cloud-Trace-Context") or req.headers.get("Function-Execution-Id") + ) + + event = req.get_json(force=True) + + if isinstance(event, dict) and "payload" in event: + func_payload = event["payload"] + request_id = event.get("request_id", provider_request_id) + elif isinstance(event, dict): + request_id = event.pop("__request_id", provider_request_id) + func_payload = event + else: + func_payload = event + request_id = provider_request_id + + if isinstance(func_payload, dict): + func_payload['request-id'] = provider_request_id + + full_function_name = os.getenv("MY_FUNCTION_NAME", "") + if "--" in full_function_name: + workflow_name, func_name = full_function_name.rsplit("--", 1) + elif "___" in full_function_name: + workflow_name, func_name = full_function_name.split("___", 1) + else: + workflow_name = full_function_name + func_name = full_function_name - event = req.get_json() - event["payload"]['request-id'] = provider_request_id - full_function_name = os.getenv("MY_FUNCTION_NAME") - workflow_name, func_name = full_function_name.split("___") function = importlib.import_module(f"function.{func_name}") - res = function.handler(event["payload"]) + res = function.handler(func_payload) end = datetime.datetime.now().timestamp() is_cold, container_id = probe_cold_start() - payload = { + measurement = { "func": func_name, "start": start, "end": end, "is_cold": is_cold, "container_id": container_id, - "provider.request_id": provider_request_id + "provider.request_id": provider_request_id, } func_res = os.getenv("SEBS_FUNCTION_RESULT") if func_res: - payload["result"] = json.loads(func_res) + measurement["result"] = json.loads(func_res) bytes_upload = os.getenv("STORAGE_UPLOAD_BYTES", 0) if bytes_upload: - payload["blob.upload"] = int(bytes_upload) + measurement["blob.upload"] = int(bytes_upload) bytes_download = os.getenv("STORAGE_DOWNLOAD_BYTES", 0) if bytes_download: - payload["blob.download"] = int(bytes_download) - - payload = json.dumps(payload) - - redis = Redis(host={{REDIS_HOST}}, - port=6379, - decode_responses=True, - socket_connect_timeout=10, - password={{REDIS_PASSWORD}}) - - req_id = event["request_id"] - key = os.path.join(workflow_name, func_name, req_id, str(uuid.uuid4())[0:8]) - redis.set(key, payload) - - return res + measurement["blob.download"] = int(bytes_download) + + try: + redis_host = os.getenv("REDIS_HOST", "") + redis_password = os.getenv("REDIS_PASSWORD", "") + if redis_host and redis_password: + from redis import Redis + redis_client = Redis( + host=redis_host, + port=6379, + decode_responses=True, + socket_connect_timeout=10, + password=redis_password, + ) + key = os.path.join(workflow_name, func_name, request_id, str(uuid.uuid4())[0:8]) + redis_client.set(key, json.dumps(measurement)) + except Exception: + pass + + if isinstance(res, dict): + res["__request_id"] = request_id + + return json.dumps(res), 200, {'Content-Type': 'application/json'} diff --git a/benchmarks/wrappers/gcp/python/storage.py b/benchmarks/wrappers/gcp/python/storage.py index dfe9563a0..23a40a34a 100644 --- a/benchmarks/wrappers/gcp/python/storage.py +++ b/benchmarks/wrappers/gcp/python/storage.py @@ -22,8 +22,8 @@ def unique_name(name): random=str(uuid.uuid4()).split('-')[0] ) - def upload(self, bucket, file, filepath): - key_name = storage.unique_name(file) + def upload(self, bucket, file, filepath, unique_name=True): + key_name = storage.unique_name(file) if unique_name else file bucket_instance = self.client.bucket(bucket) blob = bucket_instance.blob(key_name) blob.upload_from_filename(filepath) @@ -34,6 +34,11 @@ def download(self, bucket, file, filepath): blob = bucket_instance.blob(file) blob.download_to_filename(filepath) + def download_within_range(self, bucket, file, start_bytes, end_bytes): + blob = self.client.bucket(bucket).blob(file) + data = blob.download_as_bytes(start=start_bytes, end=end_bytes) + return data.decode("utf-8") + def list_directory(self, bucket, prefix): objects = self.client.bucket(bucket).list_blobs(prefix=prefix) return [obj.name for obj in objects] diff --git a/configs/systems.json b/configs/systems.json index cb1f3808b..c77443e6f 100644 --- a/configs/systems.json +++ b/configs/systems.json @@ -311,6 +311,7 @@ "deployment": { "files": [ "handler.py", + "handler_workflow.py", "storage.py", "nosql.py", "setup.py" diff --git a/sebs/gcp/gcp.py b/sebs/gcp/gcp.py index a85656284..f49875313 100644 --- a/sebs/gcp/gcp.py +++ b/sebs/gcp/gcp.py @@ -26,6 +26,7 @@ """ import docker +import json import os import logging import random @@ -2213,6 +2214,17 @@ def function_type() -> "Type[Function]": """ return GCPFunction + @staticmethod + def workflow_type() -> "Type[Function]": + """Get the workflow class type for this platform. + + Returns: + GCPWorkflow class type + """ + from sebs.gcp.workflow import GCPWorkflow + + return GCPWorkflow + def initialize( self, config: Dict[str, str] = {}, @@ -2551,6 +2563,7 @@ def create_function( func_name: str, system_variant: SystemVariant, container_uri: str | None, + extra_envs: Dict | None = None, ) -> GCPFunction: """Create a new GCP Cloud Function or update existing one. @@ -2596,6 +2609,7 @@ def create_function( envs = { **self._generate_function_envs(code_package), **strategy.generate_runtime_envs(), + **(extra_envs or {}), } # Get code bucket for non-container deployments @@ -2644,7 +2658,7 @@ def create_function( ) strategy.allow_public_access(project_name, location, func_name) - self.update_function(function, code_package, system_variant, container_uri) + self.update_function(function, code_package, system_variant, container_uri, extra_envs) # Add LibraryTrigger to a new function # Not supported on containers @@ -2675,10 +2689,18 @@ def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) Raises: RuntimeError: If trigger type is not supported """ - from sebs.gcp.triggers import HTTPTrigger + from sebs.gcp.triggers import HTTPTrigger, WorkflowLibraryTrigger from sebs.gcp.function import GCPFunction + from sebs.gcp.workflow import GCPWorkflow - if trigger_type == Trigger.TriggerType.HTTP: + if isinstance(function, GCPWorkflow): + if trigger_type == Trigger.TriggerType.LIBRARY: + trigger = WorkflowLibraryTrigger(function.name, self) + else: + raise RuntimeError( + f"Trigger type {trigger_type} not supported for workflows. Use LIBRARY." + ) + elif trigger_type == Trigger.TriggerType.HTTP: gcp_function = cast(GCPFunction, function) self.logging.info(f"Function {function.name} - waiting for deployment...") @@ -2697,6 +2719,187 @@ def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) self.cache_client.update_function(function) return trigger + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> "Function": + """Create a new GCP Workflow that orchestrates Cloud Functions. + + Deploys individual functions for each code file in the benchmark, + generates a GCP Workflows definition from the benchmark's FSM definition, + and creates the workflow via the GCP Workflows API. + + Args: + code_package: Benchmark package with workflow code and definition + workflow_name: Name for the GCP Workflow + + Returns: + GCPWorkflow instance representing the deployed workflow + """ + import yaml + from google.cloud.workflows_v1 import WorkflowsClient, Workflow as GCPWorkflowProto + from sebs.gcp.workflow import GCPWorkflow + from sebs.gcp.generator import GCPGenerator + from sebs.gcp.triggers import WorkflowLibraryTrigger, HTTPTrigger + + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found for {workflow_name}") + + code_files = list(code_package.get_code_files(include_config=False)) + func_names = [os.path.splitext(os.path.basename(p))[0] for p in code_files] + funcs = [ + self.create_function( + code_package, + workflow_name + "--" + fn, + code_package.system_variant, + None, + extra_envs={"MY_FUNCTION_NAME": workflow_name + "--" + fn}, + ) + for fn in func_names + ] + + # Create HTTP triggers for each function so the workflow can call them + func_triggers: Dict[str, str] = {} + for fn, func in zip(func_names, funcs): + if len(func.triggers(Trigger.TriggerType.HTTP)) == 0: + self.create_trigger(func, Trigger.TriggerType.HTTP) + http_trigger = cast(HTTPTrigger, func.triggers(Trigger.TriggerType.HTTP)[0]) + func_triggers[fn] = http_trigger.url + + gen = GCPGenerator(workflow_name, func_triggers, code_package.benchmark_config.timeout) + gen.parse(definition_path) + definition = gen.generate() + + # Deploy the workflow via GCP Workflows API + project_name = self.config.project_name + location = self.config.region + parent = f"projects/{project_name}/locations/{location}" + + workflows_client = WorkflowsClient() + workflow_proto = GCPWorkflowProto(source_contents=yaml.dump(json.loads(definition), width=99999)) + + try: + operation = workflows_client.create_workflow( + parent=parent, workflow=workflow_proto, workflow_id=workflow_name + ) + self.logging.info(f"Creating workflow {workflow_name}") + operation.result() + except Exception as e: + if "already exists" in str(e).lower(): + self.logging.info(f"Workflow {workflow_name} already exists, updating.") + workflow_proto.name = f"{parent}/workflows/{workflow_name}" + operation = workflows_client.update_workflow(workflow=workflow_proto) + operation.result() + else: + raise + + # Deploy map sub-workflows if any + for map_id, map_definition in gen.generate_maps(): + map_proto = GCPWorkflowProto(source_contents=yaml.dump(json.loads(map_definition), width=99999)) + try: + operation = workflows_client.create_workflow( + parent=parent, workflow=map_proto, workflow_id=map_id + ) + self.logging.info(f"Creating map sub-workflow {map_id}") + operation.result() + except Exception as e: + if "already exists" in str(e).lower(): + map_proto.name = f"{parent}/workflows/{map_id}" + operation = workflows_client.update_workflow(workflow=map_proto) + operation.result() + else: + raise + + storage_client = self._system_resources.get_storage() + workflow = GCPWorkflow( + workflow_name, + funcs, + code_package.benchmark, + code_package.hash, + FunctionConfig.from_benchmark(code_package), + storage_client.get_bucket(Resources.StorageBucketType.DEPLOYMENT), + ) + + trigger = WorkflowLibraryTrigger(workflow_name, self) + trigger.logging_handlers = self.logging_handlers + workflow.add_trigger(trigger) + return workflow + + def update_workflow(self, workflow: "Function", code_package: Benchmark) -> None: + """Update an existing GCP Workflow with new function code and definition. + + Args: + workflow: Existing GCPWorkflow instance to update + code_package: New benchmark package with updated code + """ + import yaml + from google.cloud.workflows_v1 import WorkflowsClient, Workflow as GCPWorkflowProto + from sebs.gcp.workflow import GCPWorkflow + from sebs.gcp.generator import GCPGenerator + from sebs.gcp.triggers import HTTPTrigger + + wf = cast(GCPWorkflow, workflow) + + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found for {wf.name}") + + code_files = list(code_package.get_code_files(include_config=False)) + func_names = [os.path.splitext(os.path.basename(p))[0] for p in code_files] + funcs = [ + self.create_function( + code_package, + wf.name + "--" + fn, + code_package.system_variant, + None, + extra_envs={"MY_FUNCTION_NAME": wf.name + "--" + fn}, + ) + for fn in func_names + ] + wf.functions = funcs + + func_triggers: Dict[str, str] = {} + for fn, func in zip(func_names, funcs): + if len(func.triggers(Trigger.TriggerType.HTTP)) == 0: + self.create_trigger(func, Trigger.TriggerType.HTTP) + http_trigger = cast(HTTPTrigger, func.triggers(Trigger.TriggerType.HTTP)[0]) + func_triggers[fn] = http_trigger.url + + gen = GCPGenerator(wf.name, func_triggers, code_package.benchmark_config.timeout) + gen.parse(definition_path) + definition = gen.generate() + + project_name = self.config.project_name + location = self.config.region + parent = f"projects/{project_name}/locations/{location}" + + workflows_client = WorkflowsClient() + workflow_proto = GCPWorkflowProto( + name=f"{parent}/workflows/{wf.name}", + source_contents=yaml.dump(json.loads(definition), width=99999), + ) + operation = workflows_client.update_workflow(workflow=workflow_proto) + self.logging.info(f"Updating workflow {wf.name}") + operation.result() + + for map_id, map_definition in gen.generate_maps(): + map_proto = GCPWorkflowProto( + name=f"{parent}/workflows/{map_id}", + source_contents=yaml.dump(json.loads(map_definition), width=99999), + ) + try: + operation = workflows_client.update_workflow(workflow=map_proto) + operation.result() + except Exception as e: + if "not found" in str(e).lower(): + map_proto_new = GCPWorkflowProto( + source_contents=yaml.dump(json.loads(map_definition), width=99999) + ) + operation = workflows_client.create_workflow( + parent=parent, workflow=map_proto_new, workflow_id=map_id + ) + operation.result() + else: + raise + def cached_function(self, function: Function) -> None: """Configure a cached function instance for use. @@ -2708,15 +2911,19 @@ def cached_function(self, function: Function) -> None: """ from sebs.faas.function import Trigger - from sebs.gcp.triggers import LibraryTrigger - - func = cast(GCPFunction, function) + from sebs.gcp.triggers import LibraryTrigger, WorkflowLibraryTrigger + from sebs.gcp.workflow import GCPWorkflow for trigger in function.triggers(Trigger.TriggerType.LIBRARY): - gcp_trigger = cast(LibraryTrigger, trigger) - gcp_trigger.deployment_type = func.deployment_type - gcp_trigger.logging_handlers = self.logging_handlers - gcp_trigger.deployment_client = self + if isinstance(trigger, WorkflowLibraryTrigger) or isinstance(function, GCPWorkflow): + trigger.logging_handlers = self.logging_handlers + trigger._deployment_client = self + else: + func = cast(GCPFunction, function) + gcp_trigger = cast(LibraryTrigger, trigger) + gcp_trigger.deployment_type = func.deployment_type + gcp_trigger.logging_handlers = self.logging_handlers + gcp_trigger.deployment_client = self def update_function( self, @@ -2724,6 +2931,7 @@ def update_function( code_package: Benchmark, system_variant: SystemVariant, container_uri: str | None, + extra_envs: Dict | None = None, ) -> None: """Update an existing Cloud Function with new code and configuration. @@ -2736,6 +2944,7 @@ def update_function( code_package: New benchmark package with updated code system_variant: Selected deployment variant container_uri: Container image URI (unused) + extra_envs: Additional environment variables to set Raises: NotImplementedError: If the deployment variant is unsupported @@ -2751,6 +2960,7 @@ def update_function( envs = { **self._generate_function_envs(code_package), **strategy.generate_runtime_envs(), + **(extra_envs or {}), } # Update code using strategy diff --git a/sebs/gcp/generator.py b/sebs/gcp/generator.py index 223c0c4b9..3589f519a 100644 --- a/sebs/gcp/generator.py +++ b/sebs/gcp/generator.py @@ -1,15 +1,30 @@ import uuid -from typing import Dict, Union, List +from typing import Dict, Union, List, Optional, Set -from sebs.faas.fsm import Generator, State, Task, Switch, Map, Repeat, Loop +from sebs.faas.fsm import Generator, State, Task, Switch, Map, Parallel, Repeat, Loop class GCPGenerator(Generator): - def __init__(self, workflow_name: str, func_triggers: Dict[str, str]): + def __init__( + self, + workflow_name: str, + func_triggers: Dict[str, str], + func_timeout: int = 1800, + ): + """Initialize GCP Workflows YAML generator. + + Args: + workflow_name: Name of the workflow being generated. + func_triggers: Map from function name to HTTP trigger URL. + func_timeout: Timeout in seconds for http.post calls (default 1800). + """ super().__init__() self._workflow_name = workflow_name self._func_triggers = func_triggers - self._map_funcs: Dict[str, str] = dict() + self._func_timeout = func_timeout + # Maps workflow_id -> (url, common_params_list_or_None) + self._map_funcs: Dict[str, tuple] = dict() + self._ordered_states: List[State] = [] def postprocess(self, payloads: List[dict]) -> dict: payloads.append({"final": {"return": ["${res}"]}}) @@ -18,19 +33,108 @@ def postprocess(self, payloads: List[dict]) -> dict: return definition + def _topological_order(self) -> List[State]: + """Return states in BFS order starting from root, visiting all reachable states.""" + visited: Set[str] = set() + ordered: List[State] = [] + queue: List[str] = [self.root.name] + + while queue: + name = queue.pop(0) + if name in visited or name not in self.states: + continue + visited.add(name) + state = self.states[name] + ordered.append(state) + # Enqueue successors + if isinstance(state, Task): + if state.next: + queue.append(state.next) + if state.failure: + queue.append(state.failure) + elif isinstance(state, Switch): + for case in state.cases: + queue.append(case.next) + if state.default: + queue.append(state.default) + + # Also add any states not reachable from root (shouldn't happen in well-formed FSMs) + for name, state in self.states.items(): + if name not in visited: + ordered.append(state) + + return ordered + + def generate(self) -> str: + self._ordered_states = self._topological_order() + terminal_names = self._find_terminal_state_names() + + payloads: List[dict] = [] + for s in self._ordered_states: + obj = self.encode_state(s) + if isinstance(obj, dict): + encoded_name = list(obj.keys())[0] + payloads.append(obj) + # Add explicit jump to final for terminal states that aren't last + if s.name in terminal_names and self._ordered_states[-1].name != s.name: + payloads.append({"goto_final_" + s.name: {"next": "final"}}) + elif isinstance(obj, list): + payloads += obj + # After the last step for this state, add jump to final if terminal + if s.name in terminal_names and self._ordered_states[-1].name != s.name: + payloads.append({"goto_final_" + s.name: {"next": "final"}}) + else: + raise ValueError("Unknown encoded state returned.") + + definition = self.postprocess(payloads) + return self._export_func(definition) + + def _find_terminal_state_names(self) -> Set[str]: + """Find states that have no next pointer (end of a path).""" + terminals: Set[str] = set() + for name, state in self.states.items(): + if isinstance(state, Task) and not state.next: + terminals.add(name) + return terminals + def encode_task(self, state: Task) -> Union[dict, List[dict]]: url = self._func_triggers[state.func_name] - return [ - { + if state.failure: + call_step: dict = { state.name: { - "call": "http.post", - "args": {"url": url, "body": "${res}"}, - "result": "res", + "try": { + "call": "http.post", + "args": {"url": url, "body": "${res}", "timeout": self._func_timeout}, + "result": "res", + }, + "except": { + "as": "e", + "steps": [ + {"jump_" + state.name: {"next": state.failure}}, + ], + }, } - }, - {"assign_res_" + state.name: {"assign": [{"res": "${res.body}"}]}}, - ] + } + assign_step = {"assign_res_" + state.name: {"assign": [{"res": "${res.body}"}]}} + steps: list = [call_step, assign_step] + if state.next: + steps.append({"next_" + state.name: {"next": state.next}}) + return steps + else: + plain_steps: list = [ + { + state.name: { + "call": "http.post", + "args": {"url": url, "body": "${res}", "timeout": self._func_timeout}, + "result": "res", + } + }, + {"assign_res_" + state.name: {"assign": [{"res": "${res.body}"}]}}, + ] + if state.next: + plain_steps.append({"next_" + state.name: {"next": state.next}}) + return plain_steps def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: return { @@ -45,14 +149,95 @@ def _encode_case(self, case: Switch.Case) -> dict: return {"condition": "${" + cond + "}", "next": case.next} def encode_map(self, state: Map) -> Union[dict, List[dict]]: + if isinstance(state.funcs, dict): + first_state = next(iter(state.funcs.values())) + func_name = first_state["func_name"] + else: + func_name = state.funcs[0] + id = self._workflow_name + "_" + "map" + str(uuid.uuid4())[0:8] - self._map_funcs[id] = self._func_triggers[state.func_name] + self._map_funcs[id] = (self._func_triggers[func_name], state.common_params) + + if state.common_params: + # Build enriched array: [{array_element: elem, ...common_params}, ...] + # GCP Workflows assign uses YAML dict syntax (not expression ${}) for maps. + enrich_id = "enrich_" + state.name + enriched_var = "enriched_" + state.name.replace("-", "_") + temp_var = "tmp_" + state.name.replace("-", "_") + + # Build the temp dict using YAML dict syntax in assign + temp_dict: dict = {"array_element": "${elem}"} + for p in state.common_params: + temp_dict[p] = "${res." + p + "}" + + inner_steps = [ + { + "build_" + enrich_id: { + "assign": [{temp_var: temp_dict}] + } + }, + { + "append_" + enrich_id: { + "assign": [ + {enriched_var: "${list.concat(" + enriched_var + ", " + temp_var + ")}"} + ] + } + }, + ] + enrich_steps = [ + {"init_" + enrich_id: {"assign": [{enriched_var: []}]}}, + { + "loop_" + enrich_id: { + "for": { + "value": "elem", + "in": "${res." + state.array + "}", + "steps": inner_steps, + } + } + }, + ] + call_step = { + state.name: { + "call": "experimental.executions.map", + "args": {"workflow_id": id, "arguments": "${" + enriched_var + "}"}, + "result": "res", + } + } + return_steps = [*enrich_steps, call_step] + else: + call_step = { + state.name: { + "call": "experimental.executions.map", + "args": {"workflow_id": id, "arguments": "${res." + state.array + "}"}, + "result": "res", + } + } + return_steps = [call_step] + # Wrap the list result back into a dict so downstream tasks can access it by key + assign_step = { + "assign_res_" + state.name: { + "assign": [{"res": {state.array: "${res}"}}] + } + } + steps = return_steps + [assign_step] + if state.next: + steps.append({"next_" + state.name: {"next": state.next}}) + return steps + + def encode_parallel(self, state: Parallel) -> Union[dict, List[dict]]: + branches = [] + for fn in state.funcs: + url = self._func_triggers[fn] + branches.append({ + "call": "http.post", + "args": {"url": url, "body": "${res}"}, + "result": "res", + }) return { state.name: { - "call": "experimental.executions.map", - "args": {"workflow_id": id, "arguments": "${res." + state.array + "}"}, - "result": "res", + "parallel": {"branches": [{"steps": [{"invoke": b}]} for b in branches]}, + "next": state.next, } } @@ -64,12 +249,12 @@ def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: "for": { "value": "val", "index": "idx", - "in": "${"+state.array+"}", + "in": "${res."+state.array+"}", "steps": [ { "body": { "call": "http.post", - "args": {"url": url, "body": "${val}"} + "args": {"url": url, "body": "${val}", "timeout": self._func_timeout} } } ] @@ -78,7 +263,7 @@ def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: } def generate_maps(self): - for workflow_id, url in self._map_funcs.items(): + for workflow_id, (url, common_params) in self._map_funcs.items(): yield ( workflow_id, self._export_func( @@ -89,7 +274,7 @@ def generate_maps(self): { "map": { "call": "http.post", - "args": {"url": url, "body": "${elem}"}, + "args": {"url": url, "body": "${elem}", "timeout": self._func_timeout}, "result": "elem", } }, diff --git a/sebs/gcp/triggers.py b/sebs/gcp/triggers.py index 04ec1c472..ea1e16eeb 100644 --- a/sebs/gcp/triggers.py +++ b/sebs/gcp/triggers.py @@ -270,8 +270,8 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: self.logging.info(f"Invoke workflow {self.name}") config = self._deployment_client.config - full_workflow_name = GCP.get_full_workflow_name( - config.project_name, config.region, self.name + full_workflow_name = ( + f"projects/{config.project_name}/locations/{config.region}/workflows/{self.name}" ) execution_client = ExecutionsClient() @@ -293,6 +293,7 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: elif execution.state == Execution.State.FAILED: self.logging.error(f"Invocation of {self.name} failed") self.logging.error(f"Input: {payload}") + self.logging.error(f"Error: {execution.error}") gcp_result.stats.failure = True return gcp_result From 36740aae2840d4cc30f162d3a351ff87cffc1dbb Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Sun, 7 Jun 2026 20:30:59 +0200 Subject: [PATCH 217/230] feat: Add download_within_range method to storage classes for partial downloads --- benchmarks/wrappers/azure/python/storage.py | 4 ++++ benchmarks/wrappers/local/python/storage.py | 8 ++++++-- benchmarks/wrappers/openwhisk/python/storage.py | 8 ++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py index e22a2bf7c..299e0b9dd 100644 --- a/benchmarks/wrappers/azure/python/storage.py +++ b/benchmarks/wrappers/azure/python/storage.py @@ -53,6 +53,10 @@ def upload_stream(self, container, file, data, unique_name=True): def download_stream(self, container, file): client = self.client.get_blob_client(container=container, blob=file) return client.download_blob().readall() + + def download_within_range(self, container, file, start_bytes, end_bytes): + client = self.client.get_blob_client(container=container, blob=file) + return client.download_blob(offset=start_bytes, length=end_bytes - start_bytes).readall().decode("utf-8") @staticmethod def get_instance(connection_string: Optional[str] = None): diff --git a/benchmarks/wrappers/local/python/storage.py b/benchmarks/wrappers/local/python/storage.py index 2307462b7..52764bd1a 100644 --- a/benchmarks/wrappers/local/python/storage.py +++ b/benchmarks/wrappers/local/python/storage.py @@ -29,14 +29,18 @@ def unique_name(name): random=str(uuid.uuid4()).split('-')[0] ) - def upload(self, bucket, file, filepath): - key_name = storage.unique_name(file) + def upload(self, bucket, file, filepath, unique_name=True): + key_name = storage.unique_name(file) if unique_name else file self.client.fput_object(bucket, key_name, filepath) return key_name def download(self, bucket, file, filepath): self.client.fget_object(bucket, file, filepath) + def download_within_range(self, bucket, file, start_bytes, end_bytes): + data = self.client.get_object(bucket, file, offset=start_bytes, length=end_bytes - start_bytes + 1) + return data.read().decode("utf-8") + def list_directory(self, bucket, prefix): objects = self.client.list_objects(bucket, prefix, recursive=True) return [obj.object_name for obj in objects] diff --git a/benchmarks/wrappers/openwhisk/python/storage.py b/benchmarks/wrappers/openwhisk/python/storage.py index 53c071e64..7986dcd6b 100644 --- a/benchmarks/wrappers/openwhisk/python/storage.py +++ b/benchmarks/wrappers/openwhisk/python/storage.py @@ -49,14 +49,18 @@ def unique_name(name): ) - def upload(self, bucket, file, filepath): - key_name = storage.unique_name(file) + def upload(self, bucket, file, filepath, unique_name=True): + key_name = storage.unique_name(file) if unique_name else file self.client.fput_object(bucket, key_name, filepath) return key_name def download(self, bucket, file, filepath): self.client.fget_object(bucket, file, filepath) + def download_within_range(self, bucket, file, start_bytes, end_bytes): + data = self.client.get_object(bucket, file, offset=start_bytes, length=end_bytes - start_bytes + 1) + return data.read().decode("utf-8") + def list_directory(self, bucket, prefix): objects = self.client.list_objects(bucket, prefix, recursive=True) return [obj.object_name for obj in objects] From 14a9670d84ee84765d9f6290b9e2348e413cd2eb Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Jun 2026 16:24:36 +0200 Subject: [PATCH 218/230] feat: Enhance GCP workflow handling with YAML serialization and execution timeout --- sebs/gcp/gcp.py | 42 +++++++++++++++++++++++++++++++++++++----- sebs/gcp/triggers.py | 9 ++++++++- sebs/regression.py | 2 +- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/sebs/gcp/gcp.py b/sebs/gcp/gcp.py index f49875313..c8e58d7f8 100644 --- a/sebs/gcp/gcp.py +++ b/sebs/gcp/gcp.py @@ -2719,6 +2719,38 @@ def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) self.cache_client.update_function(function) return trigger + @staticmethod + def _workflow_yaml(definition: str) -> str: + """Serialise a JSON workflow definition to GCP-compatible YAML. + + PyYAML's default sequence representation places list-item dashes at the + same indentation level as the parent key, which the GCP Workflows parser + rejects when those lists appear inside ``parallel.branches[*].steps``. + This helper uses a custom Dumper that indents sequences properly. + + Args: + definition: JSON string produced by GCPGenerator. + + Returns: + YAML string accepted by the GCP Workflows API. + """ + import json as _json + import yaml as _yaml + + class _IndentDumper(_yaml.Dumper): + """YAML Dumper that indents block sequences under their parent key.""" + + def increase_indent(self, flow=False, indentless=False): + """Override to always indent sequence items.""" + return super().increase_indent(flow=flow, indentless=False) + + return _yaml.dump( + _json.loads(definition), + Dumper=_IndentDumper, + width=99999, + default_flow_style=False, + ) + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> "Function": """Create a new GCP Workflow that orchestrates Cloud Functions. @@ -2774,7 +2806,7 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> "Funct parent = f"projects/{project_name}/locations/{location}" workflows_client = WorkflowsClient() - workflow_proto = GCPWorkflowProto(source_contents=yaml.dump(json.loads(definition), width=99999)) + workflow_proto = GCPWorkflowProto(source_contents=self._workflow_yaml(definition)) try: operation = workflows_client.create_workflow( @@ -2793,7 +2825,7 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> "Funct # Deploy map sub-workflows if any for map_id, map_definition in gen.generate_maps(): - map_proto = GCPWorkflowProto(source_contents=yaml.dump(json.loads(map_definition), width=99999)) + map_proto = GCPWorkflowProto(source_contents=self._workflow_yaml(map_definition)) try: operation = workflows_client.create_workflow( parent=parent, workflow=map_proto, workflow_id=map_id @@ -2874,7 +2906,7 @@ def update_workflow(self, workflow: "Function", code_package: Benchmark) -> None workflows_client = WorkflowsClient() workflow_proto = GCPWorkflowProto( name=f"{parent}/workflows/{wf.name}", - source_contents=yaml.dump(json.loads(definition), width=99999), + source_contents=self._workflow_yaml(definition), ) operation = workflows_client.update_workflow(workflow=workflow_proto) self.logging.info(f"Updating workflow {wf.name}") @@ -2883,7 +2915,7 @@ def update_workflow(self, workflow: "Function", code_package: Benchmark) -> None for map_id, map_definition in gen.generate_maps(): map_proto = GCPWorkflowProto( name=f"{parent}/workflows/{map_id}", - source_contents=yaml.dump(json.loads(map_definition), width=99999), + source_contents=self._workflow_yaml(map_definition), ) try: operation = workflows_client.update_workflow(workflow=map_proto) @@ -2891,7 +2923,7 @@ def update_workflow(self, workflow: "Function", code_package: Benchmark) -> None except Exception as e: if "not found" in str(e).lower(): map_proto_new = GCPWorkflowProto( - source_contents=yaml.dump(json.loads(map_definition), width=99999) + source_contents=self._workflow_yaml(map_definition) ) operation = workflows_client.create_workflow( parent=parent, workflow=map_proto_new, workflow_id=map_id diff --git a/sebs/gcp/triggers.py b/sebs/gcp/triggers.py index ea1e16eeb..0c90c6a16 100644 --- a/sebs/gcp/triggers.py +++ b/sebs/gcp/triggers.py @@ -285,7 +285,14 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: execution_finished = False while not execution_finished: - execution = execution_client.get_execution(request={"name": res.name}) + try: + execution = execution_client.get_execution( + request={"name": res.name}, + timeout=30, + ) + except Exception: + time.sleep(10) + continue execution_finished = execution.state != Execution.State.ACTIVE if not execution_finished: diff --git a/sebs/regression.py b/sebs/regression.py index f0b780bc8..f83f782f2 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -491,7 +491,7 @@ class GCPTestSequenceWorkflows( metaclass=WorkflowTestSequenceMeta, benchmarks=benchmarks_workflows, architectures=["x64"], - deployments=["package"], + deployments=["function-gen2"], deployment_name="gcp", ): def get_deployment(self, benchmark_name, architecture, deployment_type): From d1e618ee0877ddba75b1d6c2af316dddef3649b7 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Jun 2026 16:38:37 +0200 Subject: [PATCH 219/230] feat: Enhance GCP workflow encoding with improved handling for Map and Parallel states --- sebs/gcp/generator.py | 172 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 142 insertions(+), 30 deletions(-) diff --git a/sebs/gcp/generator.py b/sebs/gcp/generator.py index 3589f519a..8149704e0 100644 --- a/sebs/gcp/generator.py +++ b/sebs/gcp/generator.py @@ -1,7 +1,7 @@ import uuid -from typing import Dict, Union, List, Optional, Set +from typing import Dict, Union, List, Optional, Set, Tuple -from sebs.faas.fsm import Generator, State, Task, Switch, Map, Parallel, Repeat, Loop +from sebs.faas.fsm import Generator, State, Task, Switch, Map, Parallel, Repeat, Loop, Branch class GCPGenerator(Generator): @@ -57,6 +57,9 @@ def _topological_order(self) -> List[State]: queue.append(case.next) if state.default: queue.append(state.default) + elif isinstance(state, (Map, Parallel, Loop, Repeat)): + if state.next: + queue.append(state.next) # Also add any states not reachable from root (shouldn't happen in well-formed FSMs) for name, state in self.states.items(): @@ -93,7 +96,8 @@ def _find_terminal_state_names(self) -> Set[str]: """Find states that have no next pointer (end of a path).""" terminals: Set[str] = set() for name, state in self.states.items(): - if isinstance(state, Task) and not state.next: + has_next = getattr(state, "next", None) + if not has_next: terminals.add(name) return terminals @@ -148,7 +152,19 @@ def _encode_case(self, case: Switch.Case) -> dict: cond = "res." + case.var + " " + case.op + " " + str(case.val) return {"condition": "${" + cond + "}", "next": case.next} - def encode_map(self, state: Map) -> Union[dict, List[dict]]: + def encode_map(self, state: Map, res_var: str = "res") -> Union[dict, List[dict]]: + """Encode a Map state as GCP Workflows steps. + + Args: + state: Map state to encode. + res_var: Variable name that holds the current result dict. Defaults + to ``"res"`` for top-level maps; pass a branch-specific variable + when encoding maps inside parallel branches to avoid cross-branch + interference via the shared ``res`` variable. + + Returns: + List of step dicts for the map. + """ if isinstance(state.funcs, dict): first_state = next(iter(state.funcs.values())) func_name = first_state["func_name"] @@ -158,24 +174,22 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: id = self._workflow_name + "_" + "map" + str(uuid.uuid4())[0:8] self._map_funcs[id] = (self._func_triggers[func_name], state.common_params) + # Write map output to a separate var so the original dict is preserved. + # Use dot-path assignment (res_var.array = map_res_NAME) to update only + # the array key, keeping all other context fields intact. + map_res_var = "map_res_" + state.name.replace("-", "_") + if state.common_params: - # Build enriched array: [{array_element: elem, ...common_params}, ...] - # GCP Workflows assign uses YAML dict syntax (not expression ${}) for maps. enrich_id = "enrich_" + state.name enriched_var = "enriched_" + state.name.replace("-", "_") temp_var = "tmp_" + state.name.replace("-", "_") - # Build the temp dict using YAML dict syntax in assign temp_dict: dict = {"array_element": "${elem}"} for p in state.common_params: - temp_dict[p] = "${res." + p + "}" + temp_dict[p] = "${" + res_var + "." + p + "}" inner_steps = [ - { - "build_" + enrich_id: { - "assign": [{temp_var: temp_dict}] - } - }, + {"build_" + enrich_id: {"assign": [{temp_var: temp_dict}]}}, { "append_" + enrich_id: { "assign": [ @@ -190,7 +204,7 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: "loop_" + enrich_id: { "for": { "value": "elem", - "in": "${res." + state.array + "}", + "in": "${" + res_var + "." + state.array + "}", "steps": inner_steps, } } @@ -200,7 +214,7 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: state.name: { "call": "experimental.executions.map", "args": {"workflow_id": id, "arguments": "${" + enriched_var + "}"}, - "result": "res", + "result": map_res_var, } } return_steps = [*enrich_steps, call_step] @@ -208,15 +222,15 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: call_step = { state.name: { "call": "experimental.executions.map", - "args": {"workflow_id": id, "arguments": "${res." + state.array + "}"}, - "result": "res", + "args": {"workflow_id": id, "arguments": "${" + res_var + "." + state.array + "}"}, + "result": map_res_var, } } return_steps = [call_step] - # Wrap the list result back into a dict so downstream tasks can access it by key + # Update only the array key; all other context fields are preserved. assign_step = { "assign_res_" + state.name: { - "assign": [{"res": {state.array: "${res}"}}] + "assign": [{res_var + "." + state.array: "${" + map_res_var + "}"}] } } steps = return_steps + [assign_step] @@ -224,22 +238,120 @@ def encode_map(self, state: Map) -> Union[dict, List[dict]]: steps.append({"next_" + state.name: {"next": state.next}}) return steps + def _encode_branch(self, branch: Branch, shared_var: str) -> Tuple[List[dict], List[str]]: + """Encode a single Parallel branch as a list of GCP Workflow steps. + + Each branch reads from ``shared_var``, processes its states in order, + and writes its result back to ``shared_var``. Because GCP Workflows + parallel branches share the global variable namespace we write results + into a branch-specific variable and later merge them. + + Args: + branch: Branch definition containing sub-states. + shared_var: Variable name to read input from. + + Returns: + Tuple of (steps, extra_shared_vars). ``extra_shared_vars`` lists + any intermediate variables written inside the branch (e.g. + ``map_res_*``) that must appear in the parallel step's ``shared`` + list. + """ + from sebs.faas.fsm import State as FSMState + + steps: List[dict] = [] + extra_shared: List[str] = [] + # Resolve BFS order within the branch's own state dict + b_states = {n: FSMState.deserialize(n, s) for n, s in branch.states.items()} + visited: Set[str] = set() + queue = [branch.root] + ordered = [] + while queue: + n = queue.pop(0) + if n in visited or n not in b_states: + continue + visited.add(n) + s = b_states[n] + ordered.append(s) + nxt = getattr(s, "next", None) + if nxt: + queue.append(nxt) + if isinstance(s, Task) and s.failure: + queue.append(s.failure) + + for s in ordered: + if isinstance(s, Task): + url = self._func_triggers[s.func_name] + steps.append({s.name: {"call": "http.post", "args": {"url": url, "body": "${" + shared_var + "}", "timeout": self._func_timeout}, "result": shared_var}}) + steps.append({"assign_res_" + s.name: {"assign": [{shared_var: "${" + shared_var + ".body}"}]}}) + elif isinstance(s, Map): + # Pass shared_var directly so encode_map reads/writes that variable + # instead of the global "res". This avoids cross-branch interference + # when multiple parallel branches each contain a Map step. + steps += self.encode_map(s, res_var=shared_var) # type: ignore[arg-type] + # map_res_ is written inside this branch — must be shared. + map_res_var = "map_res_" + s.name.replace("-", "_") + extra_shared.append(map_res_var) + return steps, extra_shared + def encode_parallel(self, state: Parallel) -> Union[dict, List[dict]]: - branches = [] - for fn in state.funcs: - url = self._func_triggers[fn] - branches.append({ - "call": "http.post", - "args": {"url": url, "body": "${res}"}, - "result": "res", - }) + """Encode a Parallel state as a GCP Workflows parallel block. - return { + Each branch runs concurrently with its own local copy of ``res`` + (since ``res`` is NOT in ``shared``). Results are stored in + branch-specific shared variables and merged into ``res`` afterwards. + + Args: + state: Parallel state to encode. + + Returns: + List of step dicts for the parallel block and result merge. + """ + shared_vars = [] + extra_shared_all: List[str] = [] + gcp_branches = [] + for i, branch in enumerate(state.branches): + # Use a per-branch local variable as the working variable throughout + # the branch so that no branch writes to the outer "res". + # The final value is stored in this shared var after the branch ends. + var = "branch_res_" + state.name.replace("-", "_") + "_" + str(i) + shared_vars.append(var) + # Encode branch using var as both input (initialised to ${res}) and + # working accumulator — _encode_branch takes the var name to use. + branch_steps, extra_shared = self._encode_branch(branch, var) + extra_shared_all.extend(extra_shared) + # Seed the per-branch variable from the outer res before starting. + seed_step = {"seed_" + var: {"assign": [{var: "${res}"}]}} + branch_name = "branch_" + state.name.replace("-", "_") + "_" + str(i) + gcp_branches.append({branch_name: {"steps": [seed_step] + branch_steps}}) + + # GCP Workflows requires shared variables to be initialized in the outer + # scope before the parallel step references them. + all_shared_vars = shared_vars + extra_shared_all + init_assigns = [{v: None} for v in all_shared_vars] + init_step = {"init_" + state.name: {"assign": init_assigns}} + + parallel_step = { state.name: { - "parallel": {"branches": [{"steps": [{"invoke": b}]} for b in branches]}, - "next": state.next, + "parallel": { + # Only branch_res_* and map_res_* vars are shared; "res" is NOT + # listed so each branch gets its own local copy — no cross-branch + # interference when two branches both contain Map steps. + "shared": all_shared_vars, + "branches": gcp_branches, + } + } + } + # Merge: build a single dict keyed by branch root name using YAML dict syntax. + merged_dict = {branch.root: "${" + var + "}" for var, branch in zip(shared_vars, state.branches)} + merge_step = { + "merge_" + state.name: { + "assign": [{"res": merged_dict}] } } + steps: List[dict] = [init_step, parallel_step, merge_step] + if state.next: + steps.append({"next_" + state.name: {"next": state.next}}) + return steps def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: url = self._func_triggers[state.func_name] From 5e1fe49d205f4b04b730d944a2486635e5f652b6 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 8 Jun 2026 17:03:25 +0200 Subject: [PATCH 220/230] feat: Update timeout configuration in workflow to 1800 seconds --- benchmarks/600.workflows/6100.1000-genome/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/600.workflows/6100.1000-genome/config.json b/benchmarks/600.workflows/6100.1000-genome/config.json index e14b3b052..aff11b0e8 100644 --- a/benchmarks/600.workflows/6100.1000-genome/config.json +++ b/benchmarks/600.workflows/6100.1000-genome/config.json @@ -1,5 +1,5 @@ { - "timeout": 540, + "timeout": 1800, "memory": 2048, "languages": ["python"], "modules": ["storage"] From e1044a59a02c54907f807a7b77f5f6020959c603 Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 10 Jun 2026 12:14:35 +0200 Subject: [PATCH 221/230] feat: Add validation functions for output across multiple workflows --- benchmarks/600.workflows/610.gen/input.py | 68 ++++++++++++++++- .../600.workflows/6100.1000-genome/input.py | 71 +++++++++++++++++- .../6101.1000-genome-individuals/input.py | 44 +++++++++++ .../600.workflows/620.func-invo/input.py | 27 +++++++ .../600.workflows/6200.trip-booking/input.py | 52 +++++++++++++ .../600.workflows/630.parallel-sleep/input.py | 25 +++++++ .../631.parallel-download/input.py | 29 +++++++- .../600.workflows/640.selfish-detour/input.py | 11 +++ benchmarks/600.workflows/650.vid/input.py | 62 ++++++++++++++++ .../600.workflows/660.map-reduce/input.py | 66 +++++++++++++++++ benchmarks/600.workflows/670.auth/input.py | 47 +++++++++++- .../600.workflows/680.excamera/input.py | 74 +++++++++++++++++++ benchmarks/600.workflows/690.ml/input.py | 55 ++++++++++++++ sebs/benchmark.py | 19 +++-- sebs/regression.py | 21 +++++- 15 files changed, 655 insertions(+), 16 deletions(-) diff --git a/benchmarks/600.workflows/610.gen/input.py b/benchmarks/600.workflows/610.gen/input.py index 1300f159d..000bc1362 100644 --- a/benchmarks/600.workflows/610.gen/input.py +++ b/benchmarks/600.workflows/610.gen/input.py @@ -1,5 +1,71 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + + def buckets_count(): return (0, 0) + def generate_input(data_dir, size, bucket, input_buckets, output_buckets, upload_func, nosql_func): - return dict() \ No newline at end of file + return dict() + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + if output is None: + return "Output is None" + + if "done" not in output or output["done"] is not True: + return "Expected 'done' key to be True" + + if "astros" not in output: + return "Missing 'astros' key in output" + + # Output structure: output["astros"] is a dict with nested output["astros"]["astros"]["people"] + astros_outer = output["astros"] + if isinstance(astros_outer, dict): + inner = astros_outer.get("astros", {}) + people = inner.get("people", []) if isinstance(inner, dict) else [] + elif isinstance(astros_outer, list): + people = astros_outer + else: + return f"'astros' has unexpected type: {type(astros_outer).__name__}" + + if not isinstance(people, list): + return f"Expected people to be a list, got {type(people).__name__}" + + # The API response includes number and message at the inner astros level + if isinstance(astros_outer, dict): + inner = astros_outer.get("astros", {}) + if isinstance(inner, dict): + api_message = inner.get("message") + if api_message != "success": + return f"API 'message' field is '{api_message}', expected 'success'" + api_number = inner.get("number") + if api_number is not None and api_number != len(people): + return f"API 'number' field is {api_number} but people list has {len(people)} entries" + + for i, person in enumerate(people): + if not isinstance(person, dict): + return f"Element {i} is not a dict" + if "name" not in person: + return f"Element {i} missing 'name' field" + if "name_rev" not in person: + return f"Element {i} missing 'name_rev' field" + + name = person["name"] + name_rev = person["name_rev"] + + if not isinstance(name, str) or not name.strip(): + return f"Element {i} 'name' must be a non-empty string" + if not isinstance(name_rev, str) or not name_rev.strip(): + return f"Element {i} 'name_rev' must be a non-empty string" + + # name_rev splits on first space only: "First Last" -> "Last First" + parts = name.split(" ", 1) + expected_rev = " ".join(reversed(parts)) + if name_rev != expected_rev: + return f"Element {i} 'name_rev' is '{name_rev}', expected '{expected_rev}'" + + if "craft" not in person or not isinstance(person["craft"], str) or not person["craft"]: + return f"Element {i} missing or empty 'craft' field" + + return None diff --git a/benchmarks/600.workflows/6100.1000-genome/input.py b/benchmarks/600.workflows/6100.1000-genome/input.py index def8d0195..dfebfe575 100644 --- a/benchmarks/600.workflows/6100.1000-genome/input.py +++ b/benchmarks/600.workflows/6100.1000-genome/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + import os import re import uuid @@ -36,14 +38,14 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck #print("start: ", start, "end: ", end, "range_per_job: ", range_per_job, "num_individuals_jobs: ", num_individuals_jobs) #data = list(filter(regex.match, content[int(start):int(end)])) data = content[int(start):int(end)] - #name with start and end lines is not needed as all individuals jobs can just read their entire file. + #name with start and end lines is not needed as all individuals jobs can just read their entire file. name = str(uuid.uuid4())[:8] - + upload_data = io.BytesIO() upload_data.writelines((val).encode("utf-8") for val in data) upload_data.seek(0) #name = client.upload_stream(output_bucket, name, upload_data) - #TODO keep track of start + stop bytes and return them. + #TODO keep track of start + stop bytes and return them. nbytes = upload_data.getbuffer().nbytes output = { @@ -64,3 +66,66 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "populations": files[3:9], "sifting_input": files[1], } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + if not isinstance(output, dict): + return f"Expected output to be a dict, got {type(output).__name__}" + + # Real output structure: + # { + # "mutation_overlap": {"sifting": {"populations": [{"output_mutation_overlap": "..."}...]}}, + # "frequency": {"sifting": {"populations": [{"output_frequency": "..."}...]}} + # } + for branch in ("mutation_overlap", "frequency"): + if branch not in output: + return f"Output missing '{branch}' key, got keys: {list(output.keys())}" + sifting = output[branch].get("sifting") + if not isinstance(sifting, dict): + return f"output['{branch}']['sifting'] is not a dict" + pops = sifting.get("populations") + if not isinstance(pops, list) or len(pops) == 0: + return f"output['{branch}']['sifting']['populations'] is not a non-empty list" + + input_populations = input_config.get("populations", []) + expected_key = {"mutation_overlap": "output_mutation_overlap", "frequency": "output_frequency"} + # Output filename patterns: chr21-{POP}.tar.*gz for mutation_overlap, chr21-{POP}-freq.tar.*gz for frequency + filename_patterns = {"mutation_overlap": "chr21-{pop}", "frequency": "chr21-{pop}-freq"} + + for branch, key in expected_key.items(): + pops = output[branch]["sifting"]["populations"] + + # Population count should match input + if input_populations and len(pops) != len(input_populations): + return ( + f"output['{branch}']['sifting']['populations'] has {len(pops)} entries, " + f"expected {len(input_populations)} (one per input population)" + ) + + for i, p in enumerate(pops): + if not isinstance(p, dict): + return f"output['{branch}']['sifting']['populations'][{i}] is not a dict" + if key not in p: + return f"output['{branch}']['sifting']['populations'][{i}] missing '{key}'" + + filename = p[key] + if not isinstance(filename, str) or not filename: + return f"output['{branch}']['sifting']['populations'][{i}]['{key}'] is not a non-empty string" + + if not filename.endswith(".gz"): + return f"output['{branch}']['sifting']['populations'][{i}]['{key}'] should end with .gz, got '{filename}'" + + # Filename should contain the population name + if input_populations: + pop_name = input_populations[i] if i < len(input_populations) else None + pattern_prefix = filename_patterns[branch].format(pop=pop_name) if pop_name else None + if pattern_prefix and pattern_prefix.lower() not in filename.lower(): + return ( + f"output['{branch}']['sifting']['populations'][{i}]['{key}'] = '{filename}' " + f"does not contain expected population pattern '{pattern_prefix}'" + ) + + return None diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/input.py b/benchmarks/600.workflows/6101.1000-genome-individuals/input.py index c30c5bdcc..1dec2c4b6 100644 --- a/benchmarks/600.workflows/6101.1000-genome-individuals/input.py +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + import os import re import uuid @@ -63,3 +65,45 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "populations": files[3:9], "sifting_input": files[1], } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + if output is None: + return "Output is None" + + # Output structure: {"blob": [{"individuals_output": "*.tar.gz", ...}], ...} + if isinstance(output, dict): + if "blob" not in output: + return f"Expected 'blob' key in output, got keys: {list(output.keys())}" + items = output["blob"] + elif isinstance(output, list): + items = output + else: + return f"Expected output to be a dict or list, got {type(output).__name__}" + + if not isinstance(items, list): + return f"Expected 'blob' to be a list, got {type(items).__name__}" + + expected_length = len(input_config["blob"]) + if len(items) != expected_length: + return f"Output length {len(items)} does not match expected number of blobs {expected_length}" + + for i, element in enumerate(items): + if not isinstance(element, dict): + return f"Element {i} is not a dict" + if "individuals_output" not in element: + return f"Element {i} is missing 'individuals_output' key" + + value = element["individuals_output"] + if not isinstance(value, str) or not value: + return f"Element {i} has invalid 'individuals_output' value: expected a non-empty string" + + if not value.endswith(".tar.gz") and not value.endswith(".gz"): + return f"Element {i} has invalid 'individuals_output' filename: expected to end with '.gz', got '{value}'" + + # Filename should follow chr21n-... pattern (from handler: individuals_file = "ALL.chr21.1250.vcf" → chr21n prefix) + individuals_file = input_config.get("individuals_file", "") + if individuals_file and "chr21" in individuals_file and "chr21n" not in value: + return f"Element {i} 'individuals_output' '{value}' does not contain expected 'chr21n' prefix" + + return None diff --git a/benchmarks/600.workflows/620.func-invo/input.py b/benchmarks/600.workflows/620.func-invo/input.py index afefd5d9a..7a7aa0ed6 100644 --- a/benchmarks/600.workflows/620.func-invo/input.py +++ b/benchmarks/600.workflows/620.func-invo/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + size_generators = { 'test' : 10, 'small' : 2**5, @@ -14,3 +16,28 @@ def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): return { 'size': size_generators[size] } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + if 'len' not in output: + return "Missing key 'len' in output" + + if not isinstance(output['len'], str): + return f"Expected 'len' to be a string, got {type(output['len']).__name__}" + + expected_size = input_config['size'] + actual_size = len(output['len']) + if actual_size != expected_size: + return f"Expected string length {expected_size}, got {actual_size}" + + # The string is built by concatenating str(i % 255) for shuffled i in range(size). + # Every character must therefore be a decimal digit. + s = output['len'] + if not s.isdigit(): + non_digit = next(c for c in s if not c.isdigit()) + return f"Output string contains non-digit character {non_digit!r}; expected only digits 0-9" + + return None diff --git a/benchmarks/600.workflows/6200.trip-booking/input.py b/benchmarks/600.workflows/6200.trip-booking/input.py index 4c261f755..0b8f07cf7 100644 --- a/benchmarks/600.workflows/6200.trip-booking/input.py +++ b/benchmarks/600.workflows/6200.trip-booking/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + def allocate_nosql() -> dict: @@ -48,3 +50,53 @@ def generate_input( trip_details["expected_result"] = size_results[size] return trip_details + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + + if output is None: + return "Output is None" + + if not isinstance(output, dict): + return f"Expected output to be a dict, got {type(output).__name__}" + + if "trip_id" not in output: + return "Output is missing 'trip_id' field" + + if not isinstance(output["trip_id"], str) or not output["trip_id"]: + return "Output 'trip_id' must be a non-empty string" + + if "status" not in output: + return "Output is missing 'status' field" + + if not isinstance(output["status"], str): + return f"Output 'status' must be a string, got {type(output['status']).__name__}" + + expected_result = input_config.get("expected_result", {}) + expected_outcome = expected_result.get("result") + expected_reason = expected_result.get("reason") + + valid_statuses = {"success", "failure"} + if output["status"] not in valid_statuses: + return f"Output 'status' must be one of {valid_statuses}, got '{output['status']}'" + + # trip_id is a UUID stored without dashes (32 hex chars) + import re + if not re.match(r'^[0-9a-f]{32}$', output["trip_id"], re.IGNORECASE): + return f"Output 'trip_id' is not a 32-char hex UUID: '{output['trip_id']}'" + + if expected_outcome == "success": + if output["status"] != "success": + return f"Expected status 'success', got '{output['status']}'" + + elif expected_outcome == "failure": + if expected_reason == "hotel": + # Hotel failure raises RuntimeError immediately, so the workflow fails + # with an exception. If validate_output is called, the framework caught + # it gracefully - accept any status in this case. + pass + elif expected_reason in ["confirm", "rental", "flight"]: + if output["status"] != "failure": + return f"Expected status 'failure' (reason: {expected_reason}), got '{output['status']}'" + + return None diff --git a/benchmarks/600.workflows/630.parallel-sleep/input.py b/benchmarks/600.workflows/630.parallel-sleep/input.py index 092981d7a..ef9fc42a0 100644 --- a/benchmarks/600.workflows/630.parallel-sleep/input.py +++ b/benchmarks/600.workflows/630.parallel-sleep/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + #threads-duration size_generators = { 'test' : (2, 2), @@ -32,3 +34,26 @@ def buckets_count(): def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): count, sleep = size_generators[size] return { 'count': count, 'sleep': sleep } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + expected_count = input_config['count'] + + if not isinstance(output, dict) or "buffer" not in output: + return f"Expected output dict with 'buffer' key, got: {output!r}" + + results = output["buffer"] + if not isinstance(results, list): + return f"Expected 'buffer' to be a list, got {type(results).__name__}" + + if len(results) != expected_count: + return f"Expected {expected_count} results, got {len(results)}" + + for i, item in enumerate(results): + if item != "ok": + return f"Expected element {i} to be 'ok', got {item!r}" + + return None diff --git a/benchmarks/600.workflows/631.parallel-download/input.py b/benchmarks/600.workflows/631.parallel-download/input.py index 2a8206519..e427949b9 100644 --- a/benchmarks/600.workflows/631.parallel-download/input.py +++ b/benchmarks/600.workflows/631.parallel-download/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + import os from random import shuffle @@ -11,7 +13,7 @@ '2e20': (20, 2**20), '2e25': (20, 2**25), '2e26': (20, 2**26), - '2e27': (20, 2**27) + '2e27': (20, 2**27) } @@ -47,3 +49,28 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck # os.remove(data_path) return { 'count': count, "bucket": benchmarks_bucket, "blob": input_buckets[0] + '/' + data_name} + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + expected_count = input_config.get("count") + if expected_count is None: + return "Input config missing 'count' field" + + if output is None: + return "Output is None" + + if not isinstance(output, dict) or "buffer" not in output: + return f"Expected output dict with 'buffer' key, got: {output!r}" + + results = output["buffer"] + if not isinstance(results, list): + return f"Expected 'buffer' to be a list, got {type(results).__name__}" + + if len(results) != expected_count: + return f"Expected {expected_count} results, got {len(results)}" + + for i, result in enumerate(results): + if result != "ok": + return f"Result at index {i} is {result!r}, expected 'ok'" + + return None diff --git a/benchmarks/600.workflows/640.selfish-detour/input.py b/benchmarks/600.workflows/640.selfish-detour/input.py index 69d06fcd5..b1ec20540 100644 --- a/benchmarks/600.workflows/640.selfish-detour/input.py +++ b/benchmarks/600.workflows/640.selfish-detour/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + size_generators = { 'test' : 100, 'small': 5000, @@ -10,3 +12,12 @@ def buckets_count(): def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): num_samples = size_generators[size] return { 'num_samples': num_samples } + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + if output != "ok": + return f"Expected output to be exactly 'ok', got: {output!r}" + + return None diff --git a/benchmarks/600.workflows/650.vid/input.py b/benchmarks/600.workflows/650.vid/input.py index c1515f901..080342ed6 100644 --- a/benchmarks/600.workflows/650.vid/input.py +++ b/benchmarks/600.workflows/650.vid/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + import os size_generators = { @@ -28,3 +30,63 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "model_weights": files[0], "model_config": files[1] } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + if output is None: + return "Output is None" + + if not isinstance(output, dict): + return f"Expected output to be a dict, got {type(output).__name__}" + + # __request_id is injected by the workflow engine, not a frame entry + frame_entries = {k: v for k, v in output.items() if k != "__request_id"} + if len(frame_entries) == 0: + return "Output dict is empty, expected at least one frame entry" + + for frame_name, detections in frame_entries.items(): + if not isinstance(detections, list): + return ( + f"Expected detections for frame '{frame_name}' to be a list, " + f"got {type(detections).__name__}" + ) + + for i, detection in enumerate(detections): + if not isinstance(detection, dict): + return ( + f"Detection {i} for frame '{frame_name}' is not a dict, " + f"got {type(detection).__name__}" + ) + + if "class" not in detection: + return f"Detection {i} for frame '{frame_name}' is missing 'class' key" + + if not isinstance(detection["class"], str): + return ( + f"Detection {i} for frame '{frame_name}' has non-string 'class': " + f"{type(detection['class']).__name__}" + ) + + if "score" not in detection: + return f"Detection {i} for frame '{frame_name}' is missing 'score' key" + + if not isinstance(detection["score"], (int, float)): + return ( + f"Detection {i} for frame '{frame_name}' has non-numeric 'score': " + f"{type(detection['score']).__name__}" + ) + + if not (0.0 <= detection["score"] <= 1.0): + return ( + f"Detection {i} for frame '{frame_name}' has score out of range " + f"[0, 1]: {detection['score']}" + ) + + # Handler filters detections at score > 0.5; any detection in output must pass this threshold + if detection["score"] <= 0.5: + return ( + f"Detection {i} for frame '{frame_name}' has score {detection['score']:.4f} " + f"<= 0.5 (handler threshold)" + ) + + return None diff --git a/benchmarks/600.workflows/660.map-reduce/input.py b/benchmarks/600.workflows/660.map-reduce/input.py index 36b2bcc8f..7348cd05b 100644 --- a/benchmarks/600.workflows/660.map-reduce/input.py +++ b/benchmarks/600.workflows/660.map-reduce/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + import os import random @@ -33,3 +35,67 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "n_mappers": n_mappers, "output_bucket": output_buckets[0] } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + expected_words = {"cat", "dog", "bird", "horse", "pig"} + + if output is None: + return "Output is None" + + # Output is wrapped: {"list": [...], "__request_id": "..."} + if isinstance(output, dict): + if "list" in output: + output = output["list"] + else: + return f"Expected output dict to have 'list' key, got keys: {list(output.keys())}" + + if not isinstance(output, list): + return f"Expected output to be a list, got {type(output).__name__}" + + seen_words = set() + for i, entry in enumerate(output): + if not isinstance(entry, dict): + return f"Entry {i} is not a dict: {type(entry).__name__}" + + if "word" not in entry: + return f"Entry {i} is missing 'word' key" + if "count" not in entry: + return f"Entry {i} is missing 'count' key" + + word = entry["word"] + count = entry["count"] + + if not isinstance(word, str): + return f"Entry {i} 'word' is not a string: {type(word).__name__}" + if not isinstance(count, int): + return f"Entry {i} 'count' is not an int: {type(count).__name__}" + + if word not in expected_words: + return f"Entry {i} has unexpected word: '{word}'" + + if count <= 0: + return f"Entry {i} has non-positive count: {count}" + + seen_words.add(word) + + if seen_words != expected_words: + missing = expected_words - seen_words + return f"Missing words in output: {missing}" + + word_counts = {entry["word"]: entry["count"] for entry in output if entry["word"] in expected_words} + + # All words appear the same number of times (input is mult * ["cat","dog","bird","horse","pig"]) + counts = list(word_counts.values()) + if len(set(counts)) != 1: + return f"Word counts are not equal — expected uniform distribution, got: {word_counts}" + + per_word = counts[0] + total = per_word * len(expected_words) + + # Total must be mult * 5 words; verify it matches the n_mappers-based expectation + # n_mappers is in input_config; total words = mult * 5, which must be divisible by 5 + if total % len(expected_words) != 0: + return f"Total word count {total} is not divisible by {len(expected_words)}" + + return None diff --git a/benchmarks/600.workflows/670.auth/input.py b/benchmarks/600.workflows/670.auth/input.py index a0807d12b..2dba2ea27 100644 --- a/benchmarks/600.workflows/670.auth/input.py +++ b/benchmarks/600.workflows/670.auth/input.py @@ -1,3 +1,6 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + +import base64 import random size_generators = { @@ -18,4 +21,46 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck return { "message": msg, "token": "allow" - } \ No newline at end of file + } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + if "response" not in output: + return "Missing 'response' key in output" + + response = output["response"] + + if response == "unauthorized": + return "Response is 'unauthorized', expected encrypted message" + + try: + decoded = base64.b64decode(response) + except Exception as e: + return f"Response is not a valid base64 string: {e}" + + message = input_config["message"] + expected_length = len(message) + if len(decoded) != expected_length: + return ( + f"Decoded ciphertext length ({len(decoded)}) does not match " + f"input message length ({expected_length})" + ) + + # Decrypt and verify the ciphertext matches the original plaintext. + # The handler uses AES-CTR with a fixed key and counter starting at 0. + import pyaes + KEY = "6368616e676520746869732070617373".encode("utf-8") + counter = pyaes.Counter(initial_value=0) + aes = pyaes.AESModeOfOperationCTR(KEY, counter=counter) + decrypted = aes.decrypt(decoded) + if isinstance(message, str): + message_bytes = message.encode("utf-8") + else: + message_bytes = message + if decrypted != message_bytes: + return "Decrypted ciphertext does not match original input message" + + return None diff --git a/benchmarks/600.workflows/680.excamera/input.py b/benchmarks/600.workflows/680.excamera/input.py index 687a2eabc..8d01ea743 100644 --- a/benchmarks/600.workflows/680.excamera/input.py +++ b/benchmarks/600.workflows/680.excamera/input.py @@ -1,3 +1,4 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. import random import os @@ -42,3 +43,76 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "batch_size": batch_size, "quality": 1 } + + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage=None) -> str | None: + if output is None: + return "Output is None" + + # Output structure: {"segments": [list of batch dicts], "__request_id": "..."} + # Each batch dict has: {"segments": [...y4m names...], "benchmark_bucket": "...", "output_bucket": "...", ...} + if isinstance(output, dict): + if "segments" not in output: + return f"Expected 'segments' key in output, got keys: {list(output.keys())}" + items = output["segments"] + if not isinstance(items, list): + return f"Expected 'segments' to be a list, got {type(items).__name__}" + elif isinstance(output, list): + items = output + else: + return f"Expected output to be a dict or list, got {type(output).__name__}" + + if len(items) == 0: + return "Output 'segments' list is empty" + + for i, item in enumerate(items): + if not isinstance(item, dict): + return f"Segment batch {i} is not a dict, got {type(item).__name__}" + + if "benchmark_bucket" not in item: + return f"Segment batch {i} missing 'benchmark_bucket'" + if "output_bucket" not in item: + return f"Segment batch {i} missing 'output_bucket'" + if "segments" not in item: + return f"Segment batch {i} missing 'segments'" + + segs = item["segments"] + if not isinstance(segs, list) or len(segs) == 0: + return f"Segment batch {i} 'segments' must be a non-empty list" + + if "quality" not in item: + return f"Segment batch {i} missing 'quality'" + if item["quality"] != input_config.get("quality"): + return f"Segment batch {i} quality {item['quality']} != input quality {input_config.get('quality')}" + + for seg in segs: + if not isinstance(seg, str) or not seg.endswith(".y4m"): + return f"Segment batch {i} contains non-.y4m segment: {seg!r}" + + import math + + input_segs = input_config.get("segments", []) + expected_segs = len(input_segs) + + # Every input segment must appear exactly once across all output batches + if expected_segs > 0: + output_segs = [seg for item in items for seg in item["segments"]] + if len(output_segs) != expected_segs: + return f"Total segments in output ({len(output_segs)}) != input segments ({expected_segs})" + output_segs_set = set(output_segs) + input_segs_set = set(input_segs) + missing = input_segs_set - output_segs_set + if missing: + return f"Segments missing from output: {sorted(missing)}" + extra = output_segs_set - input_segs_set + if extra: + return f"Unexpected segments in output: {sorted(extra)}" + + # Batch count should be ceil(n_segments / batch_size) + batch_size = input_config.get("batch_size") + if batch_size and expected_segs > 0: + expected_batches = math.ceil(expected_segs / batch_size) + if len(items) != expected_batches: + return f"Expected {expected_batches} batches (ceil({expected_segs}/{batch_size})), got {len(items)}" + + return None diff --git a/benchmarks/600.workflows/690.ml/input.py b/benchmarks/600.workflows/690.ml/input.py index d3f930bc7..a8a29e5dd 100644 --- a/benchmarks/600.workflows/690.ml/input.py +++ b/benchmarks/600.workflows/690.ml/input.py @@ -1,3 +1,5 @@ +# Copyright 2020-2025 ETH Zurich and the SeBS authors. All rights reserved. + size_generators = { "test" : (1, 100, 5), "small": (2, 500, 1024), @@ -11,6 +13,8 @@ {"name": "AdaBoostClassifier"} ] +VALID_CLASSIFIER_NAMES = ["SVC", "RandomForestClassifier", "AdaBoostClassifier"] + def buckets_count(): return (0, 1) @@ -23,3 +27,54 @@ def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buck "n_samples": n_samples, "n_features": n_features } + +def validate_output(data_dir: str | None, input_config: dict, output: dict, language: str, storage = None) -> str | None: + if output is None: + return "Output is None" + + # Step Functions returns {"schedules": [...], "__request_id": "..."} + if isinstance(output, dict): + if "schedules" not in output: + return f"Expected 'schedules' key in output dict, got keys: {list(output.keys())}" + output = output["schedules"] + + if not isinstance(output, list): + return f"Expected output to be a list, got {type(output).__name__}" + + input_classifiers = input_config["classifiers"] + expected_count = len(input_classifiers) + if len(output) != expected_count: + return f"Expected {expected_count} results, got {len(output)}" + + # Build expected names in order from input + expected_names = [c["name"] for c in input_classifiers] + + for i, entry in enumerate(output): + if not isinstance(entry, dict): + return f"Entry {i} is not a dict, got {type(entry).__name__}" + + if "name" not in entry: + return f"Entry {i} is missing 'name' field" + if "score" not in entry: + return f"Entry {i} is missing 'score' field" + + name = entry["name"] + score = entry["score"] + + if not isinstance(name, str): + return f"Entry {i} 'name' is not a string, got {type(name).__name__}" + + if name not in VALID_CLASSIFIER_NAMES: + return f"Entry {i} has invalid classifier name '{name}', expected one of {VALID_CLASSIFIER_NAMES}" + + # Output classifier name must match the corresponding input classifier + if name != expected_names[i]: + return f"Entry {i} classifier name '{name}' does not match input classifier '{expected_names[i]}'" + + if not isinstance(score, (int, float)): + return f"Entry {i} 'score' is not a float, got {type(score).__name__}" + + if score < 0.0 or score > 1.0: + return f"Entry {i} 'score' is {score}, expected value between 0.0 and 1.0" + + return None diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 8a5d3c0aa..ee8c7673f 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -1400,17 +1400,20 @@ def ensure_image(name: str) -> None: path=os.path.abspath(output_dir) ) ) + build_env = { + "CONTAINER_UID": str(os.getuid()), + "CONTAINER_GID": str(os.getgid()), + "CONTAINER_USER": "docker_user", + "APP": self.benchmark, + "PLATFORM": self._deployment_name.upper(), + "TARGET_ARCHITECTURE": self._experiment_config._architecture, + } + if os.path.exists(os.path.join(output_dir, "package.sh")): + build_env["SCRIPT_FILE"] = "package.sh" container = self._docker_client.containers.run( "{}:{}".format(repo_name, image_name), volumes=volumes, - environment={ - "CONTAINER_UID": str(os.getuid()), - "CONTAINER_GID": str(os.getgid()), - "CONTAINER_USER": "docker_user", - "APP": self.benchmark, - "PLATFORM": self._deployment_name.upper(), - "TARGET_ARCHITECTURE": self._experiment_config._architecture, - }, + environment=build_env, remove=False, detach=True, ) diff --git a/sebs/regression.py b/sebs/regression.py index f83f782f2..57e8132e1 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -70,6 +70,7 @@ # Workflow benchmarks available for regression testing benchmarks_workflows = [ + "610.gen", "620.func-invo", "630.parallel-sleep", "631.parallel-download", @@ -415,7 +416,23 @@ def test(self): failure = True logging_wrapper.error(f"{benchmark_name} workflow execution failed") else: - logging_wrapper.info(f"{benchmark_name} workflow execution succeeded") + output = ret.output + storage = ( + deployment_client.system_resources.get_storage() + if benchmark.uses_storage + else None + ) + error = benchmark.validate_output(input_config, output, storage) + if error is not None: + failure = True + logging_wrapper.error( + f"{benchmark_name} workflow output validation failed," + f" reason: {error}" + ) + else: + logging_wrapper.info( + f"{benchmark_name} workflow execution succeeded" + ) except RuntimeError: failure = True logging_wrapper.error(f"{benchmark_name} workflow invocation raised exception") @@ -425,7 +442,7 @@ def test(self): f"_{architecture}_{deployment_type}.json" ) with open(os.path.join(self.client.output_dir, json_filename), "w") as f: - json.dump({"output": ret.output if not failure else {}}, f, indent=2) + json.dump({"output": ret.output}, f, indent=2) deployment_client.shutdown() From 7e7c643321c2ae15c0ff07050ec5aedaa18ec48d Mon Sep 17 00:00:00 2001 From: laurin Date: Wed, 10 Jun 2026 13:55:11 +0200 Subject: [PATCH 222/230] feat: Update postprocess method to return a single string for final result and enhance async_invoke to parse execution result --- sebs/gcp/generator.py | 2 +- sebs/gcp/triggers.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sebs/gcp/generator.py b/sebs/gcp/generator.py index 8149704e0..26943eee7 100644 --- a/sebs/gcp/generator.py +++ b/sebs/gcp/generator.py @@ -27,7 +27,7 @@ def __init__( self._ordered_states: List[State] = [] def postprocess(self, payloads: List[dict]) -> dict: - payloads.append({"final": {"return": ["${res}"]}}) + payloads.append({"final": {"return": "${res}"}}) definition = {"main": {"params": ["res"], "steps": payloads}} diff --git a/sebs/gcp/triggers.py b/sebs/gcp/triggers.py index 0c90c6a16..8db6c064a 100644 --- a/sebs/gcp/triggers.py +++ b/sebs/gcp/triggers.py @@ -304,6 +304,9 @@ def sync_invoke(self, payload: dict) -> ExecutionResult: gcp_result.stats.failure = True return gcp_result + if execution.result: + gcp_result.output = json.loads(execution.result) + return gcp_result def async_invoke(self, payload: dict): From 94d53a533485e818ef8d56e678bed0233e429fe8 Mon Sep 17 00:00:00 2001 From: laurin Date: Mon, 15 Jun 2026 11:30:32 +0200 Subject: [PATCH 223/230] feat: Add Cloudflare Workflows support and related components - Enhance CloudflareContainersDeployment to support additional benchmarks. - Introduce CloudflareWorkflowGenerator for generating TypeScript code from FSM definitions. - Create a new wrangler-workflow.toml template for Cloudflare Workflows. - Implement WorkflowLibraryTrigger for invoking Cloudflare Workflows via HTTP. - Add CloudflareWorkflow class to represent deployed workflows with dispatcher and orchestrator. - Create CloudflareTestSequenceWorkflows for testing workflow benchmarks on Cloudflare. --- .../cloudflare/nodejs/container/worker.js | 40 +- .../cloudflare/python/container/handler.py | 28 +- .../cloudflare/python/container/storage.py | 37 +- .../wrappers/cloudflare/python/handler.py | 32 +- .../wrappers/cloudflare/python/storage.py | 18 +- sebs/cloudflare/cloudflare.py | 286 +++++++++- sebs/cloudflare/containers.py | 4 + sebs/cloudflare/generator.py | 491 ++++++++++++++++++ .../templates/wrangler-workflow.toml | 17 + sebs/cloudflare/triggers.py | 219 +++++++- sebs/cloudflare/workflow.py | 81 +++ sebs/regression.py | 57 +- 12 files changed, 1267 insertions(+), 43 deletions(-) create mode 100644 sebs/cloudflare/generator.py create mode 100644 sebs/cloudflare/templates/wrangler-workflow.toml create mode 100644 sebs/cloudflare/workflow.py diff --git a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js index bd47ea538..87b3226df 100644 --- a/benchmarks/wrappers/cloudflare/nodejs/container/worker.js +++ b/benchmarks/wrappers/cloudflare/nodejs/container/worker.js @@ -331,8 +331,21 @@ async function handleR2Request(request, env) { } if (url.pathname === '/r2/download') { - const object = await env.R2.get(key); - + // Support optional byte-range via Range header (e.g. "bytes=0-1023") + const rangeHeader = request.headers.get('Range'); + let r2Options = undefined; + let rangeStart, rangeEnd; + if (rangeHeader) { + const match = rangeHeader.match(/^bytes=(\d+)-(\d+)$/); + if (match) { + rangeStart = parseInt(match[1], 10); + rangeEnd = parseInt(match[2], 10); + r2Options = { range: { offset: rangeStart, length: rangeEnd - rangeStart + 1 } }; + } + } + + const object = await env.R2.get(key, r2Options); + if (!object) { return new Response(JSON.stringify({ error: 'Object not found' @@ -341,15 +354,20 @@ async function handleR2Request(request, env) { headers: { 'Content-Type': 'application/json' } }); } - - // Return the object data - return new Response(object.body, { - headers: { - 'Content-Type': object.httpMetadata?.contentType || 'application/octet-stream', - 'Content-Length': object.size.toString() - } - }); - + + const status = rangeHeader ? 206 : 200; + const headers = { + 'Content-Type': object.httpMetadata?.contentType || 'application/octet-stream', + }; + if (rangeHeader && rangeStart !== undefined) { + const totalSize = object.size ?? (rangeEnd - rangeStart + 1); + headers['Content-Range'] = `bytes ${rangeStart}-${rangeEnd}/${totalSize}`; + headers['Content-Length'] = String(rangeEnd - rangeStart + 1); + } else { + headers['Content-Length'] = object.size?.toString() ?? ''; + } + return new Response(object.body, { status, headers }); + } else if (url.pathname === '/r2/upload') { // Upload to R2 — stream request.body directly to avoid buffering large payloads in Worker memory console.log(`[worker.js /r2/upload] bucket=${bucket}, key=${key}`); diff --git a/benchmarks/wrappers/cloudflare/python/container/handler.py b/benchmarks/wrappers/cloudflare/python/container/handler.py index 8ae89e6c0..9a9732f41 100644 --- a/benchmarks/wrappers/cloudflare/python/container/handler.py +++ b/benchmarks/wrappers/cloudflare/python/container/handler.py @@ -50,8 +50,12 @@ def patched_urlopen(url, data=None, timeout=None, **kwargs): urllib.request.urlopen = patched_urlopen print("Monkey-patched urllib.request.urlopen to add User-Agent header") -# Import the benchmark handler function -from function.function import handler as benchmark_handler +# Import the default benchmark handler function. +# For workflow dispatch mode, individual function modules are imported dynamically. +try: + from function.function import handler as benchmark_handler +except ImportError: + benchmark_handler = None # Import storage and nosql if available try: @@ -121,14 +125,30 @@ def handle_request(self): except ValueError: event[key] = value + # Workflow dispatch mode: if the event contains a "function" key, + # route to that specific module instead of the default handler. + if 'function' in event: + import importlib + func_name = event['function'] + func_input = event.get('input', {}) + if isinstance(func_input, dict): + func_input = {**func_input, 'request-id': req_id} + module = importlib.import_module(f"function.{func_name}") + func_result = module.handler(func_input) + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(func_result).encode('utf-8')) + return + # Add request metadata income_timestamp = datetime.datetime.now().timestamp() event['request-id'] = req_id event['income-timestamp'] = income_timestamp - + # Measure execution time begin = datetime.datetime.now().timestamp() - + # Call the benchmark function result = benchmark_handler(event) diff --git a/benchmarks/wrappers/cloudflare/python/container/storage.py b/benchmarks/wrappers/cloudflare/python/container/storage.py index dda903f06..bd6f6dafb 100644 --- a/benchmarks/wrappers/cloudflare/python/container/storage.py +++ b/benchmarks/wrappers/cloudflare/python/container/storage.py @@ -169,17 +169,16 @@ def download_stream(self, bucket: str, key: str) -> bytes: print(f"R2 download error: {e}") raise RuntimeError(f"Failed to download from R2: {e}") - def upload(self, bucket, key, filepath): - """Upload file from disk with unique key generation""" - # Generate unique key to avoid conflicts - unique_key = self.unique_name(key) + def upload(self, bucket, key, filepath, unique_name=True): + """Upload file from disk.""" + upload_key = self.unique_name(key) if unique_name else key with open(filepath, 'rb') as f: data = f.read() try: - self._upload_bytes(unique_key, data) + self._upload_bytes(upload_key, data) except Exception as e: raise RuntimeError(f"Failed to upload to R2: {e}") - return unique_key + return upload_key def _upload_with_key(self, bucket: str, key: str, data): """Upload data to R2 via worker proxy with exact key (internal method)""" @@ -211,7 +210,31 @@ def download(self, bucket, key, filepath): os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'wb') as f: f.write(data) - + + def download_within_range(self, bucket: str, key: str, start_bytes: int, end_bytes: int) -> str: + """Download a byte range of an object from R2 via the worker proxy.""" + if not self.r2_enabled: + raise RuntimeError("R2 not configured") + + if not storage.worker_url: + raise RuntimeError("Worker URL not set - cannot access R2") + + params = urllib.parse.urlencode({'bucket': bucket, 'key': key}) + url = f"{storage.worker_url}/r2/download?{params}" + + req = urllib.request.Request(url) + req.add_header('Range', f'bytes={start_bytes}-{end_bytes}') + + try: + with urllib.request.urlopen(req) as response: + return response.read().decode('utf-8') + except urllib.error.HTTPError as e: + if e.code in (206, 200): + return e.read().decode('utf-8') + raise RuntimeError(f"Failed to download range from R2: {e}") + except Exception as e: + raise RuntimeError(f"Failed to download range from R2: {e}") + def list_directory(self, bucket, prefix): """List all object keys with a given prefix.""" if not storage.worker_url: diff --git a/benchmarks/wrappers/cloudflare/python/handler.py b/benchmarks/wrappers/cloudflare/python/handler.py index 65376c6df..45d343607 100644 --- a/benchmarks/wrappers/cloudflare/python/handler.py +++ b/benchmarks/wrappers/cloudflare/python/handler.py @@ -63,24 +63,34 @@ async def fetch2(self, request, env): except IndexError: event[param[0]] = None - ## note: time fixed in worker - income_timestamp = datetime.datetime.now().timestamp() - - event['request-id'] = req_id - event['income-timestamp'] = income_timestamp - - - from function import storage storage.storage.init_instance(self) - if hasattr(self.env, 'NOSQL_STORAGE_DATABASE'): from function import nosql nosql.nosql.init_instance(self) + # Workflow dispatch mode: route to a specific function module + if 'function' in event: + import importlib + func_name = event['function'] + func_input = event.get('input', {}) + if isinstance(func_input, dict): + func_input = {**func_input, 'request-id': req_id} + module = importlib.import_module(f"function.{func_name}") + func_result = module.handler(func_input) + return Response(json.dumps(func_result), + headers={"Content-Type": "application/json"}) + + ## note: time fixed in worker + income_timestamp = datetime.datetime.now().timestamp() + + event['request-id'] = req_id + event['income-timestamp'] = income_timestamp + + print("event:", event) @@ -98,7 +108,7 @@ async def fetch2(self, request, env): log_data['measurement'] = ret['measurement'] else: log_data['measurement'] = {} - + # Add memory usage to measurement (if resource module is available) if HAS_RESOURCE: memory_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 @@ -106,7 +116,7 @@ async def fetch2(self, request, env): else: # Pyodide doesn't support resource module log_data['measurement']['memory_used_mb'] = 0.0 - + if 'logs' in event: log_data['time'] = 0 diff --git a/benchmarks/wrappers/cloudflare/python/storage.py b/benchmarks/wrappers/cloudflare/python/storage.py index 30b836bea..a4c734a54 100644 --- a/benchmarks/wrappers/cloudflare/python/storage.py +++ b/benchmarks/wrappers/cloudflare/python/storage.py @@ -39,12 +39,16 @@ def init_instance(entry: WorkerEntrypoint): storage.instance.entry_env = entry.env storage.instance.written_files = set() - def upload(self, bucket, key, filepath): + def upload(self, bucket, key, filepath, unique_name=True): if filepath in self.written_files: filepath = "/tmp" + os.path.abspath(filepath) with open(filepath, "rb") as f: - unique_key = self.upload_stream(bucket, key, f.read()) - return unique_key + data = f.read() + if unique_name: + upload_key = self.upload_stream(bucket, key, data) + else: + upload_key = run_sync(self._upload_exact(bucket, key, data)) + return upload_key def download(self, bucket, key, filepath): data = self.download_stream(bucket, key) @@ -73,6 +77,14 @@ def download_directory(self, bucket, prefix, out_path): self.download(bucket, file_name, os.path.join(out_path, file_name)) return + async def _upload_exact(self, bucket, key, data): + if hasattr(data, 'getvalue'): + data = data.getvalue() + data_js = to_js(data) if isinstance(data, bytes) else str(data) + bobj = self.get_bucket(bucket) + await bobj.put(key, data_js) + return key + def upload_stream(self, bucket, key, data): return run_sync(self.aupload_stream(bucket, key, data)) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index c8eef2287..04a9616af 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -110,7 +110,10 @@ class Cloudflare(System): # Benchmark IDs are matched against the numeric prefix of the benchmark name # (e.g. "110" matches "110.dynamic-html"). SUPPORTED_BENCHMARKS: Dict[Tuple[str, bool], Optional[List[str]]] = { - ("python", False): ["110", "120", "130", "210", "311", "501", "502", "503"], + ("python", False): [ + "110", "120", "130", "210", "311", "501", "502", "503", + "610", "620", "630", "631", # lightweight workflows (Pyodide-compatible) + ], ("nodejs", False): ["110", "120", "130", "311"], ("python", True): None, # all benchmarks supported ("nodejs", True): ["110", "120", "130", "210", "311"], @@ -1022,9 +1025,12 @@ def default_function_name(self, code_package: Benchmark, resources=None) -> str: Returns: Default function name """ - # Cloudflare Worker names must be lowercase and can contain hyphens + # Cloudflare Worker names must be lowercase and can contain hyphens. + # Abbreviate language names to keep names under the 54-char limit for workers.dev. + lang_abbrev = {"python": "py", "nodejs": "js", "java": "java", "cpp": "cpp"} + lang = lang_abbrev.get(code_package.language_name, code_package.language_name) name = ( - f"{code_package.benchmark}-{code_package.language_name}-" + f"{code_package.benchmark}-{lang}" f"{code_package.language_version.replace('.', '')}" ).lower() if code_package.language_variant != "default": @@ -1189,6 +1195,280 @@ def download_metrics( avg_wall_ms = sum(wall_times) / len(wall_times) / 1000.0 self.logging.info(f"Average wall time: {avg_wall_ms:.2f} ms") + @staticmethod + def workflow_type() -> "Type[Function]": + """Return the Workflow subclass used by this platform.""" + from sebs.cloudflare.workflow import CloudflareWorkflow + + return CloudflareWorkflow + + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> Function: + """Deploy a Cloudflare Workflow: dispatcher + orchestrator. + + 1. Deploys a dispatcher worker/container with all task functions. + 2. Generates a TypeScript orchestrator from definition.json. + 3. Deploys the orchestrator as a Cloudflare Workflow. + + Args: + code_package: Benchmark containing the workflow code. + workflow_name: Name for the workflow. + + Returns: + CloudflareWorkflow instance with trigger attached. + """ + import os + import tempfile + + from sebs.cloudflare.generator import CloudflareWorkflowGenerator + from sebs.cloudflare.triggers import WorkflowLibraryTrigger + from sebs.cloudflare.workflow import CloudflareWorkflow + + container_deployment = code_package.system_variant.is_container + workflow_name = self.format_function_name(workflow_name, container_deployment) + account_id = self.config.credentials.account_id + + if not account_id: + raise RuntimeError("Cloudflare account ID is required to create workflows") + + # --- Step 1: Deploy the dispatcher (single worker/container with all functions) --- + # Cloudflare workers.dev subdomains cap at 54 chars. + # Cap the base name at 43 chars so that base + "-dispatcher" (11 chars) stays ≤ 54. + max_base_len = 43 + if len(workflow_name) > max_base_len: + workflow_name = workflow_name[:max_base_len].rstrip("-") + dispatcher_name = workflow_name + "-dispatcher" + self.logging.info(f"Deploying workflow dispatcher: {dispatcher_name}") + container_uri = code_package._container_uri if container_deployment else None + dispatcher = self.create_function( + code_package, + dispatcher_name, + code_package.system_variant, + container_uri, + ) + + # --- Step 2: Generate orchestrator TypeScript from definition.json --- + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found at {definition_path}") + + # Container workers can't be called via service bindings from inside Workflow steps. + # Use the dispatcher's workers.dev URL directly for container-backed dispatchers. + if container_deployment: + dispatcher_url = self._build_workers_dev_url(dispatcher_name, account_id) + gen = CloudflareWorkflowGenerator(dispatcher_url=dispatcher_url) + else: + gen = CloudflareWorkflowGenerator(dispatcher_binding="DISPATCHER") + gen.parse(definition_path) + ts_source = gen.generate() + + # --- Step 3: Package and deploy the orchestrator --- + orchestrator_name = workflow_name + orchestrator_dir = tempfile.mkdtemp(prefix="sebs-workflow-orchestrator-") + + # Write generated workflow TypeScript + ts_path = os.path.join(orchestrator_dir, "workflow.ts") + with open(ts_path, "w") as f: + f.write(ts_source) + + # Write minimal package.json + package_json = { + "name": orchestrator_name, + "type": "module", + "dependencies": {"@cloudflare/workers-types": "*"}, + } + with open(os.path.join(orchestrator_dir, "package.json"), "w") as f: + import json as json_mod + + json_mod.dump(package_json, f, indent=2) + + # Generate wrangler.toml — omit service binding for container dispatchers + self._generate_workflow_wrangler_toml( + orchestrator_name, + orchestrator_dir, + account_id, + dispatcher_name if not container_deployment else None, + ) + + # Deploy the orchestrator via wrangler + env = {} + if self.config.credentials.api_token: + env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token + elif self.config.credentials.email and self.config.credentials.api_key: + env["CLOUDFLARE_EMAIL"] = self.config.credentials.email + env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key + env["CLOUDFLARE_ACCOUNT_ID"] = account_id + + cli = self._workers_deployment._get_cli() + container_package_path = f"/tmp/workers/{orchestrator_name}" + cli.upload_package(orchestrator_dir, container_package_path) + + self.logging.info(f"Deploying workflow orchestrator: {orchestrator_name}") + cli.wrangler_deploy(container_package_path, env=env) + + # Build orchestrator URL and wait for readiness + orchestrator_url = self._build_workers_dev_url(orchestrator_name, account_id) + self._wait_for_worker_ready(orchestrator_name, orchestrator_url) + + # --- Step 4: Create workflow object and attach trigger --- + function_cfg = FunctionConfig.from_benchmark(code_package) + workflow = CloudflareWorkflow( + name=orchestrator_name, + functions=[dispatcher], + benchmark=code_package.benchmark, + code_package_hash=code_package.hash, + cfg=function_cfg, + account_id=account_id, + dispatcher_name=dispatcher_name, + orchestrator_url=orchestrator_url, + ) + + trigger = WorkflowLibraryTrigger(orchestrator_name, orchestrator_url) + trigger.logging_handlers = self.logging_handlers + workflow.add_trigger(trigger) + + self.logging.info(f"Workflow {orchestrator_name} deployed successfully") + return workflow + + def update_workflow(self, workflow: Function, code_package: Benchmark): + """Update an existing Cloudflare Workflow deployment. + + Re-deploys the dispatcher and regenerates/re-deploys the orchestrator. + + Args: + workflow: Existing CloudflareWorkflow instance. + code_package: Updated benchmark code package. + """ + import os + import tempfile + + from sebs.cloudflare.generator import CloudflareWorkflowGenerator + from sebs.cloudflare.workflow import CloudflareWorkflow + + workflow = cast(CloudflareWorkflow, workflow) + account_id = workflow.account_id + + # Update the dispatcher + self.logging.info(f"Updating workflow dispatcher: {workflow.dispatcher_name}") + update_container_uri = ( + code_package._container_uri if code_package.system_variant.is_container else None + ) + dispatcher = self.create_function( + code_package, + workflow.dispatcher_name, + code_package.system_variant, + update_container_uri, + ) + workflow.functions = [dispatcher] + + # Regenerate and redeploy orchestrator + definition_path = os.path.join(code_package.benchmark_path, "definition.json") + if not os.path.exists(definition_path): + raise ValueError(f"No workflow definition found at {definition_path}") + + container_deployment = code_package.system_variant.is_container + if container_deployment: + dispatcher_url = self._build_workers_dev_url(workflow.dispatcher_name, account_id) + gen = CloudflareWorkflowGenerator(dispatcher_url=dispatcher_url) + else: + gen = CloudflareWorkflowGenerator(dispatcher_binding="DISPATCHER") + gen.parse(definition_path) + ts_source = gen.generate() + + orchestrator_dir = tempfile.mkdtemp(prefix="sebs-workflow-orchestrator-") + with open(os.path.join(orchestrator_dir, "workflow.ts"), "w") as f: + f.write(ts_source) + + package_json = { + "name": workflow.name, + "type": "module", + "dependencies": {"@cloudflare/workers-types": "*"}, + } + with open(os.path.join(orchestrator_dir, "package.json"), "w") as f: + import json as json_mod + + json_mod.dump(package_json, f, indent=2) + + self._generate_workflow_wrangler_toml( + workflow.name, + orchestrator_dir, + account_id, + workflow.dispatcher_name if not container_deployment else None, + ) + + env = {} + if self.config.credentials.api_token: + env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token + elif self.config.credentials.email and self.config.credentials.api_key: + env["CLOUDFLARE_EMAIL"] = self.config.credentials.email + env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key + env["CLOUDFLARE_ACCOUNT_ID"] = account_id + + cli = self._workers_deployment._get_cli() + container_package_path = f"/tmp/workers/{workflow.name}" + cli.upload_package(orchestrator_dir, container_package_path) + + self.logging.info(f"Redeploying workflow orchestrator: {workflow.name}") + cli.wrangler_deploy(container_package_path, env=env) + self._wait_for_worker_ready(workflow.name, workflow.orchestrator_url) + + self.logging.info(f"Workflow {workflow.name} updated successfully") + + def _generate_workflow_wrangler_toml( + self, + orchestrator_name: str, + package_dir: str, + account_id: str, + dispatcher_name: Optional[str], + ) -> str: + """Generate wrangler.toml for the workflow orchestrator from template. + + Args: + orchestrator_name: Name of the orchestrator worker. + package_dir: Directory to write the toml file. + account_id: Cloudflare account ID. + dispatcher_name: Name of the dispatcher worker (for service binding). + Pass None for container dispatchers — they are called via URL, not binding. + + Returns: + Path to the generated wrangler.toml. + """ + try: + import tomllib + except ImportError: + import tomli as tomllib # type: ignore[no-redef] + try: + import tomli_w + except ImportError: + import toml as tomli_w # type: ignore[no-redef, import-untyped] + + from importlib.resources import files + + template_path = ( + files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-workflow.toml") + ) + with template_path.open("rb") as f: + config = tomllib.load(f) + + config["name"] = orchestrator_name + config["account_id"] = account_id + config["workflows"][0]["name"] = orchestrator_name + if dispatcher_name is not None: + config["services"][0]["service"] = dispatcher_name + else: + # Container dispatchers are called via URL; remove service binding + config.pop("services", None) + + toml_path = os.path.join(package_dir, "wrangler.toml") + try: + with open(toml_path, "wb") as f: + tomli_w.dump(config, f) + except TypeError: + with open(toml_path, "w") as f: + f.write(tomli_w.dumps(config)) + + self.logging.info(f"Generated workflow wrangler.toml at {toml_path}") + return toml_path + def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) -> Trigger: """ Create a trigger for a Cloudflare Worker. diff --git a/sebs/cloudflare/containers.py b/sebs/cloudflare/containers.py index adbd79ed4..b7b86843a 100644 --- a/sebs/cloudflare/containers.py +++ b/sebs/cloudflare/containers.py @@ -111,6 +111,10 @@ def generate_wrangler_toml( "411.image-recognition" in benchmark_name or "311.compression" in benchmark_name or "504.dna-visualisation" in benchmark_name + or "6100.1000-genome" in benchmark_name + or "6101.1000-genome-individuals" in benchmark_name + or "650.vid" in benchmark_name + or "680.excamera" in benchmark_name ): self.logging.warning("Using standard-4 instance type for high resource benchmark") config["containers"][0]["instance_type"] = "standard-4" diff --git a/sebs/cloudflare/generator.py b/sebs/cloudflare/generator.py new file mode 100644 index 000000000..8bbfe9ba4 --- /dev/null +++ b/sebs/cloudflare/generator.py @@ -0,0 +1,491 @@ +"""Cloudflare Workflows code generator. + +Translates SeBS FSM definitions (definition.json) into TypeScript source code +for a Cloudflare WorkflowEntrypoint class. The generated code uses a while/switch +state machine pattern where each FSM state maps to a switch case with step.do() calls. +""" + +from typing import Dict, List, Set, Union + +from sebs.faas.fsm import Generator, State, Task, Switch, Map, Parallel, Repeat, Loop + + +class CloudflareWorkflowGenerator(Generator): + """Generate TypeScript Workflow code from FSM definitions.""" + + def __init__( + self, + dispatcher_binding: str = "DISPATCHER", + dispatcher_url: str = "", + ): + """Initialize the Cloudflare Workflow generator. + + Args: + dispatcher_binding: Service binding name (used when dispatcher_url is empty). + dispatcher_url: Direct HTTP URL for the dispatcher (container workers can't be + called via service bindings from inside Workflow steps — use URL instead). + """ + super().__init__() + self._dispatcher_binding = dispatcher_binding + self._dispatcher_url = dispatcher_url + + def generate(self) -> str: + """Generate the complete TypeScript workflow source file.""" + cases = [] + for state in self.states.values(): + case_code = self._encode_state_case(state) + cases.append(case_code) + + cases.append(' case "__end__":\n return state;') + + switch_body = "\n".join(cases) + + if self._dispatcher_url: + env_iface = " WORKFLOW: any;" + else: + env_iface = f" WORKFLOW: any;\n {self._dispatcher_binding}: Fetcher;" + + return f"""\ +import {{ WorkflowEntrypoint, WorkflowEvent, WorkflowStep }} from "cloudflare:workers"; + +interface Env {{ +{env_iface} +}} + +// Retry fetch on 502/503 or non-JSON responses (container cold-start / Durable Object reset). +// Any other non-2xx response is treated as a hard error and thrown immediately. +async function dispatchWithRetry(url: string, body: any, maxAttempts = 10): Promise {{ + for (let attempt = 1; attempt <= maxAttempts; attempt++) {{ + const r = await fetch(url, {{ + method: "POST", + headers: {{ "Content-Type": "application/json" }}, + body: JSON.stringify(body), + }}); + if (r.status === 503 || r.status === 502) {{ + const wait = Math.min(5000 * attempt, 30000); + await new Promise((res) => setTimeout(res, wait)); + continue; + }} + const text = await r.text(); + if (!r.ok) {{ + throw new Error(`Dispatcher returned HTTP ${{r.status}}: ${{text.slice(0, 200)}}`); + }} + try {{ + return JSON.parse(text); + }} catch (_) {{ + // Non-JSON response from container (e.g. proxy error during startup); retry. + if (attempt < maxAttempts) {{ + const wait = Math.min(5000 * attempt, 30000); + await new Promise((res) => setTimeout(res, wait)); + continue; + }} + throw new Error(`Dispatcher returned non-JSON after ${{maxAttempts}} attempts: ${{text.slice(0, 200)}}`); + }} + }} + throw new Error(`Dispatcher unavailable after ${{maxAttempts}} attempts`); +}} + +export class BenchmarkWorkflow extends WorkflowEntrypoint {{ + async run(event: WorkflowEvent, step: WorkflowStep) {{ + let state = structuredClone(event.payload); + let current = "{self.root.name}"; + + while (true) {{ + switch (current) {{ +{switch_body} + default: + throw new Error(`Unknown state: ${{current}}`); + }} + }} + }} +}} + +export default {{ + async fetch(request: Request, env: Env): Promise {{ + const url = new URL(request.url); + if (request.method === "GET" && url.searchParams.has("id")) {{ + // Status poll: return current status without blocking. + const id = url.searchParams.get("id")!; + const instance = await env.WORKFLOW.get(id); + const status = await instance.status(); + return Response.json({{ + status: status.status, + output: (status as any).output ?? null, + error: (status as any).error ?? null, + }}); + }} + // Create a new workflow instance and return its ID immediately. + const payload = await request.json(); + const instance = await env.WORKFLOW.create({{ params: payload }}); + return Response.json({{ id: instance.id }}, {{ status: 202 }}); + }}, +}}; +""" + + def _encode_state_case(self, state: State) -> str: + """Encode a single FSM state as a switch case block.""" + if isinstance(state, Task): + return self._encode_task_case(state) + elif isinstance(state, Switch): + return self._encode_switch_case(state) + elif isinstance(state, Map): + return self._encode_map_case(state) + elif isinstance(state, Parallel): + return self._encode_parallel_case(state) + elif isinstance(state, Repeat): + return self._encode_repeat_case(state) + elif isinstance(state, Loop): + return self._encode_loop_case(state) + else: + raise ValueError(f"Unknown state type: {type(state)}") + + def _encode_task_case(self, state: Task) -> str: + """Encode a Task state as a step.do() call.""" + next_state = f'"{state.next}"' if state.next else '"__end__"' + fetch_setup, fetch_result = self._make_fetch_call(state.func_name) + setup_line = f"\n {fetch_setup}" if fetch_setup else "" + var = state.name.replace("-", "_") + + is_terminal = next_state == '"__end__"' + if is_terminal: + merge_expr = f"{var}_result" + else: + merge_expr = ( + f'(typeof {var}_result === "object" && {var}_result !== null' + f' && !Array.isArray({var}_result))' + f"\n ? {{...state, ...{var}_result}} : {var}_result" + ) + + if state.failure: + return f"""\ + case "{state.name}": {{ + try {{ + const {var}_result = await step.do("{state.name}", async () => {{{setup_line} + return {fetch_result}; + }}); + state = {merge_expr}; + current = {next_state}; + }} catch (e) {{ + state = {{ ...state, _error: String(e) }}; + current = "{state.failure}"; + }} + break; + }}""" + else: + return f"""\ + case "{state.name}": {{ + const {var}_result = await step.do("{state.name}", async () => {{{setup_line} + return {fetch_result}; + }}); + state = {merge_expr}; + current = {next_state}; + break; + }}""" + + def _encode_switch_case(self, state: Switch) -> str: + """Encode a Switch state as if/else conditions.""" + conditions = [] + for case in state.cases: + var_path = self._js_var_path("state", case.var) + op = case.op + val = case.val if isinstance(case.val, (int, float)) else f'"{case.val}"' + conditions.append(f' if ({var_path} {op} {val}) {{ current = "{case.next}"; }}') + + default = state.default if state.default else "__end__" + else_clause = f' else {{ current = "{default}"; }}' + + body = "\n".join(conditions) + if len(conditions) > 1: + lines = [conditions[0]] + for c in conditions[1:]: + lines.append(" else " + c.strip()) + lines.append(else_clause) + body = "\n".join(lines) + else: + body = conditions[0] + "\n" + else_clause + + return f"""\ + case "{state.name}": {{ +{body} + break; + }}""" + + def _encode_map_case(self, state: Map) -> str: + """Encode a Map state as Promise.all with step.do() per item.""" + next_state = f'"{state.next}"' if state.next else '"__end__"' + + if isinstance(state.funcs, dict): + first_state = next(iter(state.funcs.values())) + func_name = first_state["func_name"] + else: + func_name = state.funcs[0] + + array_path = self._js_var_path("state", state.array) + + if state.common_params: + param_spread = ", ".join(f"{p}: state.{p}" for p in state.common_params) + input_expr = f"{{ array_element: item, {param_spread} }}" + else: + input_expr = "item" + + url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" + if self._dispatcher_url: + map_body = ( + f'return await dispatchWithRetry("{url}", ' + f'{{ function: "{func_name}", input: {input_expr} }});' + ) + else: + fetcher = f"this.env.{self._dispatcher_binding}.fetch" + map_body = ( + f'const r = await {fetcher}("{url}", {{\n' + f" method: \"POST\",\n" + f' headers: {{ "Content-Type": "application/json" }},\n' + f' body: JSON.stringify({{ function: "{func_name}", input: {input_expr} }}),\n' + f" }});\n" + f" return await r.json();" + ) + return f"""\ + case "{state.name}": {{ + const items_{state.name.replace("-", "_")} = {array_path}; + const results_{state.name.replace("-", "_")} = await Promise.all( + items_{state.name.replace("-", "_")}.map((item: any, i: number) => + step.do(`{state.name}_${{i}}`, async () => {{ + {map_body} + }}) + ) + ); + {array_path} = results_{state.name.replace("-", "_")}; + current = {next_state}; + break; + }}""" + + def _encode_parallel_case(self, state: Parallel) -> str: + """Encode a Parallel state as Promise.all across branches.""" + next_state = f'"{state.next}"' if state.next else '"__end__"' + + branch_thunks = [] + result_merge_parts = [] + + for i, branch in enumerate(state.branches): + sub_states = {n: State.deserialize(n, s) for n, s in branch.states.items()} + ordered = self._order_branch_states(branch.root, sub_states) + + if len(ordered) == 1 and isinstance(ordered[0], Task): + task = ordered[0] + fetch_setup, fetch_result = self._make_fetch_call(task.func_name) + setup_line = f"\n {fetch_setup}" if fetch_setup else "" + thunk = ( + f' step.do("{branch.root}", async () => {{{setup_line}\n' + f" return {fetch_result};\n" + f" }})" + ) + else: + steps_code = self._encode_branch_steps(ordered) + thunk = ( + f" (async () => {{\n" + f" let branchState = JSON.parse(JSON.stringify(state));\n" + f"{steps_code}\n" + f" return branchState;\n" + f" }})()" + ) + + branch_thunks.append(thunk) + result_merge_parts.append( + f' "{branch.root}": parallelResults_{state.name.replace("-", "_")}[{i}]' + ) + + thunks_joined = ",\n".join(branch_thunks) + merge_joined = ",\n".join(result_merge_parts) + + return f"""\ + case "{state.name}": {{ + const parallelResults_{state.name.replace("-", "_")} = await Promise.all([ +{thunks_joined}, + ]); + state = {{ +{merge_joined}, + }}; + current = {next_state}; + break; + }}""" + + def _encode_repeat_case(self, state: Repeat) -> str: + """Encode a Repeat state as a counted for loop.""" + next_state = f'"{state.next}"' if state.next else '"__end__"' + fetch_setup, fetch_result = self._make_fetch_call(state.func_name) + setup_line = f"\n {fetch_setup}" if fetch_setup else "" + + return f"""\ + case "{state.name}": {{ + for (let i = 0; i < {state.count}; i++) {{ + state = await step.do(`{state.name}_${{i}}`, async () => {{{setup_line} + return {fetch_result}; + }}); + }} + current = {next_state}; + break; + }}""" + + def _encode_loop_case(self, state: Loop) -> str: + """Encode a Loop state as a sequential for loop over an array.""" + next_state = f'"{state.next}"' if state.next else '"__end__"' + array_path = self._js_var_path("state", state.array) + url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" + if self._dispatcher_url: + # Container dispatcher: use retry wrapper + fetch_call_template = ( + f'return await dispatchWithRetry("{url}", {{ function: "{state.func_name}", ' + f"input: {array_path}[i] }});" + ) + else: + fetcher = f"this.env.{self._dispatcher_binding}.fetch" + fetch_call_template = ( + f'const r = await {fetcher}("{url}", {{\n' + f' method: "POST",\n' + f' headers: {{ "Content-Type": "application/json" }},\n' + f' body: JSON.stringify({{ function: "{state.func_name}", ' + f"input: {array_path}[i] }}),\n" + f" }});\n" + f" return await r.json();" + ) + + return f"""\ + case "{state.name}": {{ + for (let i = 0; i < {array_path}.length; i++) {{ + {array_path}[i] = await step.do(`{state.name}_${{i}}`, async () => {{ + {fetch_call_template} + }}); + }} + current = {next_state}; + break; + }}""" + + def _encode_branch_steps(self, ordered_states: List[State]) -> str: + """Encode a sequence of states within a parallel branch.""" + lines = [] + for s in ordered_states: + if isinstance(s, Task): + fetch_setup, fetch_result = self._make_fetch_call(s.func_name) + setup_line = f"\n {fetch_setup}" if fetch_setup else "" + var = s.name.replace("-", "_") + lines.append( + f' const {var}_result = await step.do("{s.name}", async () => {{{setup_line}\n' + f" return {fetch_result};\n" + f" }});\n" + f" branchState = (typeof {var}_result === \"object\" && {var}_result !== null && !Array.isArray({var}_result))\n" + f" ? {{...branchState, ...{var}_result}} : {var}_result;" + ) + elif isinstance(s, Map): + if isinstance(s.funcs, dict): + first_state = next(iter(s.funcs.values())) + func_name = first_state["func_name"] + else: + func_name = s.funcs[0] + + array_path = self._js_var_path("branchState", s.array) + + if s.common_params: + param_spread = ", ".join(f"{p}: branchState.{p}" for p in s.common_params) + input_expr = f"{{ array_element: item, {param_spread} }}" + else: + input_expr = "item" + + url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" + if self._dispatcher_url: + branch_map_body = ( + f'return await dispatchWithRetry("{url}", ' + f'{{ function: "{func_name}", input: {input_expr} }});' + ) + else: + fetcher = f'this.env.{self._dispatcher_binding}.fetch' + branch_map_body = ( + f'const r = await {fetcher}("{url}", {{\n' + f' method: "POST",\n' + f' headers: {{ "Content-Type": "application/json" }},\n' + f' body: JSON.stringify({{ function: "{func_name}",' + f" input: {input_expr} }}),\n" + f" }});\n" + f" return await r.json();" + ) + lines.append( + f" {array_path} = await Promise.all(\n" + f" {array_path}.map((item: any, i: number) =>\n" + f" step.do(`{s.name}_${{i}}`, async () => {{\n" + f" {branch_map_body}\n" + f" }})\n" + f" )\n" + f" );" + ) + return "\n".join(lines) + + def _order_branch_states(self, root: str, states: Dict[str, State]) -> List[State]: + """Return branch states in execution order (BFS from root).""" + ordered: List[State] = [] + visited: Set[str] = set() + queue = [root] + + while queue: + name = queue.pop(0) + if name in visited or name not in states: + continue + visited.add(name) + state = states[name] + ordered.append(state) + nxt = getattr(state, "next", None) + if nxt: + queue.append(nxt) + + return ordered + + def _make_fetch_call(self, func_name: str) -> tuple[str, str]: + """Generate a fetch call to the dispatcher (service binding or direct URL). + + Returns a 2-tuple: (setup_statement, result_expression). + setup_statement is JS code to run before the return, may be empty string. + result_expression is the JS expression whose value is the parsed JSON result. + """ + url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" + if self._dispatcher_url: + # Container dispatcher: use retry wrapper to handle cold-start 503s. + setup = "" + result = ( + f'await dispatchWithRetry("{url}", ' + f'{{ function: "{func_name}", input: state }})' + ) + else: + fetcher = f"this.env.{self._dispatcher_binding}.fetch" + setup = ( + f'const r = await {fetcher}("{url}", {{\n' + f' method: "POST",\n' + f' headers: {{ "Content-Type": "application/json" }},\n' + f' body: JSON.stringify({{ function: "{func_name}", input: state }}),\n' + f" }});" + ) + result = "await r.json()" + return setup, result + + @staticmethod + def _js_var_path(root: str, dotted_path: str) -> str: + """Convert a dotted path like 'astros.people' to JS access 'root.astros.people'.""" + parts = dotted_path.split(".") + return root + "." + ".".join(parts) + + def encode_task(self, state: Task) -> Union[dict, List[dict]]: + """Not used — generation bypasses the standard encode pipeline.""" + raise NotImplementedError("Use generate() directly") + + def encode_switch(self, state: Switch) -> Union[dict, List[dict]]: + """Not used — generation bypasses the standard encode pipeline.""" + raise NotImplementedError("Use generate() directly") + + def encode_map(self, state: Map) -> Union[dict, List[dict]]: + """Not used — generation bypasses the standard encode pipeline.""" + raise NotImplementedError("Use generate() directly") + + def encode_parallel(self, state: Parallel) -> Union[dict, List[dict]]: + """Not used — generation bypasses the standard encode pipeline.""" + raise NotImplementedError("Use generate() directly") + + def encode_loop(self, state: Loop) -> Union[dict, List[dict]]: + """Not used — generation bypasses the standard encode pipeline.""" + raise NotImplementedError("Use generate() directly") diff --git a/sebs/cloudflare/templates/wrangler-workflow.toml b/sebs/cloudflare/templates/wrangler-workflow.toml new file mode 100644 index 000000000..20bd20049 --- /dev/null +++ b/sebs/cloudflare/templates/wrangler-workflow.toml @@ -0,0 +1,17 @@ +# Template for Cloudflare Workflow orchestrators +# This file is read and modified by the deployment system + +name = "PLACEHOLDER_WORKER_NAME" +main = "workflow.ts" +compatibility_date = "2025-06-01" +account_id = "PLACEHOLDER_ACCOUNT_ID" +compatibility_flags = ["nodejs_compat"] + +[[workflows]] +name = "PLACEHOLDER_WORKFLOW_NAME" +binding = "WORKFLOW" +class_name = "BenchmarkWorkflow" + +[[services]] +binding = "DISPATCHER" +service = "PLACEHOLDER_DISPATCHER_NAME" diff --git a/sebs/cloudflare/triggers.py b/sebs/cloudflare/triggers.py index 5b7a208cb..51ee60921 100644 --- a/sebs/cloudflare/triggers.py +++ b/sebs/cloudflare/triggers.py @@ -1,4 +1,4 @@ -"""HTTP trigger implementation for Cloudflare Workers.""" +"""Trigger implementations for Cloudflare Workers and Workflows.""" from typing import Optional import concurrent.futures @@ -234,3 +234,220 @@ def deserialize(obj: dict) -> "HTTPTrigger": """Reconstruct an HTTPTrigger from a serialized dict.""" trigger = HTTPTrigger(obj["worker_name"], obj.get("url")) return trigger + + +class WorkflowLibraryTrigger(Trigger): + """Trigger that invokes a Cloudflare Workflow via its orchestrator's HTTP endpoint. + + The orchestrator worker's fetch handler creates a workflow instance and polls + for completion internally, returning the final result as the HTTP response. + """ + + def __init__(self, workflow_name: str, orchestrator_url: str): + """Initialize the workflow trigger. + + Args: + workflow_name: Name of the Cloudflare Workflow. + orchestrator_url: HTTP URL of the orchestrator worker. + """ + super().__init__() + self.workflow_name = workflow_name + self._orchestrator_url = orchestrator_url + + @staticmethod + def typename() -> str: + """Return the canonical type name for this trigger class.""" + return "Cloudflare.WorkflowLibraryTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + """Return the trigger type enum value.""" + return Trigger.TriggerType.LIBRARY + + def _http_get(self, url: str) -> tuple: + """Perform a GET request and return (status_code, body_bytes).""" + import pycurl + + c = pycurl.Curl() + c.setopt( + pycurl.HTTPHEADER, + ["User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; " + "+https://github.com/spcl/serverless-benchmarks)"], + ) + c.setopt(pycurl.URL, url) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + c.setopt(pycurl.TIMEOUT, 30) + c.perform() + status_code = c.getinfo(pycurl.RESPONSE_CODE) + c.close() + return status_code, data.getvalue() + + def _http_post(self, url: str, body: str) -> tuple: + """Perform a POST request and return (status_code, body_bytes).""" + import pycurl + + c = pycurl.Curl() + c.setopt( + pycurl.HTTPHEADER, + [ + "Content-Type: application/json", + "User-Agent: Mozilla/5.0 (compatible; SeBS/1.0; " + "+https://github.com/spcl/serverless-benchmarks)", + ], + ) + c.setopt(pycurl.POST, 1) + c.setopt(pycurl.URL, url) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + c.setopt(pycurl.POSTFIELDS, body) + c.setopt(pycurl.TIMEOUT, 30) + c.perform() + status_code = c.getinfo(pycurl.RESPONSE_CODE) + c.close() + return status_code, data.getvalue() + + def _do_invoke(self, payload: dict) -> ExecutionResult: + """Create a workflow instance and poll until completion. + + 1. POST to orchestrator → receives {id} (202 Accepted). + 2. GET orchestrator?id= repeatedly until status is complete/errored. + """ + begin = datetime.now() + + # Step 1: create workflow instance + max_create_retries = 3 + instance_id = None + for attempt in range(max_create_retries + 1): + try: + status_code, raw = self._http_post(self._orchestrator_url, json.dumps(payload)) + except Exception as e: + if attempt < max_create_retries: + self.logging.warning(f"Workflow creation network error: {e} — retrying") + time.sleep(5) + continue + self.logging.error(f"Workflow creation network error after retries: {e}") + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.stats.failure = True + return result + try: + resp = json.loads(raw) + except json.JSONDecodeError: + text = raw.decode() + if "1042" in text and "error code" in text: + raise ContainerProvisioningError(f"Error 1042 creating workflow: {text[:200]}") + if attempt < max_create_retries: + time.sleep(5) + continue + self.logging.error( + f"Workflow creation non-JSON response: {text[:200]}" + ) + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.stats.failure = True + return result + + if status_code == 202 and "id" in resp: + instance_id = resp["id"] + break + if "1042" in str(resp) and "error code" in str(resp): + raise ContainerProvisioningError(f"Error 1042 creating workflow: {resp}") + if attempt < max_create_retries: + time.sleep(5) + continue + self.logging.error(f"Workflow creation failed (status={status_code}): {resp}") + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.stats.failure = True + return result + + if instance_id is None: + self.logging.error("Failed to obtain workflow instance ID") + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.stats.failure = True + return result + + # Step 2: poll for completion + poll_url = f"{self._orchestrator_url}?id={instance_id}" + poll_interval = 5 + max_poll_time = 7200 + elapsed = 0 + while elapsed < max_poll_time: + time.sleep(poll_interval) + elapsed += poll_interval + try: + status_code, raw = self._http_get(poll_url) + except Exception as e: + self.logging.warning( + f"Status poll network error (elapsed={elapsed}s): {e} — retrying" + ) + continue + try: + resp = json.loads(raw) + except json.JSONDecodeError: + text = raw.decode() + self.logging.warning(f"Status poll non-JSON (elapsed={elapsed}s): {text[:100]}") + continue + + wf_status = resp.get("status") + if wf_status == "complete": + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.output = resp.get("output") or {} + return result + if wf_status == "errored": + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + self.logging.error(f"Workflow {self.workflow_name} errored: {resp.get('error')}") + result.stats.failure = True + return result + # Still running (queued/running/paused) — keep polling + + end = datetime.now() + self.logging.error( + f"Workflow {self.workflow_name} did not complete within {max_poll_time}s" + ) + result = ExecutionResult.from_times(begin, end) + result.stats.failure = True + return result + + def sync_invoke(self, payload: dict) -> ExecutionResult: + """Invoke the workflow synchronously: create instance, poll until complete. + + Retries on error 1042 (CPU time limit on cold start) up to 3 times. + """ + self.logging.debug(f"Invoke workflow {self.workflow_name} at {self._orchestrator_url}") + max_retries = 3 + retry_wait = 10 + for attempt in range(max_retries + 1): + try: + return self._do_invoke(payload) + except ContainerProvisioningError: + if attempt < max_retries: + self.logging.info( + f"Workflow cold start (error 1042), waiting {retry_wait}s " + f"before retry (attempt {attempt + 1}/{max_retries})..." + ) + time.sleep(retry_wait) + else: + raise + raise RuntimeError("Unreachable") + + def async_invoke(self, payload: dict): + """Async invocation is not implemented for workflows.""" + raise NotImplementedError("Async invocation is not implemented for workflows") + + def serialize(self) -> dict: + """Return a serializable dict for caching.""" + return { + "type": self.typename(), + "workflow_name": self.workflow_name, + "orchestrator_url": self._orchestrator_url, + } + + @staticmethod + def deserialize(obj: dict) -> "WorkflowLibraryTrigger": + """Reconstruct a WorkflowLibraryTrigger from a cached dict.""" + return WorkflowLibraryTrigger(obj["workflow_name"], obj["orchestrator_url"]) diff --git a/sebs/cloudflare/workflow.py b/sebs/cloudflare/workflow.py new file mode 100644 index 000000000..3015cba49 --- /dev/null +++ b/sebs/cloudflare/workflow.py @@ -0,0 +1,81 @@ +"""Cloudflare Workflow representation for SeBS.""" + +from typing import List + +from sebs.cloudflare.function import CloudflareWorker +from sebs.faas.function import FunctionConfig, Workflow + + +class CloudflareWorkflow(Workflow): + """Represents a deployed Cloudflare Workflow with its dispatcher and orchestrator.""" + + def __init__( + self, + name: str, + functions: List[CloudflareWorker], + benchmark: str, + code_package_hash: str, + cfg: FunctionConfig, + account_id: str, + dispatcher_name: str, + orchestrator_url: str, + ): + """Initialize a CloudflareWorkflow. + + Args: + name: Workflow name (also the orchestrator worker name). + functions: List of dispatcher CloudflareWorker instances. + benchmark: Benchmark identifier. + code_package_hash: Hash of the deployed code package. + cfg: Function configuration (memory, timeout). + account_id: Cloudflare account ID. + dispatcher_name: Name of the dispatcher worker/container. + orchestrator_url: URL of the orchestrator worker. + """ + super().__init__(benchmark, name, code_package_hash, cfg) + self.functions = functions + self.account_id = account_id + self.dispatcher_name = dispatcher_name + self.orchestrator_url = orchestrator_url + + @staticmethod + def typename() -> str: + """Return the canonical type name for this workflow class.""" + return "Cloudflare.Workflow" + + def serialize(self) -> dict: + """Serialize workflow state for caching.""" + return { + **super().serialize(), + "functions": [f.serialize() for f in self.functions], + "account_id": self.account_id, + "dispatcher_name": self.dispatcher_name, + "orchestrator_url": self.orchestrator_url, + } + + @staticmethod + def deserialize(cached_config: dict) -> "CloudflareWorkflow": + """Reconstruct a CloudflareWorkflow from a cached configuration dict.""" + from sebs.cloudflare.triggers import HTTPTrigger, WorkflowLibraryTrigger + + funcs = [CloudflareWorker.deserialize(f) for f in cached_config["functions"]] + cfg = FunctionConfig.deserialize(cached_config["config"]) + + ret = CloudflareWorkflow( + cached_config["name"], + funcs, + cached_config["benchmark"], + cached_config["hash"], + cfg, + cached_config["account_id"], + cached_config["dispatcher_name"], + cached_config["orchestrator_url"], + ) + + for trigger in cached_config["triggers"]: + if trigger["type"] == WorkflowLibraryTrigger.typename(): + ret.add_trigger(WorkflowLibraryTrigger.deserialize(trigger)) + elif trigger["type"] == HTTPTrigger.typename(): + ret.add_trigger(HTTPTrigger.deserialize(trigger)) + + return ret diff --git a/sebs/regression.py b/sebs/regression.py index 57e8132e1..56f9a7e48 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -430,9 +430,7 @@ def test(self): f" reason: {error}" ) else: - logging_wrapper.info( - f"{benchmark_name} workflow execution succeeded" - ) + logging_wrapper.info(f"{benchmark_name} workflow execution succeeded") except RuntimeError: failure = True logging_wrapper.error(f"{benchmark_name} workflow invocation raised exception") @@ -1465,6 +1463,55 @@ def get_deployment(self, benchmark_name, architecture, deployment_type): return deployment_client +class CloudflareTestSequenceWorkflows( + unittest.TestCase, + metaclass=WorkflowTestSequenceMeta, + benchmarks=benchmarks_workflows, + architectures=architectures_cloudflare, + deployments=["workers", "container"], + deployment_name="cloudflare", +): + """Test suite for workflow benchmarks on Cloudflare Workflows. + + Runs container-only benchmarks with system_variant=container and the four + lightweight benchmarks (610, 620, 630, 631) with both workers and container + variants. The filter_out_benchmarks function skips workers-incompatible ones. + """ + + def get_deployment(self, benchmark_name, architecture, deployment_type): + """Return an initialized Cloudflare deployment client for workflow testing. + + Args: + benchmark_name: Name of the workflow benchmark to deploy + architecture: Architecture to deploy on (x64) + deployment_type: Deployment type ("workers" for native Workers, "container") + + Returns: + An initialized Cloudflare deployment client + """ + deployment_name = "cloudflare" + assert cloud_config, "Cloud configuration is required" + + is_container = deployment_type == "container" + config_copy = copy.deepcopy(cloud_config) + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = is_container + config_copy["experiments"]["system_variant"] = deployment_type + + f = ( + f"regression_wf_{deployment_name}_{benchmark_name}" + f"_{architecture}_{deployment_type}.log" + ) + deployment_client = self.client.get_deployment( + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), + ) + + with CloudflareTestSequenceWorkflows.lock: + deployment_client.initialize(resource_prefix=RESOURCE_PREFIX, quiet=LOGGING_REDACTED) + return deployment_client + + class CloudflareTestSequencePythonWorkers( unittest.TestCase, metaclass=TestSequenceMeta, @@ -1886,6 +1933,10 @@ def regression_suite( suite.addTest( unittest.defaultTestLoader.loadTestsFromTestCase(AzureTestSequenceWorkflows) ) + if "cloudflare" in providers: + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase(CloudflareTestSequenceWorkflows) + ) # Prepare the list of tests to run tests: List[unittest.TestCase] = [] From 085ee505c5d23f9b803baf8d112f6c5bf391ab8b Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 16 Jun 2026 21:40:58 +0200 Subject: [PATCH 224/230] feat: Enhance Cloudflare Workflow generator with Map and Parallel state support - Refactor CloudflareWorkflowGenerator to support concurrent execution of Map and Parallel states. - Introduce ItemWorkflow for processing Map chunks and FanInCoordinator for aggregating results. - Update dispatch logic to handle chunked processing and error reporting. - Emit proxy handlers for NoSQL and R2 operations in generated code. - Modify wrangler-workflow.toml template to include new ItemWorkflow and Durable Object bindings. --- sebs/benchmark.py | 7 + sebs/cloudflare/cloudflare.py | 641 ++++++++-- sebs/cloudflare/config.py | 32 +- sebs/cloudflare/generator.py | 1087 +++++++++++++---- .../templates/wrangler-workflow.toml | 25 +- 5 files changed, 1394 insertions(+), 398 deletions(-) diff --git a/sebs/benchmark.py b/sebs/benchmark.py index ee8c7673f..2d245b7d0 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -559,6 +559,11 @@ def has_input_processed(self) -> bool: """ return self._input_processed + @property + def last_input_config(self) -> Optional[Dict[str, Any]]: + """Return the most recently prepared benchmark input, if available.""" + return self._last_input_config + @property def uses_storage(self) -> bool: """ @@ -745,6 +750,7 @@ def __init__( # Check if input has been processed self._input_processed: bool = False + self._last_input_config: Optional[Dict[str, Any]] = None self._uses_storage: bool = False self._uses_nosql: bool = False @@ -1794,6 +1800,7 @@ def prepare_input( nosql_storage.update_cache(self._benchmark) self._input_processed = True + self._last_input_config = input_config return input_config diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 04a9616af..514fe1699 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -1,9 +1,11 @@ """Cloudflare Workers platform implementation for SeBS.""" +import json +import math import os import uuid import time -from typing import cast, Dict, List, Optional, Tuple, Type +from typing import Any, cast, Dict, List, Optional, Set, Tuple, Type import docker import requests @@ -91,7 +93,8 @@ def push_to_registry( NOT a pushable URI and is not passed to any registry client. """ image_name = ( - f"{benchmark.replace('.', '-')}-{language_name}-" f"{language_version.replace('.', '')}" + f"{benchmark.replace('.', '-')}-{language_name}-" + f"{language_version.replace('.', '')}" ) return f"{image_name}:latest" @@ -111,8 +114,14 @@ class Cloudflare(System): # (e.g. "110" matches "110.dynamic-html"). SUPPORTED_BENCHMARKS: Dict[Tuple[str, bool], Optional[List[str]]] = { ("python", False): [ - "110", "120", "130", "210", "311", "501", "502", "503", - "610", "620", "630", "631", # lightweight workflows (Pyodide-compatible) + "110", + "120", + "130", + "210", + "311", + "501", + "502", + "503", ], ("nodejs", False): ["110", "120", "130", "311"], ("python", True): None, # all benchmarks supported @@ -162,12 +171,16 @@ def is_benchmark_supported( prefix = benchmark_name.split(".")[0] return prefix in allowed - def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) -> Function: + def get_function( + self, code_package: Benchmark, func_name: Optional[str] = None + ) -> Function: """Override to validate benchmark support and auto-select cloudflare variant.""" language = code_package.language_name container_deployment = code_package.system_variant.is_container benchmark_name = code_package.benchmark - if not self.is_benchmark_supported(benchmark_name, language, container_deployment): + if not self.is_benchmark_supported( + benchmark_name, language, container_deployment + ): deployment_type = "container" if container_deployment else "worker" raise RuntimeError( f"Benchmark '{benchmark_name}' is not supported for " @@ -182,8 +195,11 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) # (CLI --language-variant flag), which defaults to "default". Promoting # here ensures copy_code() applies the cloudflare/ source overlay and the # cache key reflects the correct variant. - if code_package.language_variant == "default" and code_package.benchmark_config.supports( - code_package.language, self.name() + if ( + code_package.language_variant == "default" + and code_package.benchmark_config.supports( + code_package.language, self.name() + ) ): code_package.select_variant(self.name()) @@ -209,7 +225,9 @@ def __init__( sebs_config, cache_client, docker_client, - CloudflareSystemResources(config, cache_client, docker_client, logger_handlers), + CloudflareSystemResources( + config, cache_client, docker_client, logger_handlers + ), ) self.logging_handlers = logger_handlers self._config = config @@ -227,7 +245,9 @@ def __init__( self.logging, sebs_config, docker_client, self.system_resources ) # Adapter so benchmark.build() can call container_client.build_base_image() - self._container_adapter = _CloudflareContainerAdapter(self._containers_deployment) + self._container_adapter = _CloudflareContainerAdapter( + self._containers_deployment + ) def initialize( self, @@ -246,7 +266,9 @@ def initialize( self._verify_credentials() self.initialize_resources(select_prefix=resource_prefix) - def initialize_resources(self, select_prefix: Optional[str] = None, quiet: bool = False): + def initialize_resources( + self, select_prefix: Optional[str] = None, quiet: bool = False + ): """ Initialize Cloudflare resources. @@ -276,7 +298,9 @@ def initialize_resources(self, select_prefix: Optional[str] = None, quiet: bool # Try to create R2 bucket, but don't fail if R2 is not enabled try: - self.system_resources.get_storage().get_bucket(Resources.StorageBucketType.BENCHMARKS) + self.system_resources.get_storage().get_bucket( + Resources.StorageBucketType.BENCHMARKS + ) self.logging.info("R2 storage initialized successfully") except Exception as e: self.logging.warning( @@ -323,13 +347,17 @@ def _verify_credentials(self): if len(self.config.credentials.api_token) > 8 else "***" ) - self.logging.info(f"Using API Token authentication (starts with: {token_preview})") + self.logging.info( + f"Using API Token authentication (starts with: {token_preview})" + ) else: self.logging.info( f"Using Email + API Key authentication (email: {self.config.credentials.email})" ) - response = requests.get(f"{self._api_base_url}/user/tokens/verify", headers=headers) + response = requests.get( + f"{self._api_base_url}/user/tokens/verify", headers=headers + ) if response.status_code != 200: raise RuntimeError( @@ -436,7 +464,9 @@ def _generate_wrangler_toml( Returns: Path to the generated wrangler.toml file """ - language_variant = code_package.language_variant if code_package else "cloudflare" + language_variant = ( + code_package.language_variant if code_package else "cloudflare" + ) handler = self._get_deployment_handler(container_deployment) return handler.generate_wrangler_toml( worker_name, @@ -560,7 +590,9 @@ def create_function( def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: """Get information about an existing worker.""" headers = self._get_auth_headers() - url = f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" + url = ( + f"{self._api_base_url}/accounts/{account_id}/workers/scripts/{worker_name}" + ) response = requests.get(url, headers=headers) @@ -572,7 +604,9 @@ def _get_worker(self, worker_name: str, account_id: str) -> Optional[dict]: elif response.status_code == 404: return None else: - self.logging.warning(f"Unexpected response checking worker: {response.status_code}") + self.logging.warning( + f"Unexpected response checking worker: {response.status_code}" + ) return None def _create_or_update_worker( @@ -621,7 +655,9 @@ def _create_or_update_worker( # Dockerfile. Must happen before generating wrangler.toml so the registry # URI is written in from the start. if container_deployment and container_uri: - self.logging.info(f"Pushing container image {container_uri} to Cloudflare registry...") + self.logging.info( + f"Pushing container image {container_uri} to Cloudflare registry..." + ) container_uri = cli.containers_push(container_uri, env=env) self.logging.info(f"Image pushed to: {container_uri}") @@ -645,7 +681,9 @@ def _create_or_update_worker( cli.upload_package(package_dir, container_package_path) try: - self.logging.info(f"Deploying worker {worker_name} using Wrangler in container...") + self.logging.info( + f"Deploying worker {worker_name} using Wrangler in container..." + ) # pywrangler is used for all native Python workers (packages must be # synced via pyproject.toml before wrangler uploads the bundle). @@ -663,8 +701,8 @@ def _create_or_update_worker( worker_url = self._build_workers_dev_url(worker_name, account_id_val) if container_deployment: - container_name = self._containers_deployment._container_name_from_worker( - worker_name + container_name = ( + self._containers_deployment._container_name_from_worker(worker_name) ) # Cloudflare compares the newly pushed registry image against the # image currently running in the container worker. If the image digest @@ -694,7 +732,11 @@ def _create_or_update_worker( raise RuntimeError(error_msg) def _wait_for_worker_ready( - self, worker_name: str, worker_url: str, max_wait_seconds: int = 60, poll_interval: int = 5 + self, + worker_name: str, + worker_url: str, + max_wait_seconds: int = 60, + poll_interval: int = 5, ) -> None: """Poll a native worker until it responds, confirming edge propagation.""" self.logging.info( @@ -754,17 +796,11 @@ def _wait_for_container_rollout( images) can take up to 10 minutes. Do not lower max_wait_seconds aggressively. Phase 2 — Instance readiness: After the rollout finishes, Cloudflare must start - at least one container instance before it can accept requests. The top-level - `instances` field is the configured/desired count. Runtime state lives under - `health.instances`: `starting` = still booting, `healthy` = passed health check - and ready to serve, `active` = currently handling a request (always 0 until the - first invocation). We wait until `health.instances.healthy >= max_instances`. - Note: the top-level `instances` field equals `max_instances + 1` because - Cloudflare adds one extra Durable Object coordination instance that never - becomes healthy — `max_instances` is the correct readiness threshold. - This avoids the - first benchmark invocation hitting a "no Container instance available" error - from the Durable Object. + at least one container instance before it can accept requests. Runtime state + lives under `health.instances`: `starting` = still booting, `healthy` = passed + health check and ready to serve, `active` = currently handling a request. + `max_instances` is a ceiling, not a requirement for deployment readiness, so + waiting for every possible instance can stall high-fan-out workflow deploys. Args: container_name: Cloudflare container name (e.g. my-worker-containerworker) @@ -812,24 +848,6 @@ def _wait_for_container_rollout( ) rollout_complete = True - # Phase 2: wait for at least one healthy instance so the - # first benchmark invocation does not hit a cold Durable Object. - # The top-level `instances` field is the configured/desired count, - # not the runtime state. Actual readiness is in health.instances: - # healthy — passed health check, ready to serve - # starting — still booting (image pull + firecracker init) - # active — currently handling a request (0 until first invocation) - # The top-level `instances` field equals max_instances + 1 in practice: - # Cloudflare appears to count one extra Durable Object coordination - # instance that never appears as healthy. The `health.instances` - # sub-object tracks runtime state per instance (not formally documented - # by Cloudflare at time of writing, derived from observed API responses): - # healthy — passed health check, ready to serve requests - # starting — still booting (image pull + firecracker init) - # active — currently handling a request (0 until first invocation) - # Use max_instances as the readiness threshold since that is the - # configured number of workload instances. - max_instances = data.get("max_instances", 0) health_instances = data.get("health", {}).get("instances", {}) healthy = health_instances.get("healthy", 0) starting = health_instances.get("starting", 0) @@ -841,8 +859,8 @@ def _wait_for_container_rollout( ) return self.logging.info( - f"Container {container_name} awaiting all instances to become healthy " - f"(healthy={healthy}/{max_instances}, starting={starting}, " + f"Container {container_name} waiting for a healthy instance " + f"(healthy={healthy}, starting={starting}, " f"{elapsed}s elapsed)" ) else: @@ -858,6 +876,250 @@ def _wait_for_container_rollout( f"Container {container_name} did not become ready after {max_wait_seconds}s." ) + @staticmethod + def _workflow_container_name(worker_name: str) -> str: + """Return the Cloudflare container name for the generated workflow dispatcher.""" + return f"{worker_name}-dispatchercontainer" + + def _workflow_max_instances( + self, code_package: Benchmark, definition_path: str + ) -> int: + """Return the exact DispatcherContainer ceiling for a workflow input.""" + benchmark_name = code_package.benchmark + prepared_input = code_package.last_input_config or {} + estimated = self._estimate_workflow_parallelism( + definition_path, benchmark_name, prepared_input + ) + if estimated is None: + fallback = max(1, self.config.max_instances) + self.logging.warning( + f"Cloudflare workflow {benchmark_name} has dynamic fan-out that " + f"cannot be known before execution; using configured " + f"max_instances={fallback}." + ) + return fallback + + self.logging.info( + f"Cloudflare workflow {benchmark_name} max_instances={estimated} " + "from prepared benchmark input." + ) + return max(1, estimated) + + def _estimate_workflow_parallelism( + self, + definition_path: str, + benchmark_name: str, + prepared_input: Dict[str, Any], + ) -> Optional[int]: + """Estimate maximum concurrent dispatcher containers for a workflow.""" + with open(definition_path) as f: + definition = json.load(f) + + states = definition.get("states", {}) + root = definition.get("root") + if not root: + return 1 + + def state_max( + state_defs: Dict[str, Any], + state_name: Optional[str], + visiting: Set[Tuple[int, str]], + ) -> Optional[int]: + if not state_name or state_name == "__end__": + return 1 + if state_name not in state_defs: + return 1 + + visit_key = (id(state_defs), state_name) + if visit_key in visiting: + return 1 + + visiting = set(visiting) + visiting.add(visit_key) + state = state_defs[state_name] + state_type = state.get("type") + + if state_type == "switch": + candidates: List[Optional[int]] = [] + for case in state.get("cases", []): + candidates.append(state_max(state_defs, case.get("next"), visiting)) + candidates.append(state_max(state_defs, state.get("default"), visiting)) + if any(value is None for value in candidates): + return None + return max(value or 1 for value in candidates) + + if state_type == "map": + array_length = self._workflow_array_length( + benchmark_name, state_name, state, prepared_input + ) + if array_length is None: + return None + + chunks = max(1, math.ceil(array_length / self.config.chunk_size)) + branch_max = state_max( + state.get("states", {}), state.get("root"), visiting + ) + if branch_max is None: + return None + current_max = chunks * branch_max + elif state_type == "parallel": + branch_values: List[Optional[int]] = [] + for branch in state.get("parallel_functions", []): + branch_values.append( + state_max( + branch.get("states", {}), + branch.get("root"), + visiting, + ) + ) + if any(value is None for value in branch_values): + return None + current_max = sum(value or 1 for value in branch_values) + else: + current_max = 1 + + next_max = state_max(state_defs, state.get("next"), visiting) + if next_max is None: + return None + return max(current_max, next_max) + + return state_max(states, root, set()) + + def _workflow_array_length( + self, + benchmark_name: str, + state_name: str, + state: Dict[str, Any], + prepared_input: Dict[str, Any], + ) -> Optional[int]: + """Return a Map state's array length from input or benchmark semantics.""" + array_path = state.get("array") + if not array_path: + return 0 + + if benchmark_name in {"630.parallel-sleep", "631.parallel-download"}: + if array_path == "buffer": + return self._workflow_int_value(prepared_input, "count") + + if benchmark_name in { + "6100.1000-genome", + "6101.1000-genome-individuals", + }: + if array_path == "blob": + return self._workflow_list_length(prepared_input, "blob") + if array_path == "sifting.populations": + return self._workflow_list_length(prepared_input, "populations") + + if benchmark_name == "650.vid" and array_path == "frames": + return self._workflow_chunked_length( + prepared_input, "n_frames", "batch_size" + ) + + if benchmark_name == "680.excamera" and array_path == "segments": + segments = self._workflow_list_length(prepared_input, "segments") + batch_size = self._workflow_int_value(prepared_input, "batch_size") + if segments is None or batch_size is None: + return None + return math.ceil(segments / max(1, batch_size)) + + if benchmark_name == "690.ml" and array_path == "schedules": + return self._workflow_list_length(prepared_input, "classifiers") + + if benchmark_name == "660.map-reduce" and array_path == "list": + if state_name == "map-state": + return self._workflow_int_value(prepared_input, "n_mappers") + if state_name == "reduce-state": + return 5 + + value = self._workflow_value_at_path(prepared_input, array_path) + if isinstance(value, list): + return len(value) + + return None + + @staticmethod + def _workflow_value_at_path(data: Dict[str, Any], path: str) -> Any: + """Return a dotted-path value from a dictionary, or None.""" + value: Any = data + for part in path.split("."): + if not isinstance(value, dict) or part not in value: + return None + value = value[part] + return value + + @staticmethod + def _workflow_int_value(data: Dict[str, Any], key: str) -> Optional[int]: + """Return a positive integer value from prepared input.""" + value = data.get(key) + if value is None: + return None + try: + return max(0, int(value)) + except (TypeError, ValueError): + return None + + @staticmethod + def _workflow_list_length(data: Dict[str, Any], key: str) -> Optional[int]: + """Return the length of a list value from prepared input.""" + value = data.get(key) + if isinstance(value, list): + return len(value) + return None + + def _workflow_chunked_length( + self, data: Dict[str, Any], item_key: str, batch_key: str + ) -> Optional[int]: + """Return ceil(item count / batch size) for task-produced batches.""" + item_count = self._workflow_int_value(data, item_key) + batch_size = self._workflow_int_value(data, batch_key) + if item_count is None or batch_size is None: + return None + return math.ceil(item_count / max(1, batch_size)) + + @staticmethod + def _workflow_instance_type(code_package: Benchmark) -> str: + """Choose a Cloudflare Container instance type from benchmark memory.""" + if code_package.benchmark in { + "6100.1000-genome", + "6101.1000-genome-individuals", + }: + return "standard-4" + + memory = code_package.benchmark_config.memory + if memory <= 256: + return "lite" + if memory <= 1024: + return "basic" + if memory <= 2048: + return "standard-2" + if memory <= 4096: + return "standard-3" + if memory <= 8192: + return "standard-4" + return "standard-4" + + def _deploy_workflow_orchestrator( + self, cli, package_path: str, env: Dict[str, str], orchestrator_name: str + ) -> str: + """Deploy the workflow orchestrator, recreating stale container apps if needed.""" + try: + return cli.wrangler_deploy(package_path, env=env) + except RuntimeError as exc: + message = str(exc) + if "APPLICATION_NOT_FOUND" not in message: + raise + self.logging.warning( + f"Wrangler reported APPLICATION_NOT_FOUND while deploying " + f"{orchestrator_name}; deleting stale Worker state and retrying once." + ) + try: + cli.wrangler_delete(orchestrator_name, env=env) + except RuntimeError as delete_exc: + self.logging.warning( + f"Failed to delete stale Worker {orchestrator_name}: {delete_exc}" + ) + return cli.wrangler_deploy(package_path, env=env) + def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: """Fetch the workers.dev subdomain for the given account. @@ -898,7 +1160,9 @@ def _get_workers_dev_subdomain(self, account_id: str) -> Optional[str]: self.logging.warning(f"Error fetching workers.dev subdomain: {e}") return None - def _build_workers_dev_url(self, worker_name: str, account_id: Optional[str]) -> str: + def _build_workers_dev_url( + self, worker_name: str, account_id: Optional[str] + ) -> str: """Build a best-effort public URL for a worker. Prefer using the account's readable workers.dev subdomain when available @@ -932,7 +1196,8 @@ def cached_function(self, function: Function): account_id = worker.account_id or self.config.credentials.account_id if account_id and not self._get_worker(worker.name, account_id): self.logging.info( - f"Cached worker {worker.name} no longer exists on Cloudflare " "— will redeploy." + f"Cached worker {worker.name} no longer exists on Cloudflare " + "— will redeploy." ) function.code_package_hash = "" @@ -989,7 +1254,9 @@ def update_function( # Update configuration if needed (no-op for containers: no runtime memory changes) self.update_function_configuration(worker, code_package) - def update_function_configuration(self, cached_function: Function, benchmark: Benchmark): + def update_function_configuration( + self, cached_function: Function, benchmark: Benchmark + ): """ Update the configuration of a Cloudflare Worker. @@ -1113,7 +1380,8 @@ def download_metrics( return self.logging.info( - f"Extracting metrics from {len(requests)} invocations " f"of worker {function_name}" + f"Extracting metrics from {len(requests)} invocations " + f"of worker {function_name}" ) # Aggregate statistics from all requests @@ -1153,7 +1421,9 @@ def download_metrics( # GB-seconds calculation: (128MB / 1024MB/GB) * (cpu_time_us / 1000000 us/s) cpu_time_seconds = result.provider_times.execution / 1_000_000.0 gb_seconds = (128.0 / 1024.0) * cpu_time_seconds - result.billing.gb_seconds = int(gb_seconds * 1_000_000) # micro GB-seconds + result.billing.gb_seconds = int( + gb_seconds * 1_000_000 + ) # micro GB-seconds # Calculate statistics metrics["cloudflare"] = { @@ -1171,13 +1441,17 @@ def download_metrics( metrics["cloudflare"]["cpu_time_measurements"] = len(cpu_times) if wall_times: - metrics["cloudflare"]["avg_wall_time_us"] = sum(wall_times) // len(wall_times) + metrics["cloudflare"]["avg_wall_time_us"] = sum(wall_times) // len( + wall_times + ) metrics["cloudflare"]["min_wall_time_us"] = min(wall_times) metrics["cloudflare"]["max_wall_time_us"] = max(wall_times) metrics["cloudflare"]["wall_time_measurements"] = len(wall_times) if memory_values: - metrics["cloudflare"]["avg_memory_mb"] = sum(memory_values) / len(memory_values) + metrics["cloudflare"]["avg_memory_mb"] = sum(memory_values) / len( + memory_values + ) metrics["cloudflare"]["min_memory_mb"] = min(memory_values) metrics["cloudflare"]["max_memory_mb"] = max(memory_values) metrics["cloudflare"]["memory_measurements"] = len(memory_values) @@ -1230,34 +1504,60 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> Functi if not account_id: raise RuntimeError("Cloudflare account ID is required to create workflows") - # --- Step 1: Deploy the dispatcher (single worker/container with all functions) --- + if not container_deployment: + raise RuntimeError( + "Cloudflare workflow fan-out requires container deployment. " + "Select the cloudflare container system variant for workflow benchmarks." + ) + # Cloudflare workers.dev subdomains cap at 54 chars. - # Cap the base name at 43 chars so that base + "-dispatcher" (11 chars) stays ≤ 54. + # Cap the base name at 43 chars so that derived names stay ≤ 54. max_base_len = 43 if len(workflow_name) > max_base_len: workflow_name = workflow_name[:max_base_len].rstrip("-") dispatcher_name = workflow_name + "-dispatcher" - self.logging.info(f"Deploying workflow dispatcher: {dispatcher_name}") container_uri = code_package._container_uri if container_deployment else None - dispatcher = self.create_function( - code_package, - dispatcher_name, - code_package.system_variant, - container_uri, - ) + if not container_uri: + raise RuntimeError( + f"Container image URI is missing for workflow {code_package.benchmark}. " + "The container build step may not have completed successfully." + ) + + # Set up Wrangler credentials before pushing the dispatcher image and + # deploying the orchestrator Worker. + env = {} + if self.config.credentials.api_token: + env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token + elif self.config.credentials.email and self.config.credentials.api_key: + env["CLOUDFLARE_EMAIL"] = self.config.credentials.email + env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key + env["CLOUDFLARE_ACCOUNT_ID"] = account_id + + cli = self._workers_deployment._get_cli() + self.logging.info(f"Pushing workflow dispatcher image {container_uri}...") + dispatcher_image = cli.containers_push(container_uri, env=env) + self.logging.info(f"Workflow dispatcher image pushed to: {dispatcher_image}") # --- Step 2: Generate orchestrator TypeScript from definition.json --- definition_path = os.path.join(code_package.benchmark_path, "definition.json") if not os.path.exists(definition_path): raise ValueError(f"No workflow definition found at {definition_path}") - # Container workers can't be called via service bindings from inside Workflow steps. - # Use the dispatcher's workers.dev URL directly for container-backed dispatchers. - if container_deployment: - dispatcher_url = self._build_workers_dev_url(dispatcher_name, account_id) - gen = CloudflareWorkflowGenerator(dispatcher_url=dispatcher_url) - else: - gen = CloudflareWorkflowGenerator(dispatcher_binding="DISPATCHER") + workflow_max_instances = self._workflow_max_instances( + code_package, definition_path + ) + workflow_instance_type = self._workflow_instance_type(code_package) + self.logging.info( + f"Cloudflare workflow {code_package.benchmark} instance_type=" + f"{workflow_instance_type} for configured memory " + f"{code_package.benchmark_config.memory} MB." + ) + + gen = CloudflareWorkflowGenerator( + chunk_size=self.config.chunk_size, + max_instances=workflow_max_instances, + dispatch_timeout_seconds=code_package.benchmark_config.timeout + 120, + ) gen.parse(definition_path) ts_source = gen.generate() @@ -1274,46 +1574,49 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> Functi package_json = { "name": orchestrator_name, "type": "module", - "dependencies": {"@cloudflare/workers-types": "*"}, + "dependencies": { + "@cloudflare/containers": "*", + "@cloudflare/workers-types": "*", + }, } with open(os.path.join(orchestrator_dir, "package.json"), "w") as f: import json as json_mod json_mod.dump(package_json, f, indent=2) - # Generate wrangler.toml — omit service binding for container dispatchers + orchestrator_url = self._build_workers_dev_url(orchestrator_name, account_id) self._generate_workflow_wrangler_toml( orchestrator_name, orchestrator_dir, account_id, - dispatcher_name if not container_deployment else None, + dispatcher_image, + workflow_max_instances, + workflow_instance_type, + orchestrator_url, + code_package, ) - # Deploy the orchestrator via wrangler - env = {} - if self.config.credentials.api_token: - env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token - elif self.config.credentials.email and self.config.credentials.api_key: - env["CLOUDFLARE_EMAIL"] = self.config.credentials.email - env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key - env["CLOUDFLARE_ACCOUNT_ID"] = account_id - - cli = self._workers_deployment._get_cli() container_package_path = f"/tmp/workers/{orchestrator_name}" cli.upload_package(orchestrator_dir, container_package_path) self.logging.info(f"Deploying workflow orchestrator: {orchestrator_name}") - cli.wrangler_deploy(container_package_path, env=env) + output = self._deploy_workflow_orchestrator( + cli, container_package_path, env, orchestrator_name + ) + if "no changes" not in output.lower(): + self._wait_for_container_rollout( + self._workflow_container_name(orchestrator_name), + account_id, + ) # Build orchestrator URL and wait for readiness - orchestrator_url = self._build_workers_dev_url(orchestrator_name, account_id) self._wait_for_worker_ready(orchestrator_name, orchestrator_url) # --- Step 4: Create workflow object and attach trigger --- function_cfg = FunctionConfig.from_benchmark(code_package) workflow = CloudflareWorkflow( name=orchestrator_name, - functions=[dispatcher], + functions=[], benchmark=code_package.benchmark, code_package_hash=code_package.hash, cfg=function_cfg, @@ -1332,7 +1635,7 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> Functi def update_workflow(self, workflow: Function, code_package: Benchmark): """Update an existing Cloudflare Workflow deployment. - Re-deploys the dispatcher and regenerates/re-deploys the orchestrator. + Pushes the dispatcher image and regenerates/re-deploys the orchestrator. Args: workflow: Existing CloudflareWorkflow instance. @@ -1346,31 +1649,54 @@ def update_workflow(self, workflow: Function, code_package: Benchmark): workflow = cast(CloudflareWorkflow, workflow) account_id = workflow.account_id + container_deployment = code_package.system_variant.is_container + if not container_deployment: + raise RuntimeError( + "Cloudflare workflow fan-out requires container deployment. " + "Select the cloudflare container system variant for workflow benchmarks." + ) - # Update the dispatcher - self.logging.info(f"Updating workflow dispatcher: {workflow.dispatcher_name}") - update_container_uri = ( - code_package._container_uri if code_package.system_variant.is_container else None - ) - dispatcher = self.create_function( - code_package, - workflow.dispatcher_name, - code_package.system_variant, - update_container_uri, - ) - workflow.functions = [dispatcher] + container_uri = code_package._container_uri + if not container_uri: + raise RuntimeError( + f"Container image URI is missing for workflow {code_package.benchmark}. " + "The container build step may not have completed successfully." + ) + + env = {} + if self.config.credentials.api_token: + env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token + elif self.config.credentials.email and self.config.credentials.api_key: + env["CLOUDFLARE_EMAIL"] = self.config.credentials.email + env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key + env["CLOUDFLARE_ACCOUNT_ID"] = account_id + + cli = self._workers_deployment._get_cli() + self.logging.info(f"Pushing workflow dispatcher image {container_uri}...") + dispatcher_image = cli.containers_push(container_uri, env=env) + self.logging.info(f"Workflow dispatcher image pushed to: {dispatcher_image}") + workflow.functions = [] # Regenerate and redeploy orchestrator definition_path = os.path.join(code_package.benchmark_path, "definition.json") if not os.path.exists(definition_path): raise ValueError(f"No workflow definition found at {definition_path}") - container_deployment = code_package.system_variant.is_container - if container_deployment: - dispatcher_url = self._build_workers_dev_url(workflow.dispatcher_name, account_id) - gen = CloudflareWorkflowGenerator(dispatcher_url=dispatcher_url) - else: - gen = CloudflareWorkflowGenerator(dispatcher_binding="DISPATCHER") + workflow_max_instances = self._workflow_max_instances( + code_package, definition_path + ) + workflow_instance_type = self._workflow_instance_type(code_package) + self.logging.info( + f"Cloudflare workflow {code_package.benchmark} instance_type=" + f"{workflow_instance_type} for configured memory " + f"{code_package.benchmark_config.memory} MB." + ) + + gen = CloudflareWorkflowGenerator( + chunk_size=self.config.chunk_size, + max_instances=workflow_max_instances, + dispatch_timeout_seconds=code_package.benchmark_config.timeout + 120, + ) gen.parse(definition_path) ts_source = gen.generate() @@ -1381,34 +1707,40 @@ def update_workflow(self, workflow: Function, code_package: Benchmark): package_json = { "name": workflow.name, "type": "module", - "dependencies": {"@cloudflare/workers-types": "*"}, + "dependencies": { + "@cloudflare/containers": "*", + "@cloudflare/workers-types": "*", + }, } with open(os.path.join(orchestrator_dir, "package.json"), "w") as f: import json as json_mod json_mod.dump(package_json, f, indent=2) + orchestrator_url = self._build_workers_dev_url(workflow.name, account_id) self._generate_workflow_wrangler_toml( workflow.name, orchestrator_dir, account_id, - workflow.dispatcher_name if not container_deployment else None, + dispatcher_image, + workflow_max_instances, + workflow_instance_type, + orchestrator_url, + code_package, ) - env = {} - if self.config.credentials.api_token: - env["CLOUDFLARE_API_TOKEN"] = self.config.credentials.api_token - elif self.config.credentials.email and self.config.credentials.api_key: - env["CLOUDFLARE_EMAIL"] = self.config.credentials.email - env["CLOUDFLARE_API_KEY"] = self.config.credentials.api_key - env["CLOUDFLARE_ACCOUNT_ID"] = account_id - - cli = self._workers_deployment._get_cli() container_package_path = f"/tmp/workers/{workflow.name}" cli.upload_package(orchestrator_dir, container_package_path) self.logging.info(f"Redeploying workflow orchestrator: {workflow.name}") - cli.wrangler_deploy(container_package_path, env=env) + output = self._deploy_workflow_orchestrator( + cli, container_package_path, env, workflow.name + ) + if "no changes" not in output.lower(): + self._wait_for_container_rollout( + self._workflow_container_name(workflow.name), + account_id, + ) self._wait_for_worker_ready(workflow.name, workflow.orchestrator_url) self.logging.info(f"Workflow {workflow.name} updated successfully") @@ -1418,7 +1750,11 @@ def _generate_workflow_wrangler_toml( orchestrator_name: str, package_dir: str, account_id: str, - dispatcher_name: Optional[str], + dispatcher_image: str, + max_instances: int, + instance_type: str, + worker_url: str, + code_package: Optional[Benchmark] = None, ) -> str: """Generate wrangler.toml for the workflow orchestrator from template. @@ -1426,8 +1762,11 @@ def _generate_workflow_wrangler_toml( orchestrator_name: Name of the orchestrator worker. package_dir: Directory to write the toml file. account_id: Cloudflare account ID. - dispatcher_name: Name of the dispatcher worker (for service binding). - Pass None for container dispatchers — they are called via URL, not binding. + dispatcher_image: Cloudflare registry image for DispatcherContainer. + max_instances: Maximum DispatcherContainer instances. + instance_type: Cloudflare Container instance type. + worker_url: Public orchestrator URL used by containers for R2/KV proxy calls. + code_package: Optional benchmark package for storage and nosql bindings. Returns: Path to the generated wrangler.toml. @@ -1444,7 +1783,9 @@ def _generate_workflow_wrangler_toml( from importlib.resources import files template_path = ( - files("sebs.cloudflare").joinpath("templates").joinpath("wrangler-workflow.toml") + files("sebs.cloudflare") + .joinpath("templates") + .joinpath("wrangler-workflow.toml") ) with template_path.open("rb") as f: config = tomllib.load(f) @@ -1452,11 +1793,43 @@ def _generate_workflow_wrangler_toml( config["name"] = orchestrator_name config["account_id"] = account_id config["workflows"][0]["name"] = orchestrator_name - if dispatcher_name is not None: - config["services"][0]["service"] = dispatcher_name - else: - # Container dispatchers are called via URL; remove service binding - config.pop("services", None) + config["workflows"][1]["name"] = f"{orchestrator_name}-item" + config["containers"][0]["image"] = dispatcher_image + config["containers"][0]["max_instances"] = max_instances + config["containers"][0]["instance_type"] = instance_type + config["vars"] = {"WORKER_URL": worker_url} + if code_package: + config["vars"]["BENCHMARK_NAME"] = code_package.benchmark + + if code_package and code_package.uses_nosql: + nosql_storage = self.system_resources.get_nosql_storage() + if nosql_storage.retrieve_cache(code_package.benchmark): + nosql_tables = nosql_storage.get_tables(code_package.benchmark) + if nosql_tables: + config["kv_namespaces"] = config.get("kv_namespaces", []) + for table_name, namespace_id in nosql_tables.items(): + config["kv_namespaces"].append( + { + "binding": table_name, + "id": namespace_id, + } + ) + config["vars"]["NOSQL_STORAGE_DATABASE"] = "kvstore" + + if code_package and code_package.uses_storage: + from sebs.faas.config import Resources + + storage = self.system_resources.get_storage() + bucket_name = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + if not bucket_name: + raise RuntimeError( + "R2 bucket binding not configured: benchmarks bucket name is empty. " + "Workflow benchmarks requiring file access will not work properly." + ) + config["r2_buckets"] = [{"binding": "R2", "bucket_name": bucket_name}] + self.logging.info( + f"R2 bucket '{bucket_name}' will be bound to workflow as 'R2'" + ) toml_path = os.path.join(package_dir, "wrangler.toml") try: @@ -1469,7 +1842,9 @@ def _generate_workflow_wrangler_toml( self.logging.info(f"Generated workflow wrangler.toml at {toml_path}") return toml_path - def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) -> Trigger: + def create_trigger( + self, function: Function, trigger_type: Trigger.TriggerType + ) -> Trigger: """ Create a trigger for a Cloudflare Worker. diff --git a/sebs/cloudflare/config.py b/sebs/cloudflare/config.py index d18c8a56b..819652364 100644 --- a/sebs/cloudflare/config.py +++ b/sebs/cloudflare/config.py @@ -98,7 +98,9 @@ def initialize(dct: dict) -> "CloudflareCredentials": ) @staticmethod - def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Credentials: + def deserialize( + config: dict, cache: Cache, handlers: LoggingHandlers + ) -> Credentials: """Load credentials from config dict, falling back to environment variables.""" cached_config = cache.get_config("cloudflare") ret: CloudflareCredentials @@ -133,7 +135,11 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden "or CLOUDFLARE_EMAIL, CLOUDFLARE_API_KEY, and CLOUDFLARE_ACCOUNT_ID" ) - if account_id is not None and ret.account_id is not None and account_id != ret.account_id: + if ( + account_id is not None + and ret.account_id is not None + and account_id != ret.account_id + ): ret.logging.error( f"The account id {ret.account_id} from provided credentials is different " f"from the account id {account_id} found in the cache! Please change " @@ -244,12 +250,15 @@ class CloudflareConfig(Config): Configuration for Cloudflare Workers platform. """ - def __init__(self, credentials: CloudflareCredentials, resources: CloudflareResources): + def __init__( + self, credentials: CloudflareCredentials, resources: CloudflareResources + ): """Initialize configuration with the given credentials and resources.""" super().__init__(name="cloudflare") self._credentials = credentials self._resources = resources - self._max_instances: int = 10 + self._max_instances: int = 20 + self._chunk_size: int = 1 @staticmethod def typename() -> str: @@ -271,6 +280,11 @@ def max_instances(self) -> int: """Maximum number of container instances for container deployments.""" return self._max_instances + @property + def chunk_size(self) -> int: + """Number of Map items assigned to one child workflow instance.""" + return self._chunk_size + @staticmethod def initialize(cfg: Config, dct: dict): """Apply region and other fields from a config dictionary to an existing instance.""" @@ -279,16 +293,20 @@ def initialize(cfg: Config, dct: dict): config._region = dct.get("region", "global") if "max_instances" in dct: config._max_instances = int(dct["max_instances"]) + if "chunk_size" in dct: + config._chunk_size = max(1, int(dct["chunk_size"])) @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config: """Build a CloudflareConfig from user config and cache, resolving credentials.""" cached_config = cache.get_config("cloudflare") credentials = cast( - CloudflareCredentials, CloudflareCredentials.deserialize(config, cache, handlers) + CloudflareCredentials, + CloudflareCredentials.deserialize(config, cache, handlers), ) resources = cast( - CloudflareResources, CloudflareResources.deserialize(config, cache, handlers) + CloudflareResources, + CloudflareResources.deserialize(config, cache, handlers), ) config_obj = CloudflareConfig(credentials, resources) config_obj.logging_handlers = handlers @@ -315,6 +333,8 @@ def serialize(self) -> dict: out = { "name": "cloudflare", "region": self._region, + "max_instances": self._max_instances, + "chunk_size": self._chunk_size, "credentials": self._credentials.serialize(), "resources": self._resources.serialize(), } diff --git a/sebs/cloudflare/generator.py b/sebs/cloudflare/generator.py index 8bbfe9ba4..8a7ef748d 100644 --- a/sebs/cloudflare/generator.py +++ b/sebs/cloudflare/generator.py @@ -1,11 +1,13 @@ """Cloudflare Workflows code generator. Translates SeBS FSM definitions (definition.json) into TypeScript source code -for a Cloudflare WorkflowEntrypoint class. The generated code uses a while/switch -state machine pattern where each FSM state maps to a switch case with step.do() calls. +for Cloudflare Workflows. Map and Parallel states fan out to child workflow +instances so Cloudflare can execute work concurrently across instances. """ -from typing import Dict, List, Set, Union +import json +import re +from typing import Dict, List, Union from sebs.faas.fsm import Generator, State, Task, Switch, Map, Parallel, Repeat, Loop @@ -15,55 +17,192 @@ class CloudflareWorkflowGenerator(Generator): def __init__( self, - dispatcher_binding: str = "DISPATCHER", - dispatcher_url: str = "", + chunk_size: int = 1, + max_instances: int = 1, + dispatch_timeout_seconds: int = 300, ): """Initialize the Cloudflare Workflow generator. Args: - dispatcher_binding: Service binding name (used when dispatcher_url is empty). - dispatcher_url: Direct HTTP URL for the dispatcher (container workers can't be - called via service bindings from inside Workflow steps — use URL instead). + chunk_size: Number of Map items assigned to one child ItemWorkflow. + max_instances: Container ceiling configured in wrangler.toml. + dispatch_timeout_seconds: Per-container dispatch timeout. """ super().__init__() - self._dispatcher_binding = dispatcher_binding - self._dispatcher_url = dispatcher_url + self._chunk_size = max(1, int(chunk_size)) + self._max_instances = max(1, int(max_instances)) + self._dispatch_timeout_ms = max(300_000, int(dispatch_timeout_seconds) * 1000) def generate(self) -> str: """Generate the complete TypeScript workflow source file.""" cases = [] - for state in self.states.values(): + for state in self._all_generated_states().values(): case_code = self._encode_state_case(state) cases.append(case_code) - cases.append(' case "__end__":\n return state;') + cases.append( + """\ + case "__end__": { + if (_fanin) { + const { parentId, stateName, branchIdx, total, branchRoot } = _fanin; + await reportFanIn(this.env, { + parentId, + stateName, + idx: branchIdx, + total, + mode: "object", + key: branchRoot, + result: state, + }); + } + return state; + }""" + ) switch_body = "\n".join(cases) - if self._dispatcher_url: - env_iface = " WORKFLOW: any;" - else: - env_iface = f" WORKFLOW: any;\n {self._dispatcher_binding}: Fetcher;" - return f"""\ +/* + * Required wrangler bindings: + * - WORKFLOW: Workflow binding for BenchmarkWorkflow + * - ITEM_WORKFLOW: Workflow binding for ItemWorkflow + * - FANIN: Durable Object namespace for FanInCoordinator + * - DISPATCHER: Durable Object namespace for DispatcherContainer + * - [[containers]] class_name = "DispatcherContainer", max_instances = {self._max_instances} + */ +import {{ Container }} from "@cloudflare/containers"; import {{ WorkflowEntrypoint, WorkflowEvent, WorkflowStep }} from "cloudflare:workers"; interface Env {{ -{env_iface} + WORKFLOW: Workflow; + ITEM_WORKFLOW: Workflow; + FANIN: DurableObjectNamespace; + DISPATCHER: DurableObjectNamespace; + WORKER_URL: string; + R2?: R2Bucket; + [key: string]: any; +}} + +function getDurableObjectByName( + namespace: DurableObjectNamespace, + name: string, +): DurableObjectStub {{ + return namespace.get(namespace.idFromName(name)); +}} + +function sleep(ms: number): Promise {{ + return new Promise((resolve) => setTimeout(resolve, ms)); +}} + +function textSizeBytes(value: unknown): number {{ + return new TextEncoder().encode(JSON.stringify(value)).length; +}} + +function errorMessage(error: unknown): string {{ + return error instanceof Error ? `${{error.name}}: ${{error.message}}` : String(error); +}} + +async function reportFanIn( + env: Env, + report: {{ + parentId: string; + stateName: string; + idx: number; + total: number; + mode: "array" | "object"; + key: string | null; + result: any; + error?: string; + }}, +): Promise {{ + const fanin = getDurableObjectByName(env.FANIN, `${{report.parentId}}-${{report.stateName}}`); + await fanin.fetch("http://fanin/report", {{ + method: "POST", + headers: {{ "Content-Type": "application/json" }}, + body: JSON.stringify(report), + }}); }} -// Retry fetch on 502/503 or non-JSON responses (container cold-start / Durable Object reset). +async function fetchWithTimeout( + stub: DurableObjectStub, + url: string, + init: RequestInit, + timeoutMs: number, +): Promise {{ + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + try {{ + return await stub.fetch(url, {{ ...init, signal: controller.signal }}); + }} finally {{ + clearTimeout(timeoutId); + }} +}} + +function isRetryableFetchError(error: unknown): boolean {{ + const message = error instanceof Error ? error.message : String(error); + return ( + message.includes("fetch failed") || + message.includes("Network connection lost") || + message.includes("internal error connecting to the port") + ); +}} + +function isFetchTimeoutError(error: unknown): boolean {{ + const message = error instanceof Error ? error.message : String(error); + return ( + message.includes("AbortError") || + message.includes("The operation was aborted") || + message.includes("timed out") + ); +}} + +// Retry fetch on 502/503, timeout, or non-JSON responses. // Any other non-2xx response is treated as a hard error and thrown immediately. -async function dispatchWithRetry(url: string, body: any, maxAttempts = 10): Promise {{ +async function dispatchWithRetry( + namespace: DurableObjectNamespace, + containerId: string, + workerUrl: string, + body: any, + maxAttempts = 10, + timeoutMs = {self._dispatch_timeout_ms}, +): Promise {{ + const stub = getDurableObjectByName(namespace, containerId); + console.log( + `[workflow-dispatch] containerId=${{containerId}} function=${{body?.function ?? "unknown"}}` + ); for (let attempt = 1; attempt <= maxAttempts; attempt++) {{ - const r = await fetch(url, {{ - method: "POST", - headers: {{ "Content-Type": "application/json" }}, - body: JSON.stringify(body), - }}); + let r: Response; + try {{ + r = await fetchWithTimeout( + stub, + "http://dispatcher/", + {{ + method: "POST", + headers: {{ + "Content-Type": "application/json", + "X-Worker-URL": workerUrl, + "X-Dispatcher-Container-ID": containerId, + }}, + body: JSON.stringify(body), + }}, + timeoutMs, + ); + }} catch (error) {{ + if (isFetchTimeoutError(error)) {{ + throw new Error( + `Dispatcher call timed out after ${{timeoutMs}}ms for ${{body?.function ?? "unknown"}} ` + + `on containerId=${{containerId}}. Not retrying because the container may still be running.` + ); + }} + if (attempt < maxAttempts && isRetryableFetchError(error)) {{ + await sleep(Math.min(5000 * attempt, 30000)); + continue; + }} + throw error; + }} + if (r.status === 503 || r.status === 502) {{ - const wait = Math.min(5000 * attempt, 30000); - await new Promise((res) => setTimeout(res, wait)); + await sleep(Math.min(5000 * attempt, 30000)); continue; }} const text = await r.text(); @@ -73,29 +212,95 @@ def generate(self) -> str: try {{ return JSON.parse(text); }} catch (_) {{ - // Non-JSON response from container (e.g. proxy error during startup); retry. if (attempt < maxAttempts) {{ - const wait = Math.min(5000 * attempt, 30000); - await new Promise((res) => setTimeout(res, wait)); + await sleep(Math.min(5000 * attempt, 30000)); continue; }} - throw new Error(`Dispatcher returned non-JSON after ${{maxAttempts}} attempts: ${{text.slice(0, 200)}}`); + throw new Error( + `Dispatcher returned non-JSON after ${{maxAttempts}} attempts: ${{text.slice(0, 200)}}` + ); }} }} throw new Error(`Dispatcher unavailable after ${{maxAttempts}} attempts`); }} +function isDuplicateWorkflowError(error: unknown): boolean {{ + const message = error instanceof Error ? error.message : String(error); + return ( + message.includes("already exists") || + message.includes("duplicate") || + message.includes("conflict") || + message.includes("409") + ); +}} + +function isRateLimitError(error: unknown): boolean {{ + const message = error instanceof Error ? error.message : String(error); + return message.includes("429") || message.includes("rate limit"); +}} + +async function createWorkflowWithRetry( + workflow: Workflow, + id: string, + params: any, + maxAttempts = 10, +): Promise {{ + for (let attempt = 1; attempt <= maxAttempts; attempt++) {{ + try {{ + await workflow.create({{ id, params }}); + return; + }} catch (error) {{ + if (isDuplicateWorkflowError(error)) {{ + return; + }} + if (attempt < maxAttempts && isRateLimitError(error)) {{ + await sleep(Math.min(5000 * attempt, 30000)); + continue; + }} + throw error; + }} + }} +}} + export class BenchmarkWorkflow extends WorkflowEntrypoint {{ async run(event: WorkflowEvent, step: WorkflowStep) {{ - let state = structuredClone(event.payload); - let current = "{self.root.name}"; + let state: any = structuredClone(event.payload ?? {{}}); + const {{ _start, _fanin }} = state as any; + delete (state as any)._start; + delete (state as any)._fanin; + let current = _start ?? {json.dumps(self.root.name)}; + const dispatchContainerId = _fanin + ? `${{_fanin.parentId}}-${{_fanin.stateName}}-branch-${{_fanin.branchIdx}}` + : event.instanceId; - while (true) {{ - switch (current) {{ + try {{ + while (true) {{ + switch (current) {{ {switch_body} - default: - throw new Error(`Unknown state: ${{current}}`); + default: + throw new Error(`Unknown state: ${{current}}`); + }} }} + }} catch (error) {{ + if (_fanin) {{ + const {{ parentId, stateName, branchIdx, total, branchRoot }} = _fanin; + const message = errorMessage(error); + console.log( + `[workflow-branch-error] parentId=${{parentId}} state=${{stateName}} ` + + `branchIdx=${{branchIdx}} root=${{branchRoot}} error=${{message}}` + ); + await reportFanIn(this.env, {{ + parentId, + stateName, + idx: branchIdx, + total, + mode: "object", + key: branchRoot, + result: null, + error: message, + }}); + }} + throw error; }} }} }} @@ -103,8 +308,13 @@ def generate(self) -> str: export default {{ async fetch(request: Request, env: Env): Promise {{ const url = new URL(request.url); + if (url.pathname.startsWith("/nosql/")) {{ + return await handleNoSQLRequest(request, env); + }} + if (url.pathname.startsWith("/r2/")) {{ + return await handleR2Request(request, env); + }} if (request.method === "GET" && url.searchParams.has("id")) {{ - // Status poll: return current status without blocking. const id = url.searchParams.get("id")!; const instance = await env.WORKFLOW.get(id); const status = await instance.status(); @@ -114,14 +324,40 @@ def generate(self) -> str: error: (status as any).error ?? null, }}); }} - // Create a new workflow instance and return its ID immediately. const payload = await request.json(); const instance = await env.WORKFLOW.create({{ params: payload }}); return Response.json({{ id: instance.id }}, {{ status: 202 }}); }}, }}; + +{self._emit_proxy_handlers()} + +{self._emit_item_workflow()} + +{self._emit_fanin_coordinator()} + +{self._emit_dispatcher_container()} """ + def _all_generated_states(self) -> Dict[str, State]: + """Return top-level and Parallel branch states in generation order.""" + states: Dict[str, State] = {} + + def add_state(state: State) -> None: + if state.name not in states: + states[state.name] = state + if isinstance(state, Parallel): + for branch in state.branches: + branch_states = { + n: State.deserialize(n, s) for n, s in branch.states.items() + } + for branch_state in branch_states.values(): + add_state(branch_state) + + for state in self.states.values(): + add_state(state) + return states + def _encode_state_case(self, state: State) -> str: """Encode a single FSM state as a switch case block.""" if isinstance(state, Task): @@ -142,9 +378,7 @@ def _encode_state_case(self, state: State) -> str: def _encode_task_case(self, state: Task) -> str: """Encode a Task state as a step.do() call.""" next_state = f'"{state.next}"' if state.next else '"__end__"' - fetch_setup, fetch_result = self._make_fetch_call(state.func_name) - setup_line = f"\n {fetch_setup}" if fetch_setup else "" - var = state.name.replace("-", "_") + var = self._js_identifier(state.name) is_terminal = next_state == '"__end__"' if is_terminal: @@ -152,7 +386,7 @@ def _encode_task_case(self, state: Task) -> str: else: merge_expr = ( f'(typeof {var}_result === "object" && {var}_result !== null' - f' && !Array.isArray({var}_result))' + f" && !Array.isArray({var}_result))" f"\n ? {{...state, ...{var}_result}} : {var}_result" ) @@ -160,8 +394,16 @@ def _encode_task_case(self, state: Task) -> str: return f"""\ case "{state.name}": {{ try {{ - const {var}_result = await step.do("{state.name}", async () => {{{setup_line} - return {fetch_result}; + const {var}_result = await step.do("{state.name}", async () => {{ + return await dispatchWithRetry( + this.env.DISPATCHER, + dispatchContainerId, + this.env.WORKER_URL, + {{ + function: {json.dumps(state.func_name)}, + input: state, + }}, + ); }}); state = {merge_expr}; current = {next_state}; @@ -174,8 +416,16 @@ def _encode_task_case(self, state: Task) -> str: else: return f"""\ case "{state.name}": {{ - const {var}_result = await step.do("{state.name}", async () => {{{setup_line} - return {fetch_result}; + const {var}_result = await step.do("{state.name}", async () => {{ + return await dispatchWithRetry( + this.env.DISPATCHER, + dispatchContainerId, + this.env.WORKER_URL, + {{ + function: {json.dumps(state.func_name)}, + input: state, + }}, + ); }}); state = {merge_expr}; current = {next_state}; @@ -188,17 +438,20 @@ def _encode_switch_case(self, state: Switch) -> str: for case in state.cases: var_path = self._js_var_path("state", case.var) op = case.op - val = case.val if isinstance(case.val, (int, float)) else f'"{case.val}"' - conditions.append(f' if ({var_path} {op} {val}) {{ current = "{case.next}"; }}') + val = ( + case.val if isinstance(case.val, (int, float)) else json.dumps(case.val) + ) + conditions.append( + f' if ({var_path} {op} {val}) {{ current = "{case.next}"; }}' + ) default = state.default if state.default else "__end__" else_clause = f' else {{ current = "{default}"; }}' - body = "\n".join(conditions) if len(conditions) > 1: lines = [conditions[0]] - for c in conditions[1:]: - lines.append(" else " + c.strip()) + for condition in conditions[1:]: + lines.append(" else " + condition.strip()) lines.append(else_clause) body = "\n".join(lines) else: @@ -211,115 +464,134 @@ def _encode_switch_case(self, state: Switch) -> str: }}""" def _encode_map_case(self, state: Map) -> str: - """Encode a Map state as Promise.all with step.do() per item.""" + """Encode a Map state as ItemWorkflow fan-out with Durable Object fan-in.""" next_state = f'"{state.next}"' if state.next else '"__end__"' - - if isinstance(state.funcs, dict): - first_state = next(iter(state.funcs.values())) - func_name = first_state["func_name"] - else: - func_name = state.funcs[0] - + var = self._js_identifier(state.name) array_path = self._js_var_path("state", state.array) + input_expr = self._map_item_input_expr(state, "state") + func_name = self._map_func_name(state) - if state.common_params: - param_spread = ", ".join(f"{p}: state.{p}" for p in state.common_params) - input_expr = f"{{ array_element: item, {param_spread} }}" - else: - input_expr = "item" - - url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" - if self._dispatcher_url: - map_body = ( - f'return await dispatchWithRetry("{url}", ' - f'{{ function: "{func_name}", input: {input_expr} }});' - ) - else: - fetcher = f"this.env.{self._dispatcher_binding}.fetch" - map_body = ( - f'const r = await {fetcher}("{url}", {{\n' - f" method: \"POST\",\n" - f' headers: {{ "Content-Type": "application/json" }},\n' - f' body: JSON.stringify({{ function: "{func_name}", input: {input_expr} }}),\n' - f" }});\n" - f" return await r.json();" - ) return f"""\ case "{state.name}": {{ - const items_{state.name.replace("-", "_")} = {array_path}; - const results_{state.name.replace("-", "_")} = await Promise.all( - items_{state.name.replace("-", "_")}.map((item: any, i: number) => - step.do(`{state.name}_${{i}}`, async () => {{ - {map_body} + const parentId_{var} = event.instanceId; + const mapInputs_{var} = {array_path}.map((item: any) => {input_expr}); + const totalChunks_{var} = await step.do("{state.name}_spawn", async () => {{ + const total = Math.ceil(mapInputs_{var}.length / {self._chunk_size}); + console.log( + `[workflow-map-spawn] parentId=${{parentId_{var}}} state={state.name} ` + + `items=${{mapInputs_{var}.length}} chunks=${{total}} chunkSize={self._chunk_size}` + ); + await Promise.all( + Array.from({{ length: total }}, async (_unused: unknown, chunkIdx: number) => {{ + const start = chunkIdx * {self._chunk_size}; + const childId = `${{parentId_{var}}}-{state.name}-${{chunkIdx}}`; + console.log( + `[workflow-map-child] parentId=${{parentId_{var}}} state={state.name} ` + + `chunkIdx=${{chunkIdx}} childId=${{childId}}` + ); + await createWorkflowWithRetry(this.env.ITEM_WORKFLOW, childId, {{ + items: mapInputs_{var}.slice(start, start + {self._chunk_size}), + parentId: parentId_{var}, + stateName: "{state.name}", + chunkIdx, + total, + func: {json.dumps(func_name)}, + }}); }}) - ) - ); - {array_path} = results_{state.name.replace("-", "_")}; + ); + return total; + }}); + if (totalChunks_{var} === 0) {{ + {array_path} = []; + }} else {{ + const done_{var} = await step.waitForEvent("{state.name}_done", {{ + type: `{state.name}-complete-${{parentId_{var}}}`, + timeout: "2 hours", + }}); + const payload_{var} = (done_{var} as any).payload; + if (payload_{var}.error) {{ + throw new Error(`Map state {state.name} failed: ${{payload_{var}.error}}`); + }} + {array_path} = payload_{var}.results; + }} current = {next_state}; break; }}""" def _encode_parallel_case(self, state: Parallel) -> str: - """Encode a Parallel state as Promise.all across branches.""" + """Encode a Parallel state as BenchmarkWorkflow child-instance fan-out.""" next_state = f'"{state.next}"' if state.next else '"__end__"' - - branch_thunks = [] - result_merge_parts = [] - - for i, branch in enumerate(state.branches): - sub_states = {n: State.deserialize(n, s) for n, s in branch.states.items()} - ordered = self._order_branch_states(branch.root, sub_states) - - if len(ordered) == 1 and isinstance(ordered[0], Task): - task = ordered[0] - fetch_setup, fetch_result = self._make_fetch_call(task.func_name) - setup_line = f"\n {fetch_setup}" if fetch_setup else "" - thunk = ( - f' step.do("{branch.root}", async () => {{{setup_line}\n' - f" return {fetch_result};\n" - f" }})" - ) - else: - steps_code = self._encode_branch_steps(ordered) - thunk = ( - f" (async () => {{\n" - f" let branchState = JSON.parse(JSON.stringify(state));\n" - f"{steps_code}\n" - f" return branchState;\n" - f" }})()" - ) - - branch_thunks.append(thunk) - result_merge_parts.append( - f' "{branch.root}": parallelResults_{state.name.replace("-", "_")}[{i}]' + var = self._js_identifier(state.name) + total = len(state.branches) + spawn_lines = [] + for idx, branch in enumerate(state.branches): + spawn_lines.append( + f"""\ + (async () => {{ + const childId = `${{parentId_{var}}}-{state.name}-branch-{idx}`; + console.log( + `[workflow-parallel-child] parentId=${{parentId_{var}}} ` + + `state={state.name} branchIdx={idx} root={branch.root} childId=${{childId}}` + ); + await createWorkflowWithRetry( + this.env.WORKFLOW, + childId, + {{ + ...state, + _start: {json.dumps(branch.root)}, + _fanin: {{ + parentId: parentId_{var}, + stateName: "{state.name}", + branchIdx: {idx}, + total: {total}, + branchRoot: {json.dumps(branch.root)}, + }}, + }}, + ); + }})()""" ) - - thunks_joined = ",\n".join(branch_thunks) - merge_joined = ",\n".join(result_merge_parts) + spawn_body = ",\n".join(spawn_lines) return f"""\ case "{state.name}": {{ - const parallelResults_{state.name.replace("-", "_")} = await Promise.all([ -{thunks_joined}, - ]); - state = {{ -{merge_joined}, - }}; + const parentId_{var} = event.instanceId; + await step.do("{state.name}_spawn", async () => {{ + console.log( + `[workflow-parallel-spawn] parentId=${{parentId_{var}}} state={state.name} branches={total}` + ); + await Promise.all([ +{spawn_body}, + ]); + }}); + const done_{var} = await step.waitForEvent("{state.name}_done", {{ + type: `{state.name}-complete-${{parentId_{var}}}`, + timeout: "2 hours", + }}); + const payload_{var} = (done_{var} as any).payload; + if (payload_{var}.error) {{ + throw new Error(`Parallel state {state.name} failed: ${{payload_{var}.error}}`); + }} + state = {{ ...state, ...payload_{var}.results }}; current = {next_state}; break; }}""" def _encode_repeat_case(self, state: Repeat) -> str: - """Encode a Repeat state as a counted for loop.""" + """Encode a Repeat state as a counted sequential loop.""" next_state = f'"{state.next}"' if state.next else '"__end__"' - fetch_setup, fetch_result = self._make_fetch_call(state.func_name) - setup_line = f"\n {fetch_setup}" if fetch_setup else "" - return f"""\ case "{state.name}": {{ for (let i = 0; i < {state.count}; i++) {{ - state = await step.do(`{state.name}_${{i}}`, async () => {{{setup_line} - return {fetch_result}; + state = await step.do(`{state.name}_${{i}}`, async () => {{ + return await dispatchWithRetry( + this.env.DISPATCHER, + dispatchContainerId, + this.env.WORKER_URL, + {{ + function: {json.dumps(state.func_name)}, + input: state, + }}, + ); }}); }} current = {next_state}; @@ -330,139 +602,434 @@ def _encode_loop_case(self, state: Loop) -> str: """Encode a Loop state as a sequential for loop over an array.""" next_state = f'"{state.next}"' if state.next else '"__end__"' array_path = self._js_var_path("state", state.array) - url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" - if self._dispatcher_url: - # Container dispatcher: use retry wrapper - fetch_call_template = ( - f'return await dispatchWithRetry("{url}", {{ function: "{state.func_name}", ' - f"input: {array_path}[i] }});" - ) - else: - fetcher = f"this.env.{self._dispatcher_binding}.fetch" - fetch_call_template = ( - f'const r = await {fetcher}("{url}", {{\n' - f' method: "POST",\n' - f' headers: {{ "Content-Type": "application/json" }},\n' - f' body: JSON.stringify({{ function: "{state.func_name}", ' - f"input: {array_path}[i] }}),\n" - f" }});\n" - f" return await r.json();" - ) return f"""\ case "{state.name}": {{ for (let i = 0; i < {array_path}.length; i++) {{ {array_path}[i] = await step.do(`{state.name}_${{i}}`, async () => {{ - {fetch_call_template} + return await dispatchWithRetry( + this.env.DISPATCHER, + dispatchContainerId, + this.env.WORKER_URL, + {{ + function: {json.dumps(state.func_name)}, + input: {array_path}[i], + }}, + ); }}); }} current = {next_state}; break; }}""" - def _encode_branch_steps(self, ordered_states: List[State]) -> str: - """Encode a sequence of states within a parallel branch.""" - lines = [] - for s in ordered_states: - if isinstance(s, Task): - fetch_setup, fetch_result = self._make_fetch_call(s.func_name) - setup_line = f"\n {fetch_setup}" if fetch_setup else "" - var = s.name.replace("-", "_") - lines.append( - f' const {var}_result = await step.do("{s.name}", async () => {{{setup_line}\n' - f" return {fetch_result};\n" - f" }});\n" - f" branchState = (typeof {var}_result === \"object\" && {var}_result !== null && !Array.isArray({var}_result))\n" - f" ? {{...branchState, ...{var}_result}} : {var}_result;" - ) - elif isinstance(s, Map): - if isinstance(s.funcs, dict): - first_state = next(iter(s.funcs.values())) - func_name = first_state["func_name"] - else: - func_name = s.funcs[0] - - array_path = self._js_var_path("branchState", s.array) - - if s.common_params: - param_spread = ", ".join(f"{p}: branchState.{p}" for p in s.common_params) - input_expr = f"{{ array_element: item, {param_spread} }}" - else: - input_expr = "item" - - url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" - if self._dispatcher_url: - branch_map_body = ( - f'return await dispatchWithRetry("{url}", ' - f'{{ function: "{func_name}", input: {input_expr} }});' - ) - else: - fetcher = f'this.env.{self._dispatcher_binding}.fetch' - branch_map_body = ( - f'const r = await {fetcher}("{url}", {{\n' - f' method: "POST",\n' - f' headers: {{ "Content-Type": "application/json" }},\n' - f' body: JSON.stringify({{ function: "{func_name}",' - f" input: {input_expr} }}),\n" - f" }});\n" - f" return await r.json();" - ) - lines.append( - f" {array_path} = await Promise.all(\n" - f" {array_path}.map((item: any, i: number) =>\n" - f" step.do(`{s.name}_${{i}}`, async () => {{\n" - f" {branch_map_body}\n" - f" }})\n" - f" )\n" - f" );" - ) - return "\n".join(lines) - - def _order_branch_states(self, root: str, states: Dict[str, State]) -> List[State]: - """Return branch states in execution order (BFS from root).""" - ordered: List[State] = [] - visited: Set[str] = set() - queue = [root] - - while queue: - name = queue.pop(0) - if name in visited or name not in states: - continue - visited.add(name) - state = states[name] - ordered.append(state) - nxt = getattr(state, "next", None) - if nxt: - queue.append(nxt) - - return ordered - - def _make_fetch_call(self, func_name: str) -> tuple[str, str]: - """Generate a fetch call to the dispatcher (service binding or direct URL). - - Returns a 2-tuple: (setup_statement, result_expression). - setup_statement is JS code to run before the return, may be empty string. - result_expression is the JS expression whose value is the parsed JSON result. - """ - url = self._dispatcher_url if self._dispatcher_url else "http://dispatcher/" - if self._dispatcher_url: - # Container dispatcher: use retry wrapper to handle cold-start 503s. - setup = "" - result = ( - f'await dispatchWithRetry("{url}", ' - f'{{ function: "{func_name}", input: state }})' + def _emit_item_workflow(self) -> str: + """Emit the child workflow that runs one Map chunk.""" + return """\ +export class ItemWorkflow extends WorkflowEntrypoint { + async run(event: WorkflowEvent, step: WorkflowStep) { + const { items, parentId, stateName, chunkIdx, total, func } = event.payload; + console.log( + `[workflow-item] parentId=${parentId} state=${stateName} ` + + `chunkIdx=${chunkIdx} total=${total} func=${func} items=${items.length}` + ); + try { + const results = await step.do(`${stateName}_${chunkIdx}`, async () => { + const containerId = `${parentId}-${stateName}-${chunkIdx}`; + if (items.length === 1) { + const result = await dispatchWithRetry( + this.env.DISPATCHER, + containerId, + this.env.WORKER_URL, + { + function: func, + input: items[0], + }, + ); + return [result]; + } + return await Promise.all( + items.map((item: any) => + dispatchWithRetry( + this.env.DISPATCHER, + containerId, + this.env.WORKER_URL, + { + function: func, + input: item, + }, ) - else: - fetcher = f"this.env.{self._dispatcher_binding}.fetch" - setup = ( - f'const r = await {fetcher}("{url}", {{\n' - f' method: "POST",\n' - f' headers: {{ "Content-Type": "application/json" }},\n' - f' body: JSON.stringify({{ function: "{func_name}", input: state }}),\n' - f" }});" + ) + ); + }); + + await reportFanIn(this.env, { + parentId, + stateName, + idx: chunkIdx, + total, + mode: "array", + key: null, + result: results, + }); + return results; + } catch (error) { + const message = errorMessage(error); + console.log( + `[workflow-item-error] parentId=${parentId} state=${stateName} ` + + `chunkIdx=${chunkIdx} func=${func} error=${message}` + ); + await reportFanIn(this.env, { + parentId, + stateName, + idx: chunkIdx, + total, + mode: "array", + key: null, + result: null, + error: message, + }); + throw error; + } + } +}""" + + def _emit_proxy_handlers(self) -> str: + """Emit R2 and KV proxy handlers used by containerized benchmark code.""" + return """\ +async function handleNoSQLRequest(request: Request, env: Env): Promise { + try { + const url = new URL(request.url); + const operation = url.pathname.split("/").pop(); + const params = await request.json() as any; + const { table_name, primary_key, secondary_key, data } = params; + const table = env[table_name]; + if (!table || typeof table.get !== "function" || typeof table.put !== "function") { + return Response.json( + { error: `KV namespace binding '${table_name}' not found` }, + { status: 500 }, + ); + } + + const indexKey = `__sebs_idx__${primary_key[1]}`; + const readIndex = async (): Promise => { + const raw = await table.get(indexKey); + if (!raw) { + return []; + } + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : []; + } catch { + return []; + } + }; + const writeIndex = async (values: string[]) => { + await table.put(indexKey, JSON.stringify(values)); + }; + + const compositeKey = `${primary_key[1]}#${secondary_key?.[1]}`; + let result: any; + switch (operation) { + case "insert": { + const keyData = { + ...data, + [primary_key[0]]: primary_key[1], + [secondary_key[0]]: secondary_key[1], + }; + await table.put(compositeKey, JSON.stringify(keyData)); + const index = await readIndex(); + if (!index.includes(secondary_key[1])) { + index.push(secondary_key[1]); + await writeIndex(index); + } + result = { success: true }; + break; + } + case "update": { + const existingRaw = await table.get(compositeKey); + let existing = {}; + if (existingRaw) { + try { + existing = JSON.parse(existingRaw); + } catch { + existing = {}; + } + } + const merged = { + ...existing, + ...data, + [primary_key[0]]: primary_key[1], + [secondary_key[0]]: secondary_key[1], + }; + await table.put(compositeKey, JSON.stringify(merged)); + const index = await readIndex(); + if (!index.includes(secondary_key[1])) { + index.push(secondary_key[1]); + await writeIndex(index); + } + result = { success: true }; + break; + } + case "get": { + const raw = await table.get(compositeKey); + if (raw === null) { + result = { data: null }; + } else { + try { + result = { data: JSON.parse(raw) }; + } catch { + result = { data: raw }; + } + } + break; + } + case "query": { + const prefix = `${primary_key[1]}#`; + let secondaryKeys = await readIndex(); + if (secondaryKeys.length === 0) { + const list = await table.list({ prefix }); + secondaryKeys = (list.keys || []).map((k: any) => + k.name.split("#").slice(1).join("#") + ); + } + const items = []; + for (const secondaryValue of secondaryKeys) { + const raw = await table.get(`${primary_key[1]}#${secondaryValue}`); + if (raw === null) { + continue; + } + try { + items.push(JSON.parse(raw)); + } catch { + items.push(raw); + } + } + result = { items }; + break; + } + case "delete": { + await table.delete(compositeKey); + const index = await readIndex(); + const next = index.filter((value) => value !== secondary_key[1]); + if (next.length !== index.length) { + await writeIndex(next); + } + result = { success: true }; + break; + } + default: + return Response.json({ error: "Unknown NoSQL operation" }, { status: 404 }); + } + return Response.json(result || {}); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return Response.json({ error: message }, { status: 500 }); + } +} + +async function handleR2Request(request: Request, env: Env): Promise { + try { + const url = new URL(request.url); + const key = url.searchParams.get("key"); + if (!env.R2) { + return Response.json({ error: "R2 binding not configured" }, { status: 500 }); + } + + if (url.pathname === "/r2/list") { + const prefix = url.searchParams.get("prefix") || ""; + const list = await env.R2.list({ prefix }); + return Response.json({ objects: list.objects || [] }); + } + + if (url.pathname === "/r2/multipart-init") { + if (!key) { + return Response.json({ error: "Missing key parameter" }, { status: 400 }); + } + const multipart = await env.R2.createMultipartUpload(key); + return Response.json({ key: multipart.key, uploadId: multipart.uploadId }); + } + + if (url.pathname === "/r2/multipart-part") { + if (!key) { + return Response.json({ error: "Missing key parameter" }, { status: 400 }); + } + const uploadId = url.searchParams.get("uploadId"); + const partNumber = Number(url.searchParams.get("partNumber")); + const multipart = env.R2.resumeMultipartUpload(key, uploadId!); + const part = await multipart.uploadPart(partNumber, request.body!); + return Response.json({ partNumber: part.partNumber, etag: part.etag }); + } + + if (url.pathname === "/r2/multipart-complete") { + if (!key) { + return Response.json({ error: "Missing key parameter" }, { status: 400 }); + } + const uploadId = url.searchParams.get("uploadId"); + const { parts } = await request.json() as any; + const multipart = env.R2.resumeMultipartUpload(key, uploadId!); + await multipart.complete(parts); + return Response.json({ key }); + } + + if (!key) { + return Response.json({ error: "Missing key parameter" }, { status: 400 }); + } + + if (url.pathname === "/r2/download") { + const rangeHeader = request.headers.get("Range"); + let options: any = undefined; + let rangeStart: number | undefined; + let rangeEnd: number | undefined; + if (rangeHeader) { + const match = rangeHeader.match(/^bytes=(\\d+)-(\\d+)$/); + if (match) { + rangeStart = Number(match[1]); + rangeEnd = Number(match[2]); + options = { range: { offset: rangeStart, length: rangeEnd - rangeStart + 1 } }; + } + } + const object = (await env.R2.get(key, options)) as R2ObjectBody | null; + if (!object) { + return Response.json({ error: "Object not found" }, { status: 404 }); + } + const headers = new Headers(); + headers.set("Content-Type", object.httpMetadata?.contentType || "application/octet-stream"); + if (rangeHeader && rangeStart !== undefined && rangeEnd !== undefined) { + headers.set("Content-Range", `bytes ${rangeStart}-${rangeEnd}/${object.size}`); + headers.set("Content-Length", String(rangeEnd - rangeStart + 1)); + return new Response(object.body, { status: 206, headers }); + } + headers.set("Content-Length", String(object.size ?? "")); + return new Response(object.body, { headers }); + } + + if (url.pathname === "/r2/upload") { + await env.R2.put(key, request.body!); + return Response.json({ key }); + } + + return Response.json({ error: "Unknown R2 operation" }, { status: 404 }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return Response.json({ error: message }, { status: 500 }); + } +}""" + + def _emit_fanin_coordinator(self) -> str: + """Emit the Durable Object that coordinates Map and Parallel fan-in.""" + return """\ +export class FanInCoordinator { + state: DurableObjectState; + env: Env; + + constructor(state: DurableObjectState, env: Env) { + this.state = state; + this.env = env; + } + + async fetch(request: Request): Promise { + if (request.method !== "POST") { + return new Response("Method not allowed", { status: 405 }); + } + + const report = await request.json() as { + parentId: string; + stateName: string; + idx: number; + total: number; + mode: "array" | "object"; + key: string | null; + result: any; + error?: string; + }; + const seenKey = `seen:${report.idx}`; + const alreadySeen = await this.state.storage.get(seenKey); + if (alreadySeen !== undefined) { + return Response.json({ ok: true, duplicate: true }); + } + + await this.state.storage.put(seenKey, true); + await this.state.storage.put(`result:${report.idx}`, { + key: report.key, + result: report.result, + error: report.error, + }); + + const entries = await this.state.storage.list<{ + key: string | null; + result: any; + error?: string; + }>({ prefix: "result:" }); + let assembledBytes = 0; + for (const entry of entries.values()) { + assembledBytes += textSizeBytes(entry.result); + assembledBytes += textSizeBytes(entry.error ?? ""); + } + if (assembledBytes > 900 * 1024) { + throw new Error( + "Fan-in payload exceeds 900 KiB - R2 reference path not yet implemented. " + + "Reduce fan-out width or result size." + ); + } + + if (entries.size === report.total) { + const ordered = [...entries.entries()].sort(([a], [b]) => { + const ai = Number(a.slice("result:".length)); + const bi = Number(b.slice("result:".length)); + return ai - bi; + }); + const failed = ordered.find(([_idx, entry]) => entry.error); + let results: any; + let error: string | undefined; + if (failed) { + const [failedIdx, entry] = failed; + error = `${report.stateName}[${failedIdx.slice("result:".length)}]: ${entry.error}`; + } else if (report.mode === "array") { + results = []; + for (const [_idx, entry] of ordered) { + results.push(...entry.result); + } + } else { + results = {}; + for (const [_idx, entry] of ordered) { + results[entry.key!] = entry.result; + } + } + const instance = await this.env.WORKFLOW.get(report.parentId); + await instance.sendEvent({ + type: `${report.stateName}-complete-${report.parentId}`, + payload: error ? { error } : { results }, + }); + } + + return Response.json({ ok: true }); + } +}""" + + def _emit_dispatcher_container(self) -> str: + """Emit the container class used by the dispatcher Durable Object namespace.""" + return """\ +export class DispatcherContainer extends Container { + defaultPort = 8080; + sleepAfter = "5s"; +}""" + + def _map_func_name(self, state: Map) -> str: + """Return the task function name used by a Map state.""" + if isinstance(state.funcs, dict): + first_state = next(iter(state.funcs.values())) + return first_state["func_name"] + return state.funcs[0] + + def _map_item_input_expr(self, state: Map, root: str) -> str: + """Return the JavaScript expression used as each Map dispatch input.""" + if state.common_params: + param_spread = ", ".join( + f"{json.dumps(p)}: {self._js_var_path(root, p)}" + for p in state.common_params ) - result = "await r.json()" - return setup, result + return f"({{ array_element: item, {param_spread} }})" + return "item" @staticmethod def _js_var_path(root: str, dotted_path: str) -> str: @@ -470,6 +1037,14 @@ def _js_var_path(root: str, dotted_path: str) -> str: parts = dotted_path.split(".") return root + "." + ".".join(parts) + @staticmethod + def _js_identifier(name: str) -> str: + """Convert an FSM state name into a JavaScript-safe identifier fragment.""" + identifier = re.sub(r"\W", "_", name) + if identifier and identifier[0].isdigit(): + identifier = f"_{identifier}" + return identifier or "state" + def encode_task(self, state: Task) -> Union[dict, List[dict]]: """Not used — generation bypasses the standard encode pipeline.""" raise NotImplementedError("Use generate() directly") diff --git a/sebs/cloudflare/templates/wrangler-workflow.toml b/sebs/cloudflare/templates/wrangler-workflow.toml index 20bd20049..a9b010b60 100644 --- a/sebs/cloudflare/templates/wrangler-workflow.toml +++ b/sebs/cloudflare/templates/wrangler-workflow.toml @@ -12,6 +12,25 @@ name = "PLACEHOLDER_WORKFLOW_NAME" binding = "WORKFLOW" class_name = "BenchmarkWorkflow" -[[services]] -binding = "DISPATCHER" -service = "PLACEHOLDER_DISPATCHER_NAME" +[[workflows]] +name = "PLACEHOLDER_ITEM_WORKFLOW_NAME" +binding = "ITEM_WORKFLOW" +class_name = "ItemWorkflow" + +[[durable_objects.bindings]] +name = "FANIN" +class_name = "FanInCoordinator" + +[[durable_objects.bindings]] +name = "DISPATCHER" +class_name = "DispatcherContainer" + +[[containers]] +class_name = "DispatcherContainer" +image = "PLACEHOLDER_CONTAINER_IMAGE" +max_instances = 1 +instance_type = "lite" + +[[migrations]] +tag = "v1" +new_sqlite_classes = ["FanInCoordinator", "DispatcherContainer"] From 765125743d59ee76f505ea2cfb2ed4cc939297a7 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 16 Jun 2026 21:49:40 +0200 Subject: [PATCH 225/230] feat: Add max_instances support for container health checks in Cloudflare class --- sebs/cloudflare/cloudflare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index 514fe1699..c85ff40e5 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -851,6 +851,7 @@ def _wait_for_container_rollout( health_instances = data.get("health", {}).get("instances", {}) healthy = health_instances.get("healthy", 0) starting = health_instances.get("starting", 0) + max_instances = data.get("max_instances", self.config.max_instances) self.logging.debug(f"Container {container_name} health: {health_instances}") if max_instances > 0 and healthy >= max_instances: self.logging.info( From 00ee34fa4ecb91bbc8ad287fffe7a21bd158d405 Mon Sep 17 00:00:00 2001 From: laurin Date: Tue, 16 Jun 2026 21:59:52 +0200 Subject: [PATCH 226/230] feat: Update health check logging and condition for container readiness --- sebs/cloudflare/cloudflare.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sebs/cloudflare/cloudflare.py b/sebs/cloudflare/cloudflare.py index c85ff40e5..69cc0e730 100644 --- a/sebs/cloudflare/cloudflare.py +++ b/sebs/cloudflare/cloudflare.py @@ -851,12 +851,13 @@ def _wait_for_container_rollout( health_instances = data.get("health", {}).get("instances", {}) healthy = health_instances.get("healthy", 0) starting = health_instances.get("starting", 0) - max_instances = data.get("max_instances", self.config.max_instances) - self.logging.debug(f"Container {container_name} health: {health_instances}") - if max_instances > 0 and healthy >= max_instances: + self.logging.debug( + f"Container {container_name} health: {health_instances}" + ) + if healthy > 0: self.logging.info( f"Container {container_name} is ready " - f"({healthy}/{max_instances} instances healthy)." + f"({healthy} instance(s) healthy)." ) return self.logging.info( From 4766315dc898d7ad93c0db4af134e7983aa516e7 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 18 Jun 2026 10:51:37 +0200 Subject: [PATCH 227/230] Fix Azure workflow regression execution Add an Azure workflow HTTP trigger that exposes Durable Functions as SeBS library-style workflow invocations by starting the orchestration, polling the status URL, and returning the completed workflow output. Fix Azure workflow packaging/runtime behavior: - preserve benchmark-relative imports by importing activity modules as packages - add wrapper-level User-Agent handling for outbound requests - query the correct workflow HTTP entrypoint after publish - avoid duplicate HTTP trigger handling - quote Azure app settings and restart apps after env updates - support cached workflow trigger deserialization Fix workflow result handling for nested Map states inside Parallel branches so genome workflow outputs preserve the expected branch payload shape. Run high-CPU genome workflows on an EP3 Linux Premium plan with distinct app names, and limit genome activity concurrency in host.json to avoid shared /tmp collisions in benchmark code. Fix Azure blob range reads to use inclusive end-byte semantics, matching the benchmark expectations, and improve regression failure diagnostics so invocation exceptions are written to result JSON instead of being masked by harness errors. --- benchmarks/wrappers/azure/python/fsm.py | 7 + .../wrappers/azure/python/handler_workflow.py | 44 ++++- .../wrappers/azure/python/main_workflow.py | 2 +- .../wrappers/azure/python/run_subworkflow.py | 18 +- .../wrappers/azure/python/run_workflow.py | 18 +- benchmarks/wrappers/azure/python/storage.py | 7 +- sebs/azure/azure.py | 167 ++++++++++++++++-- sebs/azure/function.py | 8 +- sebs/azure/triggers.py | 135 ++++++++++++++ 9 files changed, 376 insertions(+), 30 deletions(-) diff --git a/benchmarks/wrappers/azure/python/fsm.py b/benchmarks/wrappers/azure/python/fsm.py index 039457cd4..5d4039ec0 100644 --- a/benchmarks/wrappers/azure/python/fsm.py +++ b/benchmarks/wrappers/azure/python/fsm.py @@ -83,6 +83,13 @@ def __init__( self.next = next self.common_params = common_params + @property + def func_name(self) -> str: + state = self.funcs[self.root] + while state["type"] == "map": + state = state["states"][state["root"]] + return state["func_name"] + @classmethod def deserialize(cls, name: str, payload: dict) -> "Map": return cls( diff --git a/benchmarks/wrappers/azure/python/handler_workflow.py b/benchmarks/wrappers/azure/python/handler_workflow.py index 5de2d74ab..45ca811d0 100644 --- a/benchmarks/wrappers/azure/python/handler_workflow.py +++ b/benchmarks/wrappers/azure/python/handler_workflow.py @@ -1,15 +1,43 @@ import datetime import json import os +import sys import uuid import importlib -import importlib.util import logging import azure.functions as func from redis import Redis +SEBS_USER_AGENT = "SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2" + + +def patch_requests_user_agent(): + try: + import requests + except ImportError: + return + + original_request = requests.api.request + if getattr(original_request, "_sebs_user_agent_patched", False): + return + + def patched_request(method, url, **kwargs): + headers = dict(kwargs.get("headers") or {}) + header_names = {key.lower() for key in headers} + if "user-agent" not in header_names: + headers["User-Agent"] = SEBS_USER_AGENT + kwargs["headers"] = headers + return original_request(method, url, **kwargs) + + patched_request._sebs_user_agent_patched = True + requests.api.request = patched_request + requests.request = patched_request + + +patch_requests_user_agent() + if 'NOSQL_STORAGE_DATABASE' in os.environ: from . import nosql nosql.nosql.get_instance( @@ -17,10 +45,12 @@ os.environ['NOSQL_STORAGE_URL'], os.environ['NOSQL_STORAGE_CREDS'] ) + sys.modules["nosql"] = nosql if 'STORAGE_CONNECTION_STRING' in os.environ: from . import storage storage.storage.get_instance(os.environ['STORAGE_CONNECTION_STRING']) + sys.modules["storage"] = storage def probe_cold_start(): is_cold = False @@ -46,10 +76,14 @@ def main(event, context: func.Context): event["payload"]["request-id"] = context.invocation_id - module_path = os.path.join(os.path.dirname(__file__), f"{func_name}.py") - spec = importlib.util.spec_from_file_location(func_name, module_path) - function = importlib.util.module_from_spec(spec) - spec.loader.exec_module(function) + current_dir = os.path.dirname(__file__) + if current_dir not in sys.path: + sys.path.insert(0, current_dir) + parent_dir = os.path.dirname(current_dir) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + package = __package__ or func_name + function = importlib.import_module(f"{package}.{func_name}") res = function.handler(event["payload"]) diff --git a/benchmarks/wrappers/azure/python/main_workflow.py b/benchmarks/wrappers/azure/python/main_workflow.py index 64868a919..392f18847 100644 --- a/benchmarks/wrappers/azure/python/main_workflow.py +++ b/benchmarks/wrappers/azure/python/main_workflow.py @@ -32,7 +32,7 @@ async def main(req: func.HttpRequest, starter: str, context: func.Context) -> fu instance_id = await client.start_new("run_workflow", None, event) res = await client.wait_for_completion_or_create_check_status_response( - req, instance_id, timeout_in_milliseconds=600000 + req, instance_id, timeout_in_milliseconds=1000 ) end = datetime.datetime.now() diff --git a/benchmarks/wrappers/azure/python/run_subworkflow.py b/benchmarks/wrappers/azure/python/run_subworkflow.py index c2730714e..4fd28c577 100644 --- a/benchmarks/wrappers/azure/python/run_subworkflow.py +++ b/benchmarks/wrappers/azure/python/run_subworkflow.py @@ -5,6 +5,7 @@ import operator import logging import datetime +import copy import azure.durable_functions as df from redis import Redis @@ -152,7 +153,6 @@ def handler(context: df.DurableOrchestrationContext): elif isinstance(first_state, Map): array = get_var(res, first_state.array) - tasks = [] if first_state.next: #call suborchestrator. @@ -197,6 +197,7 @@ def handler(context: df.DurableOrchestrationContext): duration += (now() - ts) map_res = yield context.task_all(parallel_tasks) ts = now() + base_res = res res = {} for state in first_states: @@ -205,10 +206,21 @@ def handler(context: df.DurableOrchestrationContext): output = [] for index in indices: output.append(map_res[index]) - res[state.func_name] = output + if isinstance(state, Map): + branch_res = copy.deepcopy(base_res) + set_var(branch_res, output, state.array) + res[state.func_name] = branch_res + else: + res[state.func_name] = output else: #task state - res[state.func_name] = map_res[indices[0]] + output = map_res[indices[0]] + if isinstance(state, Map): + branch_res = copy.deepcopy(base_res) + set_var(branch_res, output, state.array) + res[state.func_name] = branch_res + else: + res[state.func_name] = output current = states.get(current.next, None) diff --git a/benchmarks/wrappers/azure/python/run_workflow.py b/benchmarks/wrappers/azure/python/run_workflow.py index 868909e4a..eb4b95464 100644 --- a/benchmarks/wrappers/azure/python/run_workflow.py +++ b/benchmarks/wrappers/azure/python/run_workflow.py @@ -5,6 +5,7 @@ import operator import logging import datetime +import copy import azure.durable_functions as df from redis import Redis @@ -191,7 +192,6 @@ def handler(context: df.DurableOrchestrationContext): elif isinstance(first_state, Map): array = get_var(res, first_state.array) - tasks = [] if first_state.next: #call suborchestrator. @@ -239,6 +239,7 @@ def handler(context: df.DurableOrchestrationContext): duration += (now() - ts) map_res = yield context.task_all(parallel_tasks) ts = now() + base_res = res res = {} for state in first_states: @@ -248,10 +249,21 @@ def handler(context: df.DurableOrchestrationContext): output = [] for index in indices: output.append(map_res[index]) - res[state.func_name] = output + if isinstance(state, Map): + branch_res = copy.deepcopy(base_res) + set_var(branch_res, output, state.array) + res[state.func_name] = branch_res + else: + res[state.func_name] = output else: #task state - res[state.func_name] = map_res[indices[0]] + output = map_res[indices[0]] + if isinstance(state, Map): + branch_res = copy.deepcopy(base_res) + set_var(branch_res, output, state.array) + res[state.func_name] = branch_res + else: + res[state.func_name] = output current = states.get(current.next, None) diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py index 299e0b9dd..60b98fe37 100644 --- a/benchmarks/wrappers/azure/python/storage.py +++ b/benchmarks/wrappers/azure/python/storage.py @@ -47,6 +47,8 @@ def download_directory(self, container, prefix, path): def upload_stream(self, container, file, data, unique_name=True): key_name = storage.unique_name(file) if unique_name else file client = self.client.get_blob_client(container=container, blob=key_name) + if hasattr(data, "seek"): + data.seek(0) client.upload_blob(data, overwrite=not unique_name) return key_name @@ -56,7 +58,10 @@ def download_stream(self, container, file): def download_within_range(self, container, file, start_bytes, end_bytes): client = self.client.get_blob_client(container=container, blob=file) - return client.download_blob(offset=start_bytes, length=end_bytes - start_bytes).readall().decode("utf-8") + return client.download_blob( + offset=start_bytes, + length=end_bytes - start_bytes + 1, + ).readall().decode("utf-8") @staticmethod def get_instance(connection_string: Optional[str] = None): diff --git a/sebs/azure/azure.py b/sebs/azure/azure.py index 635833395..cb1205ab3 100644 --- a/sebs/azure/azure.py +++ b/sebs/azure/azure.py @@ -37,8 +37,10 @@ import random import re import os +import shlex import shutil import time +import threading import uuid from typing import cast, Dict, List, Optional, Set, Tuple, Type # noqa @@ -85,6 +87,14 @@ class Azure(System): # runtime mapping AZURE_RUNTIMES = {"python": "python", "nodejs": "node", "java": "java"} + HIGH_CPU_WORKFLOW_BENCHMARKS = { + "6100.1000-genome", + "6101.1000-genome-individuals", + } + HIGH_CPU_WORKFLOW_PLAN_SKU = "EP3" + HIGH_CPU_WORKFLOW_NAME_SUFFIX = "ep3linux" + FUNCTION_APP_NAME_LIMIT = 60 + _workflow_plan_lock = threading.Lock() @staticmethod def name() -> str: @@ -446,7 +456,8 @@ def _package_code_workflow( func_dirs.append(func_dir) os.makedirs(func_dir) - shutil.move(os.path.join(directory, file), os.path.join(func_dir, file)) + target_file = os.path.join(func_dir, file) + shutil.move(os.path.join(directory, file), target_file) # Generate function.json for each function directory script_file = file if name in bindings else "handler.py" @@ -502,6 +513,12 @@ def _package_code_workflow( "version": "[2.*, 3.0.0)", }, } + if self._requires_high_cpu_workflow_plan(benchmark): + host_json["extensions"] = { + "durableTask": { + "maxConcurrentActivityFunctions": 1, + } + } json.dump(host_json, open(os.path.join(directory, "host.json"), "w"), indent=2) code_size = Benchmark.directory_size(directory) @@ -693,8 +710,9 @@ def publish_function( self.logging.info("Querying function details to retrieve URL") resource_group = self.config.resources.resource_group(self.cli_instance) + entrypoint = "main" if isinstance(function, AzureWorkflow) else "handler" query_cmd = ( - "az functionapp function show --function-name handler " + f"az functionapp function show --function-name {entrypoint} " f"--name {function.name} --resource-group {resource_group}" ) @@ -758,7 +776,6 @@ def update_function( if isinstance(trigger, HTTPTrigger): found_trigger = True trigger.url = function_url - break if not found_trigger: trigger = HTTPTrigger( @@ -846,7 +863,7 @@ def update_envs( try: env_string = "" for k, v in envs.items(): - env_string += f" {k}={v}" + env_string += f" {shlex.quote(f'{k}={v}')}" self.logging.info(f"Exporting environment variables for function {function.name}") self.cli_instance.execute( @@ -854,13 +871,14 @@ def update_envs( f" --resource-group {resource_group} " f" --settings {env_string} " ) - - # if we don't do that, next invocation might still see old values - # Disabled since we swapped the order - we first update envs, then we publish. - # self.logging.info( - # "Sleeping for 10 seconds - Azure needs more time to propagate changes. " - # "Otherwise, functions might not see new variables and fail unexpectedly." - # ) + self.logging.info( + f"Restarting function {function.name} to apply environment variables" + ) + self.cli_instance.execute( + f"az functionapp restart --name {function.name} " + f" --resource-group {resource_group} " + ) + time.sleep(10) except RuntimeError as e: self.logging.error("Failed to set environment variable!") @@ -960,6 +978,21 @@ def default_function_name( .replace(".", "-") .replace("_", "-") ) + if self._requires_high_cpu_workflow_plan(code_package.benchmark): + func_name = f"{func_name}-{self.HIGH_CPU_WORKFLOW_NAME_SUFFIX}" + if len(func_name) > self.FUNCTION_APP_NAME_LIMIT: + benchmark_id = code_package.benchmark.split(".")[0] + func_name = ( + "sebs-{}-{}-{}-{}-{}".format( + self.config.resources.resources_id, + benchmark_id, + code_package.language_name, + code_package.language_version, + self.HIGH_CPU_WORKFLOW_NAME_SUFFIX, + ) + .replace(".", "-") + .replace("_", "-") + ) return func_name def create_function( @@ -1091,6 +1124,76 @@ def cached_function(self, function: Function) -> None: azure_trigger.logging_handlers = self.logging_handlers azure_trigger.data_storage_account = data_storage_account + def _requires_high_cpu_workflow_plan(self, benchmark: str) -> bool: + return benchmark in self.HIGH_CPU_WORKFLOW_BENCHMARKS + + def _high_cpu_workflow_plan_name(self) -> str: + sku = self.HIGH_CPU_WORKFLOW_PLAN_SKU.lower() + return f"sebs-{self.config.resources.resources_id}-workflow-{sku}" + + def _ensure_high_cpu_workflow_plan(self, resource_group: str, region: str) -> str: + plan_name = self._high_cpu_workflow_plan_name() + + with self._workflow_plan_lock: + try: + self.cli_instance.execute( + ( + "az functionapp plan show " + f"--resource-group {resource_group} --name {plan_name}" + ) + ) + return plan_name + except RuntimeError: + pass + + self.logging.info( + f"Creating Azure Functions Premium plan {plan_name} " + f"({self.HIGH_CPU_WORKFLOW_PLAN_SKU}) for high-CPU workflows" + ) + try: + self.cli_instance.execute( + ( + "az functionapp plan create " + f"--resource-group {resource_group} " + f"--name {plan_name} " + f"--location {region} " + f"--sku {self.HIGH_CPU_WORKFLOW_PLAN_SKU} " + "--is-linux " + "--number-of-workers 1" + ) + ) + except RuntimeError as e: + if "already exists" not in str(e).lower(): + raise e from None + + return plan_name + + def _ensure_function_app_plan( + self, function_name: str, resource_group: str, plan_name: str + ) -> None: + ret = self.cli_instance.execute( + ( + "az functionapp show " + f"--resource-group {resource_group} " + f"--name {function_name}" + ) + ) + app = json.loads(ret.decode("utf-8")) + current_plan_id = ( + app.get("serverFarmId") + or app.get("appServicePlanId") + or app.get("properties", {}).get("serverFarmId") + or "" + ).lower() + if current_plan_id.endswith(f"/serverfarms/{plan_name.lower()}"): + return + + raise RuntimeError( + f"Workflow app {function_name} is on plan {current_plan_id}, " + f"expected {plan_name}. Azure does not support migrating Linux " + "Consumption function apps to Premium; redeploy with a new app name." + ) + def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureWorkflow: """Create a new Azure Durable Functions workflow. @@ -1118,6 +1221,14 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureW "runtime": self.AZURE_RUNTIMES[language], "runtime_version": language_runtime, } + high_cpu_plan_name: Optional[str] = None + if self._requires_high_cpu_workflow_plan(code_package.benchmark): + high_cpu_plan_name = self._ensure_high_cpu_workflow_plan(resource_group, region) + config["plan_args"] = f"--plan {high_cpu_plan_name}" + config["os_args"] = "" + else: + config["plan_args"] = "--consumption-plan-location {region}".format(**config) + config["os_args"] = "--os-type Linux" # Check if function app already exists function_storage_account: Optional[AzureResources.Storage] = None @@ -1129,6 +1240,10 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureW " --name {func_name} " ).format(**config) ) + except RuntimeError: + ret = None + + if ret is not None: for setting in json.loads(ret.decode()): if setting["name"] == "AzureWebJobsStorage": connection_string = setting["value"] @@ -1139,7 +1254,10 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureW ) assert function_storage_account is not None self.logging.info("Azure: Selected existing workflow app {}".format(workflow_name)) - except RuntimeError: + if high_cpu_plan_name: + self._ensure_function_app_plan(workflow_name, resource_group, high_cpu_plan_name) + + if function_storage_account is None: function_storage_account = self.config.resources.add_storage_account(self.cli_instance) config["storage_account"] = function_storage_account.account_name while True: @@ -1147,7 +1265,7 @@ def create_workflow(self, code_package: Benchmark, workflow_name: str) -> AzureW self.cli_instance.execute( ( " az functionapp create --resource-group {resource_group} " - " --os-type Linux --consumption-plan-location {region} " + " {os_args} {plan_args} " " --runtime {runtime} --runtime-version {runtime_version} " " --name {func_name} --storage-account {storage_account}" " --functions-version 4 " @@ -1188,6 +1306,10 @@ def update_workflow(self, workflow: Workflow, code_package: Benchmark) -> None: workflow: Workflow instance to update code_package: New benchmark code package """ + if self._requires_high_cpu_workflow_plan(code_package.benchmark): + resource_group = self.config.resources.resource_group(self.cli_instance) + plan_name = self._ensure_high_cpu_workflow_plan(resource_group, self.config.region) + self._ensure_function_app_plan(workflow.name, resource_group, plan_name) self.update_function(workflow, code_package, code_package.system_variant, None) def download_metrics( @@ -1331,9 +1453,24 @@ def create_trigger(self, function: Function, trigger_type: Trigger.TriggerType) Raises: NotImplementedError: If no HTTP trigger exists on the function. """ - from sebs.azure.triggers import HTTPTrigger + from sebs.azure.function import AzureWorkflow + from sebs.azure.triggers import HTTPTrigger, WorkflowHTTPTrigger http_triggers = function.triggers(Trigger.TriggerType.HTTP) - if http_triggers: + if trigger_type == Trigger.TriggerType.LIBRARY and isinstance(function, AzureWorkflow): + library_triggers = function.triggers(Trigger.TriggerType.LIBRARY) + if library_triggers: + return library_triggers[0] + if http_triggers: + trigger = WorkflowHTTPTrigger( + http_triggers[0].url, + self.config.resources.data_storage_account(self.cli_instance), + ) + trigger.logging_handlers = self.logging_handlers + function.add_trigger(trigger) + self.cache_client.update_function(function) + return trigger + + if trigger_type == Trigger.TriggerType.HTTP and http_triggers: return http_triggers[0] raise NotImplementedError() diff --git a/sebs/azure/function.py b/sebs/azure/function.py index bb2011dc6..449bc79f2 100644 --- a/sebs/azure/function.py +++ b/sebs/azure/function.py @@ -113,7 +113,7 @@ def serialize(self) -> dict: @staticmethod def deserialize(cached_config: dict) -> "AzureWorkflow": """Deserialize workflow from cached configuration.""" - from sebs.azure.triggers import HTTPTrigger + from sebs.azure.triggers import HTTPTrigger, WorkflowHTTPTrigger cfg = FunctionConfig.deserialize(cached_config["config"]) ret = AzureWorkflow( @@ -124,7 +124,11 @@ def deserialize(cached_config: dict) -> "AzureWorkflow": cfg, ) for trigger in cached_config["triggers"]: - trigger_type = {"HTTP": HTTPTrigger}.get(trigger["type"]) + trigger_type = { + "HTTP": HTTPTrigger, + "WorkflowHTTP": WorkflowHTTPTrigger, + WorkflowHTTPTrigger.typename(): WorkflowHTTPTrigger, + }.get(trigger["type"]) assert trigger_type, "Unknown trigger type {}".format(trigger["type"]) ret.add_trigger(trigger_type.deserialize(trigger)) return ret diff --git a/sebs/azure/triggers.py b/sebs/azure/triggers.py index 85b4bafe2..6991afe33 100644 --- a/sebs/azure/triggers.py +++ b/sebs/azure/triggers.py @@ -23,6 +23,10 @@ """ import concurrent.futures +import json +import time +from datetime import datetime +from io import BytesIO from typing import Any, Dict, Optional # noqa from sebs.azure.config import AzureResources @@ -152,3 +156,134 @@ def deserialize(obj: dict) -> Trigger: HTTPTrigger instance with restored configuration. """ return HTTPTrigger(obj["url"]) + + +class WorkflowHTTPTrigger(HTTPTrigger): + """HTTP-backed trigger for Azure Durable Function workflows. + + Azure starts Durable Functions workflows through an HTTP endpoint, but SeBS + treats workflows as library triggers and validates the workflow result + directly. This trigger keeps the Azure HTTP transport while exposing the + same result shape as AWS Step Functions and Cloudflare Workflows. + """ + + @staticmethod + def typename() -> str: + """Return the canonical type name for this trigger class.""" + return "Azure.WorkflowHTTPTrigger" + + @staticmethod + def trigger_type() -> Trigger.TriggerType: + """Get the trigger type.""" + return Trigger.TriggerType.LIBRARY + + def sync_invoke(self, payload: dict) -> ExecutionResult: + """Synchronously invoke an Azure Durable Function workflow.""" + begin = datetime.now() + status_code, raw, conn_time, receive_time = self._http_post_json( + self.url, payload, timeout=300 + ) + envelope = self._parse_json_response(raw, self.url) + + if status_code not in [200, 202]: + self.logging.error(f"Invocation on URL {self.url} failed!") + self.logging.error(f"Output: {envelope}") + raise RuntimeError(f"Failed invocation of function! Output: {envelope}") + + workflow_result = envelope.get("result") + if isinstance(workflow_result, dict) and "statusQueryGetUri" in workflow_result: + workflow_result = self._poll_workflow_status( + workflow_result["statusQueryGetUri"], begin + ) + + end = datetime.now() + result = ExecutionResult.from_times(begin, end) + result.times.http_startup = conn_time + result.times.http_first_byte_return = receive_time + result.request_id = envelope.get("request_id", "") + + parsed_output = dict(envelope) + parsed_output["result"] = workflow_result + parsed_output["end"] = f"{end.timestamp():.6f}" + result.parse_benchmark_output(parsed_output) + result.output = workflow_result + return result + + def _http_post_json(self, url: str, payload: dict, timeout: int) -> tuple[int, bytes, float, float]: + """POST JSON and return status, body, connection time, and first-byte time.""" + import pycurl + + c = pycurl.Curl() + c.setopt(pycurl.HTTPHEADER, ["Content-Type: application/json"]) + c.setopt(pycurl.POST, 1) + c.setopt(pycurl.URL, url) + c.setopt(pycurl.POSTFIELDS, json.dumps(payload)) + c.setopt(pycurl.TIMEOUT, timeout) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + c.perform() + status_code = c.getinfo(pycurl.RESPONSE_CODE) + conn_time = c.getinfo(pycurl.PRETRANSFER_TIME) + receive_time = c.getinfo(pycurl.STARTTRANSFER_TIME) + c.close() + return status_code, data.getvalue(), conn_time, receive_time + + def _http_get(self, url: str, timeout: int) -> tuple[int, bytes]: + """GET a URL and return status and body.""" + import pycurl + + c = pycurl.Curl() + c.setopt(pycurl.URL, url) + c.setopt(pycurl.TIMEOUT, timeout) + data = BytesIO() + c.setopt(pycurl.WRITEFUNCTION, data.write) + c.perform() + status_code = c.getinfo(pycurl.RESPONSE_CODE) + c.close() + return status_code, data.getvalue() + + def _parse_json_response(self, raw: bytes, url: str) -> dict: + """Parse a JSON response or raise a useful invocation error.""" + try: + return json.loads(raw) + except json.JSONDecodeError: + text = raw.decode(errors="replace") + self.logging.error(f"Invocation on URL {url} failed!") + self.logging.error(f"Output: {text if text else 'No output provided!'}") + raise RuntimeError(f"Failed invocation of function! Output: {text}") from None + + def _poll_workflow_status(self, status_url: str, begin: datetime) -> Any: + """Poll Azure Durable Functions status until the workflow completes.""" + max_poll_time = 7200 + poll_interval = 5 + + while (datetime.now() - begin).total_seconds() < max_poll_time: + status_code, raw = self._http_get(status_url, timeout=60) + status = self._parse_json_response(raw, status_url) + + if status_code not in [200, 202]: + self.logging.warning( + f"Workflow status poll failed with status={status_code}: {status}" + ) + time.sleep(poll_interval) + continue + + runtime_status = status.get("runtimeStatus") + if runtime_status == "Completed": + return status.get("output") + if runtime_status in ["Failed", "Terminated", "Canceled"]: + self.logging.error(f"Workflow execution failed: {status}") + raise RuntimeError(f"Workflow execution failed: {status}") + + time.sleep(poll_interval) + + raise RuntimeError(f"Workflow did not complete within {max_poll_time}s") + + def serialize(self) -> dict: + """Serialize trigger to dictionary.""" + return {"type": self.typename(), "url": self.url} + + @staticmethod + def deserialize(obj: dict) -> Trigger: + """Deserialize trigger from dictionary.""" + return WorkflowHTTPTrigger(obj["url"]) From 915e228c6cacf9f6eb86fc848e23ce4f746f8720 Mon Sep 17 00:00:00 2001 From: laurin Date: Thu, 18 Jun 2026 10:54:41 +0200 Subject: [PATCH 228/230] feat: Update Cloudflare configuration with max_instances and chunk_size settings --- configs/example.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/example.json b/configs/example.json index 4d8fe7410..7ccc19c07 100644 --- a/configs/example.json +++ b/configs/example.json @@ -104,7 +104,8 @@ } }, "cloudflare": { - "max_instances": 1 + "max_instances": 20, + "chunk_size": 1 }, "openwhisk": { "shutdownStorage": false, From bc40b40ed8399c2df994a7580186db3c25914b84 Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Thu, 18 Jun 2026 13:31:24 +0200 Subject: [PATCH 229/230] feat: Add google-cloud-workflows dependency to GCP requirements --- pyproject.toml | 1 + requirements.gcp.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 377f19b53..19ded6c57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ dependencies = [ "google-cloud-logging>=2.0.0", "google-cloud-datastore", "google-cloud-build>=3.35.0", + "google-cloud-workflows", # benchmark verification - process image "Pillow", diff --git a/requirements.gcp.txt b/requirements.gcp.txt index f1e136777..f9d712b11 100644 --- a/requirements.gcp.txt +++ b/requirements.gcp.txt @@ -6,4 +6,5 @@ google-api-python-client-stubs google-cloud-logging>=2.0.0 google-cloud-datastore google-cloud-build>=3.35.0 +google-cloud-workflows From fa0c4e6975c54c701351745372efd6c1af86821e Mon Sep 17 00:00:00 2001 From: =Laurin <=laurin@stomp.li> Date: Thu, 18 Jun 2026 15:25:43 +0200 Subject: [PATCH 230/230] fix: Revert timeout configuration in workflow to 900 seconds. Set benchmarks-data to the appropriate branch "workflows" --- benchmarks-data | 2 +- benchmarks/600.workflows/6100.1000-genome/config.json | 2 +- .../600.workflows/6101.1000-genome-individuals/config.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks-data b/benchmarks-data index 30ca2f5c5..54ff4f3d9 160000 --- a/benchmarks-data +++ b/benchmarks-data @@ -1 +1 @@ -Subproject commit 30ca2f5c533c3f441deb5e05fc03a39fe65f9948 +Subproject commit 54ff4f3d9b7894e040bcdfdbcc8fe2f9f1168c1d diff --git a/benchmarks/600.workflows/6100.1000-genome/config.json b/benchmarks/600.workflows/6100.1000-genome/config.json index aff11b0e8..5bb1699c5 100644 --- a/benchmarks/600.workflows/6100.1000-genome/config.json +++ b/benchmarks/600.workflows/6100.1000-genome/config.json @@ -1,5 +1,5 @@ { - "timeout": 1800, + "timeout": 900, "memory": 2048, "languages": ["python"], "modules": ["storage"] diff --git a/benchmarks/600.workflows/6101.1000-genome-individuals/config.json b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json index aff11b0e8..5bb1699c5 100644 --- a/benchmarks/600.workflows/6101.1000-genome-individuals/config.json +++ b/benchmarks/600.workflows/6101.1000-genome-individuals/config.json @@ -1,5 +1,5 @@ { - "timeout": 1800, + "timeout": 900, "memory": 2048, "languages": ["python"], "modules": ["storage"]