Skip to content

Commit ff075a9

Browse files
fix: add backpressure, lower memory default to 64MB, fix except syntax
- Replace immediate 503 rejection with 30s backpressure wait using asyncio.Condition (prevents ES snapshot failures) - Lower default memory limit from 128MB to 64MB everywhere - Fix Python 2 except syntax (except X, Y → except (X, Y)) in 4 files - Fix envsubst replacing runtime bash vars in esrally-job.yaml
1 parent 2132dc9 commit ff075a9

File tree

7 files changed

+47
-28
lines changed

7 files changed

+47
-28
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Master Key → KEK (derived via SHA-256)
120120
| `redis-ha.enabled` | `true` | Deploy embedded Redis HA |
121121
| `gateway.enabled` | `false` | Create gateway service |
122122
| `ingress.enabled` | `false` | Enable ingress |
123-
| `performance.memoryLimitMb` | `128` | Memory budget for streaming concurrency |
123+
| `performance.memoryLimitMb` | `64` | Memory budget for streaming concurrency |
124124

125125
See [chart/README.md](chart/README.md) for all options.
126126

chart/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ helm install s3proxy oci://ghcr.io/serversidehannes/s3proxy-python/charts/s3prox
2121
| `s3.region` | `us-east-1` | AWS region |
2222
| `server.port` | `4433` | Proxy listen port |
2323
| `server.noTls` | `true` | Disable TLS (in-cluster only) |
24-
| `performance.memoryLimitMb` | `128` | Memory budget for streaming |
24+
| `performance.memoryLimitMb` | `64` | Memory budget for streaming |
2525
| `logLevel` | `DEBUG` | Log level |
2626
| `secrets.encryptKey` | `""` | AES-256 encryption key |
2727
| `secrets.awsAccessKeyId` | `""` | AWS access key |

chart/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ server:
1616
noTls: true
1717

1818
performance:
19-
memoryLimitMb: 128
19+
memoryLimitMb: 64
2020

2121
externalRedis:
2222
url: ""

e2e/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ services:
416416
--set secrets.awsAccessKeyId="minioadmin" \
417417
--set secrets.awsSecretAccessKey="minioadmin" \
418418
--set logLevel="DEBUG" \
419-
--set performance.memoryLimitMb=128 \
419+
--set performance.memoryLimitMb=64 \
420420
--set gateway.enabled=true \
421421
--set ingress.enabled=true \
422422
--set 'ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-body-size=256m' \

e2e/elasticsearch/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ kubectl wait --namespace "$NAMESPACE" \
9292

9393
# Start esrally job NOW - it will do apt-get + pip install while ES is still starting
9494
log_info "Starting esrally loader job..."
95-
envsubst < "${SCRIPT_DIR}/templates/esrally-job.yaml" | kubectl apply -n "$NAMESPACE" -f -
95+
envsubst '$CLUSTER_NAME' < "${SCRIPT_DIR}/templates/esrally-job.yaml" | kubectl apply -n "$NAMESPACE" -f -
9696

9797
# Follow esrally logs in background
9898
kubectl wait --namespace "$NAMESPACE" --for=condition=Ready pod -l job-name=geonames-loader --timeout=120s 2>/dev/null || true

s3proxy/concurrency.py

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,22 @@ def _create_malloc_release() -> Callable[[], int] | None:
4242
_malloc_release = _create_malloc_release()
4343

4444

45+
BACKPRESSURE_TIMEOUT = 30 # seconds to wait before rejecting
46+
47+
4548
class ConcurrencyLimiter:
46-
"""Memory-based concurrency limiter.
49+
"""Memory-based concurrency limiter with backpressure.
4750
48-
Tracks reserved memory across concurrent requests and rejects new requests
49-
when the configured limit would be exceeded.
51+
Tracks reserved memory across concurrent requests. When the limit would be
52+
exceeded, waits for memory to free up instead of rejecting immediately.
5053
"""
5154

5255
def __init__(self, limit_mb: int = 128) -> None:
5356
self._limit_mb = limit_mb
5457
self._limit_bytes = limit_mb * 1024 * 1024
5558
self._active_bytes = 0
5659
self._lock = asyncio.Lock()
60+
self._condition = asyncio.Condition(self._lock)
5761
MEMORY_LIMIT_BYTES.set(self._limit_bytes)
5862

5963
@property
@@ -76,39 +80,54 @@ def set_memory_limit(self, limit_mb: int) -> None:
7680
MEMORY_LIMIT_BYTES.set(self._limit_bytes)
7781

7882
async def try_acquire(self, bytes_needed: int) -> int:
79-
"""Reserve memory. Returns bytes reserved. Raises S3Error.slow_down if exhausted."""
83+
"""Reserve memory, waiting up to BACKPRESSURE_TIMEOUT if at capacity."""
8084
if self._limit_bytes <= 0:
8185
return 0
8286

8387
to_reserve = max(MIN_RESERVATION, min(bytes_needed, self._limit_bytes))
8488

85-
async with self._lock:
86-
if self._active_bytes + to_reserve > self._limit_bytes:
87-
active_mb = self._active_bytes / 1024 / 1024
88-
request_mb = to_reserve / 1024 / 1024
89-
limit_mb = self._limit_bytes / 1024 / 1024
90-
logger.warning(
91-
"MEMORY_REJECTED",
92-
active_mb=round(active_mb, 2),
93-
requested_mb=round(request_mb, 2),
94-
limit_mb=round(limit_mb, 2),
95-
)
96-
MEMORY_REJECTIONS.inc()
97-
raise S3Error.slow_down(
98-
f"Memory limit: {active_mb:.0f}MB + {request_mb:.0f}MB > {limit_mb:.0f}MB"
89+
async with self._condition:
90+
deadline = asyncio.get_event_loop().time() + BACKPRESSURE_TIMEOUT
91+
while self._active_bytes + to_reserve > self._limit_bytes:
92+
remaining = deadline - asyncio.get_event_loop().time()
93+
if remaining <= 0:
94+
active_mb = self._active_bytes / 1024 / 1024
95+
request_mb = to_reserve / 1024 / 1024
96+
limit_mb = self._limit_bytes / 1024 / 1024
97+
logger.warning(
98+
"MEMORY_REJECTED",
99+
active_mb=round(active_mb, 2),
100+
requested_mb=round(request_mb, 2),
101+
limit_mb=round(limit_mb, 2),
102+
waited_sec=BACKPRESSURE_TIMEOUT,
103+
)
104+
MEMORY_REJECTIONS.inc()
105+
raise S3Error.slow_down(
106+
f"Memory limit: {active_mb:.0f}MB + {request_mb:.0f}MB > {limit_mb:.0f}MB"
107+
)
108+
logger.info(
109+
"MEMORY_BACKPRESSURE",
110+
active_mb=round(self._active_bytes / 1024 / 1024, 2),
111+
requested_mb=round(to_reserve / 1024 / 1024, 2),
112+
limit_mb=round(self._limit_bytes / 1024 / 1024, 2),
113+
remaining_sec=round(remaining, 1),
99114
)
115+
with contextlib.suppress(TimeoutError):
116+
await asyncio.wait_for(self._condition.wait(), timeout=remaining)
117+
100118
self._active_bytes += to_reserve
101119
MEMORY_RESERVED_BYTES.set(self._active_bytes)
102120
return to_reserve
103121

104122
async def release(self, bytes_reserved: int) -> None:
105-
"""Release reserved memory and trigger OS memory release."""
123+
"""Release reserved memory and wake waiting requests."""
106124
if self._limit_bytes <= 0 or bytes_reserved <= 0:
107125
return
108126

109-
async with self._lock:
127+
async with self._condition:
110128
self._active_bytes = max(0, self._active_bytes - bytes_reserved)
111129
MEMORY_RESERVED_BYTES.set(self._active_bytes)
130+
self._condition.notify_all()
112131

113132
# Run garbage collection and release memory to OS
114133
gc.collect(0)
@@ -124,7 +143,7 @@ async def release(self, bytes_reserved: int) -> None:
124143

125144

126145
# Default instance used by module-level functions
127-
_default = ConcurrencyLimiter(limit_mb=int(os.environ.get("S3PROXY_MEMORY_LIMIT_MB", "128")))
146+
_default = ConcurrencyLimiter(limit_mb=int(os.environ.get("S3PROXY_MEMORY_LIMIT_MB", "64")))
128147

129148

130149
def estimate_memory_footprint(method: str, content_length: int) -> int:

s3proxy/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ class Settings(BaseSettings):
2929
# This is the ONLY setting needed for OOM protection.
3030
# Use nginx proxy-body-size at ingress to reject oversized requests before they reach Python.
3131
memory_limit_mb: int = Field(
32-
default=128,
32+
default=64,
3333
description="Memory budget for concurrent requests in MB. 0=unlimited. "
3434
"Small files use content_length*2, large files use 8MB (streaming). "
35-
"Excess requests get 503.",
35+
"Excess requests wait up to 30s (backpressure), then get 503.",
3636
)
3737

3838
# Redis settings (for distributed state in HA deployments)

0 commit comments

Comments
 (0)