Skip to content

Commit cbf1734

Browse files
authored
Merge pull request #111 from vcon-dev/laughing-kepler
Add S3 storage option to diet link with presigned URLs
2 parents 04ff18d + 8780947 commit cbf1734

3 files changed

Lines changed: 414 additions & 10 deletions

File tree

server/links/diet/README.md

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ The Diet link is a specialized plugin that helps reduce the size and content of
55
## Features
66

77
- Selective removal of dialog body content
8-
- Optional media redirection to external storage
8+
- Optional media redirection to external storage (HTTP endpoint or S3)
9+
- S3 storage with presigned URL generation for secure access
910
- Removal of analysis data
1011
- Filtering of attachments by MIME type
1112
- Removal of system prompts to prevent LLM instruction injection
@@ -20,6 +21,13 @@ default_options = {
2021
"remove_analysis": False, # Remove all analysis data
2122
"remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"])
2223
"remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion
24+
# S3 storage options for dialog bodies
25+
"s3_bucket": "", # S3 bucket name for storing dialog bodies
26+
"s3_path": "", # Optional path prefix within the bucket
27+
"aws_access_key_id": "", # AWS access key ID
28+
"aws_secret_access_key": "", # AWS secret access key
29+
"aws_region": "us-east-1", # AWS region (default: us-east-1)
30+
"presigned_url_expiration": None, # Presigned URL expiration in seconds (None = default 1 hour)
2331
}
2432
```
2533

@@ -31,6 +39,15 @@ default_options = {
3139
- `remove_attachment_types`: List of MIME types to remove from attachments
3240
- `remove_system_prompts`: Whether to remove system_prompt keys to prevent LLM instruction injection
3341

42+
### S3 Storage Options
43+
44+
- `s3_bucket`: The S3 bucket name where dialog bodies will be stored
45+
- `s3_path`: Optional path prefix within the bucket (e.g., "dialogs/processed")
46+
- `aws_access_key_id`: AWS access key ID for authentication
47+
- `aws_secret_access_key`: AWS secret access key for authentication
48+
- `aws_region`: AWS region where the bucket is located (default: "us-east-1")
49+
- `presigned_url_expiration`: Expiration time in seconds for presigned URLs (optional, defaults to 3600 seconds / 1 hour)
50+
3451
## Usage
3552

3653
The link processes vCons by:
@@ -42,9 +59,39 @@ The link processes vCons by:
4259
- Removing system prompts if specified
4360
3. Storing the modified vCon back in Redis
4461

45-
## Media Redirection
62+
## Media Storage Options
63+
64+
The diet link supports two methods for storing dialog bodies externally:
65+
66+
### S3 Storage (Recommended)
67+
68+
When `s3_bucket` is configured, the link will:
69+
1. Upload dialog body content to the specified S3 bucket
70+
2. Generate a presigned URL for secure access
71+
3. Replace the body content with the presigned URL
72+
4. Set the body_type to "url"
73+
5. If the upload fails, the body content will be removed
74+
75+
**S3 takes precedence over HTTP endpoint** - if both `s3_bucket` and `post_media_to_url` are configured, S3 will be used.
76+
77+
Example S3 configuration:
78+
```python
79+
{
80+
"remove_dialog_body": True,
81+
"s3_bucket": "my-vcon-storage",
82+
"s3_path": "dialogs/archived",
83+
"aws_access_key_id": "AKIAXXXXXXXX",
84+
"aws_secret_access_key": "xxxxxxxxxxxxx",
85+
"aws_region": "us-west-2",
86+
"presigned_url_expiration": 86400, # 24 hours
87+
}
88+
```
89+
90+
The S3 key structure is: `{s3_path}/{vcon_uuid}/{dialog_id}_{unique_id}.txt`
91+
92+
### HTTP Endpoint Storage
4693

47-
When `post_media_to_url` is configured, the link will:
94+
When `post_media_to_url` is configured (and `s3_bucket` is not), the link will:
4895
1. Post the media content to the specified URL
4996
2. Replace the body content with the URL to the stored content
5097
3. Set the body_type to "url"
@@ -59,12 +106,16 @@ When `post_media_to_url` is configured, the link will:
59106
## Dependencies
60107

61108
- Redis for vCon storage
62-
- Requests library for media redirection
109+
- Requests library for HTTP media redirection
110+
- boto3 library for S3 storage
63111
- Custom utilities:
64112
- logging_utils
65113

66114
## Requirements
67115

68116
- Redis connection must be configured
69117
- Appropriate permissions for vCon access and storage
70-
- If using media redirection, a valid endpoint URL must be provided
118+
- If using HTTP media redirection, a valid endpoint URL must be provided
119+
- If using S3 storage:
120+
- Valid AWS credentials with write access to the specified bucket
121+
- The bucket must exist and be accessible

server/links/diet/__init__.py

Lines changed: 126 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,34 @@
22
from lib.logging_utils import init_logger
33
import json
44
import requests
5+
import uuid
6+
import boto3
7+
from botocore.exceptions import ClientError
58
from typing import Dict, List, Any, Optional
69

710
logger = init_logger(__name__)
811
logger.info("MDO THIS SHOULD PRINT")
912

13+
_REDACTED = "[REDACTED]"
14+
15+
16+
def _redact_option_value(key: str, value: Any) -> Any:
17+
"""
18+
Redact sensitive option values before logging.
19+
20+
This prevents leaking secrets (for example AWS credentials) into logs.
21+
"""
22+
key_l = (key or "").lower()
23+
if (
24+
key_l == "aws_secret_access_key"
25+
or "secret" in key_l
26+
or "password" in key_l
27+
or "token" in key_l
28+
or key_l.endswith("_secret")
29+
):
30+
return _REDACTED
31+
return value
32+
1033

1134
# Default options that control which elements to remove
1235
default_options = {
@@ -15,16 +38,101 @@
1538
"remove_analysis": False, # Remove all analysis data
1639
"remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"])
1740
"remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion
41+
# S3 storage options for dialog bodies
42+
"s3_bucket": "", # S3 bucket name for storing dialog bodies
43+
"s3_path": "", # Optional path prefix within the bucket
44+
"aws_access_key_id": "", # AWS access key ID
45+
"aws_secret_access_key": "", # AWS secret access key
46+
"aws_region": "us-east-1", # AWS region (default: us-east-1)
47+
"presigned_url_expiration": None, # Presigned URL expiration in seconds (None = no expiration/default 1 hour)
1848
}
1949

50+
51+
def _get_s3_client(options: Dict[str, Any]):
52+
"""Create and return an S3 client with the provided credentials."""
53+
return boto3.client(
54+
"s3",
55+
aws_access_key_id=options["aws_access_key_id"],
56+
aws_secret_access_key=options["aws_secret_access_key"],
57+
region_name=options.get("aws_region", "us-east-1"),
58+
)
59+
60+
61+
def _upload_to_s3_and_get_presigned_url(
62+
content: str,
63+
vcon_uuid: str,
64+
dialog_id: str,
65+
options: Dict[str, Any]
66+
) -> Optional[str]:
67+
"""
68+
Upload dialog body content to S3 and return a presigned URL.
69+
70+
Args:
71+
content: The dialog body content to upload
72+
vcon_uuid: The vCon UUID
73+
dialog_id: The dialog ID
74+
options: Configuration options including S3 credentials and bucket info
75+
76+
Returns:
77+
Presigned URL to access the uploaded content, or None if upload fails
78+
"""
79+
try:
80+
s3 = _get_s3_client(options)
81+
82+
# Generate a unique key for this dialog body
83+
unique_id = str(uuid.uuid4())
84+
key = f"{dialog_id}_{unique_id}.txt" if dialog_id else f"{unique_id}.txt"
85+
86+
# Add vcon_uuid as a directory level
87+
key = f"{vcon_uuid}/{key}"
88+
89+
# Add optional path prefix
90+
if options.get("s3_path"):
91+
key = f"{options['s3_path']}/{key}"
92+
93+
bucket = options["s3_bucket"]
94+
95+
# Upload the content
96+
s3.put_object(
97+
Bucket=bucket,
98+
Key=key,
99+
Body=content.encode("utf-8") if isinstance(content, str) else content,
100+
ContentType="text/plain",
101+
)
102+
103+
logger.info(f"Successfully uploaded dialog body to s3://{bucket}/{key}")
104+
105+
# Generate presigned URL
106+
expiration = options.get("presigned_url_expiration")
107+
if expiration is None:
108+
# Default to 1 hour (3600 seconds) if not specified
109+
expiration = 3600
110+
111+
presigned_url = s3.generate_presigned_url(
112+
"get_object",
113+
Params={"Bucket": bucket, "Key": key},
114+
ExpiresIn=expiration,
115+
)
116+
117+
logger.info(f"Generated presigned URL with expiration {expiration}s")
118+
return presigned_url
119+
120+
except ClientError as e:
121+
logger.error(f"S3 client error uploading dialog body: {e}")
122+
return None
123+
except Exception as e:
124+
logger.error(f"Exception uploading dialog body to S3: {e}")
125+
return None
126+
127+
20128
def run(vcon_uuid, link_name, opts=default_options):
21129
logger.info("Starting diet::run")
22130

23131
# Merge provided options with defaults
24132
options = {**default_options, **opts}
25133

26134
for key, value in options.items():
27-
logger.info(f"diet::{key}: {value}")
135+
logger.info("diet::%s: %s", key, _redact_option_value(key, value))
28136

29137
# Load vCon from Redis using JSON.GET
30138
vcon = redis.json().get(f"vcon:{vcon_uuid}")
@@ -41,12 +149,27 @@ def run(vcon_uuid, link_name, opts=default_options):
41149
logger.info("diet::got dialog")
42150
if options["remove_dialog_body"] and "body" in dialog:
43151
logger.info("diet::remove_dialog_body AND body")
44-
if options["post_media_to_url"] and dialog.get("body"):
152+
dialog_body = dialog.get("body")
153+
dialog_id = dialog.get("id", "")
154+
155+
# Check if S3 storage is configured
156+
if options.get("s3_bucket") and dialog_body:
157+
logger.info("diet::uploading to S3")
158+
presigned_url = _upload_to_s3_and_get_presigned_url(
159+
dialog_body, vcon_uuid, dialog_id, options
160+
)
161+
if presigned_url:
162+
dialog["body"] = presigned_url
163+
dialog["body_type"] = "url"
164+
else:
165+
logger.error("Failed to upload to S3, removing body")
166+
dialog["body"] = ""
167+
elif options["post_media_to_url"] and dialog_body:
45168
try:
46169
# Post the body content to the specified URL
47170
response = requests.post(
48171
options["post_media_to_url"],
49-
json={"content": dialog["body"], "vcon_uuid": vcon_uuid, "dialog_id": dialog.get("id", "")}
172+
json={"content": dialog_body, "vcon_uuid": vcon_uuid, "dialog_id": dialog_id}
50173
)
51174
if response.status_code == 200:
52175
# Replace body with the URL to the stored content

0 commit comments

Comments
 (0)