Skip to content

Commit 6ae5b80

Browse files
committed
feat: add automatic S3 upload for HTML reports
1 parent afa292f commit 6ae5b80

3 files changed

Lines changed: 80 additions & 1 deletion

File tree

src/ps_helper/extensions/metrics_extension.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import math
55
import datetime
66
from collections import defaultdict
7+
from ..scripts.generate_report import generate_html_report
8+
from ..scripts.utils import upload_html_to_s3
79

810
from scrapy import signals
911
from pydantic import ValidationError
@@ -202,3 +204,30 @@ def spider_closed(self, spider, reason):
202204

203205
spider.logger.info(f"Saved metrics: {file_path}")
204206
print(json.dumps(metrics, indent=2, ensure_ascii=False))
207+
208+
if os.getenv('PRODUCTION') == 'True':
209+
try:
210+
_, html_content = generate_html_report(file_path)
211+
212+
url = self._upload_report_to_s3(html_content, spider)
213+
spider.logger.info(f"Report uploaded to S3: {url}")
214+
215+
except Exception as e:
216+
spider.logger.error(f"Failed to generate/upload HTML report: {e}")
217+
218+
def _upload_report_to_s3(self, html_content, spider):
219+
"""Upload HTML report to S3 from memory"""
220+
221+
bucket_name = os.getenv('S3_BUCKET_NAME')
222+
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
223+
key = f"scrapy-reports/{spider.name}/{timestamp}-report.html"
224+
225+
url = upload_html_to_s3(
226+
html_str=html_content,
227+
bucket=bucket_name,
228+
key=key,
229+
publico=False,
230+
expira_seg=3 * 24 * 3600
231+
)
232+
233+
return url

src/ps_helper/scripts/generate_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -926,4 +926,4 @@ def _generate_retry_reasons_html(data):
926926
with open(output_path, "w", encoding="utf-8") as f:
927927
f.write(html_template)
928928

929-
return output_path
929+
return output_path, html_template

src/ps_helper/scripts/utils.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import uuid
2+
import boto3
3+
4+
5+
def upload_html_to_s3(
6+
html_str: str,
7+
bucket: str,
8+
key: str = None,
9+
publico: bool = False,
10+
expira_seg: int = 30 * 24 * 3600
11+
):
12+
"""
13+
Uploads HTML to S3 from memory.
14+
15+
Args:
16+
html_str: HTML content as string
17+
bucket: S3 bucket name
18+
key: S3 path (optional, generates UUID if not provided)
19+
publico: If True, makes the file public
20+
expira_seg: Expiration seconds for private URL (default 30 days)
21+
22+
Returns:
23+
str: File URL (public or signed)
24+
"""
25+
26+
s3 = boto3.client("s3")
27+
key = key or f"reports/{uuid.uuid4()}.html"
28+
29+
put_args = dict(
30+
Bucket=bucket,
31+
Key=key,
32+
Body=html_str.encode("utf-8"),
33+
ContentType="text/html; charset=utf-8"
34+
)
35+
36+
if publico:
37+
put_args["ACL"] = "public-read"
38+
39+
s3.put_object(**put_args)
40+
41+
if publico:
42+
url = f"https://{bucket}.s3.amazonaws.com/{key}"
43+
else:
44+
url = s3.generate_presigned_url(
45+
"get_object",
46+
Params={"Bucket": bucket, "Key": key},
47+
ExpiresIn=expira_seg
48+
)
49+
50+
return url

0 commit comments

Comments
 (0)