Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion web_programming/get_imdbtop.py.DISABLED
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
from urllib.parse import urlparse

import bs4
import requests

ALLOWED_HOSTS = {"www.imdb.com"}


def _validate_url(url: str) -> str:
"""Validate a URL against an allowlist of trusted hosts to prevent SSRF attacks.

Args:
url: The URL to validate.

Returns:
The original URL if valid.

Raises:
ValueError: If the URL scheme is not HTTPS or the host is not in the allowlist.
"""
parsed = urlparse(url)
if parsed.scheme != "https" or parsed.hostname not in ALLOWED_HOSTS:
raise ValueError(
f"URL '{url}' is not allowed. Only HTTPS requests to "
f"{ALLOWED_HOSTS} are permitted."
)
return url


def get_movie_data_from_soup(soup: bs4.element.ResultSet) -> dict[str, str]:
return {
Expand Down Expand Up @@ -35,7 +60,9 @@ def get_imdb_top_movies(num_movies: int = 5) -> tuple:
"https://www.imdb.com/search/title?title_type="
f"feature&sort=num_votes,desc&count={num_movies}"
)
source = bs4.BeautifulSoup(requests.get(base_url).content, "html.parser")
source = bs4.BeautifulSoup(
requests.get(_validate_url(base_url)).content, "html.parser"
)
return tuple(
get_movie_data_from_soup(movie)
for movie in source.find_all("div", class_="lister-item mode-advanced")
Expand Down
Loading