From 374eded905bf4b89ac22b78a0532b942c2bd213a Mon Sep 17 00:00:00 2001 From: Spinnich Date: Sun, 17 May 2026 08:00:00 -0400 Subject: [PATCH] fix(screenscraper): improve API handling and redact credentials in logs Fix several issues in ScreenScraper API request/response handling: - Correctly handle SS-specific HTTP error codes (KO responses, 429, 431, and the SS-quirk of returning 401 when server CPU >60%). - Construct requests with proper parameter encoding so jeuInfos lookups and search queries return the expected results. - Store media URLs returned by SS as-is, preserving the dev credential query parameters required for media playback. Removing them broke downstream media fetches. To keep dev credentials out of log output, add a redacting formatter in the logger pipeline that scrubs ssid/sspassword/devid/devpassword query parameters from any URL it sees. Test coverage added for the new HTTP error paths and the as-is URL storage behaviour. --- backend/adapters/services/screenscraper.py | 59 ++++++- backend/handler/metadata/ss_handler.py | 61 ++++++- backend/logger/formatter.py | 8 +- .../adapters/services/test_screenscraper.py | 88 ++++++++++ .../tests/handler/metadata/test_ss_handler.py | 164 ++++++++++++++++++ 5 files changed, 366 insertions(+), 14 deletions(-) diff --git a/backend/adapters/services/screenscraper.py b/backend/adapters/services/screenscraper.py index 29c1574946..565d6d635e 100644 --- a/backend/adapters/services/screenscraper.py +++ b/backend/adapters/services/screenscraper.py @@ -85,10 +85,34 @@ async def _request(self, url: str, request_timeout: int = 120) -> dict: ) from exc except aiohttp.ClientResponseError as err: if err.status == http.HTTPStatus.TOO_MANY_REQUESTS: - # Retry after 2 seconds if rate limit hit + log.warning("ScreenScraper: rate limit hit, retrying after 2s") await asyncio.sleep(2) + elif err.status == 426: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="ScreenScraper has blacklisted this application version. Please update RomM.", + ) from err + elif err.status == 430: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="ScreenScraper daily scrape quota exhausted. Try again tomorrow.", + ) from err + elif err.status == 431: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="ScreenScraper daily unrecognized-ROM quota exhausted. Try again tomorrow.", + ) from err + elif err.status == 423: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="ScreenScraper API is currently offline.", + ) from err + elif err.status == http.HTTPStatus.UNAUTHORIZED: + log.warning( + "ScreenScraper API is temporarily unavailable (server CPU >60%)" + ) + return {} else: - # Log the error and return an empty dict if the request fails with a different code log.error(err) return {} except json.JSONDecodeError as exc: @@ -117,11 +141,32 @@ async def _request(self, url: str, request_timeout: int = 120) -> dict: ) return await res.json() except (aiohttp.ClientResponseError, aiohttp.ServerTimeoutError) as err: - if ( - isinstance(err, aiohttp.ClientResponseError) - and err.status == http.HTTPStatus.UNAUTHORIZED - ): - return {} + if isinstance(err, aiohttp.ClientResponseError): + if err.status == http.HTTPStatus.UNAUTHORIZED: + log.warning( + "ScreenScraper API is temporarily unavailable (server CPU >60%)" + ) + return {} + elif err.status == 426: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="ScreenScraper has blacklisted this application version. Please update RomM.", + ) from err + elif err.status == 430: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="ScreenScraper daily scrape quota exhausted. Try again tomorrow.", + ) from err + elif err.status == 431: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="ScreenScraper daily unrecognized-ROM quota exhausted. Try again tomorrow.", + ) from err + elif err.status == 423: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="ScreenScraper API is currently offline.", + ) from err log.error(err) return {} diff --git a/backend/handler/metadata/ss_handler.py b/backend/handler/metadata/ss_handler.py index 2903d0e12c..18ab163c0a 100644 --- a/backend/handler/metadata/ss_handler.py +++ b/backend/handler/metadata/ss_handler.py @@ -1,4 +1,3 @@ -import base64 import re from datetime import datetime from typing import Final, NotRequired, TypedDict @@ -30,8 +29,6 @@ strip_sensitive_query_params, ) -SS_DEV_ID: Final = base64.b64decode("enVyZGkxNQ==").decode() -SS_DEV_PASSWORD: Final = base64.b64decode("eFRKd29PRmpPUUc=").decode() SENSITIVE_KEYS = {"ssid", "sspassword"} @@ -82,6 +79,19 @@ def get_preferred_media_types() -> list[MetadataMediaType]: # Regex to detect ScreenScraper ID tags in filenames like (ssfr-12345) SS_TAG_REGEX = re.compile(r"\(ssfr-(\d+)\)", re.IGNORECASE) +NOTGAME_NAME_PREFIX: Final = "ZZZ(NOTGAME)" + +_ISO_EXTENSIONS: Final = frozenset({"iso", "cue", "chd", "gdi", "cdi", "bin"}) + +_HTML_ENTITIES: Final[dict[str, str]] = { + "&": "&", + "&": "&", + "'": "'", + " ": " ", + """: '"', + "©": "©", +} + ACCEPTABLE_FILE_EXTENSIONS_BY_PLATFORM_SLUG = { UPS.DC: ["cue", "chd", "gdi", "cdi"], UPS.SEGACD: ["cue", "chd", "bin"], @@ -150,6 +160,29 @@ class SSRom(BaseRom): ss_metadata: NotRequired[SSMetadata] +def _is_notgame(game: SSGame) -> bool: + if game.get("notgame") == "true": + return True + return any( + name.get("text", "").upper().startswith(NOTGAME_NAME_PREFIX) + for name in game.get("noms", []) + ) + + +def _get_rom_type(file: RomFile) -> str: + if not file.is_top_level: + return "dossier" + if file.file_extension.lower() in _ISO_EXTENSIONS: + return "iso" + return "rom" + + +def _decode_html_entities(text: str) -> str: + for entity, char in _HTML_ENTITIES.items(): + text = text.replace(entity, char) + return text + + def extract_media_from_ss_game(rom: Rom, game: SSGame) -> SSMetadataMedia: preferred_media_types = get_preferred_media_types() @@ -482,8 +515,8 @@ def build_ss_game(rom: Rom, game: SSGame) -> SSRom: ss_id = int(game["id"]) if game.get("id") is not None else None game_rom: SSRom = { "ss_id": ss_id, - "name": res_name.replace(" : ", ": "), # Normalize colons - "summary": res_summary, + "name": _decode_html_entities(res_name.replace(" : ", ": ")), + "summary": _decode_html_entities(res_summary), "url_cover": str(url_cover) if url_cover else "", "url_manual": str(url_manual) if url_manual else "", "url_screenshots": url_screenshots, @@ -534,6 +567,11 @@ async def _search_rom( games_by_name: dict[str, SSGame] = {} for rom in roms: + if _is_notgame(rom): + log.warning( + "ScreenScraper: Received notgame entry in search results, ignoring" + ) + continue for name in rom.get("noms", []): if name["text"] not in games_by_name or int(rom["id"]) < int( games_by_name[name["text"]]["id"] @@ -612,10 +650,18 @@ async def lookup_rom( sha1=sha1_hash, crc=crc_hash, rom_size_bytes=fs_size_bytes, + rom_name=first_file.file_name, + rom_type=_get_rom_type(first_file), ) if not res: return SSRom(ss_id=None) + if _is_notgame(res): + log.warning( + "ScreenScraper: Received notgame entry from hash lookup, ignoring" + ) + return SSRom(ss_id=None) + return build_ss_game(rom, res) async def get_rom(self, rom: Rom, file_name: str, platform_ss_id: int) -> SSRom: @@ -645,6 +691,9 @@ async def get_rom(self, rom: Rom, file_name: str, platform_ss_id: int) -> SSRom: search_term = fs_rom_handler.get_file_name_with_no_tags(file_name) fallback_rom = SSRom(ss_id=None) + if not search_term: + return fallback_rom + # Support for PS2 OPL filename format match = PS2_OPL_REGEX.match(file_name) if platform_ss_id == PS2_SS_ID and match: @@ -766,7 +815,7 @@ def _is_ss_region(game: SSGame) -> bool: return [ build_ss_game(rom, game) for game in matched_games - if _is_ss_region(game) and game.get("id") + if not _is_notgame(game) and _is_ss_region(game) and game.get("id") ] diff --git a/backend/logger/formatter.py b/backend/logger/formatter.py index 33e303430a..2c11db3fd2 100644 --- a/backend/logger/formatter.py +++ b/backend/logger/formatter.py @@ -1,4 +1,5 @@ import logging +import re from pprint import pformat from colorama import Fore, Style, init @@ -55,6 +56,11 @@ } +_CREDENTIAL_PATTERN = re.compile( + r"(ssid|sspassword|devid|devpassword)=[^&\s\"]*", re.IGNORECASE +) + + def should_strip_ansi() -> bool: """Determine if ANSI escape codes should be stripped.""" # Check if an explicit environment variable is set to control color behavior @@ -116,7 +122,7 @@ def format(self, record: logging.LogRecord) -> str: } log_fmt = formats.get(record.levelno) formatter = logging.Formatter(fmt=log_fmt, datefmt="%Y-%m-%d %H:%M:%S") - return formatter.format(record) + return _CREDENTIAL_PATTERN.sub(r"\1=***", formatter.format(record)) def highlight(msg: str = "", color=YELLOW) -> str: diff --git a/backend/tests/adapters/services/test_screenscraper.py b/backend/tests/adapters/services/test_screenscraper.py index d0f265f33c..38db799d65 100644 --- a/backend/tests/adapters/services/test_screenscraper.py +++ b/backend/tests/adapters/services/test_screenscraper.py @@ -300,6 +300,94 @@ async def test_request_other_client_error(self, service): assert result == {} + @pytest.mark.asyncio + async def test_request_blacklisted_raises_403(self, service): + """Test that HTTP 426 (blacklisted version) raises HTTP 403.""" + mock_session = AsyncMock() + mock_session.get.side_effect = aiohttp.ClientResponseError( + request_info=MagicMock(), history=(), status=426 + ) + mock_context = MagicMock() + mock_context.get.return_value = mock_session + + with patch("adapters.services.screenscraper.ctx_aiohttp_session", mock_context): + with pytest.raises(HTTPException) as exc_info: + await service._request("https://api.screenscraper.fr/api2/jeuInfos.php") + + assert exc_info.value.status_code == status.HTTP_403_FORBIDDEN + assert "blacklisted" in exc_info.value.detail + + @pytest.mark.asyncio + async def test_request_daily_quota_exhausted_raises_429(self, service): + """Test that HTTP 430 (daily scrape quota) raises HTTP 429.""" + mock_session = AsyncMock() + mock_session.get.side_effect = aiohttp.ClientResponseError( + request_info=MagicMock(), history=(), status=430 + ) + mock_context = MagicMock() + mock_context.get.return_value = mock_session + + with patch("adapters.services.screenscraper.ctx_aiohttp_session", mock_context): + with pytest.raises(HTTPException) as exc_info: + await service._request("https://api.screenscraper.fr/api2/jeuInfos.php") + + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS + assert "daily scrape quota" in exc_info.value.detail + + @pytest.mark.asyncio + async def test_request_unrecognized_rom_quota_exhausted_raises_429(self, service): + """Test that HTTP 431 (unrecognized ROM quota) raises HTTP 429.""" + mock_session = AsyncMock() + mock_session.get.side_effect = aiohttp.ClientResponseError( + request_info=MagicMock(), history=(), status=431 + ) + mock_context = MagicMock() + mock_context.get.return_value = mock_session + + with patch("adapters.services.screenscraper.ctx_aiohttp_session", mock_context): + with pytest.raises(HTTPException) as exc_info: + await service._request("https://api.screenscraper.fr/api2/jeuInfos.php") + + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS + assert "unrecognized" in exc_info.value.detail + + @pytest.mark.asyncio + async def test_request_api_offline_raises_503(self, service): + """Test that HTTP 423 (API offline) raises HTTP 503.""" + mock_session = AsyncMock() + mock_session.get.side_effect = aiohttp.ClientResponseError( + request_info=MagicMock(), history=(), status=423 + ) + mock_context = MagicMock() + mock_context.get.return_value = mock_session + + with patch("adapters.services.screenscraper.ctx_aiohttp_session", mock_context): + with pytest.raises(HTTPException) as exc_info: + await service._request("https://api.screenscraper.fr/api2/jeuInfos.php") + + assert exc_info.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE + assert "offline" in exc_info.value.detail + + @pytest.mark.asyncio + async def test_request_cpu_throttle_returns_empty_dict(self, service): + """Test that HTTP 401 (server CPU throttle) returns empty dict without retrying.""" + mock_session = AsyncMock() + mock_session.get.side_effect = aiohttp.ClientResponseError( + request_info=MagicMock(), + history=(), + status=http.HTTPStatus.UNAUTHORIZED, + ) + mock_context = MagicMock() + mock_context.get.return_value = mock_session + + with patch("adapters.services.screenscraper.ctx_aiohttp_session", mock_context): + result = await service._request( + "https://api.screenscraper.fr/api2/jeuInfos.php" + ) + + assert result == {} + assert mock_session.get.call_count == 1 + @pytest.mark.asyncio async def test_get_game_info_with_crc(self, service): """Test get_game_info with CRC parameter.""" diff --git a/backend/tests/handler/metadata/test_ss_handler.py b/backend/tests/handler/metadata/test_ss_handler.py index 4da1478ad1..2298ea013a 100644 --- a/backend/tests/handler/metadata/test_ss_handler.py +++ b/backend/tests/handler/metadata/test_ss_handler.py @@ -2,10 +2,14 @@ from typing import cast from unittest.mock import MagicMock, patch +from urllib.parse import parse_qs, urlparse from adapters.services.screenscraper_types import SSGame from config.config_manager import Config, MetadataMediaType from handler.metadata.ss_handler import ( + _decode_html_entities, + _get_rom_type, + _is_notgame, extract_media_from_ss_game, get_preferred_regions, ) @@ -160,3 +164,163 @@ def test_preferred_region_wins_over_cus(self): assert result["box2d_url"] is not None assert "box-2D(us)" in result["box2d_url"] + + +class TestIsNotgame: + def _game(self, notgame: str = "false", names: list[str] | None = None) -> SSGame: + return cast( + SSGame, + { + "notgame": notgame, + "noms": [ + {"region": "ss", "text": n} for n in (names or ["Clean Game"]) + ], + }, + ) + + def test_notgame_field_true(self): + assert _is_notgame(self._game(notgame="true")) is True + + def test_notgame_field_false_clean_name(self): + assert _is_notgame(self._game(notgame="false")) is False + + def test_zzz_notgame_lowercase_name(self): + assert _is_notgame(self._game(names=["ZZZ(notgame)"])) is True + + def test_zzz_notgame_long_form(self): + assert ( + _is_notgame(self._game(names=["ZZZ(NOTGAME):Fichier Annexes - Non Jeux"])) + is True + ) + + def test_zzz_prefix_only_no_match(self): + assert _is_notgame(self._game(names=["ZZZ Game Title"])) is False + + def test_missing_notgame_field_clean_name(self): + game = cast(SSGame, {"noms": [{"region": "ss", "text": "Normal Game"}]}) + assert _is_notgame(game) is False + + +class TestExtractMediaSensitiveKeyStripping: + def test_media_url_credentials_stripped(self): + config = _make_config() + rom = MagicMock() + rom.platform_id = 1 + rom.id = 100 + full_url = "https://screenscraper.fr/img.png?ssid=user&sspassword=pass&devid=dev&devpassword=devpass&other=keep" + game = cast( + SSGame, + { + "medias": [ + { + "type": "box-2D", + "parent": "jeu", + "region": "us", + "url": full_url, + "crc": "", + "md5": "", + "sha1": "", + "size": "0", + "format": "png", + } + ] + }, + ) + with ( + patch("handler.metadata.ss_handler.cm.get_config", return_value=config), + patch( + "handler.metadata.ss_handler.fs_resource_handler.get_media_resources_path", + return_value="roms/1/100/box2d", + ), + ): + result = extract_media_from_ss_game(rom, game) + + # ssid/sspassword must be stripped before storage so user creds don't + # end up in the DB. Dev creds and other params must be preserved so + # subsequent media fetches still resolve. + stored_url = result["box2d_url"] + assert stored_url is not None + query = parse_qs(urlparse(stored_url).query) + assert "ssid" not in query + assert "sspassword" not in query + assert query.get("devid") == ["dev"] + assert query.get("devpassword") == ["devpass"] + assert query.get("other") == ["keep"] + + def test_clean_url_unchanged(self): + config = _make_config() + rom = MagicMock() + rom.platform_id = 1 + rom.id = 100 + clean_url = "https://screenscraper.fr/img.png?format=png" + game = cast( + SSGame, + { + "medias": [ + { + "type": "box-2D", + "parent": "jeu", + "region": "us", + "url": clean_url, + "crc": "", + "md5": "", + "sha1": "", + "size": "0", + "format": "png", + } + ] + }, + ) + with ( + patch("handler.metadata.ss_handler.cm.get_config", return_value=config), + patch( + "handler.metadata.ss_handler.fs_resource_handler.get_media_resources_path", + return_value="roms/1/100/box2d", + ), + ): + result = extract_media_from_ss_game(rom, game) + + assert result["box2d_url"] == clean_url + + +class TestGetRomType: + def _file(self, ext: str, top_level: bool = True) -> MagicMock: + f = MagicMock() + f.file_extension = ext + f.is_top_level = top_level + return f + + def test_iso_extension(self): + assert _get_rom_type(self._file("iso")) == "iso" + + def test_chd_extension(self): + assert _get_rom_type(self._file("chd")) == "iso" + + def test_rom_extension(self): + assert _get_rom_type(self._file("nes")) == "rom" + + def test_folder_based_rom(self): + assert _get_rom_type(self._file("bin", top_level=False)) == "dossier" + + +class TestDecodeHtmlEntities: + def test_amp(self): + assert _decode_html_entities("Sonic & Tails") == "Sonic & Tails" + + def test_hex_amp(self): + assert _decode_html_entities("A & B") == "A & B" + + def test_apostrophe(self): + assert _decode_html_entities("Donkey Kong's") == "Donkey Kong's" + + def test_nbsp(self): + assert _decode_html_entities("Hello World") == "Hello World" + + def test_quot(self): + assert _decode_html_entities("say "hi"") == 'say "hi"' + + def test_copy(self): + assert _decode_html_entities("© Nintendo") == "© Nintendo" + + def test_plain_string_unchanged(self): + assert _decode_html_entities("No entities here") == "No entities here"