diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index ec3ee0d..834094a 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -45,11 +45,29 @@ def get_images(html): def process_history(history): - history = [v for v in history.values()] - for revision in history: - revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M") - history.sort(key=lambda x: x["date"]) - return history + if not history: + return [] + + if isinstance(history, dict): + revisions = list(history.values()) + elif isinstance(history, list): + revisions = history + else: + return [] + + for revision in revisions: + if not isinstance(revision, dict): + continue + revision_date = revision.get("date") + if isinstance(revision_date, str): + try: + revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M") + except Exception: + # Keep original value if parsing fails. + pass + + revisions.sort(key=lambda x: x.get("date") or datetime.min) + return revisions def get_wiki_source(page_id, domain, attempts=5): @@ -94,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5): hub_list, ): # Convert history dict to list and sort by date. - hub["history"] = process_history(hub["history"]) + hub["history"] = process_history(hub.get("history")) if len(hub["history"]) > 0: hub["created_at"] = hub["history"][0]["date"] @@ -143,7 +161,7 @@ def run_postproc_items(): item["hubs"] = get_hubs(item["link"]) # Convert history dict to list and sort by date. - item["history"] = process_history(item["history"]) + item["history"] = process_history(item.get("history")) if len(item["history"]) > 0: item["created_at"] = item["history"][0]["date"] @@ -200,7 +218,7 @@ def run_postproc_tales(): tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"]) # Convert history dict to list and sort by date. - tale["history"] = process_history(tale["history"]) + tale["history"] = process_history(tale.get("history")) if len(tale["history"]) > 0: tale["created_at"] = tale["history"][0]["date"] diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 8528b3a..e29d562 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -1,3 +1,4 @@ +import json import re import sys from pprint import pprint @@ -28,13 +29,67 @@ def parse_history(self, response, item, history_page=1): self.logger.info(f"Reviewing Page {item['page_id']} history") page_id = item["page_id"] - changes = item["history"] if "history" in item else {} + changes = item.get("history", {}) + item["history"] = changes # Ensure history key always exists + try: + response_text = getattr(response, "text", "") or "" + if not response_text.strip(): + self.logger.error( + "Empty response when fetching history for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + history = response.json() - soup = BeautifulSoup(history["body"], "lxml") + if not isinstance(history, dict) or "body" not in history: + self.logger.error( + "Missing 'body' in history lookup for %s (status=%s, page=%s). Keys=%s", + item.get("url"), + getattr(response, "status", None), + history_page, + list(history.keys()) if isinstance(history, dict) else type(history), + ) + return self.get_page_source_request(page_id, item) + + body = history.get("body") + if not body: + self.logger.error( + "Empty 'body' in history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + + soup = BeautifulSoup(body, "lxml") + if soup.table is None: + self.logger.error( + "Missing in history HTML for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) rows = soup.table.find_all("tr") - except: - self.logger.exception(f"Unable to parse history lookup. {item['url']}") + + except (json.JSONDecodeError, ValueError): + self.logger.error( + "JSON decode error in history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + except Exception: + self.logger.exception( + "Unable to parse history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) return self.get_page_source_request(page_id, item) for row in rows: try: @@ -62,11 +117,13 @@ def parse_history(self, response, item, history_page=1): self.logger.exception("Could not process row.") self.logger.error(row) - item["history"] = changes - # The "0" change is the first revision, and the last one that shows up. - # If we have it then we're done. - if "0" in changes: - return self.get_page_source_request(page_id, item) + # Update item history after processing all rows + item["history"] = changes + + # The "0" change is the first revision, and the last one that shows up. + # If we have it then we're done. + if "0" in changes: + return self.get_page_source_request(page_id, item) next_page = history_page + 1 if next_page > MAX_HISTORY_PAGES: