From 774a7e81479ba55ae0a49309cdfa16d61993184b Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 12:11:39 +0100 Subject: [PATCH 01/12] chore(.gitignore): add .DS_Store to ignore list --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2778f5e..6f06558 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ venv.bak/ *.json +.DS_Store \ No newline at end of file From aeb4640816782a191074b5332fdf641779bca98a Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 12:21:23 +0100 Subject: [PATCH 02/12] feat(spiders): add SCP Supplement spider and item - Introduce ScpSupplement class for item representation - Implement ScpSupplementSpider to crawl supplement pages - Update makefile to include supplement in data targets --- makefile | 9 +++++-- scp_crawler/items.py | 4 ++++ scp_crawler/spiders/scp.py | 49 +++++++++++++++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/makefile b/makefile index 0b49c2f..c777a7a 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ PYTHON_VENV = source .venv/bin/activate && install: .venv -data: scp scp_int goi +data: scp scp_int goi supplement fresh: clean data @@ -14,7 +14,7 @@ clean: python -m venv .venv $(PYTHON_VENV) python -m pip install . -crawl: scp scp_int goi +crawl: scp scp_int goi supplement scp: scp_crawl scp_postprocess @@ -37,6 +37,11 @@ goi: data/goi.json data/goi.json: .venv $(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json +supplement: data/scp_supplement.json + +data/scp_supplement.json: .venv + $(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json + scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/goi: .venv diff --git a/scp_crawler/items.py b/scp_crawler/items.py index d4aa1c2..a199653 100644 --- a/scp_crawler/items.py +++ b/scp_crawler/items.py @@ -33,6 +33,10 @@ class ScpGoi(WikiPage): pass +class ScpSupplement(WikiPage): + pass + + class ScpTitle(scrapy.Item): title = scrapy.Field() scp = scrapy.Field() diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 8528b3a..58fee06 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -8,7 +8,7 @@ from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule -from ..items import ScpGoi, ScpHub, ScpItem, ScpTale, ScpTitle +from ..items import ScpGoi, ScpHub, ScpItem, ScpSupplement, ScpTale, ScpTitle DOMAIN = "scp-wiki.wikidot.com" INT_DOMAIN = "scp-int.wikidot.com" @@ -531,6 +531,53 @@ def parse_tale(self, response, original_link=None): return self.get_history_request(item["page_id"], 1, item) +class ScpSupplementSpider(CrawlSpider, WikiMixin): + name = "scp_supplement" + + start_urls = [ + f"http://{DOMAIN}/system:page-tags/tag/supplement", + ] + + allowed_domains = [DOMAIN] + + domain = DOMAIN + + rules = ( + Rule(LinkExtractor(allow=[re.escape("system:page-tags/tag/supplement")])), + Rule(LinkExtractor(allow=[r".*"]), callback="parse_supplement"), + ) + + def parse_supplement(self, response, original_link=None): + self.logger.debug("Reviewing Potential SCP Supplement page: %s", response.url) + content = self.get_content(response) + tags = self.get_tags(response) + + if not content or not tags: + return None + + redirect = self.follow_splash_redirects(response, tags, self.parse_supplement) + if redirect: + return redirect + + if "supplement" not in tags: + return None + + self.logger.info("Processing SCP Supplement page: %s", response.url) + content_soup = BeautifulSoup(content, "lxml") + + item = ScpSupplement() + item["title"] = self.get_title(response) + item["url"] = response.url + item["domain"] = self.domain + item["link"] = original_link if original_link else self.get_simple_link(response.url) + item["tags"] = tags + item["page_id"] = self.get_page_id(response) + item["rating"] = get_rating(response) + item["raw_content"] = str(clean_content_soup(content_soup)) + item["references"] = self.get_content_references(response) + return self.get_history_request(item["page_id"], 1, item) + + def get_rating(response): try: return int(response.css(".rate-points .number::text").get()) From b1883628a61fe4a3acdd2aebaaa99a0797e4ed44 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 13:00:49 +0100 Subject: [PATCH 03/12] feat(postprocessing): add supplement processing command - Implement run_postproc_supplement to process SCP supplement data - Create necessary directories and handle data extraction - Store processed supplements in JSON format for further use --- makefile | 17 +++++++++---- scp_crawler/postprocessing.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/makefile b/makefile index c777a7a..5041eb9 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ PYTHON_VENV = source .venv/bin/activate && install: .venv -data: scp scp_int goi supplement +data: scp scp_int goi fresh: clean data @@ -14,11 +14,11 @@ clean: python -m venv .venv $(PYTHON_VENV) python -m pip install . -crawl: scp scp_int goi supplement +crawl: scp scp_int goi scp: scp_crawl scp_postprocess -scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json +scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json data/scp_titles.json: .venv $(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json @@ -37,12 +37,19 @@ goi: data/goi.json data/goi.json: .venv $(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json -supplement: data/scp_supplement.json +supplement: supplement_crawl supplement_postprocess + +supplement_crawl: data/scp_supplement.json data/scp_supplement.json: .venv $(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json -scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales +supplement_postprocess: supplement_crawl data/processed/supplement + +data/processed/supplement: .venv + $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement + +scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement data/processed/goi: .venv $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index ec3ee0d..7d284d8 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -273,5 +273,53 @@ def run_postproc_goi(): to_file(tales, processed_path / "index.json") +@cli.command() +def run_postproc_supplement(): + + processed_path = Path(cwd + "/data/processed/supplement") + os.makedirs(processed_path, exist_ok=True) + + print("Processing Supplement list.") + + supplement_list = from_file(cwd + "/data/scp_supplement.json") + supplements = {} + for supplement in tqdm(supplement_list, smoothing=0): + + supplement["images"] = get_images(supplement["raw_content"]) + supplement["hubs"] = get_hubs(supplement["link"]) + supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"]) + + # Convert history dict to list and sort by date. + supplement["history"] = process_history(supplement["history"]) + + if len(supplement["history"]) > 0: + supplement["created_at"] = supplement["history"][0]["date"] + supplement["creator"] = supplement["history"][0]["author"] + else: + supplement["created_at"] = "unknown" + supplement["creator"] = "unknown" + + supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "") + + # Extract parent SCP from title or link + scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE) + supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None + + # Extract parent tale series from link + tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"]) + supplement["parent_tale"] = tale_match.group(1) if tale_match else None + + supplements[supplement["link"]] = supplement + + to_file(supplements, processed_path / f"content_supplement.json") + + for supplement_id in supplements: + del supplements[supplement_id]["raw_content"] + del supplements[supplement_id]["raw_source"] + supplements[supplement_id]["content_file"] = f"content_supplement.json" + + to_file(supplements, processed_path / "index.json") + + if __name__ == "__main__": cli() From 146435dd8f6da11cef7cbfce93443eb1cd945c8b Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 13:01:13 +0100 Subject: [PATCH 04/12] docs(README): update supplement crawl instructions and content structure - Added instructions for crawling pages tagged as 'supplement' - Updated content structure to include multiple content types - Clarified post-processing details for supplements --- README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b35932f..6506fc7 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location: scrapy crawl scp_int -o scp_international_items.json ``` +To crawl pages tagged as `supplement` and save to a custom location: + +```bash +scrapy crawl scp_supplement -o scp_supplement.json +``` + ## Raw Content Structure -There are two types of content downloaded- SCP Items and SCP Tales. +There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements). All content (both SCP Items and Tales) contain the following: @@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr | scp_titles.json | Main | Title | scp | | scp_hubs.json | Main | Hub | scp | | scp_tales.json | Main | Tale | scp | +| scp_supplement.json | Main | Supplement | scp | | scp_int.json | International | Item | scp_int | | scp_int_titles.json | International | Title | scp_int | | scp_int_tales.json | International | Tale | scp_int | @@ -76,7 +83,9 @@ To regenerate all files run `make fresh`. ## Post Processed Data -The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. +The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. + +Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`. ## Content Licensing From 84d709f0d7babac086fb2741f4f8b843fe2e0118 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 15:24:24 +0100 Subject: [PATCH 05/12] fix(spiders): refine tale parsing rules to exclude unwanted links - Updated the LinkExtractor in ScpTaleSpider to deny links matching specific patterns, improving the relevance of parsed tales. --- scp_crawler/spiders/scp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 58fee06..9f4933a 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -343,7 +343,7 @@ class ScpTaleSpider(CrawlSpider, WikiMixin): rules = ( Rule(LinkExtractor(allow=[re.escape("tales-by-title"), re.escape("system:page-tags/tag/tale")])), - Rule(LinkExtractor(allow=[r".*"]), callback="parse_tale"), + Rule(LinkExtractor(deny=[r"system:.*", r".*:.*", re.escape("tag-search")]), callback="parse_tale"), ) def parse_tale(self, response, original_link=None): From 3a2194055516dc8a8a6e7ffc465e3c6d4fc7ac9a Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 15:24:45 +0100 Subject: [PATCH 06/12] feat(ci): add GitHub Actions workflow for SCP crawling - for reference --- .github/workflows/scp-items.yml | 92 +++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/scp-items.yml diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml new file mode 100644 index 0000000..4c1fea1 --- /dev/null +++ b/.github/workflows/scp-items.yml @@ -0,0 +1,92 @@ +name: Crawl SCP Wiki + +on: +# workflow_dispatch: +# schedule: +# - cron: "0 0 * * *" + push: + branches: + - main + paths: + - .github/workflows/scp-items.yml + +permissions: + contents: write + +jobs: + update-main-scp: + runs-on: ubuntu-latest + steps: + - name: "Clone this Repository" + uses: actions/checkout@v6 + with: + path: scp-api + + - name: "Clone the Crawler" + uses: actions/checkout@v6 + with: + repository: heroheman/scp_crawler + ref: "main" + path: scp-crawler + + - name: "Setup Python" + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: "Install Crawler" + working-directory: ./scp-crawler + run: make install + + - name: "Crawl Titles" + working-directory: ./scp-crawler + run: make data/scp_titles.json + + - name: "Crawl Hubs" + working-directory: ./scp-crawler + run: make data/scp_hubs.json + + - name: "Crawl Items" + working-directory: ./scp-crawler + run: make data/scp_items.json + + - name: "Process Items" + working-directory: ./scp-crawler + run: make data/processed/items + + - name: "Crawl Tales" + working-directory: ./scp-crawler + run: make data/scp_tales.json + + - name: "Process Tales" + working-directory: ./scp-crawler + run: make data/processed/tales + + - name: "Crawl GOI" + working-directory: ./scp-crawler + run: make data/goi.json + + - name: "Process GOI" + working-directory: ./scp-crawler + run: make data/processed/goi + + - name: "Crawl Supplements" + working-directory: ./scp-crawler + run: make data/scp_supplement.json + + - name: "Process Supplements" + working-directory: ./scp-crawler + run: make data/processed/supplement + + - name: "Move Files into API" + run: cp -Rf ./scp-crawler/data/processed/* ./scp-api/docs/data/scp/ + + - name: "Push" + shell: bash + run: > + cd scp-api; + ./bin/push.sh; + + env: + GIT_USER: "SCP Bot" + GIT_EMAIL: "scp@tedivm.com" From 166a50bd6ad98504b51ce1be603669f55a3c3480 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 17:37:21 +0100 Subject: [PATCH 07/12] chore(workflow): adjusted workflow after review - This change prevents automatic pushing to the SCP API during CI - it should run on pull requests to allow for manual review first - remove clone step as it's unnecessary - resolves #5 --- .github/workflows/scp-items.yml | 43 ++++++++++++--------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml index 4c1fea1..bf59e9e 100644 --- a/.github/workflows/scp-items.yml +++ b/.github/workflows/scp-items.yml @@ -9,6 +9,9 @@ on: - main paths: - .github/workflows/scp-items.yml + pull_request: + branches: + - main permissions: contents: write @@ -17,17 +20,14 @@ jobs: update-main-scp: runs-on: ubuntu-latest steps: - - name: "Clone this Repository" + - name: "Checkout Crawler" uses: actions/checkout@v6 - with: - path: scp-api - - name: "Clone the Crawler" + - name: "Clone API Repository" uses: actions/checkout@v6 with: - repository: heroheman/scp_crawler - ref: "main" - path: scp-crawler + repository: heroheman/scp-api + path: scp-api - name: "Setup Python" uses: actions/setup-python@v6 @@ -35,58 +35,47 @@ jobs: python-version: '3.13' - name: "Install Crawler" - working-directory: ./scp-crawler run: make install - name: "Crawl Titles" - working-directory: ./scp-crawler run: make data/scp_titles.json - name: "Crawl Hubs" - working-directory: ./scp-crawler run: make data/scp_hubs.json - name: "Crawl Items" - working-directory: ./scp-crawler run: make data/scp_items.json - name: "Process Items" - working-directory: ./scp-crawler run: make data/processed/items - name: "Crawl Tales" - working-directory: ./scp-crawler run: make data/scp_tales.json - name: "Process Tales" - working-directory: ./scp-crawler run: make data/processed/tales - name: "Crawl GOI" - working-directory: ./scp-crawler run: make data/goi.json - name: "Process GOI" - working-directory: ./scp-crawler run: make data/processed/goi - name: "Crawl Supplements" - working-directory: ./scp-crawler run: make data/scp_supplement.json - name: "Process Supplements" - working-directory: ./scp-crawler run: make data/processed/supplement - name: "Move Files into API" - run: cp -Rf ./scp-crawler/data/processed/* ./scp-api/docs/data/scp/ + run: cp -Rf ./data/processed/* ./scp-api/docs/data/scp/ - - name: "Push" - shell: bash - run: > - cd scp-api; - ./bin/push.sh; + # - name: "Push" + # shell: bash + # run: > + # cd scp-api; + # ./bin/push.sh; - env: - GIT_USER: "SCP Bot" - GIT_EMAIL: "scp@tedivm.com" + # env: + # GIT_USER: "SCP Bot" + # GIT_EMAIL: "scp@tedivm.com" From 49aec691f052cb238fcab929ab8f3977f4968044 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Mon, 15 Dec 2025 17:47:51 +0100 Subject: [PATCH 08/12] fix(spiders): update tale parsing rules to allow all links - this fixes unintend removal of Linkextractor Rule --- scp_crawler/spiders/scp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 9f4933a..58fee06 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -343,7 +343,7 @@ class ScpTaleSpider(CrawlSpider, WikiMixin): rules = ( Rule(LinkExtractor(allow=[re.escape("tales-by-title"), re.escape("system:page-tags/tag/tale")])), - Rule(LinkExtractor(deny=[r"system:.*", r".*:.*", re.escape("tag-search")]), callback="parse_tale"), + Rule(LinkExtractor(allow=[r".*"]), callback="parse_tale"), ) def parse_tale(self, response, original_link=None): From d68338f96d342e3282e40cb6d694fdfd46555e5b Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Wed, 17 Dec 2025 11:24:08 +0100 Subject: [PATCH 09/12] fix(spiders): enhance error handling in history lookup - Add checks for empty responses and missing 'body' in JSON - Log errors for various failure scenarios to improve debugging - Ensure robust parsing of history HTML to prevent crashes --- scp_crawler/spiders/scp.py | 60 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 58fee06..c38c443 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -1,3 +1,4 @@ +import json import re import sys from pprint import pprint @@ -29,12 +30,65 @@ def parse_history(self, response, item, history_page=1): page_id = item["page_id"] changes = item["history"] if "history" in item else {} + try: + response_text = getattr(response, "text", "") or "" + if not response_text.strip(): + self.logger.error( + "Empty response when fetching history for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + history = response.json() - soup = BeautifulSoup(history["body"], "lxml") + if not isinstance(history, dict) or "body" not in history: + self.logger.error( + "Missing 'body' in history lookup for %s (status=%s, page=%s). Keys=%s", + item.get("url"), + getattr(response, "status", None), + history_page, + list(history.keys()) if isinstance(history, dict) else type(history), + ) + return self.get_page_source_request(page_id, item) + + body = history.get("body") + if not body: + self.logger.error( + "Empty 'body' in history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + + soup = BeautifulSoup(body, "lxml") + if soup.table is None: + self.logger.error( + "Missing in history HTML for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) rows = soup.table.find_all("tr") - except: - self.logger.exception(f"Unable to parse history lookup. {item['url']}") + + except (json.JSONDecodeError, ValueError): + self.logger.error( + "JSON decode error in history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + except Exception: + self.logger.exception( + "Unable to parse history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) return self.get_page_source_request(page_id, item) for row in rows: try: From cca296cc3393c487ee51d3572e0ff455c775a9a2 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Wed, 17 Dec 2025 11:26:14 +0100 Subject: [PATCH 10/12] chore(workflow): enable workflow_dispatch for SCP crawling - Allows manual triggering of the workflow - Improves flexibility for testing and updates --- .github/workflows/scp-items.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml index bf59e9e..a20822f 100644 --- a/.github/workflows/scp-items.yml +++ b/.github/workflows/scp-items.yml @@ -1,17 +1,19 @@ name: Crawl SCP Wiki on: -# workflow_dispatch: + workflow_dispatch: # schedule: # - cron: "0 0 * * *" - push: + # push: + # branches: + # - main + # paths: + # - .github/workflows/scp-items.yml + pull_request: branches: - main paths: - .github/workflows/scp-items.yml - pull_request: - branches: - - main permissions: contents: write From 4604780632fdc09839a3703f6c8c711cd60c1384 Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Wed, 17 Dec 2025 13:41:35 +0100 Subject: [PATCH 11/12] fix(history): improve history processing and error handling - Handle empty history cases by returning an empty list - Support both dict and list formats for history input - Safely parse date strings with error handling - Sort revisions by date, ensuring robustness against missing values --- scp_crawler/postprocessing.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index 7d284d8..7c2cd90 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -45,11 +45,29 @@ def get_images(html): def process_history(history): - history = [v for v in history.values()] - for revision in history: - revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M") - history.sort(key=lambda x: x["date"]) - return history + if not history: + return [] + + if isinstance(history, dict): + revisions = list(history.values()) + elif isinstance(history, list): + revisions = history + else: + return [] + + for revision in revisions: + if not isinstance(revision, dict): + continue + revision_date = revision.get("date") + if isinstance(revision_date, str): + try: + revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M") + except Exception: + # Keep original value if parsing fails. + pass + + revisions.sort(key=lambda x: x.get("date") or datetime.min) + return revisions def get_wiki_source(page_id, domain, attempts=5): @@ -200,7 +218,7 @@ def run_postproc_tales(): tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"]) # Convert history dict to list and sort by date. - tale["history"] = process_history(tale["history"]) + tale["history"] = process_history(tale.get("history")) if len(tale["history"]) > 0: tale["created_at"] = tale["history"][0]["date"] From 45d680b972886082f5a99db0b755da577506462f Mon Sep 17 00:00:00 2001 From: Florenz Heldermann Date: Wed, 17 Dec 2025 13:58:37 +0100 Subject: [PATCH 12/12] fix(history): ensure history key always exists in items - Use `get` method to safely access history in hubs and items - Prevent potential KeyError by ensuring history key is present --- scp_crawler/postprocessing.py | 4 ++-- scp_crawler/spiders/scp.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index 7c2cd90..399ae67 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -112,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5): hub_list, ): # Convert history dict to list and sort by date. - hub["history"] = process_history(hub["history"]) + hub["history"] = process_history(hub.get("history")) if len(hub["history"]) > 0: hub["created_at"] = hub["history"][0]["date"] @@ -161,7 +161,7 @@ def run_postproc_items(): item["hubs"] = get_hubs(item["link"]) # Convert history dict to list and sort by date. - item["history"] = process_history(item["history"]) + item["history"] = process_history(item.get("history")) if len(item["history"]) > 0: item["created_at"] = item["history"][0]["date"] diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index c38c443..745b5a1 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -29,7 +29,8 @@ def parse_history(self, response, item, history_page=1): self.logger.info(f"Reviewing Page {item['page_id']} history") page_id = item["page_id"] - changes = item["history"] if "history" in item else {} + changes = item.get("history", {}) + item["history"] = changes # Ensure history key always exists try: response_text = getattr(response, "text", "") or "" @@ -116,11 +117,13 @@ def parse_history(self, response, item, history_page=1): self.logger.exception("Could not process row.") self.logger.error(row) - item["history"] = changes - # The "0" change is the first revision, and the last one that shows up. - # If we have it then we're done. - if "0" in changes: - return self.get_page_source_request(page_id, item) + # Update item history after processing all rows + item["history"] = changes + + # The "0" change is the first revision, and the last one that shows up. + # If we have it then we're done. + if "0" in changes: + return self.get_page_source_request(page_id, item) next_page = history_page + 1 if next_page > MAX_HISTORY_PAGES: