From 774a7e81479ba55ae0a49309cdfa16d61993184b Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 12:11:39 +0100
Subject: [PATCH 01/12] chore(.gitignore): add .DS_Store to ignore list

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2778f5e..6f06558 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ venv.bak/
 
 
 *.json
+.DS_Store
\ No newline at end of file

From aeb4640816782a191074b5332fdf641779bca98a Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 12:21:23 +0100
Subject: [PATCH 02/12] feat(spiders): add SCP Supplement spider and item

- Introduce ScpSupplement class for item representation
- Implement ScpSupplementSpider to crawl supplement pages
- Update makefile to include supplement in data targets
---
 makefile                   |  9 +++++--
 scp_crawler/items.py       |  4 ++++
 scp_crawler/spiders/scp.py | 49 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/makefile b/makefile
index 0b49c2f..c777a7a 100644
--- a/makefile
+++ b/makefile
@@ -3,7 +3,7 @@ PYTHON_VENV = source .venv/bin/activate &&
 
 install: .venv
 
-data: scp scp_int goi
+data: scp scp_int goi supplement
 
 fresh: clean data
 
@@ -14,7 +14,7 @@ clean:
 	python -m venv .venv
 	$(PYTHON_VENV) python -m pip install .
 
-crawl: scp scp_int goi
+crawl: scp scp_int goi supplement
 
 scp: scp_crawl scp_postprocess
 
@@ -37,6 +37,11 @@ goi: data/goi.json
 data/goi.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json
 
+supplement: data/scp_supplement.json
+
+data/scp_supplement.json: .venv
+	$(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json
+
 scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales
 
 data/processed/goi: .venv
diff --git a/scp_crawler/items.py b/scp_crawler/items.py
index d4aa1c2..a199653 100644
--- a/scp_crawler/items.py
+++ b/scp_crawler/items.py
@@ -33,6 +33,10 @@ class ScpGoi(WikiPage):
     pass
 
 
+class ScpSupplement(WikiPage):
+    pass
+
+
 class ScpTitle(scrapy.Item):
     title = scrapy.Field()
     scp = scrapy.Field()
diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
index 8528b3a..58fee06 100644
--- a/scp_crawler/spiders/scp.py
+++ b/scp_crawler/spiders/scp.py
@@ -8,7 +8,7 @@
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 
-from ..items import ScpGoi, ScpHub, ScpItem, ScpTale, ScpTitle
+from ..items import ScpGoi, ScpHub, ScpItem, ScpSupplement, ScpTale, ScpTitle
 
 DOMAIN = "scp-wiki.wikidot.com"
 INT_DOMAIN = "scp-int.wikidot.com"
@@ -531,6 +531,53 @@ def parse_tale(self, response, original_link=None):
         return self.get_history_request(item["page_id"], 1, item)
 
 
+class ScpSupplementSpider(CrawlSpider, WikiMixin):
+    name = "scp_supplement"
+
+    start_urls = [
+        f"http://{DOMAIN}/system:page-tags/tag/supplement",
+    ]
+
+    allowed_domains = [DOMAIN]
+
+    domain = DOMAIN
+
+    rules = (
+        Rule(LinkExtractor(allow=[re.escape("system:page-tags/tag/supplement")])),
+        Rule(LinkExtractor(allow=[r".*"]), callback="parse_supplement"),
+    )
+
+    def parse_supplement(self, response, original_link=None):
+        self.logger.debug("Reviewing Potential SCP Supplement page: %s", response.url)
+        content = self.get_content(response)
+        tags = self.get_tags(response)
+        
+        if not content or not tags:
+            return None
+
+        redirect = self.follow_splash_redirects(response, tags, self.parse_supplement)
+        if redirect:
+            return redirect
+
+        if "supplement" not in tags:
+            return None
+
+        self.logger.info("Processing SCP Supplement page: %s", response.url)
+        content_soup = BeautifulSoup(content, "lxml")
+
+        item = ScpSupplement()
+        item["title"] = self.get_title(response)
+        item["url"] = response.url
+        item["domain"] = self.domain
+        item["link"] = original_link if original_link else self.get_simple_link(response.url)
+        item["tags"] = tags
+        item["page_id"] = self.get_page_id(response)
+        item["rating"] = get_rating(response)
+        item["raw_content"] = str(clean_content_soup(content_soup))
+        item["references"] = self.get_content_references(response)
+        return self.get_history_request(item["page_id"], 1, item)
+
+
 def get_rating(response):
     try:
         return int(response.css(".rate-points .number::text").get())

From b1883628a61fe4a3acdd2aebaaa99a0797e4ed44 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 13:00:49 +0100
Subject: [PATCH 03/12] feat(postprocessing): add supplement processing command

- Implement run_postproc_supplement to process SCP supplement data
- Create necessary directories and handle data extraction
- Store processed supplements in JSON format for further use
---
 makefile                      | 17 +++++++++----
 scp_crawler/postprocessing.py | 48 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/makefile b/makefile
index c777a7a..5041eb9 100644
--- a/makefile
+++ b/makefile
@@ -3,7 +3,7 @@ PYTHON_VENV = source .venv/bin/activate &&
 
 install: .venv
 
-data: scp scp_int goi supplement
+data: scp scp_int goi
 
 fresh: clean data
 
@@ -14,11 +14,11 @@ clean:
 	python -m venv .venv
 	$(PYTHON_VENV) python -m pip install .
 
-crawl: scp scp_int goi supplement
+crawl: scp scp_int goi
 
 scp: scp_crawl scp_postprocess
 
-scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json
+scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json
 
 data/scp_titles.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json
@@ -37,12 +37,19 @@ goi: data/goi.json
 data/goi.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json
 
-supplement: data/scp_supplement.json
+supplement: supplement_crawl supplement_postprocess
+
+supplement_crawl: data/scp_supplement.json
 
 data/scp_supplement.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json
 
-scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales
+supplement_postprocess: supplement_crawl data/processed/supplement
+
+data/processed/supplement: .venv
+	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement
+
+scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement
 
 data/processed/goi: .venv
 	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi
diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py
index ec3ee0d..7d284d8 100644
--- a/scp_crawler/postprocessing.py
+++ b/scp_crawler/postprocessing.py
@@ -273,5 +273,53 @@ def run_postproc_goi():
     to_file(tales, processed_path / "index.json")
 
 
+@cli.command()
+def run_postproc_supplement():
+
+    processed_path = Path(cwd + "/data/processed/supplement")
+    os.makedirs(processed_path, exist_ok=True)
+
+    print("Processing Supplement list.")
+
+    supplement_list = from_file(cwd + "/data/scp_supplement.json")
+    supplements = {}
+    for supplement in tqdm(supplement_list, smoothing=0):
+
+        supplement["images"] = get_images(supplement["raw_content"])
+        supplement["hubs"] = get_hubs(supplement["link"])
+        supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"])
+
+        # Convert history dict to list and sort by date.
+        supplement["history"] = process_history(supplement["history"])
+
+        if len(supplement["history"]) > 0:
+            supplement["created_at"] = supplement["history"][0]["date"]
+            supplement["creator"] = supplement["history"][0]["author"]
+        else:
+            supplement["created_at"] = "unknown"
+            supplement["creator"] = "unknown"
+
+        supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "")
+        
+        # Extract parent SCP from title or link
+        scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE)
+        supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None
+        
+        # Extract parent tale series from link
+        tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"])
+        supplement["parent_tale"] = tale_match.group(1) if tale_match else None
+        
+        supplements[supplement["link"]] = supplement
+
+    to_file(supplements, processed_path / f"content_supplement.json")
+
+    for supplement_id in supplements:
+        del supplements[supplement_id]["raw_content"]
+        del supplements[supplement_id]["raw_source"]
+        supplements[supplement_id]["content_file"] = f"content_supplement.json"
+
+    to_file(supplements, processed_path / "index.json")
+
+
 if __name__ == "__main__":
     cli()

From 146435dd8f6da11cef7cbfce93443eb1cd945c8b Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 13:01:13 +0100
Subject: [PATCH 04/12] docs(README): update supplement crawl instructions and
 content structure

- Added instructions for crawling pages tagged as 'supplement'
- Updated content structure to include multiple content types
- Clarified post-processing details for supplements
---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b35932f..6506fc7 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location:
 scrapy crawl scp_int -o scp_international_items.json
 ```
 
+To crawl pages tagged as `supplement` and save to a custom location:
+
+```bash
+scrapy crawl scp_supplement -o scp_supplement.json
+```
+
 ## Raw Content Structure
 
-There are two types of content downloaded- SCP Items and SCP Tales.
+There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements).
 
 All content (both SCP Items and Tales) contain the following:
 
@@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr
 | scp_titles.json     | Main          | Title | scp     |
 | scp_hubs.json       | Main          | Hub   | scp     |
 | scp_tales.json      | Main          | Tale  | scp     |
+| scp_supplement.json | Main          | Supplement | scp |
 | scp_int.json        | International | Item  | scp_int |
 | scp_int_titles.json | International | Title | scp_int |
 | scp_int_tales.json  | International | Tale  | scp_int |
@@ -76,7 +83,9 @@ To regenerate all files run `make fresh`.
 
 ## Post Processed Data
 
-The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+
+Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`.
 
 
 ## Content Licensing

From 84d709f0d7babac086fb2741f4f8b843fe2e0118 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 15:24:24 +0100
Subject: [PATCH 05/12] fix(spiders): refine tale parsing rules to exclude
 unwanted links

- Updated the LinkExtractor in ScpTaleSpider to deny links
  matching specific patterns, improving the relevance of parsed tales.
---
 scp_crawler/spiders/scp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
index 58fee06..9f4933a 100644
--- a/scp_crawler/spiders/scp.py
+++ b/scp_crawler/spiders/scp.py
@@ -343,7 +343,7 @@ class ScpTaleSpider(CrawlSpider, WikiMixin):
 
     rules = (
         Rule(LinkExtractor(allow=[re.escape("tales-by-title"), re.escape("system:page-tags/tag/tale")])),
-        Rule(LinkExtractor(allow=[r".*"]), callback="parse_tale"),
+        Rule(LinkExtractor(deny=[r"system:.*", r".*:.*", re.escape("tag-search")]), callback="parse_tale"),
     )
 
     def parse_tale(self, response, original_link=None):

From 3a2194055516dc8a8a6e7ffc465e3c6d4fc7ac9a Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 15:24:45 +0100
Subject: [PATCH 06/12] feat(ci): add GitHub Actions workflow for SCP crawling

- for reference
---
 .github/workflows/scp-items.yml | 92 +++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 .github/workflows/scp-items.yml

diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml
new file mode 100644
index 0000000..4c1fea1
--- /dev/null
+++ b/.github/workflows/scp-items.yml
@@ -0,0 +1,92 @@
+name: Crawl SCP Wiki
+
+on:
+#   workflow_dispatch:
+#   schedule:
+#     - cron: "0 0 * * *"
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/scp-items.yml
+
+permissions:
+  contents: write
+
+jobs:
+  update-main-scp:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Clone this Repository"
+        uses: actions/checkout@v6
+        with:
+          path: scp-api
+
+      - name: "Clone the Crawler"
+        uses: actions/checkout@v6
+        with:
+          repository: heroheman/scp_crawler
+          ref: "main"
+          path: scp-crawler
+
+      - name: "Setup Python"
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: "Install Crawler"
+        working-directory: ./scp-crawler
+        run: make install
+
+      - name: "Crawl Titles"
+        working-directory: ./scp-crawler
+        run: make data/scp_titles.json
+
+      - name: "Crawl Hubs"
+        working-directory: ./scp-crawler
+        run: make data/scp_hubs.json
+
+      - name: "Crawl Items"
+        working-directory: ./scp-crawler
+        run: make data/scp_items.json
+
+      - name: "Process Items"
+        working-directory: ./scp-crawler
+        run: make data/processed/items
+
+      - name: "Crawl Tales"
+        working-directory: ./scp-crawler
+        run: make data/scp_tales.json
+
+      - name: "Process Tales"
+        working-directory: ./scp-crawler
+        run: make data/processed/tales
+
+      - name: "Crawl GOI"
+        working-directory: ./scp-crawler
+        run: make data/goi.json
+
+      - name: "Process GOI"
+        working-directory: ./scp-crawler
+        run: make data/processed/goi
+
+      - name: "Crawl Supplements"
+        working-directory: ./scp-crawler
+        run: make data/scp_supplement.json
+
+      - name: "Process Supplements"
+        working-directory: ./scp-crawler
+        run: make data/processed/supplement
+
+      - name: "Move Files into API"
+        run: cp -Rf ./scp-crawler/data/processed/* ./scp-api/docs/data/scp/
+
+      - name: "Push"
+        shell: bash
+        run: >
+          cd scp-api;
+          ./bin/push.sh;
+
+        env:
+          GIT_USER: "SCP Bot"
+          GIT_EMAIL: "scp@tedivm.com"

From 166a50bd6ad98504b51ce1be603669f55a3c3480 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 17:37:21 +0100
Subject: [PATCH 07/12] chore(workflow): adjusted workflow after review

- This change prevents automatic pushing to the SCP API during CI
- it should run on pull requests to allow for manual review first
- remove clone step as it's unnecessary
- resolves #5
---
 .github/workflows/scp-items.yml | 43 ++++++++++++---------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml
index 4c1fea1..bf59e9e 100644
--- a/.github/workflows/scp-items.yml
+++ b/.github/workflows/scp-items.yml
@@ -9,6 +9,9 @@ on:
       - main
     paths:
       - .github/workflows/scp-items.yml
+  pull_request:
+    branches:
+      - main
 
 permissions:
   contents: write
@@ -17,17 +20,14 @@ jobs:
   update-main-scp:
     runs-on: ubuntu-latest
     steps:
-      - name: "Clone this Repository"
+      - name: "Checkout Crawler"
         uses: actions/checkout@v6
-        with:
-          path: scp-api
 
-      - name: "Clone the Crawler"
+      - name: "Clone API Repository"
         uses: actions/checkout@v6
         with:
-          repository: heroheman/scp_crawler
-          ref: "main"
-          path: scp-crawler
+          repository: heroheman/scp-api
+          path: scp-api
 
       - name: "Setup Python"
         uses: actions/setup-python@v6
@@ -35,58 +35,47 @@ jobs:
           python-version: '3.13'
 
       - name: "Install Crawler"
-        working-directory: ./scp-crawler
         run: make install
 
       - name: "Crawl Titles"
-        working-directory: ./scp-crawler
         run: make data/scp_titles.json
 
       - name: "Crawl Hubs"
-        working-directory: ./scp-crawler
         run: make data/scp_hubs.json
 
       - name: "Crawl Items"
-        working-directory: ./scp-crawler
         run: make data/scp_items.json
 
       - name: "Process Items"
-        working-directory: ./scp-crawler
         run: make data/processed/items
 
       - name: "Crawl Tales"
-        working-directory: ./scp-crawler
         run: make data/scp_tales.json
 
       - name: "Process Tales"
-        working-directory: ./scp-crawler
         run: make data/processed/tales
 
       - name: "Crawl GOI"
-        working-directory: ./scp-crawler
         run: make data/goi.json
 
       - name: "Process GOI"
-        working-directory: ./scp-crawler
         run: make data/processed/goi
 
       - name: "Crawl Supplements"
-        working-directory: ./scp-crawler
         run: make data/scp_supplement.json
 
       - name: "Process Supplements"
-        working-directory: ./scp-crawler
         run: make data/processed/supplement
 
       - name: "Move Files into API"
-        run: cp -Rf ./scp-crawler/data/processed/* ./scp-api/docs/data/scp/
+        run: cp -Rf ./data/processed/* ./scp-api/docs/data/scp/
 
-      - name: "Push"
-        shell: bash
-        run: >
-          cd scp-api;
-          ./bin/push.sh;
+      # - name: "Push"
+      #   shell: bash
+      #   run: >
+      #     cd scp-api;
+      #     ./bin/push.sh;
 
-        env:
-          GIT_USER: "SCP Bot"
-          GIT_EMAIL: "scp@tedivm.com"
+      #   env:
+      #     GIT_USER: "SCP Bot"
+      #     GIT_EMAIL: "scp@tedivm.com"

From 49aec691f052cb238fcab929ab8f3977f4968044 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Mon, 15 Dec 2025 17:47:51 +0100
Subject: [PATCH 08/12] fix(spiders): update tale parsing rules to allow all
 links

- this fixes unintend removal of Linkextractor Rule
---
 scp_crawler/spiders/scp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
index 9f4933a..58fee06 100644
--- a/scp_crawler/spiders/scp.py
+++ b/scp_crawler/spiders/scp.py
@@ -343,7 +343,7 @@ class ScpTaleSpider(CrawlSpider, WikiMixin):
 
     rules = (
         Rule(LinkExtractor(allow=[re.escape("tales-by-title"), re.escape("system:page-tags/tag/tale")])),
-        Rule(LinkExtractor(deny=[r"system:.*", r".*:.*", re.escape("tag-search")]), callback="parse_tale"),
+        Rule(LinkExtractor(allow=[r".*"]), callback="parse_tale"),
     )
 
     def parse_tale(self, response, original_link=None):

From d68338f96d342e3282e40cb6d694fdfd46555e5b Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Wed, 17 Dec 2025 11:24:08 +0100
Subject: [PATCH 09/12] fix(spiders): enhance error handling in history lookup

- Add checks for empty responses and missing 'body' in JSON
- Log errors for various failure scenarios to improve debugging
- Ensure robust parsing of history HTML to prevent crashes
---
 scp_crawler/spiders/scp.py | 60 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
index 58fee06..c38c443 100644
--- a/scp_crawler/spiders/scp.py
+++ b/scp_crawler/spiders/scp.py
@@ -1,3 +1,4 @@
+import json
 import re
 import sys
 from pprint import pprint
@@ -29,12 +30,65 @@ def parse_history(self, response, item, history_page=1):
 
         page_id = item["page_id"]
         changes = item["history"] if "history" in item else {}
+
         try:
+            response_text = getattr(response, "text", "") or ""
+            if not response_text.strip():
+                self.logger.error(
+                    "Empty response when fetching history for %s (status=%s, page=%s)",
+                    item.get("url"),
+                    getattr(response, "status", None),
+                    history_page,
+                )
+                return self.get_page_source_request(page_id, item)
+
             history = response.json()
-            soup = BeautifulSoup(history["body"], "lxml")
+            if not isinstance(history, dict) or "body" not in history:
+                self.logger.error(
+                    "Missing 'body' in history lookup for %s (status=%s, page=%s). Keys=%s",
+                    item.get("url"),
+                    getattr(response, "status", None),
+                    history_page,
+                    list(history.keys()) if isinstance(history, dict) else type(history),
+                )
+                return self.get_page_source_request(page_id, item)
+
+            body = history.get("body")
+            if not body:
+                self.logger.error(
+                    "Empty 'body' in history lookup for %s (status=%s, page=%s)",
+                    item.get("url"),
+                    getattr(response, "status", None),
+                    history_page,
+                )
+                return self.get_page_source_request(page_id, item)
+
+            soup = BeautifulSoup(body, "lxml")
+            if soup.table is None:
+                self.logger.error(
+                    "Missing <table> in history HTML for %s (status=%s, page=%s)",
+                    item.get("url"),
+                    getattr(response, "status", None),
+                    history_page,
+                )
+                return self.get_page_source_request(page_id, item)
             rows = soup.table.find_all("tr")
-        except:
-            self.logger.exception(f"Unable to parse history lookup. {item['url']}")
+
+        except (json.JSONDecodeError, ValueError):
+            self.logger.error(
+                "JSON decode error in history lookup for %s (status=%s, page=%s)",
+                item.get("url"),
+                getattr(response, "status", None),
+                history_page,
+            )
+            return self.get_page_source_request(page_id, item)
+        except Exception:
+            self.logger.exception(
+                "Unable to parse history lookup for %s (status=%s, page=%s)",
+                item.get("url"),
+                getattr(response, "status", None),
+                history_page,
+            )
             return self.get_page_source_request(page_id, item)
         for row in rows:
             try:

From cca296cc3393c487ee51d3572e0ff455c775a9a2 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Wed, 17 Dec 2025 11:26:14 +0100
Subject: [PATCH 10/12] chore(workflow): enable workflow_dispatch for SCP
 crawling

- Allows manual triggering of the workflow
- Improves flexibility for testing and updates
---
 .github/workflows/scp-items.yml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml
index bf59e9e..a20822f 100644
--- a/.github/workflows/scp-items.yml
+++ b/.github/workflows/scp-items.yml
@@ -1,17 +1,19 @@
 name: Crawl SCP Wiki
 
 on:
-#   workflow_dispatch:
+  workflow_dispatch:
 #   schedule:
 #     - cron: "0 0 * * *"
-  push:
+  # push:
+    # branches:
+    #   - main
+    # paths:
+    #   - .github/workflows/scp-items.yml
+  pull_request:
     branches:
       - main
     paths:
       - .github/workflows/scp-items.yml
-  pull_request:
-    branches:
-      - main
 
 permissions:
   contents: write

From 4604780632fdc09839a3703f6c8c711cd60c1384 Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Wed, 17 Dec 2025 13:41:35 +0100
Subject: [PATCH 11/12] fix(history): improve history processing and error
 handling

- Handle empty history cases by returning an empty list
- Support both dict and list formats for history input
- Safely parse date strings with error handling
- Sort revisions by date, ensuring robustness against missing values
---
 scp_crawler/postprocessing.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py
index 7d284d8..7c2cd90 100644
--- a/scp_crawler/postprocessing.py
+++ b/scp_crawler/postprocessing.py
@@ -45,11 +45,29 @@ def get_images(html):
 
 
 def process_history(history):
-    history = [v for v in history.values()]
-    for revision in history:
-        revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M")
-    history.sort(key=lambda x: x["date"])
-    return history
+    if not history:
+        return []
+
+    if isinstance(history, dict):
+        revisions = list(history.values())
+    elif isinstance(history, list):
+        revisions = history
+    else:
+        return []
+
+    for revision in revisions:
+        if not isinstance(revision, dict):
+            continue
+        revision_date = revision.get("date")
+        if isinstance(revision_date, str):
+            try:
+                revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M")
+            except Exception:
+                # Keep original value if parsing fails.
+                pass
+
+    revisions.sort(key=lambda x: x.get("date") or datetime.min)
+    return revisions
 
 
 def get_wiki_source(page_id, domain, attempts=5):
@@ -200,7 +218,7 @@ def run_postproc_tales():
         tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"])
 
         # Convert history dict to list and sort by date.
-        tale["history"] = process_history(tale["history"])
+        tale["history"] = process_history(tale.get("history"))
 
         if len(tale["history"]) > 0:
             tale["created_at"] = tale["history"][0]["date"]

From 45d680b972886082f5a99db0b755da577506462f Mon Sep 17 00:00:00 2001
From: Florenz Heldermann <f.heldermann@reply.de>
Date: Wed, 17 Dec 2025 13:58:37 +0100
Subject: [PATCH 12/12] fix(history): ensure history key always exists in items

- Use `get` method to safely access history in hubs and items
- Prevent potential KeyError by ensuring history key is present
---
 scp_crawler/postprocessing.py |  4 ++--
 scp_crawler/spiders/scp.py    | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py
index 7c2cd90..399ae67 100644
--- a/scp_crawler/postprocessing.py
+++ b/scp_crawler/postprocessing.py
@@ -112,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5):
     hub_list,
 ):
     # Convert history dict to list and sort by date.
-    hub["history"] = process_history(hub["history"])
+    hub["history"] = process_history(hub.get("history"))
 
     if len(hub["history"]) > 0:
         hub["created_at"] = hub["history"][0]["date"]
@@ -161,7 +161,7 @@ def run_postproc_items():
         item["hubs"] = get_hubs(item["link"])
 
         # Convert history dict to list and sort by date.
-        item["history"] = process_history(item["history"])
+        item["history"] = process_history(item.get("history"))
 
         if len(item["history"]) > 0:
             item["created_at"] = item["history"][0]["date"]
diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py
index c38c443..745b5a1 100644
--- a/scp_crawler/spiders/scp.py
+++ b/scp_crawler/spiders/scp.py
@@ -29,7 +29,8 @@ def parse_history(self, response, item, history_page=1):
         self.logger.info(f"Reviewing Page {item['page_id']} history")
 
         page_id = item["page_id"]
-        changes = item["history"] if "history" in item else {}
+        changes = item.get("history", {})
+        item["history"] = changes  # Ensure history key always exists
 
         try:
             response_text = getattr(response, "text", "") or ""
@@ -116,11 +117,13 @@ def parse_history(self, response, item, history_page=1):
                 self.logger.exception("Could not process row.")
                 self.logger.error(row)
 
-            item["history"] = changes
-            # The "0" change is the first revision, and the last one that shows up.
-            # If we have it then we're done.
-            if "0" in changes:
-                return self.get_page_source_request(page_id, item)
+        # Update item history after processing all rows
+        item["history"] = changes
+        
+        # The "0" change is the first revision, and the last one that shows up.
+        # If we have it then we're done.
+        if "0" in changes:
+            return self.get_page_source_request(page_id, item)
 
         next_page = history_page + 1
         if next_page > MAX_HISTORY_PAGES: