scp-data · heroheman · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml
@@ -0,0 +1,83 @@
+name: Crawl SCP Wiki
+
+on:
+  workflow_dispatch:
+#   schedule:
+#     - cron: "0 0 * * *"
+  # push:
+    # branches:
+    #   - main
+    # paths:
+    #   - .github/workflows/scp-items.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/scp-items.yml
+
+permissions:
+  contents: write
+
+jobs:
+  update-main-scp:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout Crawler"
+        uses: actions/checkout@v6
+
+      - name: "Clone API Repository"
+        uses: actions/checkout@v6
+        with:
+          repository: heroheman/scp-api
+          path: scp-api
+
+      - name: "Setup Python"
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: "Install Crawler"
+        run: make install
+
+      - name: "Crawl Titles"
+        run: make data/scp_titles.json
+
+      - name: "Crawl Hubs"
+        run: make data/scp_hubs.json
+
+      - name: "Crawl Items"
+        run: make data/scp_items.json
+
+      - name: "Process Items"
+        run: make data/processed/items
+
+      - name: "Crawl Tales"
+        run: make data/scp_tales.json
+
+      - name: "Process Tales"
+        run: make data/processed/tales
+
+      - name: "Crawl GOI"
+        run: make data/goi.json
+
+      - name: "Process GOI"
+        run: make data/processed/goi
+
+      - name: "Crawl Supplements"
+        run: make data/scp_supplement.json
+
+      - name: "Process Supplements"
+        run: make data/processed/supplement
+
+      - name: "Move Files into API"
+        run: cp -Rf ./data/processed/* ./scp-api/docs/data/scp/
+
+      # - name: "Push"
+      #   shell: bash
+      #   run: >
+      #     cd scp-api;
+      #     ./bin/push.sh;
+
+      #   env:
+      #     GIT_USER: "SCP Bot"
+      #     GIT_EMAIL: "scp@tedivm.com"
diff --git a/.gitignore b/.gitignore
@@ -105,3 +105,4 @@ venv.bak/
 
 
 *.json
+.DS_Store
diff --git a/README.md b/README.md
@@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location:
 scrapy crawl scp_int -o scp_international_items.json
 ```
 
+To crawl pages tagged as `supplement` and save to a custom location:
+
+```bash
+scrapy crawl scp_supplement -o scp_supplement.json
+```
+
 ## Raw Content Structure
 
-There are two types of content downloaded- SCP Items and SCP Tales.
+There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements).
 
 All content (both SCP Items and Tales) contain the following:
 
@@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr
 | scp_titles.json     | Main          | Title | scp     |
 | scp_hubs.json       | Main          | Hub   | scp     |
 | scp_tales.json      | Main          | Tale  | scp     |
+| scp_supplement.json | Main          | Supplement | scp |
 | scp_int.json        | International | Item  | scp_int |
 | scp_int_titles.json | International | Title | scp_int |
 | scp_int_tales.json  | International | Tale  | scp_int |
@@ -76,7 +83,9 @@ To regenerate all files run `make fresh`.
 
 ## Post Processed Data
 
-The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there.
+
+Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`.
 
 
 ## Content Licensing

diff --git a/makefile b/makefile
@@ -18,7 +18,7 @@ crawl: scp scp_int goi
 
 scp: scp_crawl scp_postprocess
 
-scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json
+scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json
 
 data/scp_titles.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json
@@ -37,7 +37,19 @@ goi: data/goi.json
 data/goi.json: .venv
 	$(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json
 
-scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales
+supplement: supplement_crawl supplement_postprocess
+
+supplement_crawl: data/scp_supplement.json
+
+data/scp_supplement.json: .venv
+	$(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json
+
+supplement_postprocess: supplement_crawl data/processed/supplement
+
+data/processed/supplement: .venv
+	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement
+
+scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement
 
 data/processed/goi: .venv
 	$(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi

diff --git a/scp_crawler/items.py b/scp_crawler/items.py
@@ -33,6 +33,10 @@ class ScpGoi(WikiPage):
     pass
 
 
+class ScpSupplement(WikiPage):
+    pass
+
+
 class ScpTitle(scrapy.Item):
     title = scrapy.Field()
     scp = scrapy.Field()

diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py
@@ -45,11 +45,29 @@ def get_images(html):
 
 
 def process_history(history):
-    history = [v for v in history.values()]
-    for revision in history:
-        revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M")
-    history.sort(key=lambda x: x["date"])
-    return history
+    if not history:
+        return []
+
+    if isinstance(history, dict):
+        revisions = list(history.values())
+    elif isinstance(history, list):
+        revisions = history
+    else:
+        return []
+
+    for revision in revisions:
+        if not isinstance(revision, dict):
+            continue
+        revision_date = revision.get("date")
+        if isinstance(revision_date, str):
+            try:
+                revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M")
+            except Exception:
+                # Keep original value if parsing fails.
+                pass
+
+    revisions.sort(key=lambda x: x.get("date") or datetime.min)
+    return revisions
 
 
 def get_wiki_source(page_id, domain, attempts=5):
@@ -94,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5):
     hub_list,
 ):
     # Convert history dict to list and sort by date.
-    hub["history"] = process_history(hub["history"])
+    hub["history"] = process_history(hub.get("history"))
 
     if len(hub["history"]) > 0:
         hub["created_at"] = hub["history"][0]["date"]
@@ -143,7 +161,7 @@ def run_postproc_items():
         item["hubs"] = get_hubs(item["link"])
 
         # Convert history dict to list and sort by date.
-        item["history"] = process_history(item["history"])
+        item["history"] = process_history(item.get("history"))
 
         if len(item["history"]) > 0:
             item["created_at"] = item["history"][0]["date"]
@@ -200,7 +218,7 @@ def run_postproc_tales():
         tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"])
 
         # Convert history dict to list and sort by date.
-        tale["history"] = process_history(tale["history"])
+        tale["history"] = process_history(tale.get("history"))
 
         if len(tale["history"]) > 0:
             tale["created_at"] = tale["history"][0]["date"]
@@ -273,5 +291,53 @@ def run_postproc_goi():
     to_file(tales, processed_path / "index.json")
 
 
+@cli.command()
+def run_postproc_supplement():
+
+    processed_path = Path(cwd + "/data/processed/supplement")
+    os.makedirs(processed_path, exist_ok=True)
+
+    print("Processing Supplement list.")
+
+    supplement_list = from_file(cwd + "/data/scp_supplement.json")
+    supplements = {}
+    for supplement in tqdm(supplement_list, smoothing=0):
+
+        supplement["images"] = get_images(supplement["raw_content"])
+        supplement["hubs"] = get_hubs(supplement["link"])
+        supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"])
+
+        # Convert history dict to list and sort by date.
+        supplement["history"] = process_history(supplement["history"])
+
+        if len(supplement["history"]) > 0:
+            supplement["created_at"] = supplement["history"][0]["date"]
+            supplement["creator"] = supplement["history"][0]["author"]
+        else:
+            supplement["created_at"] = "unknown"
+            supplement["creator"] = "unknown"
+
+        supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "")
+
+        # Extract parent SCP from title or link
+        scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE)
+        supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None
+
+        # Extract parent tale series from link
+        tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"])
+        supplement["parent_tale"] = tale_match.group(1) if tale_match else None
+
+        supplements[supplement["link"]] = supplement
+
+    to_file(supplements, processed_path / f"content_supplement.json")
+
+    for supplement_id in supplements:
+        del supplements[supplement_id]["raw_content"]
+        del supplements[supplement_id]["raw_source"]
+        supplements[supplement_id]["content_file"] = f"content_supplement.json"
+
+    to_file(supplements, processed_path / "index.json")
+
+
 if __name__ == "__main__":
     cli()