diff --git a/.github/workflows/scp-items.yml b/.github/workflows/scp-items.yml new file mode 100644 index 0000000..a20822f --- /dev/null +++ b/.github/workflows/scp-items.yml @@ -0,0 +1,83 @@ +name: Crawl SCP Wiki + +on: + workflow_dispatch: +# schedule: +# - cron: "0 0 * * *" + # push: + # branches: + # - main + # paths: + # - .github/workflows/scp-items.yml + pull_request: + branches: + - main + paths: + - .github/workflows/scp-items.yml + +permissions: + contents: write + +jobs: + update-main-scp: + runs-on: ubuntu-latest + steps: + - name: "Checkout Crawler" + uses: actions/checkout@v6 + + - name: "Clone API Repository" + uses: actions/checkout@v6 + with: + repository: heroheman/scp-api + path: scp-api + + - name: "Setup Python" + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: "Install Crawler" + run: make install + + - name: "Crawl Titles" + run: make data/scp_titles.json + + - name: "Crawl Hubs" + run: make data/scp_hubs.json + + - name: "Crawl Items" + run: make data/scp_items.json + + - name: "Process Items" + run: make data/processed/items + + - name: "Crawl Tales" + run: make data/scp_tales.json + + - name: "Process Tales" + run: make data/processed/tales + + - name: "Crawl GOI" + run: make data/goi.json + + - name: "Process GOI" + run: make data/processed/goi + + - name: "Crawl Supplements" + run: make data/scp_supplement.json + + - name: "Process Supplements" + run: make data/processed/supplement + + - name: "Move Files into API" + run: cp -Rf ./data/processed/* ./scp-api/docs/data/scp/ + + # - name: "Push" + # shell: bash + # run: > + # cd scp-api; + # ./bin/push.sh; + + # env: + # GIT_USER: "SCP Bot" + # GIT_EMAIL: "scp@tedivm.com" diff --git a/.gitignore b/.gitignore index 2778f5e..6f06558 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ venv.bak/ *.json +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index b35932f..6506fc7 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location: scrapy crawl scp_int -o scp_international_items.json ``` +To crawl pages tagged as `supplement` and save to a custom location: + +```bash +scrapy crawl scp_supplement -o scp_supplement.json +``` + ## Raw Content Structure -There are two types of content downloaded- SCP Items and SCP Tales. +There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements). All content (both SCP Items and Tales) contain the following: @@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr | scp_titles.json | Main | Title | scp | | scp_hubs.json | Main | Hub | scp | | scp_tales.json | Main | Tale | scp | +| scp_supplement.json | Main | Supplement | scp | | scp_int.json | International | Item | scp_int | | scp_int_titles.json | International | Title | scp_int | | scp_int_tales.json | International | Tale | scp_int | @@ -76,7 +83,9 @@ To regenerate all files run `make fresh`. ## Post Processed Data -The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. +The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. + +Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`. ## Content Licensing diff --git a/makefile b/makefile index 0b49c2f..5041eb9 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ crawl: scp scp_int goi scp: scp_crawl scp_postprocess -scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json +scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json data/scp_titles.json: .venv $(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json @@ -37,7 +37,19 @@ goi: data/goi.json data/goi.json: .venv $(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json -scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales +supplement: supplement_crawl supplement_postprocess + +supplement_crawl: data/scp_supplement.json + +data/scp_supplement.json: .venv + $(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json + +supplement_postprocess: supplement_crawl data/processed/supplement + +data/processed/supplement: .venv + $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement + +scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement data/processed/goi: .venv $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi diff --git a/scp_crawler/items.py b/scp_crawler/items.py index d4aa1c2..a199653 100644 --- a/scp_crawler/items.py +++ b/scp_crawler/items.py @@ -33,6 +33,10 @@ class ScpGoi(WikiPage): pass +class ScpSupplement(WikiPage): + pass + + class ScpTitle(scrapy.Item): title = scrapy.Field() scp = scrapy.Field() diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index ec3ee0d..399ae67 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -45,11 +45,29 @@ def get_images(html): def process_history(history): - history = [v for v in history.values()] - for revision in history: - revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M") - history.sort(key=lambda x: x["date"]) - return history + if not history: + return [] + + if isinstance(history, dict): + revisions = list(history.values()) + elif isinstance(history, list): + revisions = history + else: + return [] + + for revision in revisions: + if not isinstance(revision, dict): + continue + revision_date = revision.get("date") + if isinstance(revision_date, str): + try: + revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M") + except Exception: + # Keep original value if parsing fails. + pass + + revisions.sort(key=lambda x: x.get("date") or datetime.min) + return revisions def get_wiki_source(page_id, domain, attempts=5): @@ -94,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5): hub_list, ): # Convert history dict to list and sort by date. - hub["history"] = process_history(hub["history"]) + hub["history"] = process_history(hub.get("history")) if len(hub["history"]) > 0: hub["created_at"] = hub["history"][0]["date"] @@ -143,7 +161,7 @@ def run_postproc_items(): item["hubs"] = get_hubs(item["link"]) # Convert history dict to list and sort by date. - item["history"] = process_history(item["history"]) + item["history"] = process_history(item.get("history")) if len(item["history"]) > 0: item["created_at"] = item["history"][0]["date"] @@ -200,7 +218,7 @@ def run_postproc_tales(): tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"]) # Convert history dict to list and sort by date. - tale["history"] = process_history(tale["history"]) + tale["history"] = process_history(tale.get("history")) if len(tale["history"]) > 0: tale["created_at"] = tale["history"][0]["date"] @@ -273,5 +291,53 @@ def run_postproc_goi(): to_file(tales, processed_path / "index.json") +@cli.command() +def run_postproc_supplement(): + + processed_path = Path(cwd + "/data/processed/supplement") + os.makedirs(processed_path, exist_ok=True) + + print("Processing Supplement list.") + + supplement_list = from_file(cwd + "/data/scp_supplement.json") + supplements = {} + for supplement in tqdm(supplement_list, smoothing=0): + + supplement["images"] = get_images(supplement["raw_content"]) + supplement["hubs"] = get_hubs(supplement["link"]) + supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"]) + + # Convert history dict to list and sort by date. + supplement["history"] = process_history(supplement["history"]) + + if len(supplement["history"]) > 0: + supplement["created_at"] = supplement["history"][0]["date"] + supplement["creator"] = supplement["history"][0]["author"] + else: + supplement["created_at"] = "unknown" + supplement["creator"] = "unknown" + + supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "") + + # Extract parent SCP from title or link + scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE) + supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None + + # Extract parent tale series from link + tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"]) + supplement["parent_tale"] = tale_match.group(1) if tale_match else None + + supplements[supplement["link"]] = supplement + + to_file(supplements, processed_path / f"content_supplement.json") + + for supplement_id in supplements: + del supplements[supplement_id]["raw_content"] + del supplements[supplement_id]["raw_source"] + supplements[supplement_id]["content_file"] = f"content_supplement.json" + + to_file(supplements, processed_path / "index.json") + + if __name__ == "__main__": cli() diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 8528b3a..745b5a1 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -1,3 +1,4 @@ +import json import re import sys from pprint import pprint @@ -8,7 +9,7 @@ from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule -from ..items import ScpGoi, ScpHub, ScpItem, ScpTale, ScpTitle +from ..items import ScpGoi, ScpHub, ScpItem, ScpSupplement, ScpTale, ScpTitle DOMAIN = "scp-wiki.wikidot.com" INT_DOMAIN = "scp-int.wikidot.com" @@ -28,13 +29,67 @@ def parse_history(self, response, item, history_page=1): self.logger.info(f"Reviewing Page {item['page_id']} history") page_id = item["page_id"] - changes = item["history"] if "history" in item else {} + changes = item.get("history", {}) + item["history"] = changes # Ensure history key always exists + try: + response_text = getattr(response, "text", "") or "" + if not response_text.strip(): + self.logger.error( + "Empty response when fetching history for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + history = response.json() - soup = BeautifulSoup(history["body"], "lxml") + if not isinstance(history, dict) or "body" not in history: + self.logger.error( + "Missing 'body' in history lookup for %s (status=%s, page=%s). Keys=%s", + item.get("url"), + getattr(response, "status", None), + history_page, + list(history.keys()) if isinstance(history, dict) else type(history), + ) + return self.get_page_source_request(page_id, item) + + body = history.get("body") + if not body: + self.logger.error( + "Empty 'body' in history lookup for %s (status=%s, page=%s)", + item.get("url"), + getattr(response, "status", None), + history_page, + ) + return self.get_page_source_request(page_id, item) + + soup = BeautifulSoup(body, "lxml") + if soup.table is None: + self.logger.error( + "Missing