diff --git a/.gitignore b/.gitignore index 2778f5e..6f06558 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ venv.bak/ *.json +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index b35932f..6506fc7 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,15 @@ To crawl the International Hub for SCP Items and save to a custom location: scrapy crawl scp_int -o scp_international_items.json ``` +To crawl pages tagged as `supplement` and save to a custom location: + +```bash +scrapy crawl scp_supplement -o scp_supplement.json +``` + ## Raw Content Structure -There are two types of content downloaded- SCP Items and SCP Tales. +There are multiple types of content downloaded (Items, Tales, GOI formats, and Supplements). All content (both SCP Items and Tales) contain the following: @@ -66,6 +72,7 @@ The crawler generates a series of json files containing an array of objects repr | scp_titles.json | Main | Title | scp | | scp_hubs.json | Main | Hub | scp | | scp_tales.json | Main | Tale | scp | +| scp_supplement.json | Main | Supplement | scp | | scp_int.json | International | Item | scp_int | | scp_int_titles.json | International | Title | scp_int | | scp_int_tales.json | International | Tale | scp_int | @@ -76,7 +83,9 @@ To regenerate all files run `make fresh`. ## Post Processed Data -The postproc system takes the Titles, Hubs, Items, and Tales and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. +The postproc system takes Titles, Hubs, Items, Tales, GOI, and Supplements and uses them to generate a comprehensive set of objects. It combines and cross references data and expands on the data already there. + +Supplements are written to `data/processed/supplement/` and include additional fields like `parent_scp` and `parent_tale`. ## Content Licensing diff --git a/makefile b/makefile index 0b49c2f..5041eb9 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ crawl: scp scp_int goi scp: scp_crawl scp_postprocess -scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json +scp_crawl: data/scp_titles.json data/scp_hubs.json data/scp_items.json data/scp_tales.json data/goi.json data/scp_supplement.json data/scp_titles.json: .venv $(PYTHON_VENV) python -m scrapy crawl scp_titles -o data/scp_titles.json @@ -37,7 +37,19 @@ goi: data/goi.json data/goi.json: .venv $(PYTHON_VENV) python -m scrapy crawl goi -o data/goi.json -scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales +supplement: supplement_crawl supplement_postprocess + +supplement_crawl: data/scp_supplement.json + +data/scp_supplement.json: .venv + $(PYTHON_VENV) python -m scrapy crawl scp_supplement -o data/scp_supplement.json + +supplement_postprocess: supplement_crawl data/processed/supplement + +data/processed/supplement: .venv + $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-supplement + +scp_postprocess: scp_crawl data/processed/goi data/processed/items data/processed/tales data/processed/supplement data/processed/goi: .venv $(PYTHON_VENV) python -m scp_crawler.postprocessing run-postproc-goi diff --git a/scp_crawler/items.py b/scp_crawler/items.py index d4aa1c2..a199653 100644 --- a/scp_crawler/items.py +++ b/scp_crawler/items.py @@ -33,6 +33,10 @@ class ScpGoi(WikiPage): pass +class ScpSupplement(WikiPage): + pass + + class ScpTitle(scrapy.Item): title = scrapy.Field() scp = scrapy.Field() diff --git a/scp_crawler/postprocessing.py b/scp_crawler/postprocessing.py index ec3ee0d..7d284d8 100644 --- a/scp_crawler/postprocessing.py +++ b/scp_crawler/postprocessing.py @@ -273,5 +273,53 @@ def run_postproc_goi(): to_file(tales, processed_path / "index.json") +@cli.command() +def run_postproc_supplement(): + + processed_path = Path(cwd + "/data/processed/supplement") + os.makedirs(processed_path, exist_ok=True) + + print("Processing Supplement list.") + + supplement_list = from_file(cwd + "/data/scp_supplement.json") + supplements = {} + for supplement in tqdm(supplement_list, smoothing=0): + + supplement["images"] = get_images(supplement["raw_content"]) + supplement["hubs"] = get_hubs(supplement["link"]) + supplement["raw_source"] = get_wiki_source(supplement["page_id"], supplement["domain"]) + + # Convert history dict to list and sort by date. + supplement["history"] = process_history(supplement["history"]) + + if len(supplement["history"]) > 0: + supplement["created_at"] = supplement["history"][0]["date"] + supplement["creator"] = supplement["history"][0]["author"] + else: + supplement["created_at"] = "unknown" + supplement["creator"] = "unknown" + + supplement["link"] = supplement["url"].replace("https://scp-wiki.wikidot.com/", "") + + # Extract parent SCP from title or link + scp_match = re.search(r"scp-\d+", supplement["link"], re.IGNORECASE) + supplement["parent_scp"] = scp_match.group(0).upper() if scp_match else None + + # Extract parent tale series from link + tale_match = re.match(r"([a-z\-]+)-\d+$", supplement["link"]) + supplement["parent_tale"] = tale_match.group(1) if tale_match else None + + supplements[supplement["link"]] = supplement + + to_file(supplements, processed_path / f"content_supplement.json") + + for supplement_id in supplements: + del supplements[supplement_id]["raw_content"] + del supplements[supplement_id]["raw_source"] + supplements[supplement_id]["content_file"] = f"content_supplement.json" + + to_file(supplements, processed_path / "index.json") + + if __name__ == "__main__": cli() diff --git a/scp_crawler/spiders/scp.py b/scp_crawler/spiders/scp.py index 8528b3a..58fee06 100644 --- a/scp_crawler/spiders/scp.py +++ b/scp_crawler/spiders/scp.py @@ -8,7 +8,7 @@ from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule -from ..items import ScpGoi, ScpHub, ScpItem, ScpTale, ScpTitle +from ..items import ScpGoi, ScpHub, ScpItem, ScpSupplement, ScpTale, ScpTitle DOMAIN = "scp-wiki.wikidot.com" INT_DOMAIN = "scp-int.wikidot.com" @@ -531,6 +531,53 @@ def parse_tale(self, response, original_link=None): return self.get_history_request(item["page_id"], 1, item) +class ScpSupplementSpider(CrawlSpider, WikiMixin): + name = "scp_supplement" + + start_urls = [ + f"http://{DOMAIN}/system:page-tags/tag/supplement", + ] + + allowed_domains = [DOMAIN] + + domain = DOMAIN + + rules = ( + Rule(LinkExtractor(allow=[re.escape("system:page-tags/tag/supplement")])), + Rule(LinkExtractor(allow=[r".*"]), callback="parse_supplement"), + ) + + def parse_supplement(self, response, original_link=None): + self.logger.debug("Reviewing Potential SCP Supplement page: %s", response.url) + content = self.get_content(response) + tags = self.get_tags(response) + + if not content or not tags: + return None + + redirect = self.follow_splash_redirects(response, tags, self.parse_supplement) + if redirect: + return redirect + + if "supplement" not in tags: + return None + + self.logger.info("Processing SCP Supplement page: %s", response.url) + content_soup = BeautifulSoup(content, "lxml") + + item = ScpSupplement() + item["title"] = self.get_title(response) + item["url"] = response.url + item["domain"] = self.domain + item["link"] = original_link if original_link else self.get_simple_link(response.url) + item["tags"] = tags + item["page_id"] = self.get_page_id(response) + item["rating"] = get_rating(response) + item["raw_content"] = str(clean_content_soup(content_soup)) + item["references"] = self.get_content_references(response) + return self.get_history_request(item["page_id"], 1, item) + + def get_rating(response): try: return int(response.css(".rate-points .number::text").get())