diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 60170c65..e3951af7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -default_stages: [commit] +default_stages: [pre-commit] fail_fast: false exclude: | (?x)( diff --git a/camply/config/__init__.py b/camply/config/__init__.py index 39b843f1..4d5d6799 100644 --- a/camply/config/__init__.py +++ b/camply/config/__init__.py @@ -5,6 +5,7 @@ from .api_config import ( STANDARD_HEADERS, RecreationBookingConfig, + ReserveAmericaConfig, RIDBConfig, YellowstoneConfig, ) @@ -25,6 +26,7 @@ __all__ = [ "RecreationBookingConfig", "RIDBConfig", + "ReserveAmericaConfig", "STANDARD_HEADERS", "CampsiteContainerFields", "DataColumns", diff --git a/camply/config/api_config.py b/camply/config/api_config.py index f64156f8..4588398d 100644 --- a/camply/config/api_config.py +++ b/camply/config/api_config.py @@ -97,6 +97,14 @@ class RecreationBookingConfig(APIConfig): RATE_LIMITING = (1.01, 1.51) +class ReserveAmericaConfig(APIConfig): + """ + Reserve America API Configuration + """ + + # TODO: Add cookies or other authentication to configuration + + class UseDirectConfig(APIConfig): """ Reserve California API Configuration diff --git a/camply/providers/__init__.py b/camply/providers/__init__.py index a4a96fc4..a5885de8 100644 --- a/camply/providers/__init__.py +++ b/camply/providers/__init__.py @@ -13,6 +13,7 @@ RecreationDotGovTicket, RecreationDotGovTimedEntry, ) +from .reserve_america.reserveamerica_provider import ReserveAmerica from .usedirect.variations import ( AlabamaStateParks, ArizonaStateParks, @@ -49,6 +50,7 @@ AlabamaStateParks, FairfaxCountyParks, MinnesotaStateParks, + ReserveAmerica, ] __all__ = [ @@ -73,4 +75,5 @@ "AlabamaStateParks", "FairfaxCountyParks", "MinnesotaStateParks", + "ReserveAmerica", ] diff --git a/camply/providers/recreation_dot_gov/recdotgov_provider.py b/camply/providers/recreation_dot_gov/recdotgov_provider.py index 0fbee5df..120469a2 100644 --- a/camply/providers/recreation_dot_gov/recdotgov_provider.py +++ b/camply/providers/recreation_dot_gov/recdotgov_provider.py @@ -45,7 +45,7 @@ class RecreationDotGovBase(BaseProvider, ABC): def __init__(self, api_key: Optional[str] = None): """ - Initialize with Search Dates + Initialize API key and headers for the Recreation.gov API """ super().__init__() if api_key is None: diff --git a/camply/providers/reserve_america/__init__.py b/camply/providers/reserve_america/__init__.py new file mode 100644 index 00000000..9e309661 --- /dev/null +++ b/camply/providers/reserve_america/__init__.py @@ -0,0 +1,3 @@ +""" +ReserveAmerica __init__ +""" diff --git a/camply/providers/reserve_america/reserve_america_scraper/__init__.py b/camply/providers/reserve_america/reserve_america_scraper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/camply/providers/reserve_america/reserve_america_scraper/items.py b/camply/providers/reserve_america/reserve_america_scraper/items.py new file mode 100644 index 00000000..b9a0cd17 --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/items.py @@ -0,0 +1,13 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class CampgroundAvailabilityItem(scrapy.Item): + parkId = scrapy.Field() + site = scrapy.Field() + date = scrapy.Field() + availability = scrapy.Field() diff --git a/camply/providers/reserve_america/reserve_america_scraper/middlewares.py b/camply/providers/reserve_america/reserve_america_scraper/middlewares.py new file mode 100644 index 00000000..a772762b --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/middlewares.py @@ -0,0 +1,191 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +import logging +import time +from datetime import datetime, timedelta + +# useful for handling different item types with a single interface +from scrapy import signals +from scrapy.http import HtmlResponse +from selenium import webdriver +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager + +from camply.containers.data_containers import AvailableCampsite + +logger = logging.getLogger(__name__) + +logging.getLogger("selenium.webdriver.remote.remote_connection").setLevel(logging.INFO) + + +class ReserveAmericaScraperSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it does not have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class CamplyReserveAmericaSpiderMiddleware: + """ + Spider middleware to convert scraped items (raw availability) into + `AvailableCampsite` objects. Accumulates them so they can be retrieved + by the spider or provider at the end of the crawl. + """ + + def __init__(self): + # Store processed campsite objects here + self.available_campsites = [] + + @classmethod + def from_crawler(cls, crawler): + """ + Create the middleware and connect signals. + """ + middleware = cls() + # Connect signals + crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(middleware.item_scraped, signal=signals.item_scraped) + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) + return middleware + + def spider_opened(self, spider): + logger.debug(f"Spider opened: {spider.name}") + + def item_scraped(self, item, spider): + """ + Convert the scraped item (if it's available) into an `AvailableCampsite`, + then store it in `self.available_campsites`. + """ + availability = item.get("availability", "").lower() + + # Only process items that are actually "Available" + if availability == "a": + # For example, parse the date from the item + date_str = item["date"] # e.g. '2025-04-01' + booking_date = datetime.strptime(date_str, "%Y-%m-%d") + # Create your `AvailableCampsite` object + campsite = AvailableCampsite( + campsite_id=item["site"], + booking_date=booking_date, + booking_end_date=booking_date + timedelta(days=1), + booking_nights=1, + campsite_site_name=str(item["site"]), + campsite_loop_name="Placeholder Loop Name", + # TODO: Replace with actual loop name + campsite_occupancy=[1, 1], + # TODO: Replace with actual occupancy + availability_status=item["availability"], + recreation_area="Placeholder Recreation Area", + # TODO: Replace with actual Recreation Area name + recreation_area_id=item["parkId"], + facility_name="Placeholder Facility Name", + # TODO: Replace with actual Facility name + facility_id=item["parkId"], + booking_url="placeholder.url", + # TODO: Replace with actual booking URL + ) + self.available_campsites.append(campsite) + + def spider_closed(self, spider, reason): + """ + When the spider finishes, optionally store the list of available_campsites + back onto the spider so the provider can retrieve them. + """ + spider.logger.info(f"Spider closed: {spider.name}, reason: {reason}") + spider.available_campsites = self.available_campsites + + +class HumanInTheDownloaderMiddleware: + def __init__(self): + self.driver = self.create_headful_driver() + # TODO: Only use the headful if a captcha is detected. + # [ ] Save session (cookies) that can be passed between headless and headful drivers + # [ ] Write headless driver + # [ ] Detect if captcha is present and switch to headful driver + # [ ] Switch back to headless driver after captcha is solved + # [ ] Save session (cookies) to configuration file + + def create_headful_driver(self): + chrome_options = webdriver.ChromeOptions() + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option("useAutomationExtension", False) + service = ChromeService(ChromeDriverManager().install()) + return webdriver.Chrome(service=service, options=chrome_options) + + def process_request(self, request, spider): + # Process only requests marked with meta['selenium']. + if not request.meta.get("selenium"): + return None + + self.driver.get(request.url) + time.sleep(3) # Wait for JavaScript elements to load. + body = self.driver.page_source + + # Check for a captcha. Adjust this check as needed. + if "captcha" in body.lower(): + spider.logger.info("Captcha detected!") + spider.logger.info( + "Please solve the captcha in the browser window, then press Enter to continue..." + ) + input("Press Enter after solving the captcha...") + body = self.driver.page_source + + # Attach the Selenium driver to the request meta so it's available later. + request.meta["driver"] = self.driver + + return HtmlResponse( + url=self.driver.current_url, body=body, encoding="utf-8", request=request + ) + + def process_response(self, request, response, spider): + # Return the response unmodified. + return response + + def process_exception(self, request, exception, spider): + spider.logger.error(f"Exception in HumanInTheMiddleware: {exception}") + + def spider_closed(self, spider): + self.driver.quit() diff --git a/camply/providers/reserve_america/reserve_america_scraper/pipelines.py b/camply/providers/reserve_america/reserve_america_scraper/pipelines.py new file mode 100644 index 00000000..6b385e47 --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/pipelines.py @@ -0,0 +1,33 @@ +from datetime import datetime + +import pandas as pd + + +class DataFramePipeline: + def __init__(self): + self.items = [] + + def process_item(self, item, spider): + self.items.append(dict(item)) + return item + + def close_spider(self, spider): + # Create a DataFrame + df = pd.DataFrame(self.items) + + # Ensure there is only one unique parkId + if "parkId" in df.columns: + unique_park_ids = df["parkId"].unique() + if len(unique_park_ids) != 1: + spider.logger.error("Multiple or no unique parkId values found.") + return + park_id = unique_park_ids[0] + else: + spider.logger.error("parkId column not found in data.") + return + + # Export to Parquet with parkId in filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + fpath = f"../data/availability_{park_id}_{timestamp}.parquet" + df.to_parquet(fpath, index=False) + spider.logger.info(f"Saved dataframe to {fpath}") diff --git a/camply/providers/reserve_america/reserve_america_scraper/settings.py b/camply/providers/reserve_america/reserve_america_scraper/settings.py new file mode 100644 index 00000000..660074da --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/settings.py @@ -0,0 +1,95 @@ +# Scrapy settings for reserve_america_scraper project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "reserve_america_scraper" + +SPIDER_MODULES = ["reserve_america_scraper.spiders"] +NEWSPIDER_MODULE = "reserve_america_scraper.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0" +) + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 1 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 1 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +SPIDER_MIDDLEWARES = { + "camply.providers.reserve_america.reserve_america_scraper.middlewares.CamplyReserveAmericaSpiderMiddleware": 500, + "reserve_america_scraper.middlewares.ReserveAmericaScraperSpiderMiddleware": 600, +} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + "reserve_america_scraper.middlewares.HumanInTheDownloaderMiddleware": 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "reserve_america_scraper.pipelines.DataFramePipeline": 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/camply/providers/reserve_america/reserve_america_scraper/spiders/__init__.py b/camply/providers/reserve_america/reserve_america_scraper/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/camply/providers/reserve_america/reserve_america_scraper/spiders/campground_spider.py b/camply/providers/reserve_america/reserve_america_scraper/spiders/campground_spider.py new file mode 100644 index 00000000..ddb4439a --- /dev/null +++ b/camply/providers/reserve_america/reserve_america_scraper/spiders/campground_spider.py @@ -0,0 +1,326 @@ +import time +from datetime import datetime, timedelta +from urllib.parse import parse_qs, urlparse + +import scrapy +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By + +from camply.providers.reserve_america.reserve_america_scraper.items import ( + CampgroundAvailabilityItem, +) + + +class CampgroundSpider(scrapy.Spider): + name = "campground" + allowed_domains = [ + "massdcrcamping.reserveamerica.com", + "go.aspiraconnect.com", + ] + + def __init__( + self, start_date=None, end_date=None, park_id="32608", *args, **kwargs + ): + super().__init__(*args, **kwargs) + + # Default start_date is today, default end_date is 6 weeks from start_date. + if start_date is None: + start_date = datetime.today().strftime("%m/%d/%Y") + self.start_date_str = start_date + self.start_date = datetime.strptime(start_date, "%m/%d/%Y").date() + + if end_date is None: + end_date = (self.start_date + timedelta(weeks=6)).strftime("%m/%d/%Y") + self.end_date_str = end_date + self.end_date = datetime.strptime(end_date, "%m/%d/%Y").date() + + self.park_id = park_id + + def start_requests(self): + # Update the URL to include the start_date instead of "null" + start_urls = [ + f"https://massdcrcamping.reserveamerica.com/campsiteCalendar.do?page=calendar&contractCode=MA&parkId={self.park_id}&calarvdate={self.start_date_str}&sitepage=true&startIdx=0", + # TODO: Implement generic reserveamerica.com. + # Note, calendar grid is slightly different for generic reserveamerica.com pages. + ] + for url in start_urls: + yield scrapy.Request(url=url, callback=self.parse, meta={"selenium": True}) + + def parse(self, response): + # Process the current date window's data. + yield from self.parse_page(response) + # Scrape all pages in the current date window by navigating appropriately. + yield from self.scrape_all_pages_in_date_window(response) + # Finally, handle the "Next 2 Weeks" date pagination. + yield from self.paginate_date_window(response) + + def parse_page(self, response): + """ + Extract header information (month/year and day numbers) and process each campsite row, + yielding an item for each site's availability. + """ + daterange_div = response.css("#daterangediv") + if not daterange_div: + self.logger.warning("No daterangediv found in response") + with open("no_daterangediv_response.html", "wb") as f: + f.write("No daterangediv found in response".encode()) + f.write(response.body) + yield scrapy.Request(response.url, meta={"selenium": True}) + return + + # Extract month and year + month, year = self._extract_month_year(daterange_div) + + # Extract dates + dates = self._extract_dates(daterange_div, month, year) + + # Process campsite rows + yield from self._process_campsite_rows(daterange_div, dates, response) + + def _extract_month_year(self, daterange_div): + """ + Extract the month and year from the daterange_div. + """ + month_text = daterange_div.css( + "div.empty.top.rght .td.weeknav.month span::text" + ).get() + if month_text: + try: + dt = datetime.strptime(month_text.strip(), "%b %Y") + return dt.month, dt.year + except Exception as e: + self.logger.error(f"Error parsing month/year from '{month_text}': {e}") + return datetime.now().month, datetime.now().year + + def _extract_dates(self, daterange_div, month, year): + """ + Extract header cells for the day numbers and return a list of dates. + """ + header_cells = daterange_div.css("#calendar .thead div.th.calendar") + dates = [] + for cell in header_cells: + day_text = cell.css("div.date::text").get() + if day_text: + try: + day = int(day_text.strip()) + date_obj = datetime(year, month, day).date() + dates.append(date_obj) + except Exception as e: + self.logger.error(f"Error parsing day '{day_text}': {e}") + dates.append(None) + else: + dates.append(None) + self.logger.info(f"Extracted dates: {dates}") + return dates + + def _process_campsite_rows(self, daterange_div, dates, response): + """ + Process each campsite row and yield items for each site's availability. + """ + rows = daterange_div.css("#calendar > div.br") + self.logger.info(f"Found {len(rows)} site rows") + qs = parse_qs(urlparse(response.url).query) + parkId = qs.get("parkId", [None])[0] + + for row in rows: + site = row.css("div.td.sn .siteListLabel a::text").get() + site = site.strip() if site else "" + availability_cells = row.css("div.td.status") + if len(availability_cells) != len(dates): + self.logger.warning( + f"Row with site {site}: number of availability cells ({len(availability_cells)}) does not match number of dates ({len(dates)})" + ) + for i, cell in enumerate(availability_cells): + if i < len(dates) and dates[i]: + avail = cell.css("::text").get() + avail = avail.strip() if avail else "" + item = CampgroundAvailabilityItem() + item["parkId"] = parkId + item["site"] = site + item["date"] = dates[i].isoformat() + item["availability"] = avail + yield item + + def scrape_all_pages_in_date_window(self, response): + """ + Check for the presence of next and previous pagination buttons. + - If next exists and previous does not, assume the page is the first page and navigate forward. + - If previous exists and next does not, assume the page is the last page and navigate backward. + - If both exist, raise an error. + """ + daterange_div = response.css("#daterangediv") + next_page_href = daterange_div.css( + "span.pagenav a#resultNext_dr_top::attr(href)" + ).get() + prev_page_href = daterange_div.css( + "span.pagenav a#resultPrevious_dr_top::attr(href)" + ).get() + + if next_page_href and not prev_page_href: + self.logger.info("Detected first page of date window; navigating forward.") + yield from self.scrape_pages_forward(response) + elif prev_page_href and not next_page_href: + self.logger.info("Detected last page of date window; navigating backward.") + yield from self.scrape_pages_backward(response) + elif next_page_href and prev_page_href: + self.logger.error( + "Both next and previous pagination buttons exist. Ambiguous starting page." + ) + raise ValueError( + "Both next and previous pagination buttons exist. Cannot determine starting page." + ) + else: + self.logger.info("No pagination buttons found; single page date window.") + + def scrape_pages_forward(self, response): + """ + If on the first page, repeatedly click the next-page button to scrape forward + until no next-page button is found. + """ + driver = response.request.meta.get("driver") + if not driver: + self.logger.error("No Selenium driver found in response meta!") + return + + while True: + try: + next_button = driver.find_element( + By.CSS_SELECTOR, "a#resultNext_dr_top" + ) + if not next_button.is_displayed() or not next_button.is_enabled(): + self.logger.info("Next button not clickable; reached last page.") + break + except Exception: + self.logger.info("No next page button found; reached last page.") + break + + self.logger.info("Clicking next page button via Selenium.") + try: + actions = ActionChains(driver) + actions.move_to_element(next_button).click().perform() + time.sleep(3) # Consider using WebDriverWait for a more robust wait. + new_page = driver.page_source + new_url = driver.current_url + new_response = scrapy.http.HtmlResponse( + url=new_url, + body=new_page, + encoding="utf-8", + request=response.request, + ) + yield from self.parse_page(new_response) + # Check if there's another next page. + daterange_div = new_response.css("#daterangediv") + next_page_href = daterange_div.css( + "span.pagenav a#resultNext_dr_top::attr(href)" + ).get() + if not next_page_href: + self.logger.info("No further next page found; reached last page.") + break + except Exception as e: + self.logger.error(f"Error clicking next page: {e}") + break + + def scrape_pages_backward(self, response): + """ + If on the last page, repeatedly click the previous-page button to scrape backward + until no previous-page button is found (i.e. reached the first page). + """ + driver = response.request.meta.get("driver") + if not driver: + self.logger.error("No Selenium driver found in response meta!") + return + + while True: + try: + prev_button = driver.find_element( + By.CSS_SELECTOR, "a#resultPrevious_dr_top" + ) + if not prev_button.is_displayed() or not prev_button.is_enabled(): + self.logger.info( + "Previous button not clickable; reached first page." + ) + break + except Exception: + self.logger.info("No previous page button found; reached first page.") + break + + self.logger.info("Clicking previous page button via Selenium.") + try: + actions = ActionChains(driver) + actions.move_to_element(prev_button).click().perform() + time.sleep(3) + new_page = driver.page_source + new_url = driver.current_url + new_response = scrapy.http.HtmlResponse( + url=new_url, + body=new_page, + encoding="utf-8", + request=response.request, + ) + yield from self.parse_page(new_response) + # Check if there's another previous page. + daterange_div = new_response.css("#daterangediv") + prev_page_href = daterange_div.css( + "span.pagenav a#resultPrevious_dr_top::attr(href)" + ).get() + if not prev_page_href: + self.logger.info( + "No further previous page found; reached first page." + ) + break + except Exception as e: + self.logger.error(f"Error clicking previous page: {e}") + break + + def paginate_date_window(self, response): + """ + Handle date pagination ("Next 2 Weeks") using Selenium. + Click the Next 2 Weeks button if it exists and if the target date is within the allowed range. + """ + daterange_div = response.css("#daterangediv") + next_week_href = daterange_div.css( + "div.empty.top.rght a#nextWeek::attr(href)" + ).get() + if next_week_href: + parsed_url = urlparse(next_week_href) + params = parse_qs(parsed_url.query) + calarvdate_str = params.get("calarvdate", [None])[0] + if calarvdate_str: + try: + next_window_date = datetime.strptime( + calarvdate_str, "%m/%d/%Y" + ).date() + if next_window_date > self.end_date: + self.logger.info( + "Reached beyond end_date; stopping date pagination." + ) + return + except Exception as e: + self.logger.error( + f"Error parsing calarvdate '{calarvdate_str}': {e}" + ) + try: + driver = response.request.meta.get("driver") + if not driver: + self.logger.error("No Selenium driver found in response meta!") + return + next_week_button = driver.find_element( + By.CSS_SELECTOR, "div.empty.top.rght a#nextWeek" + ) + self.logger.info("Clicking Next 2 Weeks button via Selenium.") + actions = ActionChains(driver) + actions.move_to_element(next_week_button).click().perform() + time.sleep(3) + new_page = driver.page_source + new_url = driver.current_url + new_response = scrapy.http.HtmlResponse( + url=new_url, + body=new_page, + encoding="utf-8", + request=response.request, + ) + yield from self.parse(new_response) + except Exception as e: + self.logger.error(f"Error during date pagination: {e}") + else: + self.logger.info("No further date pagination found.") diff --git a/camply/providers/reserve_america/reserveamerica_provider.py b/camply/providers/reserve_america/reserveamerica_provider.py new file mode 100644 index 00000000..09a83f86 --- /dev/null +++ b/camply/providers/reserve_america/reserveamerica_provider.py @@ -0,0 +1,129 @@ +""" +Reserve America Web Searching Utilities +""" + +import logging +from abc import ABC +from datetime import date, datetime +from typing import List, Optional, Union + +from scrapy.crawler import CrawlerProcess + +# TODO: Update ReserveAmericaConfig as needed +# TODO: Add and create other classes as needed +from camply.containers import CampgroundFacility +from camply.containers.data_containers import AvailableCampsite +from camply.providers.base_provider import BaseProvider +from camply.providers.reserve_america.reserve_america_scraper.spiders.campground_spider import ( + CampgroundSpider, +) +from camply.utils.general_utils import make_list + +logger = logging.getLogger(__name__) + +scrapy_base_settings = { + "BOT_NAME": "reserve_america_scraper", + "ROBOTSTXT_OBEY": False, + "CONCURRENT_REQUESTS": 1, + "DOWNLOAD_DELAY": 1, + "DOWNLOADER_MIDDLEWARES": { + "camply.providers.reserve_america.reserve_america_scraper.middlewares.HumanInTheDownloaderMiddleware": 543, + }, +} + + +class ReserveAmerica(BaseProvider, ABC): + """ + Python Class for Reserve America Web Searching + """ + + def __init__(self): + """ + Initialize the ReserveAmericaBase class. + """ + super().__init__() + # TODO: Initialize session/cookies + + # TODO return availability, see search_reserveamerica.py for more details + + def find_campgrounds( + self, + park_ids: Optional[List[str]] = None, + ) -> List[CampgroundFacility]: + """ + Get Campground metadata from ReserveAmerica + + Parameters + ---------- + park_ids: Optional[List[str]] + ReserveAmerica Park ID or List of IDs + + Returns + ------- + List[CampgroundFacility] + List of CampgroundFacility objects + """ + + found_campgrounds: List[CampgroundFacility] = [] + logger.debug(f"Finding campgrounds for park IDs: {park_ids}") + for park_id in make_list(park_ids): + campground = CampgroundFacility( + facility_name="Placeholder Facility Name", + # TODO: Replace with actual name + facility_id=park_id, + recreation_area="Placeholder Recreation Area", + recreation_area_id=park_id, + # TODO: Replace with actual recreation area ID + ) + found_campgrounds.append(campground) + logger.debug( + f"Found {len(found_campgrounds)} campgrounds for park IDs: {park_ids}" + ) + return found_campgrounds + + def get_campsites( + self, + park_id: int, + start_date: Union[datetime, date], + end_date: Union[datetime, date], + **spider_args, + ) -> List[AvailableCampsite]: + """ + Run the CampgroundSpider to scrape campground data, + returning a list of `AvailableCampsite`. + """ + + start_date_str = start_date.strftime("%m/%d/%Y") + end_date_str = end_date.strftime("%m/%d/%Y") + + scrapy_settings = scrapy_base_settings.copy() + scrapy_settings.update( + [ + ( + "SPIDER_MIDDLEWARES", + { + "camply.providers.reserve_america.reserve_america_scraper.middlewares.CamplyReserveAmericaSpiderMiddleware": 543, + }, + ), + ] + ) + + # Create a CrawlerProcess using your project settings + process = CrawlerProcess(settings=scrapy_settings) + + # Run the spider + crawler = process.create_crawler(CampgroundSpider) + process.crawl( + crawler, + park_id=park_id, + start_date=start_date_str, + end_date=end_date_str, + **spider_args, + ) + + process.start() # Blocks until crawl is finished + + # Retrieve the accumulated results from the spider (populated by your new middleware) + scraped_campsites = getattr(crawler.spider, "available_campsites", []) + + return scraped_campsites diff --git a/camply/providers/reserve_america/scrapy.cfg b/camply/providers/reserve_america/scrapy.cfg new file mode 100644 index 00000000..75957fbb --- /dev/null +++ b/camply/providers/reserve_america/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = reserve_america_scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = reserve_america_scraper diff --git a/camply/search/__init__.py b/camply/search/__init__.py index 2b8b6d3d..823c0935 100644 --- a/camply/search/__init__.py +++ b/camply/search/__init__.py @@ -13,6 +13,7 @@ SearchRecreationDotGovTicket, SearchRecreationDotGovTimedEntry, ) +from camply.search.search_reserveamerica import SearchReserveAmerica from camply.search.search_usedirect import ( SearchAlabamaStateParks, SearchArizonaStateParks, @@ -32,6 +33,7 @@ # Register Providers Here with their Search class __search_providers__: List[Type[BaseCampingSearch]] = [ SearchRecreationDotGov, + SearchReserveAmerica, SearchYellowstone, SearchGoingToCamp, # UseDirect diff --git a/camply/search/search_reserveamerica.py b/camply/search/search_reserveamerica.py new file mode 100644 index 00000000..9e2947bb --- /dev/null +++ b/camply/search/search_reserveamerica.py @@ -0,0 +1,174 @@ +""" +ReserveAmerica search utilities +""" + +import logging +from typing import Any, Dict, List, Optional, Union + +from camply.containers import ( + AvailableCampsite, + SearchWindow, +) +from camply.exceptions import SearchError +from camply.providers import ReserveAmerica +from camply.search.base_search import BaseCampingSearch +from camply.utils import logging_utils +from camply.utils.general_utils import make_list +from camply.utils.logging_utils import format_log_string + +logger = logging.getLogger(__name__) + + +class SearchReserveAmerica(BaseCampingSearch): + """ + Searches on ReserveAmerica.com for Campsites + """ + + provider_class = ReserveAmerica + list_campsites_supported: bool = False + + def __init__( + self, + search_window: Union[SearchWindow, List[SearchWindow]], + campgrounds: Optional[Union[List[int], int]] = None, + campsites: Optional[Union[List[int], int]] = None, + weekends_only: bool = False, + nights: int = 1, + offline_search_path: Optional[str] = None, + **kwargs, + ) -> None: + """ + Initialize with Search Parameters + + Parameters + ---------- + search_window: Union[SearchWindow, List[SearchWindow]] + Search Window tuple containing start date and End Date + campgrounds: Optional[Union[List[int], int]] + Park ID or List of Park IDs + campsites: Optional[Union[List[int], int]] + Site ID or List of Site IDs + weekends_only: bool + Whether to only search for Camping availabilities on the weekends (Friday / + Saturday nights) + # TODO: Implement weekends_only + nights: int + minimum number of consecutive nights to search per campsite,defaults to 1 + # TODO: Implement number of nights + equipment: Optional[List[Tuple[str, Optional[int]]]] + List of tuples containing equipment name and optional quantity + # TODO: Document values of equipment + offline_search: bool + When set to True, the campsite search will both save the results of the + campsites it's found, but also load those campsites before beginning a + search for other campsites. + offline_search_path: Optional[str] + When offline search is set to True, this is the name of the file to be saved/loaded. + When not specified, the filename will default to `camply_campsites.json` + + Returns + ------- + None + """ + super(SearchReserveAmerica, self).__init__( + search_window=search_window, + weekends_only=weekends_only, + nights=nights, + # offline_search=offline_search, + # offline_search_path=offline_search_path, + **kwargs, + ) + try: + assert campgrounds not in [[], None] + except AssertionError: + raise ValueError( + f"You must provide a Campground ID to {self.provider_class.__name__}" + ) from None + + self._campground_ids: List[int] = make_list(campgrounds) + self.campgrounds = self.campsite_finder.find_campgrounds( + park_ids=self._campground_ids, + ) + self.campsites = make_list(campsites) + # TODO: Validate campsites requested are within campgrounds requested + + def get_all_campsites(self, **kwargs: Dict[str, Any]) -> List[AvailableCampsite]: + """ + Retrieve All Available Campsites from ReserveAmerica + + Parameters + ---------- + kwargs: Dict[str, Any] + #TODO: Document kwargs + + Returns + ------- + List[AvailableCampsite] + """ + if len(self.campgrounds) == 0: + error_message = "No campgrounds found to search" + logger.error(error_message) + raise SearchError(error_message) + + logger.info("Searching across %d campgrounds", len(self.campgrounds)) + + for campground in self.campgrounds: + log_str = format_log_string(campground) + logger.info(" %s", log_str) + + campsites_found: List[AvailableCampsite] = [] + + for search_window in self.search_window: + for campground in self.campgrounds: + start_date = search_window.start_date + end_date = search_window.end_date + + logger.info( + f"Searching {campground.facility_name} " + f"({campground.facility_id}) for availability: from " + f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" + ) + + campsites = self.campsite_finder.get_campsites( + park_id=campground.facility_id, + start_date=start_date, + end_date=end_date, + **kwargs, + ) + + logger.info( + f"\t{logging_utils.get_emoji(campsites)}\t" + f"{len(campsites)} campsites found for {campground.facility_name} " + f"from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" + ) + + if self.campsites not in [None, []]: + campsites = [ + campsite_obj + for campsite_obj in campsites + if int(campsite_obj.campsite_id) in self.campsites + ] + campsites_found += campsites + + campsite_df = self.campsites_to_df(campsites=campsites_found) + campsite_df_validated = self._filter_date_overlap(campsites=campsite_df) + compiled_campsite_df = self._consolidate_campsites( + campsite_df=campsite_df_validated, nights=self.nights + ) + compiled_campsites = self.df_to_campsites(campsite_df=compiled_campsite_df) + + return compiled_campsites + + def list_campsite_units(self): + """ + List Campsite Units + + Retruns + ------- + List[AvailableCampsite] + """ + + logger.info("Listing campsite units") + + # TODO: Implement the logic to list campsite units from ReserveAmerica + return super().list_campsite_units() diff --git a/pyproject.toml b/pyproject.toml index fa9a2755..5e2219be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,12 @@ dependencies = [ "requests~=2.31.0", "rich~=13.3.2", "rich-click~=1.6.1", - "tenacity~=8.2.2" + "scrapy~=2.12.0", + "selenium~=4.30.0", + "tenacity~=8.2.2", + "webdriver-manager~=4.0.2" ] -description = "camply, the campsite finder 🏕" +description = "camply, the campsite finder" dynamic = ["version"] keywords = ["recdotgov", "outdoors", "camping", "reservations", "national-parks"] license = "MIT" @@ -178,6 +181,9 @@ no_implicit_reexport = true warn_redundant_casts = true warn_unused_ignores = true +[tool.pylint.MESSAGES_CONTROL] +disable = ["logging-fstring-interpolation"] + [tool.pytest.ini_options] filterwarnings = [ "error", diff --git a/requirements.txt b/requirements.txt index b69a7818..d03b18e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,10 @@ # - requests~=2.31.0 # - rich-click~=1.6.1 # - rich~=13.3.2 +# - scrapy~=2.12.0 +# - selenium~=4.30.0 # - tenacity~=8.2.2 +# - webdriver-manager~=4.0.2 # - apprise~=1.3.0 # - trogon~=0.4.0 # - twilio~=7.17.0 @@ -20,10 +23,23 @@ apprise==1.3.0 # via hatch.envs.default +attrs==25.3.0 + # via + # outcome + # service-identity + # trio + # twisted +automat==24.8.1 + # via twisted certifi==2023.11.17 # via # apprise # requests + # selenium +cffi==1.17.1 + # via + # cryptography + # trio charset-normalizer==3.3.2 # via requests click==8.1.7 @@ -32,12 +48,53 @@ click==8.1.7 # apprise # rich-click # trogon +colorama==0.4.6 + # via click +constantly==23.10.4 + # via twisted +cryptography==44.0.2 + # via + # pyopenssl + # scrapy + # service-identity +cssselect==1.3.0 + # via + # parsel + # scrapy +defusedxml==0.7.1 + # via scrapy fake-useragent==1.4.0 # via hatch.envs.default +filelock==3.18.0 + # via tldextract +h11==0.14.0 + # via wsproto +hyperlink==21.0.0 + # via twisted idna==3.6 - # via requests + # via + # hyperlink + # requests + # tldextract + # trio +incremental==24.7.2 + # via twisted +itemadapter==0.11.0 + # via + # itemloaders + # scrapy +itemloaders==1.3.2 + # via scrapy +jmespath==1.0.1 + # via + # itemloaders + # parsel linkify-it-py==2.0.2 # via markdown-it-py +lxml==5.3.2 + # via + # parsel + # scrapy markdown==3.5.1 # via apprise markdown-it-py==2.2.0 @@ -53,18 +110,49 @@ numpy==1.26.3 # via pandas oauthlib==3.2.2 # via requests-oauthlib +outcome==1.3.0.post0 + # via + # trio + # trio-websocket +packaging==24.2 + # via + # parsel + # scrapy + # webdriver-manager pandas==2.1.4 # via hatch.envs.default +parsel==1.10.0 + # via + # itemloaders + # scrapy +protego==0.4.0 + # via scrapy +pyasn1==0.6.1 + # via + # pyasn1-modules + # service-identity +pyasn1-modules==0.4.2 + # via service-identity +pycparser==2.22 + # via cffi pydantic==1.10.13 # via hatch.envs.default +pydispatcher==2.0.7 + # via scrapy pygments==2.17.2 # via rich pyjwt==2.8.0 # via twilio +pyopenssl==25.0.0 + # via scrapy +pysocks==1.7.1 + # via urllib3 python-dateutil==2.8.2 # via pandas python-dotenv==1.0.0 - # via hatch.envs.default + # via + # hatch.envs.default + # webdriver-manager pytz==2023.3.post1 # via # hatch.envs.default @@ -74,14 +162,21 @@ pyyaml==6.0.1 # via # hatch.envs.default # apprise +queuelib==1.8.0 + # via scrapy ratelimit==2.2.1 # via hatch.envs.default requests==2.31.0 # via # hatch.envs.default # apprise + # requests-file # requests-oauthlib + # tldextract # twilio + # webdriver-manager +requests-file==2.1.0 + # via tldextract requests-oauthlib==1.3.1 # via apprise rich==13.3.5 @@ -91,23 +186,65 @@ rich==13.3.5 # textual rich-click==1.6.1 # via hatch.envs.default +scrapy==2.12.0 + # via hatch.envs.default +selenium==4.30.0 + # via hatch.envs.default +service-identity==24.2.0 + # via scrapy six==1.16.0 # via python-dateutil +sniffio==1.3.1 + # via trio +sortedcontainers==2.4.0 + # via trio tenacity==8.2.3 # via hatch.envs.default textual==0.47.1 # via trogon +tldextract==5.1.3 + # via scrapy +trio==0.29.0 + # via + # selenium + # trio-websocket +trio-websocket==0.12.2 + # via selenium trogon==0.4.0 # via hatch.envs.default twilio==7.17.0 # via hatch.envs.default +twisted==24.11.0 + # via scrapy typing-extensions==4.9.0 # via # pydantic + # pyopenssl + # selenium # textual + # twisted tzdata==2023.4 # via pandas uc-micro-py==1.0.2 # via linkify-it-py urllib3==2.1.0 - # via requests + # via + # requests + # selenium +w3lib==2.3.1 + # via + # parsel + # scrapy +webdriver-manager==4.0.2 + # via hatch.envs.default +websocket-client==1.8.0 + # via selenium +wsproto==1.2.0 + # via trio-websocket +zope-interface==7.2 + # via + # scrapy + # twisted + +# The following packages are considered to be unsafe in a requirements file: +# setuptools