diff --git a/legistar/events.py b/legistar/events.py index 839a1e1..d337044 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -1,4 +1,3 @@ -from abc import ABCMeta, abstractmethod import time import datetime from collections import deque @@ -73,7 +72,7 @@ def events(self, follow_links=True, since=None): # we might revisit the same event. So, we keep track of # the last few events we've visited in order to # make sure we are not revisiting - scraped_events = deque([], maxlen=10) + scraped_events = deque([], maxlen=100) current_year = self.now().year @@ -93,7 +92,7 @@ def events(self, follow_links=True, since=None): for year in range(current_year + 1, since_year, -1): no_events_in_year = True - for page in self.eventPages(year): + for idx, page in enumerate(self.eventPages(year)): events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] for event, _, _ in self.parseDataTable(events_table): ical_url = event['iCalendar']['url'] @@ -171,7 +170,7 @@ def _get_ecomment_link(self, link): return self.ecomment_dict.get(event_id, None) -class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta): +class LegistarAPIEventScraper(LegistarAPIScraper): webscraper_class = LegistarEventsScraper WEB_RETRY_EVENTS = 3 @@ -197,10 +196,6 @@ def _init_webscraper(self): return webscraper - @abstractmethod - def _get_web_event(self, api_event): - pass - def api_events(self, since_datetime=None): # scrape from oldest to newest. This makes resuming big # scraping jobs easier because upon a scrape failure we can @@ -315,25 +310,6 @@ def minutes(self, event): self._suppress_item_matter(item, minutes_url) yield item - def _suppress_item_matter(self, item, agenda_url): - ''' - Agenda items in Legistar do not always display links to - associated matter files even if the same agenda item - in the API references a Matter File. The agenda items - we scrape should honor the suppression on the Legistar - agendas. - - This is also practical because matter files that are hidden - in the Legistar Agenda do not seem to available for scraping - on Legistar or through the API - - Since we are not completely sure that the same suppression - logic should be used for all Legislative Bodies, this method - is currently just a hook for being overridden in particular - scrapers. As of now, at least LA Metro uses this hook. - ''' - pass - def rollcalls(self, event): for item in self.agenda(event): if item['EventItemRollCallFlag']: @@ -354,6 +330,25 @@ def addDocs(self, e, events, doc_type): except ValueError: pass + def _suppress_item_matter(self, item, agenda_url): + ''' + Agenda items in Legistar do not always display links to + associated matter files even if the same agenda item + in the API references a Matter File. The agenda items + we scrape should honor the suppression on the Legistar + agendas. + + This is also practical because matter files that are hidden + in the Legistar Agenda do not seem to available for scraping + on Legistar or through the API + + Since we are not completely sure that the same suppression + logic should be used for all Legislative Bodies, this method + is currently just a hook for being overridden in particular + scrapers. As of now, at least LA Metro uses this hook. + ''' + pass + def _event_status(self, event): '''Events can have a status of tentative, confirmed, cancelled, or passed (http://docs.opencivicdata.org/en/latest/data/event.html). By @@ -367,11 +362,20 @@ def _event_status(self, event): status = 'confirmed' return status - - -class LegistarAPIEventScraper(LegistarAPIEventScraperBase): + + def _not_in_web_interface(self, event): + '''Occasionally, an event will appear in the API before it appears in + the web interface. This method checks attributes of the API event that + tell uswhether the given event is one of those cases, returning True if + so, and False otherwise. Available for override in jurisdictional + scrapers. + ''' + return False def _get_web_event(self, api_event): + if self._not_in_web_interface(api_event): + return None + return self.web_detail(api_event) def web_detail(self, event): @@ -405,12 +409,12 @@ def web_detail(self, event): return event_page_details -class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase): - ''' - There are some inSite sites that have information that only appears - event listing page, like NYC's 'Meeting Topic.' This scraper visits - the listing page and attempts to zip API and web events together - ''' +class WebCalendarFallbackMixin: + """ + Sometimes, events are visible on the web calendar before their detail + link is accessible. Use this mixin to scrape to check the web calendar + for events if their detail link cannot be accessed. + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -422,20 +426,35 @@ def __init__(self, *args, **kwargs): # Instantiate dictionary where events from generator are stored as they # are scraped. self._scraped_events = {} - + def _get_web_event(self, api_event): if self._not_in_web_interface(api_event): return None - else: - # None if entire web calendar scraped but API event not found - return self.web_results(api_event) + + web_event = None + + if not self._detail_page_not_available(api_event): + web_event = super()._get_web_event(api_event) + + if web_event is None: + web_event = self.web_results(api_event) + + return web_event + + def _detail_page_not_available(self, api_event): + """ + Sometimes, we can know from the API event that a detail link is + not available or will be invalid, so we can skip trying to access + it. Available for override in jurisdictional scrapers. + """ + return False def web_results(self, event): api_key = (event['EventBodyName'].strip(), event['start']) - # Check the cache of events we've already scraped from the web interface - # for the API event at hand. + # Check the cache of events we've already scraped from the web + # interface for the API event at hand. if api_key in self._scraped_events: return self._scraped_events[api_key] @@ -453,29 +472,21 @@ def _scrapeWebCalendar(self): chronological order. ''' for event, _ in self._webscraper.events(follow_links=False): - event_key = self._event_key(event, self._webscraper) + event_key = self._event_key(event) yield event_key, event - def _event_key(self, event, web_scraper): + def _event_key(self, event): '''Since Legistar InSite contains more information about events than are available in the API, we need to scrape both. Then, we have to line them up. This method makes a key that should be uniquely identify every event and will allow us to link events from the two data sources. ''' - response = web_scraper.get(event['iCalendar']['url'], verify=False) - event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt + response = self._webscraper.get(event['iCalendar']['url'], verify=False) + event_time = self._webscraper.ical(response.text).subcomponents[0]['DTSTART'].dt event_time = pytz.timezone(self.TIMEZONE).localize(event_time) key = (event['Name']['label'], event_time) return key - - def _not_in_web_interface(self, event): - '''Occasionally, an event will appear in the API, but not in the web - interface. This method checks attributes of the API event that tell us - whether the given event is one of those cases, returning True if so, and - False otherwise. Available for override in jurisdictional scrapers. - ''' - return False