From 79af30f8fb87f10304a79f2296e97bf87d902c54 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Wed, 9 Jul 2025 11:03:48 -0500 Subject: [PATCH 1/5] Convert sibling API event scraper classes into one class and a mixin --- legistar/events.py | 105 ++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/legistar/events.py b/legistar/events.py index 839a1e1..316874f 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -1,4 +1,3 @@ -from abc import ABCMeta, abstractmethod import time import datetime from collections import deque @@ -171,7 +170,7 @@ def _get_ecomment_link(self, link): return self.ecomment_dict.get(event_id, None) -class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta): +class LegistarAPIEventScraper(LegistarAPIScraper): webscraper_class = LegistarEventsScraper WEB_RETRY_EVENTS = 3 @@ -197,10 +196,6 @@ def _init_webscraper(self): return webscraper - @abstractmethod - def _get_web_event(self, api_event): - pass - def api_events(self, since_datetime=None): # scrape from oldest to newest. This makes resuming big # scraping jobs easier because upon a scrape failure we can @@ -315,25 +310,6 @@ def minutes(self, event): self._suppress_item_matter(item, minutes_url) yield item - def _suppress_item_matter(self, item, agenda_url): - ''' - Agenda items in Legistar do not always display links to - associated matter files even if the same agenda item - in the API references a Matter File. The agenda items - we scrape should honor the suppression on the Legistar - agendas. - - This is also practical because matter files that are hidden - in the Legistar Agenda do not seem to available for scraping - on Legistar or through the API - - Since we are not completely sure that the same suppression - logic should be used for all Legislative Bodies, this method - is currently just a hook for being overridden in particular - scrapers. As of now, at least LA Metro uses this hook. - ''' - pass - def rollcalls(self, event): for item in self.agenda(event): if item['EventItemRollCallFlag']: @@ -354,6 +330,25 @@ def addDocs(self, e, events, doc_type): except ValueError: pass + def _suppress_item_matter(self, item, agenda_url): + ''' + Agenda items in Legistar do not always display links to + associated matter files even if the same agenda item + in the API references a Matter File. The agenda items + we scrape should honor the suppression on the Legistar + agendas. + + This is also practical because matter files that are hidden + in the Legistar Agenda do not seem to available for scraping + on Legistar or through the API + + Since we are not completely sure that the same suppression + logic should be used for all Legislative Bodies, this method + is currently just a hook for being overridden in particular + scrapers. As of now, at least LA Metro uses this hook. + ''' + pass + def _event_status(self, event): '''Events can have a status of tentative, confirmed, cancelled, or passed (http://docs.opencivicdata.org/en/latest/data/event.html). By @@ -367,11 +362,20 @@ def _event_status(self, event): status = 'confirmed' return status - - -class LegistarAPIEventScraper(LegistarAPIEventScraperBase): + + def _not_in_web_interface(self, event): + '''Occasionally, an event will appear in the API before it appears in + the web interface. This method checks attributes of the API event that + tell uswhether the given event is one of those cases, returning True if + so, and False otherwise. Available for override in jurisdictional + scrapers. + ''' + return False def _get_web_event(self, api_event): + if self._not_in_web_interface(api_event): + return None + return self.web_detail(api_event) def web_detail(self, event): @@ -405,12 +409,13 @@ def web_detail(self, event): return event_page_details -class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase): - ''' - There are some inSite sites that have information that only appears - event listing page, like NYC's 'Meeting Topic.' This scraper visits - the listing page and attempts to zip API and web events together - ''' +class WebCalendarMixin: + """ + Sometimes, it's desirable to retrieve information from the web calendar, + in addition to the API. This mixin extends the base functionality to get + event information from both the detail page, if accessible, and the web + calendar listing. + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -422,13 +427,20 @@ def __init__(self, *args, **kwargs): # Instantiate dictionary where events from generator are stored as they # are scraped. self._scraped_events = {} - + def _get_web_event(self, api_event): if self._not_in_web_interface(api_event): - return None + event_detail = {} else: - # None if entire web calendar scraped but API event not found - return self.web_results(api_event) + # None if detail link does not exist or cannot be found. + event_detail = super()._get_web_event(api_event) or {} + + # Sometimes events can appear on the calendar before their detail links + # become available. None if entire web calendar scraped but event not + # found. + event_listing = self.web_results(api_event) or {} + + return (event_listing | event_detail) or None def web_results(self, event): api_key = (event['EventBodyName'].strip(), @@ -453,29 +465,22 @@ def _scrapeWebCalendar(self): chronological order. ''' for event, _ in self._webscraper.events(follow_links=False): - event_key = self._event_key(event, self._webscraper) + event_key = self._event_key(event) + print(event_key, event) yield event_key, event - def _event_key(self, event, web_scraper): + def _event_key(self, event): '''Since Legistar InSite contains more information about events than are available in the API, we need to scrape both. Then, we have to line them up. This method makes a key that should be uniquely identify every event and will allow us to link events from the two data sources. ''' - response = web_scraper.get(event['iCalendar']['url'], verify=False) - event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt + response = self._webscraper.get(event['iCalendar']['url'], verify=False) + event_time = self._webscraper.ical(response.text).subcomponents[0]['DTSTART'].dt event_time = pytz.timezone(self.TIMEZONE).localize(event_time) key = (event['Name']['label'], event_time) return key - - def _not_in_web_interface(self, event): - '''Occasionally, an event will appear in the API, but not in the web - interface. This method checks attributes of the API event that tell us - whether the given event is one of those cases, returning True if so, and - False otherwise. Available for override in jurisdictional scrapers. - ''' - return False From ec5806d07c6aa27343e6e9f23727f3fee61a4b93 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Wed, 9 Jul 2025 11:12:29 -0500 Subject: [PATCH 2/5] Remove print statement --- legistar/events.py | 1 - 1 file changed, 1 deletion(-) diff --git a/legistar/events.py b/legistar/events.py index 316874f..37ca00e 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -466,7 +466,6 @@ def _scrapeWebCalendar(self): ''' for event, _ in self._webscraper.events(follow_links=False): event_key = self._event_key(event) - print(event_key, event) yield event_key, event def _event_key(self, event): From dbf139c6933cefeb2d0db643a4a0ef12dc9e49d8 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Wed, 9 Jul 2025 11:54:34 -0500 Subject: [PATCH 3/5] Allow event API params to be overridden --- legistar/events.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/legistar/events.py b/legistar/events.py index 37ca00e..728d6e6 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -196,7 +196,7 @@ def _init_webscraper(self): return webscraper - def api_events(self, since_datetime=None): + def api_events(self, since_datetime=None, override_params=None): # scrape from oldest to newest. This makes resuming big # scraping jobs easier because upon a scrape failure we can # import everything scraped and then scrape everything newer @@ -226,6 +226,8 @@ def api_events(self, since_datetime=None): for field in update_fields) params['$filter'] = since_filter + + params.update(override_params or {}) events_url = self.BASE_URL + '/events/' @@ -413,8 +415,8 @@ class WebCalendarMixin: """ Sometimes, it's desirable to retrieve information from the web calendar, in addition to the API. This mixin extends the base functionality to get - event information from both the detail page, if accessible, and the web - calendar listing. + event information from both the detail page linked to in the API and the + web calendar listing. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -430,14 +432,12 @@ def __init__(self, *args, **kwargs): def _get_web_event(self, api_event): if self._not_in_web_interface(api_event): - event_detail = {} - else: - # None if detail link does not exist or cannot be found. - event_detail = super()._get_web_event(api_event) or {} + return None + + # None if detail link does not exist or cannot be found. + event_detail = super()._get_web_event(api_event) or {} - # Sometimes events can appear on the calendar before their detail links - # become available. None if entire web calendar scraped but event not - # found. + # None if entire web calendar scraped but event not found. event_listing = self.web_results(api_event) or {} return (event_listing | event_detail) or None From ef2571950c658a1021b989711460ff9497eac5c0 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Wed, 9 Jul 2025 12:26:03 -0500 Subject: [PATCH 4/5] Nevermind on the params --- legistar/events.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/legistar/events.py b/legistar/events.py index 728d6e6..cbe67e7 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -196,7 +196,7 @@ def _init_webscraper(self): return webscraper - def api_events(self, since_datetime=None, override_params=None): + def api_events(self, since_datetime=None): # scrape from oldest to newest. This makes resuming big # scraping jobs easier because upon a scrape failure we can # import everything scraped and then scrape everything newer @@ -226,8 +226,6 @@ def api_events(self, since_datetime=None, override_params=None): for field in update_fields) params['$filter'] = since_filter - - params.update(override_params or {}) events_url = self.BASE_URL + '/events/' From 874fd422442e0bce0fe8e870ea9457a233c05705 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Thu, 10 Jul 2025 15:04:59 -0500 Subject: [PATCH 5/5] Revise mixin --- legistar/events.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/legistar/events.py b/legistar/events.py index cbe67e7..d337044 100644 --- a/legistar/events.py +++ b/legistar/events.py @@ -72,7 +72,7 @@ def events(self, follow_links=True, since=None): # we might revisit the same event. So, we keep track of # the last few events we've visited in order to # make sure we are not revisiting - scraped_events = deque([], maxlen=10) + scraped_events = deque([], maxlen=100) current_year = self.now().year @@ -92,7 +92,7 @@ def events(self, follow_links=True, since=None): for year in range(current_year + 1, since_year, -1): no_events_in_year = True - for page in self.eventPages(year): + for idx, page in enumerate(self.eventPages(year)): events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] for event, _, _ in self.parseDataTable(events_table): ical_url = event['iCalendar']['url'] @@ -409,12 +409,11 @@ def web_detail(self, event): return event_page_details -class WebCalendarMixin: +class WebCalendarFallbackMixin: """ - Sometimes, it's desirable to retrieve information from the web calendar, - in addition to the API. This mixin extends the base functionality to get - event information from both the detail page linked to in the API and the - web calendar listing. + Sometimes, events are visible on the web calendar before their detail + link is accessible. Use this mixin to scrape to check the web calendar + for events if their detail link cannot be accessed. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -432,20 +431,30 @@ def _get_web_event(self, api_event): if self._not_in_web_interface(api_event): return None - # None if detail link does not exist or cannot be found. - event_detail = super()._get_web_event(api_event) or {} + web_event = None - # None if entire web calendar scraped but event not found. - event_listing = self.web_results(api_event) or {} + if not self._detail_page_not_available(api_event): + web_event = super()._get_web_event(api_event) + + if web_event is None: + web_event = self.web_results(api_event) - return (event_listing | event_detail) or None + return web_event + + def _detail_page_not_available(self, api_event): + """ + Sometimes, we can know from the API event that a detail link is + not available or will be invalid, so we can skip trying to access + it. Available for override in jurisdictional scrapers. + """ + return False def web_results(self, event): api_key = (event['EventBodyName'].strip(), event['start']) - # Check the cache of events we've already scraped from the web interface - # for the API event at hand. + # Check the cache of events we've already scraped from the web + # interface for the API event at hand. if api_key in self._scraped_events: return self._scraped_events[api_key]