From 79af30f8fb87f10304a79f2296e97bf87d902c54 Mon Sep 17 00:00:00 2001
From: Hannah Cushman Garland <hannah.cushman@datamade.us>
Date: Wed, 9 Jul 2025 11:03:48 -0500
Subject: [PATCH 1/5] Convert sibling API event scraper classes into one class
 and a mixin

---
 legistar/events.py | 105 ++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 50 deletions(-)

diff --git a/legistar/events.py b/legistar/events.py
index 839a1e1..316874f 100644
--- a/legistar/events.py
+++ b/legistar/events.py
@@ -1,4 +1,3 @@
-from abc import ABCMeta, abstractmethod
 import time
 import datetime
 from collections import deque
@@ -171,7 +170,7 @@ def _get_ecomment_link(self, link):
         return self.ecomment_dict.get(event_id, None)
 
 
-class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta):
+class LegistarAPIEventScraper(LegistarAPIScraper):
     webscraper_class = LegistarEventsScraper
     WEB_RETRY_EVENTS = 3
 
@@ -197,10 +196,6 @@ def _init_webscraper(self):
 
         return webscraper
 
-    @abstractmethod
-    def _get_web_event(self, api_event):
-        pass
-
     def api_events(self, since_datetime=None):
         # scrape from oldest to newest. This makes resuming big
         # scraping jobs easier because upon a scrape failure we can
@@ -315,25 +310,6 @@ def minutes(self, event):
             self._suppress_item_matter(item, minutes_url)
             yield item
 
-    def _suppress_item_matter(self, item, agenda_url):
-        '''
-        Agenda items in Legistar do not always display links to
-        associated matter files even if the same agenda item
-        in the API references a Matter File. The agenda items
-        we scrape should honor the suppression on the Legistar
-        agendas.
-
-        This is also practical because matter files that are hidden
-        in the Legistar Agenda do not seem to available for scraping
-        on Legistar or through the API
-
-        Since we are not completely sure that the same suppression
-        logic should be used for all Legislative Bodies, this method
-        is currently just a hook for being overridden in particular
-        scrapers. As of now, at least LA Metro uses this hook.
-        '''
-        pass
-
     def rollcalls(self, event):
         for item in self.agenda(event):
             if item['EventItemRollCallFlag']:
@@ -354,6 +330,25 @@ def addDocs(self, e, events, doc_type):
         except ValueError:
             pass
 
+    def _suppress_item_matter(self, item, agenda_url):
+        '''
+        Agenda items in Legistar do not always display links to
+        associated matter files even if the same agenda item
+        in the API references a Matter File. The agenda items
+        we scrape should honor the suppression on the Legistar
+        agendas.
+
+        This is also practical because matter files that are hidden
+        in the Legistar Agenda do not seem to available for scraping
+        on Legistar or through the API
+
+        Since we are not completely sure that the same suppression
+        logic should be used for all Legislative Bodies, this method
+        is currently just a hook for being overridden in particular
+        scrapers. As of now, at least LA Metro uses this hook.
+        '''
+        pass
+
     def _event_status(self, event):
         '''Events can have a status of tentative, confirmed, cancelled, or
         passed (http://docs.opencivicdata.org/en/latest/data/event.html). By
@@ -367,11 +362,20 @@ def _event_status(self, event):
             status = 'confirmed'
 
         return status
-
-
-class LegistarAPIEventScraper(LegistarAPIEventScraperBase):
+    
+    def _not_in_web_interface(self, event):
+        '''Occasionally, an event will appear in the API before it appears in
+        the web interface. This method checks attributes of the API event that
+        tell uswhether the given event is one of those cases, returning True if
+        so, and False otherwise. Available for override in jurisdictional
+        scrapers.
+        '''
+        return False
 
     def _get_web_event(self, api_event):
+        if self._not_in_web_interface(api_event):
+            return None
+        
         return self.web_detail(api_event)
 
     def web_detail(self, event):
@@ -405,12 +409,13 @@ def web_detail(self, event):
         return event_page_details
 
 
-class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase):
-    '''
-    There are some inSite sites that have information that only appears
-    event listing page, like NYC's 'Meeting Topic.' This scraper visits
-    the listing page and attempts to zip API and web events together
-    '''
+class WebCalendarMixin:
+    """
+    Sometimes, it's desirable to retrieve information from the web calendar,
+    in addition to the API. This mixin extends the base functionality to get
+    event information from both the detail page, if accessible, and the web
+    calendar listing.
+    """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -422,13 +427,20 @@ def __init__(self, *args, **kwargs):
         # Instantiate dictionary where events from generator are stored as they
         # are scraped.
         self._scraped_events = {}
-
+        
     def _get_web_event(self, api_event):
         if self._not_in_web_interface(api_event):
-            return None
+            event_detail = {}
         else:
-            # None if entire web calendar scraped but API event not found
-            return self.web_results(api_event)
+            # None if detail link does not exist or cannot be found.
+            event_detail = super()._get_web_event(api_event) or {}
+        
+        # Sometimes events can appear on the calendar before their detail links
+        # become available. None if entire web calendar scraped but event not
+        # found.
+        event_listing = self.web_results(api_event) or {}
+        
+        return (event_listing | event_detail) or None
 
     def web_results(self, event):
         api_key = (event['EventBodyName'].strip(),
@@ -453,29 +465,22 @@ def _scrapeWebCalendar(self):
         chronological order.
         '''
         for event, _ in self._webscraper.events(follow_links=False):
-            event_key = self._event_key(event, self._webscraper)
+            event_key = self._event_key(event)
+            print(event_key, event)
             yield event_key, event
 
-    def _event_key(self, event, web_scraper):
+    def _event_key(self, event):
         '''Since Legistar InSite contains more information about events than
         are available in the API, we need to scrape both. Then, we have
         to line them up. This method makes a key that should be
         uniquely identify every event and will allow us to link
         events from the two data sources.
         '''
-        response = web_scraper.get(event['iCalendar']['url'], verify=False)
-        event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt
+        response = self._webscraper.get(event['iCalendar']['url'], verify=False)
+        event_time = self._webscraper.ical(response.text).subcomponents[0]['DTSTART'].dt
         event_time = pytz.timezone(self.TIMEZONE).localize(event_time)
 
         key = (event['Name']['label'],
                event_time)
 
         return key
-
-    def _not_in_web_interface(self, event):
-        '''Occasionally, an event will appear in the API, but not in the web
-        interface. This method checks attributes of the API event that tell us
-        whether the given event is one of those cases, returning True if so, and
-        False otherwise. Available for override in jurisdictional scrapers.
-        '''
-        return False

From ec5806d07c6aa27343e6e9f23727f3fee61a4b93 Mon Sep 17 00:00:00 2001
From: Hannah Cushman Garland <hannah.cushman@datamade.us>
Date: Wed, 9 Jul 2025 11:12:29 -0500
Subject: [PATCH 2/5] Remove print statement

---
 legistar/events.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/legistar/events.py b/legistar/events.py
index 316874f..37ca00e 100644
--- a/legistar/events.py
+++ b/legistar/events.py
@@ -466,7 +466,6 @@ def _scrapeWebCalendar(self):
         '''
         for event, _ in self._webscraper.events(follow_links=False):
             event_key = self._event_key(event)
-            print(event_key, event)
             yield event_key, event
 
     def _event_key(self, event):

From dbf139c6933cefeb2d0db643a4a0ef12dc9e49d8 Mon Sep 17 00:00:00 2001
From: Hannah Cushman Garland <hannah.cushman@datamade.us>
Date: Wed, 9 Jul 2025 11:54:34 -0500
Subject: [PATCH 3/5] Allow event API params to be overridden

---
 legistar/events.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/legistar/events.py b/legistar/events.py
index 37ca00e..728d6e6 100644
--- a/legistar/events.py
+++ b/legistar/events.py
@@ -196,7 +196,7 @@ def _init_webscraper(self):
 
         return webscraper
 
-    def api_events(self, since_datetime=None):
+    def api_events(self, since_datetime=None, override_params=None):
         # scrape from oldest to newest. This makes resuming big
         # scraping jobs easier because upon a scrape failure we can
         # import everything scraped and then scrape everything newer
@@ -226,6 +226,8 @@ def api_events(self, since_datetime=None):
                             for field in update_fields)
 
             params['$filter'] = since_filter
+            
+        params.update(override_params or {})
 
         events_url = self.BASE_URL + '/events/'
 
@@ -413,8 +415,8 @@ class WebCalendarMixin:
     """
     Sometimes, it's desirable to retrieve information from the web calendar,
     in addition to the API. This mixin extends the base functionality to get
-    event information from both the detail page, if accessible, and the web
-    calendar listing.
+    event information from both the detail page linked to in the API and the
+    web calendar listing.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -430,14 +432,12 @@ def __init__(self, *args, **kwargs):
         
     def _get_web_event(self, api_event):
         if self._not_in_web_interface(api_event):
-            event_detail = {}
-        else:
-            # None if detail link does not exist or cannot be found.
-            event_detail = super()._get_web_event(api_event) or {}
+            return None
+        
+        # None if detail link does not exist or cannot be found.
+        event_detail = super()._get_web_event(api_event) or {}
         
-        # Sometimes events can appear on the calendar before their detail links
-        # become available. None if entire web calendar scraped but event not
-        # found.
+        # None if entire web calendar scraped but event not found.
         event_listing = self.web_results(api_event) or {}
         
         return (event_listing | event_detail) or None

From ef2571950c658a1021b989711460ff9497eac5c0 Mon Sep 17 00:00:00 2001
From: Hannah Cushman Garland <hannah.cushman@datamade.us>
Date: Wed, 9 Jul 2025 12:26:03 -0500
Subject: [PATCH 4/5] Nevermind on the params

---
 legistar/events.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/legistar/events.py b/legistar/events.py
index 728d6e6..cbe67e7 100644
--- a/legistar/events.py
+++ b/legistar/events.py
@@ -196,7 +196,7 @@ def _init_webscraper(self):
 
         return webscraper
 
-    def api_events(self, since_datetime=None, override_params=None):
+    def api_events(self, since_datetime=None):
         # scrape from oldest to newest. This makes resuming big
         # scraping jobs easier because upon a scrape failure we can
         # import everything scraped and then scrape everything newer
@@ -226,8 +226,6 @@ def api_events(self, since_datetime=None, override_params=None):
                             for field in update_fields)
 
             params['$filter'] = since_filter
-            
-        params.update(override_params or {})
 
         events_url = self.BASE_URL + '/events/'
 

From 874fd422442e0bce0fe8e870ea9457a233c05705 Mon Sep 17 00:00:00 2001
From: Hannah Cushman Garland <hannah.cushman@datamade.us>
Date: Thu, 10 Jul 2025 15:04:59 -0500
Subject: [PATCH 5/5] Revise mixin

---
 legistar/events.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/legistar/events.py b/legistar/events.py
index cbe67e7..d337044 100644
--- a/legistar/events.py
+++ b/legistar/events.py
@@ -72,7 +72,7 @@ def events(self, follow_links=True, since=None):
         # we might revisit the same event. So, we keep track of
         # the last few events we've visited in order to
         # make sure we are not revisiting
-        scraped_events = deque([], maxlen=10)
+        scraped_events = deque([], maxlen=100)
 
         current_year = self.now().year
 
@@ -92,7 +92,7 @@ def events(self, follow_links=True, since=None):
         for year in range(current_year + 1, since_year, -1):
             no_events_in_year = True
 
-            for page in self.eventPages(year):
+            for idx, page in enumerate(self.eventPages(year)):
                 events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0]
                 for event, _, _ in self.parseDataTable(events_table):
                     ical_url = event['iCalendar']['url']
@@ -409,12 +409,11 @@ def web_detail(self, event):
         return event_page_details
 
 
-class WebCalendarMixin:
+class WebCalendarFallbackMixin:
     """
-    Sometimes, it's desirable to retrieve information from the web calendar,
-    in addition to the API. This mixin extends the base functionality to get
-    event information from both the detail page linked to in the API and the
-    web calendar listing.
+    Sometimes, events are visible on the web calendar before their detail
+    link is accessible. Use this mixin to scrape to check the web calendar
+    for events if their detail link cannot be accessed.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -432,20 +431,30 @@ def _get_web_event(self, api_event):
         if self._not_in_web_interface(api_event):
             return None
         
-        # None if detail link does not exist or cannot be found.
-        event_detail = super()._get_web_event(api_event) or {}
+        web_event = None
         
-        # None if entire web calendar scraped but event not found.
-        event_listing = self.web_results(api_event) or {}
+        if not self._detail_page_not_available(api_event):
+            web_event = super()._get_web_event(api_event)
+            
+        if web_event is None:
+            web_event = self.web_results(api_event)
         
-        return (event_listing | event_detail) or None
+        return web_event
+    
+    def _detail_page_not_available(self, api_event):
+        """
+        Sometimes, we can know from the API event that a detail link is
+        not available or will be invalid, so we can skip trying to access
+        it. Available for override in jurisdictional scrapers.
+        """
+        return False
 
     def web_results(self, event):
         api_key = (event['EventBodyName'].strip(),
                    event['start'])
 
-        # Check the cache of events we've already scraped from the web interface
-        # for the API event at hand.
+        # Check the cache of events we've already scraped from the web
+        # interface for the API event at hand.
         if api_key in self._scraped_events:
             return self._scraped_events[api_key]