From ab38ebe9a4299e77ecc86f19b6f4b9f2ee4d7f44 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 11:47:25 -0500 Subject: [PATCH 1/8] Reorganize and modernize package --- .github/workflows/pythonapp.yml | 130 +++++++++----- legistar/__init__.py | 1 - legistar/people.py | 87 ---------- pyproject.toml | 95 ++++++++++ requirements.txt | 3 - setup.cfg | 3 - setup.py | 33 ---- src/legistar/__init__.py | 1 + src/legistar/api/__init__.py | 0 src/legistar/api/base.py | 99 +++++++++++ {legistar => src/legistar/api}/bills.py | 211 +---------------------- {legistar => src/legistar/api}/events.py | 174 +------------------ src/legistar/api/people.py | 43 +++++ src/legistar/ui/__init__.py | 0 {legistar => src/legistar/ui}/base.py | 117 ++----------- src/legistar/ui/bills.py | 210 ++++++++++++++++++++++ src/legistar/ui/events.py | 169 ++++++++++++++++++ src/legistar/ui/people.py | 45 +++++ tests/conftest.py | 4 +- tests/refresh_fixtures.py | 10 +- tests/test_parsing.py | 12 +- 21 files changed, 783 insertions(+), 664 deletions(-) delete mode 100644 legistar/__init__.py delete mode 100644 legistar/people.py create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 src/legistar/__init__.py create mode 100644 src/legistar/api/__init__.py create mode 100644 src/legistar/api/base.py rename {legistar => src/legistar/api}/bills.py (60%) rename {legistar => src/legistar/api}/events.py (64%) create mode 100644 src/legistar/api/people.py create mode 100644 src/legistar/ui/__init__.py rename {legistar => src/legistar/ui}/base.py (70%) create mode 100644 src/legistar/ui/bills.py create mode 100644 src/legistar/ui/events.py create mode 100644 src/legistar/ui/people.py diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 483e33b..eb60bfb 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -1,53 +1,99 @@ -name: tests +name: CI -on: [push] +on: + push: + branches: [master, main] + pull_request: + branches: [master, main] jobs: - test: + get-python-versions: + runs-on: ubuntu-latest + outputs: + python-versions: ${{ steps.get-versions.outputs.python-versions }} + steps: + - uses: actions/checkout@v4 + - name: Extract Python versions from pyproject.toml + id: get-versions + run: | + # Extract Python versions from classifiers in pyproject.toml + # This looks for lines like "Programming Language :: Python :: 3.8" + python_versions=$(grep -o 'Programming Language :: Python :: 3\.[0-9]\+' pyproject.toml | grep -o '3\.[0-9]\+' | sort -V | jq -R -s -c 'split("\n")[:-1]') + echo "python-versions=$python_versions" >> $GITHUB_OUTPUT + echo "Detected Python versions: $python_versions" + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 src/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics + # run full linting + flake8 src/ tests/ + + test: + needs: [get-python-versions, lint] runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ${{ fromJson(needs.get-python-versions.outputs.python-versions) }} steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.x - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install . - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pip install -r requirements.txt - pytest - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - publish: - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') - needs: test + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[test] + - name: Test with pytest + run: pytest + build: + needs: [test] runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Upload build artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist/ + publish: + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') + needs: [build] + runs-on: ubuntu-latest + permissions: + id-token: write steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - name: Build package for publishing - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - name: Download build artifacts + uses: actions/download-artifact@v3 + with: + name: dist + path: dist/ + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/legistar/__init__.py b/legistar/__init__.py deleted file mode 100644 index 1559bab..0000000 --- a/legistar/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.0.1' # pragma: no cover diff --git a/legistar/people.py b/legistar/people.py deleted file mode 100644 index c7e4f48..0000000 --- a/legistar/people.py +++ /dev/null @@ -1,87 +0,0 @@ -from .base import LegistarScraper, LegistarAPIScraper - - -class LegistarPersonScraper(LegistarScraper): - MEMBERLIST = None - ALL_MEMBERS = None - - def councilMembers(self, extra_args=None, follow_links=True): - payload = {} - if extra_args: - payload.update(extra_args) - page = self.lxmlize(self.MEMBERLIST, payload) - payload.update(self.sessionSecrets(page)) - - if self.ALL_MEMBERS: - payload['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$menuPeople" - payload['__EVENTARGUMENT'] = self.ALL_MEMBERS - - for page in self.pages(self.MEMBERLIST, payload): - table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']")[0] - - for councilman, headers, row in self.parseDataTable(table): - if follow_links and type(councilman['Person Name']) == dict: - - detail_url = councilman['Person Name']['url'] - councilman_details = self.lxmlize(detail_url) - detail_div = councilman_details.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0] - - councilman.update(self.parseDetails(detail_div)) - - img = councilman_details.xpath( - "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']") - if img: - councilman['Photo'] = img[0].get('src') - - committee_table = councilman_details.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']")[0] - committees = self.parseDataTable(committee_table) - - yield councilman, committees - - else: - yield councilman - - -class LegistarAPIPersonScraper(LegistarAPIScraper): - date_format = '%Y-%m-%dT%H:%M:%S' - - def body_types(self): - body_types_url = self.BASE_URL + '/bodytypes/' - response = self.get(body_types_url) - - types = {body_type['BodyTypeName']: body_type['BodyTypeId'] - for body_type in response.json()} - - return types - - def bodies(self): - bodies_url = self.BASE_URL + '/bodies/' - - for body in self.pages(bodies_url, item_key="BodyId"): - yield body - - def body_offices(self, body): - body_id = body['BodyId'] - - offices_url = (self.BASE_URL + - '/bodies/{}/OfficeRecords'.format(body_id)) - - for office in self.pages(offices_url, item_key="OfficeRecordId"): - yield office - - def toDate(self, text): - return self.toTime(text).date() - - def person_sources_from_office(self, office): - person_api_url = (self.BASE_URL + - '/persons/{OfficeRecordPersonId}'.format(**office)) - - response = self.get(person_api_url) - - route = '/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}' - person_web_url = self.WEB_URL + route.format(**response.json()) - - return person_api_url, person_web_url diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f724041 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,95 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "legistar" +dynamic = ["version"] +description = "Mixin classes for legistar scrapers" +readme = "README.md" +license = {text = "BSD"} +authors = [ + {name = "Forest Gregg", email = "fgregg@datamade.us"} +] +maintainers = [ + {name = "Forest Gregg", email = "fgregg@datamade.us"} +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "requests", + "lxml", + "pytz", + "icalendar", + "scrapelib", + "esprima" +] +requires-python = ">=3.8" + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-mock", + "requests-mock", + "flake8" +] +test = [ + "pytest", + "pytest-mock", + "requests-mock" +] + +[project.urls] +Homepage = "http://github.com/opencivicdata/python-legistar-scraper/" +Repository = "http://github.com/opencivicdata/python-legistar-scraper/" +Issues = "http://github.com/opencivicdata/python-legistar-scraper/issues" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.dynamic] +version = {attr = "legistar.__version__"} + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v -p no:warnings" +norecursedirs = [".env"] + +[tool.flake8] +max-line-length = 88 +max-complexity = 10 +exclude = [ + ".git", + "__pycache__", + "build", + "dist", + ".env", + ".venv", + "venv", + "*.egg-info" +] +ignore = [ + "E203", # whitespace before ':' (conflicts with black) + "W503", # line break before binary operator (PEP 8 recommends this) +] +per-file-ignores = [ + "__init__.py:F401", # imported but unused (common in __init__.py files) +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ab92662..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest -pytest-mock -requests-mock diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index f60784d..0000000 --- a/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[tool:pytest] -addopts = -v -p no:warnings -norecursedirs=.env diff --git a/setup.py b/setup.py deleted file mode 100644 index 547f29e..0000000 --- a/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup -from legistar import __version__ - -long_description = '' - -setup(name='legistar', - version=__version__, - packages=['legistar'], - author='Forest Gregg', - author_email='fgregg@datamade.us', - license='BSD', - url='http://github.com/opencivicdata/python-legistar-scraper/', - description='Mixin classes for legistar scrapers', - long_description=long_description, - platforms=['any'], - install_requires=['requests', - 'lxml', - 'pytz', - 'icalendar', - 'scrapelib', - 'esprima' - ], - classifiers=["Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Topic :: Software Development :: Libraries :: Python Modules", - ], - ) diff --git a/src/legistar/__init__.py b/src/legistar/__init__.py new file mode 100644 index 0000000..2bd87a9 --- /dev/null +++ b/src/legistar/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.1' # pragma: no cover \ No newline at end of file diff --git a/src/legistar/api/__init__.py b/src/legistar/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/legistar/api/base.py b/src/legistar/api/base.py new file mode 100644 index 0000000..5110e96 --- /dev/null +++ b/src/legistar/api/base.py @@ -0,0 +1,99 @@ +import datetime +from collections import deque +import requests +import logging + +import scrapelib +import pytz + + +class LegistarAPIScraper(scrapelib.Scraper): + date_format = '%Y-%m-%dT%H:%M:%S' + time_string_format = '%I:%M %p' + utc_timestamp_format = '%Y-%m-%dT%H:%M:%S.%f' + + def __init__(self, *args, **kwargs): + super(LegistarAPIScraper, self).__init__(*args, **kwargs) + self.logger = logging.getLogger("legistar") + self.warning = self.logger.warning + + def to_time(self, text): + time = datetime.datetime.strptime(text, self.date_format) + time = pytz.timezone(self.TIMEZONE).localize(time) + return time + + def to_utc_timestamp(self, text): + try: + time = datetime.datetime.strptime(text, self.utc_timestamp_format) + except ValueError as e: + if 'does not match format' in str(e): + time = datetime.datetime.strptime(text, self.date_format) + else: + raise + time = pytz.timezone('UTC').localize(time) + return time + + def search(self, route, item_key, search_conditions): + """ + Base function for searching the Legistar API. + + Arguments: + + route -- The path to search, i.e. /matters/, /events/, etc + item_key -- The unique id field for the items that you are searching. + This is necessary for proper pagination. examples + might be MatterId or EventId + search_conditions -- a string in the OData format for the + your search conditions http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 + + It would be nice if we could provide a + friendly search API. Something like https://github.com/tuomur/python-odata + + + Examples: + # Search for bills introduced after Jan. 1, 2017 + search('/matters/', 'MatterId', "MatterIntroDate gt datetime'2017-01-01'") + """ + + search_url = self.BASE_URL + route + + params = {'$filter': search_conditions} + + try: + yield from self.pages(search_url, + params=params, + item_key=item_key) + except requests.HTTPError as e: + if e.response.status_code == 400: + raise ValueError(e.response.json()['Message']) + if not self.accept_response(e.response): + raise + + def pages(self, url, params=None, item_key=None): + if params is None: + params = {} + + seen = deque([], maxlen=1000) + + page_num = 0 + response = None + while page_num == 0 or len(response.json()) == 1000: + params['$skip'] = page_num * 1000 + response = self.get(url, params=params) + response.raise_for_status() + + for item in response.json(): + if item[item_key] not in seen: + yield item + seen.append(item[item_key]) + + page_num += 1 + + def accept_response(self, response, **kwargs): + """ + This overrides a method that controls whether + the scraper should retry on an error. We don't + want to retry if the API returns a 400, except for + 410, which means the record no longer exists. + """ + return response.status_code < 401 or response.status_code == 410 diff --git a/legistar/bills.py b/src/legistar/api/bills.py similarity index 60% rename from legistar/bills.py rename to src/legistar/api/bills.py index 550c2f3..3c070a4 100644 --- a/legistar/bills.py +++ b/src/legistar/api/bills.py @@ -1,217 +1,10 @@ -from .base import LegistarScraper, LegistarAPIScraper -from lxml.etree import tostring -from collections import deque from functools import partialmethod from urllib.parse import urljoin + import requests import scrapelib - -class LegistarBillScraper(LegistarScraper): - def legislation(self, search_text='', created_after=None, - created_before=None): - - # If legislation is added to the the legistar system while we - # are scraping, it will shift the list of legislation down and - # we might revisit the same legislation. So, we keep track of - # the last few pieces of legislation we've visited in order to - # make sure we are not revisiting - scraped_leg = deque([], maxlen=10) - - for page in self.searchLegislation(search_text, created_after, - created_before): - for legislation_summary in self.parseSearchResults(page): - if not legislation_summary['url'] in scraped_leg: - yield legislation_summary - scraped_leg.append(legislation_summary['url']) - - def searchLegislation(self, search_text='', created_after=None, - created_before=None): - """ - Submit a search query on the legislation search page, and return a list - of summary results. - """ - - page = self.lxmlize(self.LEGISLATION_URL) - - page = self._advancedSearch(page) - - payload = {} - - # Enter the search parameters TODO: Each of the possible form - # fields should be represented as keyword arguments to this - # function. The default query string should be for the the - # default 'Legislative text' field. - payload['ctl00$ContentPlaceHolder1$txtText'] = search_text - - if created_after and created_before: - payload.update(dateWithin(created_after, created_before)) - - elif created_before: - payload.update(dateBound(created_before)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '<' - - elif created_after: - payload.update(dateBound(created_after)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '>' - - # Return up to one million search results - payload['ctl00_ContentPlaceHolder1_lstMax_ClientState'] = '{"value":"1000000"}' - payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = '{"value":"All"}' - payload['ctl00$ContentPlaceHolder1$btnSearch'] = 'Search Legislation' - - payload.update(self.sessionSecrets(page)) - - return self.pages(self.LEGISLATION_URL, payload) - - def parseSearchResults(self, page): - """Take a page of search results and return a sequence of data - of tuples about the legislation, of the form - - ('Document ID', 'Document URL', 'Type', 'Status', 'Introduction Date' - 'Passed Date', 'Main Sponsor', 'Title') - """ - table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] - for legislation, headers, row in self.parseDataTable(table): - # Do legislation search-specific stuff - # ------------------------------------ - # First column should be the ID of the record. - id_key = headers[0] - try: - legislation_id = legislation[id_key]['label'] - except TypeError: - continue - legislation_url = legislation[id_key]['url'].split( - self.BASE_URL)[-1] - legislation[id_key] = legislation_id - legislation['url'] = self.BASE_URL + \ - legislation_url.split('&Options')[0] + '&FullText=1' - - yield legislation - - def _advancedSearch(self, page): - search_switcher = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] - - if 'simple search' in search_switcher.value.lower(): - return page - else: - payload = self.sessionSecrets(page) - payload[search_switcher.name] = search_switcher.value - - page = self.lxmlize(self.LEGISLATION_URL, payload) - - if 'simple search' not in page.xpath("//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0].value.lower(): - raise ValueError('Not on the advanced search page') - - return page - - def details(self, detail_url, div_id): - detail_page = self.lxmlize(detail_url) - - detail_div = detail_page.xpath(".//div[@id='%s']" % div_id)[0] - - return self.parseDetails(detail_div) - - def legDetails(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageDetails' - return self.details(detail_url, div_id) - - def actionDetails(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageTop1' - return self.details(detail_url, div_id) - - def history(self, detail_url): - detail_page = self.lxmlize(detail_url) - - try: - history_table = detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']")[0] - except IndexError: - print(detail_url) - raise - - history = [row[0] for row in self.parseDataTable(history_table)] - - try: - history = sorted(history, key=self._actionSortKey) - except (TypeError, ValueError): - pass - - for action in history: - yield action - - def _actionSortKey(self, action): - action_date = self.toDate(action['Date']) - action_url = action['Action\xa0Details']['url'] - - return (action_date, action_url) - - def text(self, detail_url): - detail_page = self.lxmlize(detail_url) - - text_div = detail_page.xpath( - "//div[@id='ctl00_ContentPlaceHolder1_divText']") - - if len(text_div): - return tostring(text_div[0], pretty_print=True).decode() - else: - return None - - def extractVotes(self, action_detail_url): - action_detail_page = self.lxmlize(action_detail_url) - try: - vote_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] - except IndexError: - self.warning("No votes found in table") - return None, [] - votes = list(self.parseDataTable(vote_table)) - vote_list = [] - for vote, _, _ in votes: - raw_option = vote['Vote'].lower() - vote_list.append((self.VOTE_OPTIONS.get(raw_option, raw_option), - vote['Person Name']['label'])) - - action_detail_div = action_detail_page.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']")[0] - action_details = self.parseDetails(action_detail_div) - result = action_details['Result'].lower() - - return result, vote_list - - -def dateWithin(created_after, created_before): - payload = dateBound(created_after) - - payload['ctl00$ContentPlaceHolder1$txtFileCreated2'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=created_before) - payload['ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=created_before) - - payload['ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=created_before) - - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = 'between' - - return payload - - -def dateBound(creation_date): - payload = {} - - payload['ctl00$ContentPlaceHolder1$txtFileCreated1'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=creation_date) - payload['ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=creation_date) - - payload['ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=creation_date) - - return payload +from .base import LegistarAPIScraper class LegistarAPIBillScraper(LegistarAPIScraper): diff --git a/legistar/events.py b/src/legistar/api/events.py similarity index 64% rename from legistar/events.py rename to src/legistar/api/events.py index 839a1e1..3d76189 100644 --- a/legistar/events.py +++ b/src/legistar/api/events.py @@ -1,174 +1,12 @@ from abc import ABCMeta, abstractmethod -import time import datetime -from collections import deque -import esprima +import time import pytz -import icalendar import scrapelib -from .base import LegistarScraper, LegistarAPIScraper - - -class LegistarEventsScraper(LegistarScraper): - ECOMMENT_JS_URLS = ( - 'https://metro.granicusideas.com/meetings.js', - 'https://metro.granicusideas.com/meetings.js?scope=past' - ) - - def __init__(self, *args, event_info_key='Meeting Details', **kwargs): - super().__init__(*args, **kwargs) - self.event_info_key = event_info_key - - - @property - def ecomment_dict(self): - """ - Parse event IDs and eComment links from JavaScript file with lines like: - activateEcomment('750', '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'); - """ - if getattr(self, '_ecomment_dict', None) is None: - ecomment_dict = {} - - # Define a callback to apply to each node, e.g., - # https://esprima.readthedocs.io/en/latest/syntactic-analysis.html#example-console-calls-removal - def is_activateEcomment(node, metadata): - if node.callee and node.callee.name == 'activateEcomment': - event_id, _, comment_url = node.arguments - ecomment_dict[event_id.value] = comment_url.value - - for url in self.ECOMMENT_JS_URLS: - response = self.get(url) - esprima.parse(response.text, delegate=is_activateEcomment) - - self._ecomment_dict = ecomment_dict - - return self._ecomment_dict - - def eventPages(self, since): - - page = self.lxmlize(self.EVENTSPAGE) - for page in self.eventSearch(page, since): - yield page - - def should_cache_response(self, response): - # Never cache the top level events page, because that may result in - # expired .NET state values. - return (super().should_cache_response(response) and - response.url != self.EVENTSPAGE) - - def eventSearch(self, page, since): - payload = self.sessionSecrets(page) - - payload['ctl00_ContentPlaceHolder1_lstYears_ClientState'] = '{"value":"%s"}' % since - - payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$lstYears' - - return self.pages(self.EVENTSPAGE, payload) - - def events(self, follow_links=True, since=None): - # If an event is added to the the legistar system while we - # are scraping, it will shift the list of events down and - # we might revisit the same event. So, we keep track of - # the last few events we've visited in order to - # make sure we are not revisiting - scraped_events = deque([], maxlen=10) - - current_year = self.now().year - - if since: - if since > current_year: - raise ValueError( - 'Value of :since cannot exceed {}'.format(current_year)) - else: - since_year = since - 1 - - else: - since_year = 0 - - # Anticipate events will be scheduled for the following year to avoid - # missing upcoming events during scrapes near the end of the current - # year. - for year in range(current_year + 1, since_year, -1): - no_events_in_year = True - - for page in self.eventPages(year): - events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] - for event, _, _ in self.parseDataTable(events_table): - ical_url = event['iCalendar']['url'] - if ical_url in scraped_events: - continue - else: - scraped_events.append(ical_url) - - if follow_links and type(event[self.event_info_key]) == dict: - agenda = self.agenda(event[self.event_info_key]['url']) - else: - agenda = None - - yield event, agenda - no_events_in_year = False - - # We scrape events in reverse chronological order, starting one year - # in the future. Stop scraping if there are no events in a given - # year, unless that year is in the future, because whether events - # have been scheduled in the future is not a reliable indication of - # whether any happened in the previous year. - if no_events_in_year and year <= current_year: - break - - def agenda(self, detail_url): - page = self.lxmlize(detail_url) - - payload = self.sessionSecrets(page) - - payload.update({"__EVENTARGUMENT": "3:1", - "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain"}) - - for page in self.pages(detail_url, payload): - agenda_table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] - agenda = self.parseDataTable(agenda_table) - yield from agenda - - def addDocs(self, e, events, doc_type): - try: - if events[doc_type] != 'Not\xa0available': - e.add_document(note=events[doc_type]['label'], - url=events[doc_type]['url'], - media_type="application/pdf") - except ValueError: - pass - - def extractRollCall(self, action_detail_url): - action_detail_page = self.lxmlize(action_detail_url) - try: - rollcall_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']")[0] - except IndexError: - self.warning("No rollcall found in table") - return [] - roll_call = list(self.parseDataTable(rollcall_table)) - call_list = [] - for call, _, _ in roll_call: - option = call['Attendance'] - call_list.append((option, - call['Person Name']['label'])) - - return call_list - - def ical(self, ical_text): - value = icalendar.Calendar.from_ical(ical_text) - return value - - def _parse_detail(self, key, field_1, field_2): - if key == 'eComment': - return self._get_ecomment_link(field_2) or field_2.text_content().strip() - - def _get_ecomment_link(self, link): - event_id = link.attrib['data-event-id'] - return self.ecomment_dict.get(event_id, None) +from .base import LegistarAPIScraper +from ..ui.events import LegistarEventsScraper class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta): @@ -261,7 +99,7 @@ def event(self, api_event): ) return - start = self.toTime(api_event["EventDate"]) + start = self.to_time(api_event["EventDate"]) api_event["start"] = start.replace( hour=start_time.tm_hour, minute=start_time.tm_min ) @@ -345,7 +183,7 @@ def rollcalls(self, event): for item in response.json(): yield item - def addDocs(self, e, events, doc_type): + def add_docs(self, e, events, doc_type): try: if events[doc_type] != 'Not\xa0available': e.add_document(note=events[doc_type]['label'], @@ -399,7 +237,7 @@ def web_detail(self, event): div_id = 'ctl00_ContentPlaceHolder1_pageTop1' detail_div = event_page.xpath(".//div[@id='%s']" % div_id)[0] - event_page_details = self._webscraper.parseDetails(detail_div) + event_page_details = self._webscraper.parse_details(detail_div) event_page_details['Meeting Details'] = {'url': insite_url} return event_page_details diff --git a/src/legistar/api/people.py b/src/legistar/api/people.py new file mode 100644 index 0000000..36a41f6 --- /dev/null +++ b/src/legistar/api/people.py @@ -0,0 +1,43 @@ +from .base import LegistarAPIScraper + + +class LegistarAPIPersonScraper(LegistarAPIScraper): + date_format = '%Y-%m-%dT%H:%M:%S' + + def body_types(self): + body_types_url = self.BASE_URL + '/bodytypes/' + response = self.get(body_types_url) + + types = {body_type['BodyTypeName']: body_type['BodyTypeId'] + for body_type in response.json()} + + return types + + def bodies(self): + bodies_url = self.BASE_URL + '/bodies/' + + for body in self.pages(bodies_url, item_key="BodyId"): + yield body + + def body_offices(self, body): + body_id = body['BodyId'] + + offices_url = (self.BASE_URL + + '/bodies/{}/OfficeRecords'.format(body_id)) + + for office in self.pages(offices_url, item_key="OfficeRecordId"): + yield office + + def to_date(self, text): + return self.to_time(text).date() + + def person_sources_from_office(self, office): + person_api_url = (self.BASE_URL + + '/persons/{OfficeRecordPersonId}'.format(**office)) + + response = self.get(person_api_url) + + route = '/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}' + person_web_url = self.WEB_URL + route.format(**response.json()) + + return person_api_url, person_web_url diff --git a/src/legistar/ui/__init__.py b/src/legistar/ui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/legistar/base.py b/src/legistar/ui/base.py similarity index 70% rename from legistar/base.py rename to src/legistar/ui/base.py index 5e6327c..64fd23e 100644 --- a/legistar/base.py +++ b/src/legistar/ui/base.py @@ -1,11 +1,10 @@ import datetime import itertools import traceback -from collections import defaultdict, deque +from collections import defaultdict import re import requests import json -import logging import scrapelib import lxml.html @@ -62,7 +61,7 @@ def _range_error(self, response, payload): # contain the correct payload data. This comes as a # result of not updating the payload via sessionSecrets: # so, we do that here. - payload.update(self.sessionSecrets(page)) + payload.update(self.session_secrets(page)) raise scrapelib.HTTPError(response) @@ -106,7 +105,7 @@ def pages(self, url, payload=None): if payload is None: payload = {} - payload.update(self.sessionSecrets(page)) + payload.update(self.session_secrets(page)) event_target = next_page[0].attrib['href'].split("'")[1] @@ -119,7 +118,7 @@ def pages(self, url, payload=None): next_page = page.xpath( "//a[@class='rgCurrentPage']/following-sibling::a[1]") - def parseDetails(self, detail_div): + def parse_details(self, detail_div): """ Parse the data in the top section of a detail page. """ @@ -130,7 +129,7 @@ def parseDetails(self, detail_div): details = {} - for field_key, field in itertools.groupby(fields, fieldKey): + for field_key, field in itertools.groupby(fields, field_key): field = list(field) field_1, field_2 = field[0], field[-1] @@ -156,7 +155,7 @@ def parseDetails(self, detail_div): return details - def parseDataTable(self, table): + def parse_data_table(self, table): """ Legistar uses the same kind of data table in a number of places. This will return a list of dictionaries using the @@ -239,13 +238,13 @@ def _stringify(self, field): em.text = "--em--" + em.text + "--em--" return field.text_content().replace(' ', ' ').strip() - def toTime(self, text): + def to_time(self, text): time = datetime.datetime.strptime(text, self.date_format) time = pytz.timezone(self.TIMEZONE).localize(time) return time - def toDate(self, text): - return self.toTime(text).date().isoformat() + def to_date(self, text): + return self.to_time(text).date().isoformat() def now(self): return datetime.datetime.utcnow().replace(tzinfo=pytz.utc) @@ -254,7 +253,7 @@ def mdY2Ymd(self, text): month, day, year = text.split('/') return "%d-%02d-%02d" % (int(year), int(month), int(day)) - def sessionSecrets(self, page): + def session_secrets(self, page): payload = {} payload['__EVENTARGUMENT'] = None @@ -266,7 +265,7 @@ def sessionSecrets(self, page): except IndexError: pass - return(payload) + return payload def accept_response(self, response, **kwargs): if response.status_code == 410: @@ -274,101 +273,9 @@ def accept_response(self, response, **kwargs): return super().accept_response(response, **kwargs) -def fieldKey(x): +def field_key(x): field_id = x.attrib['id'] field = re.split(r'hyp|lbl|Label', field_id)[-1] field = field.split('Prompt')[0] field = field.rstrip('X21') return field - - -class LegistarAPIScraper(scrapelib.Scraper): - date_format = '%Y-%m-%dT%H:%M:%S' - time_string_format = '%I:%M %p' - utc_timestamp_format = '%Y-%m-%dT%H:%M:%S.%f' - - def __init__(self, *args, **kwargs): - super(LegistarAPIScraper, self).__init__(*args, **kwargs) - self.logger = logging.getLogger("legistar") - self.warning = self.logger.warning - - def toTime(self, text): - time = datetime.datetime.strptime(text, self.date_format) - time = pytz.timezone(self.TIMEZONE).localize(time) - return time - - def to_utc_timestamp(self, text): - try: - time = datetime.datetime.strptime(text, self.utc_timestamp_format) - except ValueError as e: - if 'does not match format' in str(e): - time = datetime.datetime.strptime(text, self.date_format) - else: - raise - time = pytz.timezone('UTC').localize(time) - return time - - def search(self, route, item_key, search_conditions): - """ - Base function for searching the Legistar API. - - Arguments: - - route -- The path to search, i.e. /matters/, /events/, etc - item_key -- The unique id field for the items that you are searching. - This is necessary for proper pagination. examples - might be MatterId or EventId - search_conditions -- a string in the OData format for the - your search conditions http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 - - It would be nice if we could provide a - friendly search API. Something like https://github.com/tuomur/python-odata - - - Examples: - # Search for bills introduced after Jan. 1, 2017 - search('/matters/', 'MatterId', "MatterIntroDate gt datetime'2017-01-01'") - """ - - search_url = self.BASE_URL + route - - params = {'$filter': search_conditions} - - try: - yield from self.pages(search_url, - params=params, - item_key=item_key) - except requests.HTTPError as e: - if e.response.status_code == 400: - raise ValueError(e.response.json()['Message']) - if not self.accept_response(e.response): - raise - - def pages(self, url, params=None, item_key=None): - if params is None: - params = {} - - seen = deque([], maxlen=1000) - - page_num = 0 - response = None - while page_num == 0 or len(response.json()) == 1000: - params['$skip'] = page_num * 1000 - response = self.get(url, params=params) - response.raise_for_status() - - for item in response.json(): - if item[item_key] not in seen: - yield item - seen.append(item[item_key]) - - page_num += 1 - - def accept_response(self, response, **kwargs): - """ - This overrides a method that controls whether - the scraper should retry on an error. We don't - want to retry if the API returns a 400, except for - 410, which means the record no longer exists. - """ - return response.status_code < 401 or response.status_code == 410 diff --git a/src/legistar/ui/bills.py b/src/legistar/ui/bills.py new file mode 100644 index 0000000..d41aa79 --- /dev/null +++ b/src/legistar/ui/bills.py @@ -0,0 +1,210 @@ +from .base import LegistarScraper +from lxml.etree import tostring +from collections import deque + + +class LegistarBillScraper(LegistarScraper): + def legislation(self, search_text='', created_after=None, + created_before=None): + + # If legislation is added to the the legistar system while we + # are scraping, it will shift the list of legislation down and + # we might revisit the same legislation. So, we keep track of + # the last few pieces of legislation we've visited in order to + # make sure we are not revisiting + scraped_leg = deque([], maxlen=10) + + for page in self.search_legislation(search_text, created_after, + created_before): + for legislation_summary in self.parse_search_results(page): + if not legislation_summary['url'] in scraped_leg: + yield legislation_summary + scraped_leg.append(legislation_summary['url']) + + def search_legislation(self, search_text='', created_after=None, + created_before=None): + """ + Submit a search query on the legislation search page, and return a list + of summary results. + """ + + page = self.lxmlize(self.LEGISLATION_URL) + + page = self._advancedSearch(page) + + payload = {} + + # Enter the search parameters TODO: Each of the possible form + # fields should be represented as keyword arguments to this + # function. The default query string should be for the the + # default 'Legislative text' field. + payload['ctl00$ContentPlaceHolder1$txtText'] = search_text + + if created_after and created_before: + payload.update(date_within(created_after, created_before)) + + elif created_before: + payload.update(date_bound(created_before)) + payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '<' + + elif created_after: + payload.update(date_bound(created_after)) + payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '>' + + # Return up to one million search results + payload['ctl00_ContentPlaceHolder1_lstMax_ClientState'] = '{"value":"1000000"}' + payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = '{"value":"All"}' + payload['ctl00$ContentPlaceHolder1$btnSearch'] = 'Search Legislation' + + payload.update(self.session_secrets(page)) + + return self.pages(self.LEGISLATION_URL, payload) + + def parse_search_results(self, page): + """Take a page of search results and return a sequence of data + of tuples about the legislation, of the form + + ('Document ID', 'Document URL', 'Type', 'Status', 'Introduction Date' + 'Passed Date', 'Main Sponsor', 'Title') + """ + table = page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] + for legislation, headers, row in self.parse_data_table(table): + # Do legislation search-specific stuff + # ------------------------------------ + # First column should be the ID of the record. + id_key = headers[0] + try: + legislation_id = legislation[id_key]['label'] + except TypeError: + continue + legislation_url = legislation[id_key]['url'].split( + self.BASE_URL)[-1] + legislation[id_key] = legislation_id + legislation['url'] = self.BASE_URL + \ + legislation_url.split('&Options')[0] + '&FullText=1' + + yield legislation + + def _advancedSearch(self, page): + search_switcher = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] + + if 'simple search' in search_switcher.value.lower(): + return page + else: + payload = self.session_secrets(page) + payload[search_switcher.name] = search_switcher.value + + page = self.lxmlize(self.LEGISLATION_URL, payload) + + if 'simple search' not in page.xpath("//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0].value.lower(): + raise ValueError('Not on the advanced search page') + + return page + + def details(self, detail_url, div_id): + detail_page = self.lxmlize(detail_url) + + detail_div = detail_page.xpath(".//div[@id='%s']" % div_id)[0] + + return self.parse_details(detail_div) + + def leg_details(self, detail_url): + div_id = 'ctl00_ContentPlaceHolder1_pageDetails' + return self.details(detail_url, div_id) + + def action_details(self, detail_url): + div_id = 'ctl00_ContentPlaceHolder1_pageTop1' + return self.details(detail_url, div_id) + + def history(self, detail_url): + detail_page = self.lxmlize(detail_url) + + try: + history_table = detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']")[0] + except IndexError: + print(detail_url) + raise + + history = [row[0] for row in self.parse_data_table(history_table)] + + try: + history = sorted(history, key=self._action_sort_key) + except (TypeError, ValueError): + pass + + for action in history: + yield action + + def _action_sort_key(self, action): + action_date = self.to_date(action['Date']) + action_url = action['Action\xa0Details']['url'] + + return (action_date, action_url) + + def text(self, detail_url): + detail_page = self.lxmlize(detail_url) + + text_div = detail_page.xpath( + "//div[@id='ctl00_ContentPlaceHolder1_divText']") + + if len(text_div): + return tostring(text_div[0], pretty_print=True).decode() + else: + return None + + def extract_votes(self, action_detail_url): + action_detail_page = self.lxmlize(action_detail_url) + try: + vote_table = action_detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] + except IndexError: + self.warning("No votes found in table") + return None, [] + votes = list(self.parse_data_table(vote_table)) + vote_list = [] + for vote, _, _ in votes: + raw_option = vote['Vote'].lower() + vote_list.append((self.VOTE_OPTIONS.get(raw_option, raw_option), + vote['Person Name']['label'])) + + action_detail_div = action_detail_page.xpath( + ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']")[0] + action_details = self.parse_details(action_detail_div) + result = action_details['Result'].lower() + + return result, vote_list + + +def date_within(created_after, created_before): + payload = date_bound(created_after) + + payload['ctl00$ContentPlaceHolder1$txtFileCreated2'] =\ + '{d.year}-{d.month:02}-{d.day:02}'.format(d=created_before) + payload['ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput'] =\ + '{d.month}/{d.day}/{d.year}'.format(d=created_before) + + payload['ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState'] =\ + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=created_before) + + payload['ctl00$ContentPlaceHolder1$radFileCreated'] = 'between' + + return payload + + +def date_bound(creation_date): + payload = {} + + payload['ctl00$ContentPlaceHolder1$txtFileCreated1'] =\ + '{d.year}-{d.month:02}-{d.day:02}'.format(d=creation_date) + payload['ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput'] =\ + '{d.month}/{d.day}/{d.year}'.format(d=creation_date) + + payload['ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState'] =\ + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=creation_date) + + return payload \ No newline at end of file diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py new file mode 100644 index 0000000..34e5abb --- /dev/null +++ b/src/legistar/ui/events.py @@ -0,0 +1,169 @@ +import time +import datetime +from collections import deque +import esprima + +import pytz +import icalendar + +from .base import LegistarScraper + + +class LegistarEventsScraper(LegistarScraper): + ECOMMENT_JS_URLS = ( + 'https://metro.granicusideas.com/meetings.js', + 'https://metro.granicusideas.com/meetings.js?scope=past' + ) + + def __init__(self, *args, event_info_key='Meeting Details', **kwargs): + super().__init__(*args, **kwargs) + self.event_info_key = event_info_key + + + @property + def ecomment_dict(self): + """ + Parse event IDs and eComment links from JavaScript file with lines like: + activateEcomment('750', '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'); + """ + if getattr(self, '_ecomment_dict', None) is None: + ecomment_dict = {} + + # Define a callback to apply to each node, e.g., + # https://esprima.readthedocs.io/en/latest/syntactic-analysis.html#example-console-calls-removal + def is_activateEcomment(node, metadata): + if node.callee and node.callee.name == 'activateEcomment': + event_id, _, comment_url = node.arguments + ecomment_dict[event_id.value] = comment_url.value + + for url in self.ECOMMENT_JS_URLS: + response = self.get(url) + esprima.parse(response.text, delegate=is_activateEcomment) + + self._ecomment_dict = ecomment_dict + + return self._ecomment_dict + + def event_pages(self, since): + + page = self.lxmlize(self.EVENTSPAGE) + for page in self.event_search(page, since): + yield page + + def should_cache_response(self, response): + # Never cache the top level events page, because that may result in + # expired .NET state values. + return (super().should_cache_response(response) and + response.url != self.EVENTSPAGE) + + def event_search(self, page, since): + payload = self.session_secrets(page) + + payload['ctl00_ContentPlaceHolder1_lstYears_ClientState'] = '{"value":"%s"}' % since + + payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$lstYears' + + return self.pages(self.EVENTSPAGE, payload) + + def events(self, follow_links=True, since=None): + # If an event is added to the the legistar system while we + # are scraping, it will shift the list of events down and + # we might revisit the same event. So, we keep track of + # the last few events we've visited in order to + # make sure we are not revisiting + scraped_events = deque([], maxlen=10) + + current_year = self.now().year + + if since: + if since > current_year: + raise ValueError( + 'Value of :since cannot exceed {}'.format(current_year)) + else: + since_year = since - 1 + + else: + since_year = 0 + + # Anticipate events will be scheduled for the following year to avoid + # missing upcoming events during scrapes near the end of the current + # year. + for year in range(current_year + 1, since_year, -1): + no_events_in_year = True + + for page in self.event_pages(year): + events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] + for event, _, _ in self.parse_data_table(events_table): + ical_url = event['iCalendar']['url'] + if ical_url in scraped_events: + continue + else: + scraped_events.append(ical_url) + + if follow_links and type(event[self.event_info_key]) == dict: + agenda = self.agenda(event[self.event_info_key]['url']) + else: + agenda = None + + yield event, agenda + no_events_in_year = False + + # We scrape events in reverse chronological order, starting one year + # in the future. Stop scraping if there are no events in a given + # year, unless that year is in the future, because whether events + # have been scheduled in the future is not a reliable indication of + # whether any happened in the previous year. + if no_events_in_year and year <= current_year: + break + + def agenda(self, detail_url): + page = self.lxmlize(detail_url) + + payload = self.session_secrets(page) + + payload.update({"__EVENTARGUMENT": "3:1", + "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain"}) + + for page in self.pages(detail_url, payload): + agenda_table = page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] + agenda = self.parse_data_table(agenda_table) + yield from agenda + + def add_docs(self, e, events, doc_type): + try: + if events[doc_type] != 'Not\xa0available': + e.add_document(note=events[doc_type]['label'], + url=events[doc_type]['url'], + media_type="application/pdf") + except ValueError: + pass + + def extract_roll_call(self, action_detail_url): + action_detail_page = self.lxmlize(action_detail_url) + try: + rollcall_table = action_detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']")[0] + except IndexError: + self.warning("No rollcall found in table") + return [] + roll_call = list(self.parse_data_table(rollcall_table)) + call_list = [] + for call, _, _ in roll_call: + option = call['Attendance'] + call_list.append((option, + call['Person Name']['label'])) + + return call_list + + def ical(self, ical_text): + value = icalendar.Calendar.from_ical(ical_text) + return value + + def _parse_detail(self, key, field_1, field_2): + if key == 'eComment': + return self._get_ecomment_link(field_2) or field_2.text_content().strip() + + def _get_ecomment_link(self, link): + event_id = link.attrib['data-event-id'] + return self.ecomment_dict.get(event_id, None) \ No newline at end of file diff --git a/src/legistar/ui/people.py b/src/legistar/ui/people.py new file mode 100644 index 0000000..be75bce --- /dev/null +++ b/src/legistar/ui/people.py @@ -0,0 +1,45 @@ +from .base import LegistarScraper + + +class LegistarPersonScraper(LegistarScraper): + MEMBERLIST = None + ALL_MEMBERS = None + + def council_members(self, extra_args=None, follow_links=True): + payload = {} + if extra_args: + payload.update(extra_args) + page = self.lxmlize(self.MEMBERLIST, payload) + payload.update(self.session_secrets(page)) + + if self.ALL_MEMBERS: + payload['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$menuPeople" + payload['__EVENTARGUMENT'] = self.ALL_MEMBERS + + for page in self.pages(self.MEMBERLIST, payload): + table = page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']")[0] + + for councilman, headers, row in self.parse_data_table(table): + if follow_links and type(councilman['Person Name']) == dict: + + detail_url = councilman['Person Name']['url'] + councilman_details = self.lxmlize(detail_url) + detail_div = councilman_details.xpath( + ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0] + + councilman.update(self.parse_details(detail_div)) + + img = councilman_details.xpath( + "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']") + if img: + councilman['Photo'] = img[0].get('src') + + committee_table = councilman_details.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']")[0] + committees = self.parse_data_table(committee_table) + + yield councilman, committees + + else: + yield councilman \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 5be6406..292654f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,8 @@ import pytest -from legistar import base -from legistar.bills import LegistarAPIBillScraper +from src.legistar.api import base +from src.legistar.api.bills import LegistarAPIBillScraper @pytest.fixture(scope="module") diff --git a/tests/refresh_fixtures.py b/tests/refresh_fixtures.py index 173f952..23a702f 100644 --- a/tests/refresh_fixtures.py +++ b/tests/refresh_fixtures.py @@ -3,9 +3,9 @@ import lxml -from legistar.bills import LegistarBillScraper -from legistar.events import LegistarEventsScraper -from legistar.people import LegistarPersonScraper +from src.legistar.ui.bills import LegistarBillScraper +from src.legistar.ui.events import LegistarEventsScraper +from src.legistar.ui.people import LegistarPersonScraper def save_page(page, jurisdiction, outfile): @@ -20,7 +20,7 @@ def refresh_bills(jurisdiction): s = LegistarBillScraper() s.LEGISLATION_URL = 'https://{}.legistar.com/Legislation.aspx'.format(jurisdiction) - page = next(s.searchLegislation('bus')) + page = next(s.search_legislation('bus')) save_page(page, jurisdiction, 'bills.html') @@ -29,7 +29,7 @@ def refresh_events(jurisdiction): s = LegistarEventsScraper() s.EVENTSPAGE = 'https://{}.legistar.com/Calendar.aspx'.format(jurisdiction) - page = next(s.eventPages('2018-01-01')) + page = next(s.event_pages('2018-01-01')) save_page(page, jurisdiction, 'events.html') diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 167c0c9..e9df1e3 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -3,9 +3,9 @@ import lxml import pytest -from legistar.bills import LegistarBillScraper -from legistar.events import LegistarEventsScraper -from legistar.people import LegistarPersonScraper +from src.legistar.ui.bills import LegistarBillScraper +from src.legistar.ui.events import LegistarEventsScraper +from src.legistar.ui.people import LegistarPersonScraper @pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) @@ -17,7 +17,7 @@ def test_parse_bills(project_directory, jurisdiction): with open(bills_fixture, 'r') as f: page = lxml.html.fromstring(f.read()) - result = next(scraper.parseSearchResults(page)) + result = next(scraper.parse_search_results(page)) print(result) @@ -30,7 +30,7 @@ def test_parse_events(project_directory, mocker, jurisdiction): with open(events_fixture, 'r') as f: page = lxml.html.fromstring(f.read()) - mocker.patch.object(scraper, 'eventPages', return_value=page) + mocker.patch.object(scraper, 'event_pages', return_value=page) result, _ = next(scraper.events(follow_links=False)) print(result) @@ -45,5 +45,5 @@ def test_parse_people(project_directory, mocker, jurisdiction): with open(events_fixture, 'r') as f: page = lxml.html.fromstring(f.read()) mocker.patch.object(scraper, 'pages', return_value=page) - result = next(scraper.councilMembers(follow_links=False)) + result = next(scraper.council_members(follow_links=False)) print(result) From b64adc31dd5760946a5193b4129a3fc8364c049c Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 11:50:07 -0500 Subject: [PATCH 2/8] Define field_key before scoped usage --- src/legistar/ui/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/legistar/ui/base.py b/src/legistar/ui/base.py index 64fd23e..c5588f6 100644 --- a/src/legistar/ui/base.py +++ b/src/legistar/ui/base.py @@ -12,6 +12,14 @@ import pytz +def field_key(x): + field_id = x.attrib['id'] + field = re.split(r'hyp|lbl|Label', field_id)[-1] + field = field.split('Prompt')[0] + field = field.rstrip('X21') + return field + + class LegistarSession(requests.Session): def request(self, method, url, **kwargs): @@ -271,11 +279,3 @@ def accept_response(self, response, **kwargs): if response.status_code == 410: return True return super().accept_response(response, **kwargs) - - -def field_key(x): - field_id = x.attrib['id'] - field = re.split(r'hyp|lbl|Label', field_id)[-1] - field = field.split('Prompt')[0] - field = field.rstrip('X21') - return field From 152cbc7d317e14f36ee13a13a360e23bfc0e8452 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 11:51:46 -0500 Subject: [PATCH 3/8] Prefix field_key method --- src/legistar/ui/base.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/legistar/ui/base.py b/src/legistar/ui/base.py index c5588f6..1b4e139 100644 --- a/src/legistar/ui/base.py +++ b/src/legistar/ui/base.py @@ -12,14 +12,6 @@ import pytz -def field_key(x): - field_id = x.attrib['id'] - field = re.split(r'hyp|lbl|Label', field_id)[-1] - field = field.split('Prompt')[0] - field = field.rstrip('X21') - return field - - class LegistarSession(requests.Session): def request(self, method, url, **kwargs): @@ -137,7 +129,7 @@ def parse_details(self, detail_div): details = {} - for field_key, field in itertools.groupby(fields, field_key): + for field_key, field in itertools.groupby(fields, _field_key): field = list(field) field_1, field_2 = field[0], field[-1] @@ -279,3 +271,11 @@ def accept_response(self, response, **kwargs): if response.status_code == 410: return True return super().accept_response(response, **kwargs) + + +def _field_key(x): + field_id = x.attrib['id'] + field = re.split(r'hyp|lbl|Label', field_id)[-1] + field = field.split('Prompt')[0] + field = field.rstrip('X21') + return field From 82be2ece8abc1f971cb221fa9df2fcb57f3048da Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 11:56:30 -0500 Subject: [PATCH 4/8] Add flake8 pyproject plug-in --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f724041..0758a5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dev = [ "pytest", "pytest-mock", "requests-mock", - "flake8" + "flake8", + "flake8-pyproject" ] test = [ "pytest", From 26815d3d64ffe29e8284fc710d532d4ff08207d5 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 12:07:54 -0500 Subject: [PATCH 5/8] Address linting failures --- src/legistar/__init__.py | 2 +- src/legistar/api/base.py | 6 +++-- src/legistar/api/bills.py | 54 ++++++++++++++++++++++++-------------- src/legistar/api/events.py | 16 +++++------ src/legistar/api/people.py | 8 +++--- src/legistar/ui/base.py | 15 ++++++----- src/legistar/ui/bills.py | 13 +++++---- src/legistar/ui/events.py | 2 +- src/legistar/ui/people.py | 2 +- tests/refresh_fixtures.py | 5 +++- tests/test_parsing.py | 14 +++++++--- 11 files changed, 84 insertions(+), 53 deletions(-) diff --git a/src/legistar/__init__.py b/src/legistar/__init__.py index 2bd87a9..1559bab 100644 --- a/src/legistar/__init__.py +++ b/src/legistar/__init__.py @@ -1 +1 @@ -__version__ = '0.0.1' # pragma: no cover \ No newline at end of file +__version__ = '0.0.1' # pragma: no cover diff --git a/src/legistar/api/base.py b/src/legistar/api/base.py index 5110e96..15e8f7d 100644 --- a/src/legistar/api/base.py +++ b/src/legistar/api/base.py @@ -44,10 +44,12 @@ def search(self, route, item_key, search_conditions): This is necessary for proper pagination. examples might be MatterId or EventId search_conditions -- a string in the OData format for the - your search conditions http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 + your search conditions + http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 It would be nice if we could provide a - friendly search API. Something like https://github.com/tuomur/python-odata + friendly search API. Something like + https://github.com/tuomur/python-odata Examples: diff --git a/src/legistar/api/bills.py b/src/legistar/api/bills.py index 3c070a4..ae319b6 100644 --- a/src/legistar/api/bills.py +++ b/src/legistar/api/bills.py @@ -12,7 +12,8 @@ def __init__(self, *args, **kwargs): ''' Initialize the Bill scraper with a `scrape_restricted` property. Do not collect private bills (i.e., bills with 'MatterRestrictViewViaWeb' - set as True in the API), unless the scrapers have access to them, e.g., via a token. + set as True in the API), unless the scrapers have access to them, + e.g., via a token. ''' super().__init__(*args, **kwargs) @@ -34,8 +35,8 @@ def matters(self, since_datetime=None): 'MatterDate1', 'MatterDate2', # 'MatterEXDate1', # can't use all 17 search - # terms, this one always - # seems to be not set + # terms, this one always + # seems to be not set 'MatterEXDate2', 'MatterEXDate3', 'MatterEXDate4', @@ -147,12 +148,14 @@ def votes(self, history_id): def history(self, matter_id): actions = self.endpoint('/matters/{0}/histories', matter_id) for action in actions: - action['MatterHistoryActionName'] = action['MatterHistoryActionName'].strip() + action['MatterHistoryActionName'] = ( + action['MatterHistoryActionName'].strip() + ) actions = sorted((action for action in actions - if (action['MatterHistoryActionDate'] and - action['MatterHistoryActionName'] and - action['MatterHistoryActionBodyName'])), + if (action['MatterHistoryActionDate'] + and action['MatterHistoryActionName'] + and action['MatterHistoryActionBodyName'])), key=lambda action: action['MatterHistoryActionDate']) # sometimes there are exact duplicates of actions. while this @@ -172,11 +175,14 @@ def history(self, matter_id): uniq_actions.append(action) previous_key = current_key else: - self.warning('"{0} by {1}" appears more than once in {2}/matters/{3}/histories. Duplicate actions have been removed.'.format( - current_key[0], - current_key[1], - self.BASE_URL, - matter_id)) + self.warning( + '"{0} by {1}" appears more than once in ' + '{2}/matters/{3}/histories. Duplicate actions have been ' + 'removed.'.format( + current_key[0], + current_key[1], + self.BASE_URL, + matter_id)) return uniq_actions @@ -242,7 +248,8 @@ def _filter_relations(self, relations): def text(self, matter_id, latest_version_value=None): '''Historically, we have determined the latest version of a bill - by finding the version with the highest value (either numerical or alphabetical). + by finding the version with the highest value (either numerical + or alphabetical). However, the `MatterVersion` field on the matter detail page most accurately identifies the latest version of a bill. @@ -301,7 +308,11 @@ def legislation_detail_url(self, matter_id): # If the status code is anything but a 200 or 302, something is wrong. # Raise an HTTPError to interrupt the scrape. else: - self.error('{0} returned an unexpected status code: {1}'.format(gateway_url, response.status_code)) + self.error( + '{0} returned an unexpected status code: {1}'.format( + gateway_url, response.status_code + ) + ) response.status_code = 500 raise scrapelib.HTTPError(response) @@ -312,8 +323,13 @@ def _missing_votes(self, response): see `accept_response` for more discussion of why we are doing this. ''' - missing = (response.status_code == 500 and - response.json().get('InnerException', {}).get('ExceptionMessage', '') == "The cast to value type 'System.Int32' failed because the materialized value is null. Either the result type's generic parameter or the query must use a nullable type.") # noqa : 501 + missing = (response.status_code == 500 + and response.json().get('InnerException', {}).get( + 'ExceptionMessage', '') == ( + "The cast to value type 'System.Int32' failed because the " + "materialized value is null. Either the result type's " + "generic parameter or the query must use a nullable type." + )) return missing def accept_response(self, response, **kwargs): @@ -331,7 +347,7 @@ def accept_response(self, response, **kwargs): we short circuit scrapelib's retry mechanism for this particular error. ''' - accept = (super().accept_response(response) or - self._missing_votes(response) or - response.status_code <= 403) + accept = (super().accept_response(response) + or self._missing_votes(response) + or response.status_code <= 403) return accept diff --git a/src/legistar/api/events.py b/src/legistar/api/events.py index 3d76189..2860114 100644 --- a/src/legistar/api/events.py +++ b/src/legistar/api/events.py @@ -118,16 +118,16 @@ def event(self, api_event): ) def agenda(self, event): - agenda_url = (self.BASE_URL + - '/events/{}/eventitems'.format(event['EventId'])) + agenda_url = (self.BASE_URL + + '/events/{}/eventitems'.format(event['EventId'])) response = self.get(agenda_url) # If an event item does not have a value for # EventItemAgendaSequence, it is not on the agenda filtered_items = (item for item in response.json() - if (item['EventItemTitle'] and - item['EventItemAgendaSequence'])) + if (item['EventItemTitle'] + and item['EventItemAgendaSequence'])) sorted_items = sorted(filtered_items, key=lambda item: item['EventItemAgendaSequence']) @@ -136,16 +136,16 @@ def agenda(self, event): yield item def minutes(self, event): - minutes_url = (self.BASE_URL + - '/events/{}/eventitems'.format(event['EventId'])) + minutes_url = (self.BASE_URL + + '/events/{}/eventitems'.format(event['EventId'])) response = self.get(minutes_url) # If an event item does not have a value for # EventItemMinutesSequence, it is not in the minutes filtered_items = (item for item in response.json() - if (item['EventItemTitle'] and - item['EventItemMinutesSequence'])) + if (item['EventItemTitle'] + and item['EventItemMinutesSequence'])) sorted_items = sorted(filtered_items, key=lambda item: item['EventItemMinutesSequence']) diff --git a/src/legistar/api/people.py b/src/legistar/api/people.py index 36a41f6..5f3aa39 100644 --- a/src/legistar/api/people.py +++ b/src/legistar/api/people.py @@ -22,8 +22,8 @@ def bodies(self): def body_offices(self, body): body_id = body['BodyId'] - offices_url = (self.BASE_URL + - '/bodies/{}/OfficeRecords'.format(body_id)) + offices_url = (self.BASE_URL + + '/bodies/{}/OfficeRecords'.format(body_id)) for office in self.pages(offices_url, item_key="OfficeRecordId"): yield office @@ -32,8 +32,8 @@ def to_date(self, text): return self.to_time(text).date() def person_sources_from_office(self, office): - person_api_url = (self.BASE_URL + - '/persons/{OfficeRecordPersonId}'.format(**office)) + person_api_url = (self.BASE_URL + + '/persons/{OfficeRecordPersonId}'.format(**office)) response = self.get(person_api_url) diff --git a/src/legistar/ui/base.py b/src/legistar/ui/base.py index 1b4e139..0eca4df 100644 --- a/src/legistar/ui/base.py +++ b/src/legistar/ui/base.py @@ -67,8 +67,8 @@ def _range_error(self, response, payload): def _range_is_all(self, payload): range_var = 'ctl00_ContentPlaceHolder1_lstYears_ClientState' - all_range = (range_var in payload and - json.loads(payload[range_var])['value'] == 'All') + all_range = (range_var in payload + and json.loads(payload[range_var])['value'] == 'All') return all_range @@ -185,7 +185,8 @@ def parse_data_table(self, table): if field.find('.//a') is not None: address = self._get_link_address(field.find('.//a')) if address: - if key.strip() in ['', 'ics'] and 'View.ashx?M=IC' in address: + if (key.strip() in ['', 'ics'] + and 'View.ashx?M=IC' in address): key = 'iCalendar' value = {'url': address} else: @@ -210,10 +211,10 @@ def _get_link_address(self, link): url = None if 'onclick' in link.attrib: onclick = link.attrib['onclick'] - if (onclick is not None and - onclick.startswith(("radopen('", - "window.open", - "OpenTelerikWindow"))): + if (onclick is not None + and onclick.startswith(("radopen('", + "window.open", + "OpenTelerikWindow"))): onclick_path = onclick.split("'")[1] if not onclick_path.startswith("/"): onclick_path = "/" + onclick_path diff --git a/src/legistar/ui/bills.py b/src/legistar/ui/bills.py index d41aa79..ce53bf9 100644 --- a/src/legistar/ui/bills.py +++ b/src/legistar/ui/bills.py @@ -15,14 +15,14 @@ def legislation(self, search_text='', created_after=None, scraped_leg = deque([], maxlen=10) for page in self.search_legislation(search_text, created_after, - created_before): + created_before): for legislation_summary in self.parse_search_results(page): if not legislation_summary['url'] in scraped_leg: yield legislation_summary scraped_leg.append(legislation_summary['url']) def search_legislation(self, search_text='', created_after=None, - created_before=None): + created_before=None): """ Submit a search query on the legislation search page, and return a list of summary results. @@ -53,7 +53,8 @@ def search_legislation(self, search_text='', created_after=None, # Return up to one million search results payload['ctl00_ContentPlaceHolder1_lstMax_ClientState'] = '{"value":"1000000"}' - payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = '{"value":"All"}' + payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = ( + '{"value":"All"}') payload['ctl00$ContentPlaceHolder1$btnSearch'] = 'Search Legislation' payload.update(self.session_secrets(page)) @@ -98,7 +99,9 @@ def _advancedSearch(self, page): page = self.lxmlize(self.LEGISLATION_URL, payload) - if 'simple search' not in page.xpath("//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0].value.lower(): + search_button = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] + if 'simple search' not in search_button.value.lower(): raise ValueError('Not on the advanced search page') return page @@ -207,4 +210,4 @@ def date_bound(creation_date): '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 d=creation_date) - return payload \ No newline at end of file + return payload diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py index 34e5abb..3d12089 100644 --- a/src/legistar/ui/events.py +++ b/src/legistar/ui/events.py @@ -166,4 +166,4 @@ def _parse_detail(self, key, field_1, field_2): def _get_ecomment_link(self, link): event_id = link.attrib['data-event-id'] - return self.ecomment_dict.get(event_id, None) \ No newline at end of file + return self.ecomment_dict.get(event_id, None) diff --git a/src/legistar/ui/people.py b/src/legistar/ui/people.py index be75bce..baeb2c7 100644 --- a/src/legistar/ui/people.py +++ b/src/legistar/ui/people.py @@ -42,4 +42,4 @@ def council_members(self, extra_args=None, follow_links=True): yield councilman, committees else: - yield councilman \ No newline at end of file + yield councilman diff --git a/tests/refresh_fixtures.py b/tests/refresh_fixtures.py index 23a702f..a315e5d 100644 --- a/tests/refresh_fixtures.py +++ b/tests/refresh_fixtures.py @@ -12,7 +12,10 @@ def save_page(page, jurisdiction, outfile): test_directory = os.path.abspath(os.path.dirname(__file__)) project_directory = os.path.join(test_directory, '..') - with open(os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, outfile), 'wb') as f: + fixture_path = os.path.join( + project_directory, 'tests', 'fixtures', jurisdiction, outfile + ) + with open(fixture_path, 'wb') as f: f.write(lxml.html.tostring(page)) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index e9df1e3..804885d 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -10,7 +10,9 @@ @pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) def test_parse_bills(project_directory, jurisdiction): - bills_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'bills.html') + bills_fixture = os.path.join( + project_directory, 'tests', 'fixtures', jurisdiction, 'bills.html' + ) scraper = LegistarBillScraper() scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) @@ -23,7 +25,9 @@ def test_parse_bills(project_directory, jurisdiction): @pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) def test_parse_events(project_directory, mocker, jurisdiction): - events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'events.html') + events_fixture = os.path.join( + project_directory, 'tests', 'fixtures', jurisdiction, 'events.html' + ) scraper = LegistarEventsScraper() scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) @@ -37,12 +41,14 @@ def test_parse_events(project_directory, mocker, jurisdiction): @pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) def test_parse_people(project_directory, mocker, jurisdiction): - events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'people.html') + people_fixture = os.path.join( + project_directory, 'tests', 'fixtures', jurisdiction, 'people.html' + ) scraper = LegistarPersonScraper() scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) - with open(events_fixture, 'r') as f: + with open(people_fixture, 'r') as f: page = lxml.html.fromstring(f.read()) mocker.patch.object(scraper, 'pages', return_value=page) result = next(scraper.council_members(follow_links=False)) From 2bded462e0ad3918df251269869d8560d6e5e8a5 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 14:23:42 -0500 Subject: [PATCH 6/8] flake8/Black --- pyproject.toml | 3 +- src/legistar/__init__.py | 2 +- src/legistar/api/base.py | 20 ++-- src/legistar/api/bills.py | 237 +++++++++++++++++++------------------ src/legistar/api/events.py | 133 +++++++++++---------- src/legistar/api/people.py | 24 ++-- src/legistar/ui/base.py | 169 ++++++++++++++------------ src/legistar/ui/bills.py | 127 +++++++++++--------- src/legistar/ui/events.py | 66 ++++++----- src/legistar/ui/people.py | 22 ++-- tests/conftest.py | 26 ++-- tests/refresh_fixtures.py | 28 ++--- tests/test_bills.py | 42 ++++--- tests/test_parsing.py | 28 ++--- tests/test_search.py | 8 +- 15 files changed, 497 insertions(+), 438 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0758a5b..3c02f45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,8 @@ dev = [ "pytest-mock", "requests-mock", "flake8", - "flake8-pyproject" + "flake8-pyproject", + "black", ] test = [ "pytest", diff --git a/src/legistar/__init__.py b/src/legistar/__init__.py index 1559bab..1e09a73 100644 --- a/src/legistar/__init__.py +++ b/src/legistar/__init__.py @@ -1 +1 @@ -__version__ = '0.0.1' # pragma: no cover +__version__ = "0.0.1" # pragma: no cover diff --git a/src/legistar/api/base.py b/src/legistar/api/base.py index 15e8f7d..1292163 100644 --- a/src/legistar/api/base.py +++ b/src/legistar/api/base.py @@ -8,9 +8,9 @@ class LegistarAPIScraper(scrapelib.Scraper): - date_format = '%Y-%m-%dT%H:%M:%S' - time_string_format = '%I:%M %p' - utc_timestamp_format = '%Y-%m-%dT%H:%M:%S.%f' + date_format = "%Y-%m-%dT%H:%M:%S" + time_string_format = "%I:%M %p" + utc_timestamp_format = "%Y-%m-%dT%H:%M:%S.%f" def __init__(self, *args, **kwargs): super(LegistarAPIScraper, self).__init__(*args, **kwargs) @@ -26,11 +26,11 @@ def to_utc_timestamp(self, text): try: time = datetime.datetime.strptime(text, self.utc_timestamp_format) except ValueError as e: - if 'does not match format' in str(e): + if "does not match format" in str(e): time = datetime.datetime.strptime(text, self.date_format) else: raise - time = pytz.timezone('UTC').localize(time) + time = pytz.timezone("UTC").localize(time) return time def search(self, route, item_key, search_conditions): @@ -59,15 +59,13 @@ def search(self, route, item_key, search_conditions): search_url = self.BASE_URL + route - params = {'$filter': search_conditions} + params = {"$filter": search_conditions} try: - yield from self.pages(search_url, - params=params, - item_key=item_key) + yield from self.pages(search_url, params=params, item_key=item_key) except requests.HTTPError as e: if e.response.status_code == 400: - raise ValueError(e.response.json()['Message']) + raise ValueError(e.response.json()["Message"]) if not self.accept_response(e.response): raise @@ -80,7 +78,7 @@ def pages(self, url, params=None, item_key=None): page_num = 0 response = None while page_num == 0 or len(response.json()) == 1000: - params['$skip'] = page_num * 1000 + params["$skip"] = page_num * 1000 response = self.get(url, params=params) response.raise_for_status() diff --git a/src/legistar/api/bills.py b/src/legistar/api/bills.py index ae319b6..408b4c0 100644 --- a/src/legistar/api/bills.py +++ b/src/legistar/api/bills.py @@ -9,12 +9,12 @@ class LegistarAPIBillScraper(LegistarAPIScraper): def __init__(self, *args, **kwargs): - ''' + """ Initialize the Bill scraper with a `scrape_restricted` property. Do not collect private bills (i.e., bills with 'MatterRestrictViewViaWeb' set as True in the API), unless the scrapers have access to them, e.g., via a token. - ''' + """ super().__init__(*args, **kwargs) self.scrape_restricted = False @@ -24,63 +24,63 @@ def matters(self, since_datetime=None): # scraping jobs easier because upon a scrape failure we can # import everything scraped and then scrape everything newer # then the last bill we scraped - params = {'$orderby': 'MatterLastModifiedUtc'} + params = {"$orderby": "MatterLastModifiedUtc"} if since_datetime: since_iso = since_datetime.isoformat() - update_fields = ('MatterLastModifiedUtc', - 'MatterIntroDate', - 'MatterPassedDate', - 'MatterDate1', - 'MatterDate2', - # 'MatterEXDate1', # can't use all 17 search - # terms, this one always - # seems to be not set - 'MatterEXDate2', - 'MatterEXDate3', - 'MatterEXDate4', - 'MatterEXDate5', - 'MatterEXDate6', - 'MatterEXDate7', - 'MatterEXDate8', - 'MatterEXDate9', - 'MatterEXDate10', - 'MatterEnactmentDate', - 'MatterAgendaDate') + update_fields = ( + "MatterLastModifiedUtc", + "MatterIntroDate", + "MatterPassedDate", + "MatterDate1", + "MatterDate2", + # 'MatterEXDate1', # can't use all 17 search + # terms, this one always + # seems to be not set + "MatterEXDate2", + "MatterEXDate3", + "MatterEXDate4", + "MatterEXDate5", + "MatterEXDate6", + "MatterEXDate7", + "MatterEXDate8", + "MatterEXDate9", + "MatterEXDate10", + "MatterEnactmentDate", + "MatterAgendaDate", + ) since_fmt = "{field} gt datetime'{since_datetime}'" - since_filter =\ - ' or '.join(since_fmt.format(field=field, - since_datetime=since_iso) - for field in update_fields) + since_filter = " or ".join( + since_fmt.format(field=field, since_datetime=since_iso) + for field in update_fields + ) - params['$filter'] = since_filter + params["$filter"] = since_filter - matters_url = self.BASE_URL + '/matters' + matters_url = self.BASE_URL + "/matters" - for matter in self.pages(matters_url, - params=params, - item_key="MatterId"): + for matter in self.pages(matters_url, params=params, item_key="MatterId"): try: - legistar_url = self.legislation_detail_url(matter['MatterId']) + legistar_url = self.legislation_detail_url(matter["MatterId"]) except scrapelib.HTTPError as e: if e.response.status_code > 403: raise - url = matters_url + '/{}'.format(matter['MatterId']) - self.warning('Bill could not be found in web interface: {}'.format(url)) + url = matters_url + "/{}".format(matter["MatterId"]) + self.warning("Bill could not be found in web interface: {}".format(url)) if not self.scrape_restricted: continue else: - matter['legistar_url'] = legistar_url + matter["legistar_url"] = legistar_url yield matter def matter(self, matter_id): - matter = self.endpoint('/matters/{}', matter_id) + matter = self.endpoint("/matters/{}", matter_id) try: legistar_url = self.legislation_detail_url(matter_id) @@ -88,13 +88,13 @@ def matter(self, matter_id): if e.response.status_code > 403: raise - url = self.BASE_URL + '/matters/{}'.format(matter_id) - self.warning('Bill could not be found in web interface: {}'.format(url)) + url = self.BASE_URL + "/matters/{}".format(matter_id) + self.warning("Bill could not be found in web interface: {}".format(url)) if not self.scrape_restricted: return None else: - matter['legistar_url'] = legistar_url + matter["legistar_url"] = legistar_url return matter @@ -103,26 +103,24 @@ def endpoint(self, route, *args): response = self.get(url.format(*args)) return response.json() - code_sections = partialmethod(endpoint, 'matters/{0}/codesections') + code_sections = partialmethod(endpoint, "matters/{0}/codesections") def topics(self, *args, **kwargs): if args: - return self.endpoint('/matters/{0}/indexes', *args) + return self.endpoint("/matters/{0}/indexes", *args) else: - matter_indexes_url = self.BASE_URL + '/indexes' - return self.pages(matter_indexes_url, - params=kwargs, - item_key="IndexId") + matter_indexes_url = self.BASE_URL + "/indexes" + return self.pages(matter_indexes_url, params=kwargs, item_key="IndexId") def attachments(self, matter_id): - attachments = self.endpoint('/matters/{0}/attachments', matter_id) + attachments = self.endpoint("/matters/{0}/attachments", matter_id) unique_attachments = [] scraped_urls = set() # Handle matters with duplicate attachments. for attachment in attachments: - url = attachment['MatterAttachmentHyperlink'] + url = attachment["MatterAttachmentHyperlink"] if url not in scraped_urls: unique_attachments.append(attachment) scraped_urls.add(url) @@ -130,7 +128,7 @@ def attachments(self, matter_id): return unique_attachments def votes(self, history_id): - url = self.BASE_URL + '/eventitems/{0}/votes'.format(history_id) + url = self.BASE_URL + "/eventitems/{0}/votes".format(history_id) try: response = self.get(url) @@ -146,17 +144,24 @@ def votes(self, history_id): return response.json() def history(self, matter_id): - actions = self.endpoint('/matters/{0}/histories', matter_id) + actions = self.endpoint("/matters/{0}/histories", matter_id) for action in actions: - action['MatterHistoryActionName'] = ( - action['MatterHistoryActionName'].strip() - ) - - actions = sorted((action for action in actions - if (action['MatterHistoryActionDate'] - and action['MatterHistoryActionName'] - and action['MatterHistoryActionBodyName'])), - key=lambda action: action['MatterHistoryActionDate']) + action["MatterHistoryActionName"] = action[ + "MatterHistoryActionName" + ].strip() + + actions = sorted( + ( + action + for action in actions + if ( + action["MatterHistoryActionDate"] + and action["MatterHistoryActionName"] + and action["MatterHistoryActionBodyName"] + ) + ), + key=lambda action: action["MatterHistoryActionDate"], + ) # sometimes there are exact duplicates of actions. while this # is a a data entry problem that ideally the source system @@ -169,50 +174,53 @@ def history(self, matter_id): for action in actions: # these are the attributes that pupa uses for # checking for duplicate vote events - current_key = (action['MatterHistoryActionName'], - action['MatterHistoryActionBodyName']) + current_key = ( + action["MatterHistoryActionName"], + action["MatterHistoryActionBodyName"], + ) if current_key != previous_key: uniq_actions.append(action) previous_key = current_key else: self.warning( '"{0} by {1}" appears more than once in ' - '{2}/matters/{3}/histories. Duplicate actions have been ' - 'removed.'.format( - current_key[0], - current_key[1], - self.BASE_URL, - matter_id)) + "{2}/matters/{3}/histories. Duplicate actions have been " + "removed.".format( + current_key[0], current_key[1], self.BASE_URL, matter_id + ) + ) return uniq_actions def sponsors(self, matter_id): - spons = self.endpoint('/matters/{0}/sponsors', matter_id) + spons = self.endpoint("/matters/{0}/sponsors", matter_id) if spons: max_version = max( - (sponsor['MatterSponsorMatterVersion'] for sponsor in spons), - key=lambda version: self._version_rank(version) + (sponsor["MatterSponsorMatterVersion"] for sponsor in spons), + key=lambda version: self._version_rank(version), ) - spons = [sponsor for sponsor in spons - if sponsor['MatterSponsorMatterVersion'] == str(max_version)] + spons = [ + sponsor + for sponsor in spons + if sponsor["MatterSponsorMatterVersion"] == str(max_version) + ] - return sorted(spons, - key=lambda sponsor: sponsor["MatterSponsorSequence"]) + return sorted(spons, key=lambda sponsor: sponsor["MatterSponsorSequence"]) else: return [] def _version_rank(self, version): - ''' + """ In general, matter versions are numbers. This method provides an override opportunity for handling versions that are not numbers. - ''' + """ return int(version) def relations(self, matter_id): - relations = self.endpoint('/matters/{0}/relations', matter_id) + relations = self.endpoint("/matters/{0}/relations", matter_id) if relations: return self._filter_relations(relations) @@ -221,33 +229,30 @@ def relations(self, matter_id): return [] def _filter_relations(self, relations): - ''' + """ Sometimes, many versions of a bill are related. This method returns the most recent version of each relation. Override this method to apply a different filter or return the full array of relations. - ''' + """ # Sort relations such that the latest version of each matter # ID is returned first. sorted_relations = sorted( relations, - key=lambda x: ( - x['MatterRelationMatterId'], - x['MatterRelationFlag'] - ), - reverse=True + key=lambda x: (x["MatterRelationMatterId"], x["MatterRelationFlag"]), + reverse=True, ) seen_relations = set() for relation in sorted_relations: - relation_id = relation['MatterRelationMatterId'] + relation_id = relation["MatterRelationMatterId"] if relation_id not in seen_relations: yield relation seen_relations.add(relation_id) def text(self, matter_id, latest_version_value=None): - '''Historically, we have determined the latest version of a bill + """Historically, we have determined the latest version of a bill by finding the version with the highest value (either numerical or alphabetical). @@ -257,30 +262,29 @@ def text(self, matter_id, latest_version_value=None): Other municipalities may share this characteristic with Metro. Until we know more, the `text` function accepts `latest_version_value`, - i.e., matter['MatterVersion'], as an optional argument.''' + i.e., matter['MatterVersion'], as an optional argument.""" - version_route = '/matters/{0}/versions' - text_route = '/matters/{0}/texts/{1}' + version_route = "/matters/{0}/versions" + text_route = "/matters/{0}/texts/{1}" versions = self.endpoint(version_route, matter_id) if latest_version_value: latest_version = next( - version for version - in versions - if version['Value'] == latest_version_value) + version + for version in versions + if version["Value"] == latest_version_value + ) else: - latest_version = max( - versions, key=lambda x: self._version_rank(x['Value'])) + latest_version = max(versions, key=lambda x: self._version_rank(x["Value"])) - text_url = self.BASE_URL + \ - text_route.format(matter_id, latest_version['Key']) + text_url = self.BASE_URL + text_route.format(matter_id, latest_version["Key"]) response = self.get(text_url, stream=True) - if int(response.headers['Content-Length']) < 21052630: + if int(response.headers["Content-Length"]) < 21052630: return response.json() def legislation_detail_url(self, matter_id): - gateway_url = self.BASE_WEB_URL + '/gateway.aspx?m=l&id={0}'.format(matter_id) + gateway_url = self.BASE_WEB_URL + "/gateway.aspx?m=l&id={0}".format(matter_id) # We want to supress any session level params for this head request, # since they could lead to an additonal level of redirect. @@ -288,15 +292,12 @@ def legislation_detail_url(self, matter_id): # Per # http://docs.python-requests.org/en/master/user/advanced/, we # have to do this by setting session level params to None - response = self.head( - gateway_url, - params={k: None for k in self.params} - ) + response = self.head(gateway_url, params={k: None for k in self.params}) # If the gateway URL redirects, the matter is publicly viewable. Grab # its detail URL from the response headers. if response.status_code == 302: - legislation_detail_route = response.headers['Location'] + legislation_detail_route = response.headers["Location"] return urljoin(self.BASE_WEB_URL, legislation_detail_route) # If the gateway URL returns a 200, it has not redirected, i.e., the @@ -309,7 +310,7 @@ def legislation_detail_url(self, matter_id): # Raise an HTTPError to interrupt the scrape. else: self.error( - '{0} returned an unexpected status code: {1}'.format( + "{0} returned an unexpected status code: {1}".format( gateway_url, response.status_code ) ) @@ -317,23 +318,23 @@ def legislation_detail_url(self, matter_id): raise scrapelib.HTTPError(response) def _missing_votes(self, response): - ''' + """ Check to see if a response has the particular status code and error message that corresponds to inaccessible eventitem votes. see `accept_response` for more discussion of why we are doing this. - ''' - missing = (response.status_code == 500 - and response.json().get('InnerException', {}).get( - 'ExceptionMessage', '') == ( - "The cast to value type 'System.Int32' failed because the " - "materialized value is null. Either the result type's " - "generic parameter or the query must use a nullable type." - )) + """ + missing = response.status_code == 500 and response.json().get( + "InnerException", {} + ).get("ExceptionMessage", "") == ( + "The cast to value type 'System.Int32' failed because the " + "materialized value is null. Either the result type's " + "generic parameter or the query must use a nullable type." + ) return missing def accept_response(self, response, **kwargs): - ''' + """ Sometimes there ought to be votes on an eventitem but when we visit the votes page, the API returns a 500 status code and a particular error message. @@ -346,8 +347,10 @@ def accept_response(self, response, **kwargs): cases, it would really slow down the scraping. To avoid that we short circuit scrapelib's retry mechanism for this particular error. - ''' - accept = (super().accept_response(response) - or self._missing_votes(response) - or response.status_code <= 403) + """ + accept = ( + super().accept_response(response) + or self._missing_votes(response) + or response.status_code <= 403 + ) return accept diff --git a/src/legistar/api/events.py b/src/legistar/api/events.py index 2860114..689a520 100644 --- a/src/legistar/api/events.py +++ b/src/legistar/api/events.py @@ -20,7 +20,8 @@ def __init__(self, *args, **kwargs): def _init_webscraper(self): webscraper = self.webscraper_class( requests_per_minute=self.requests_per_minute, - retry_attempts=self.WEB_RETRY_EVENTS) + retry_attempts=self.WEB_RETRY_EVENTS, + ) if self.cache_storage: webscraper.cache_storage = self.cache_storage @@ -31,7 +32,7 @@ def _init_webscraper(self): webscraper.EVENTSPAGE = self.EVENTSPAGE webscraper.BASE_URL = self.WEB_URL webscraper.TIMEZONE = self.TIMEZONE - webscraper.date_format = '%m/%d/%Y' + webscraper.date_format = "%m/%d/%Y" return webscraper @@ -44,7 +45,7 @@ def api_events(self, since_datetime=None): # scraping jobs easier because upon a scrape failure we can # import everything scraped and then scrape everything newer # then the last event we scraped - params = {'$orderby': 'EventLastModifiedUtc'} + params = {"$orderby": "EventLastModifiedUtc"} if since_datetime: # We include events three days before the given start date @@ -57,24 +58,24 @@ def api_events(self, since_datetime=None): # corresponding event modification. Query all update fields so later # changes are always caught by our scraper, particularly when # scraping narrower windows of time. - update_fields = ('EventDate', - 'EventLastModifiedUtc', - 'EventAgendaLastPublishedUTC', - 'EventMinutesLastPublishedUTC') + update_fields = ( + "EventDate", + "EventLastModifiedUtc", + "EventAgendaLastPublishedUTC", + "EventMinutesLastPublishedUTC", + ) since_fmt = "{field} gt datetime'{since_datetime}'" - since_filter =\ - ' or '.join(since_fmt.format(field=field, - since_datetime=since_iso) - for field in update_fields) + since_filter = " or ".join( + since_fmt.format(field=field, since_datetime=since_iso) + for field in update_fields + ) - params['$filter'] = since_filter + params["$filter"] = since_filter - events_url = self.BASE_URL + '/events/' + events_url = self.BASE_URL + "/events/" - yield from self.pages(events_url, - params=params, - item_key="EventId") + yield from self.pages(events_url, params=params, item_key="EventId") def events(self, since_datetime=None): for api_event in self.api_events(since_datetime=since_datetime): @@ -118,43 +119,47 @@ def event(self, api_event): ) def agenda(self, event): - agenda_url = (self.BASE_URL - + '/events/{}/eventitems'.format(event['EventId'])) + agenda_url = self.BASE_URL + "/events/{}/eventitems".format(event["EventId"]) response = self.get(agenda_url) # If an event item does not have a value for # EventItemAgendaSequence, it is not on the agenda - filtered_items = (item for item in response.json() - if (item['EventItemTitle'] - and item['EventItemAgendaSequence'])) - sorted_items = sorted(filtered_items, - key=lambda item: item['EventItemAgendaSequence']) + filtered_items = ( + item + for item in response.json() + if (item["EventItemTitle"] and item["EventItemAgendaSequence"]) + ) + sorted_items = sorted( + filtered_items, key=lambda item: item["EventItemAgendaSequence"] + ) for item in sorted_items: self._suppress_item_matter(item, agenda_url) yield item def minutes(self, event): - minutes_url = (self.BASE_URL - + '/events/{}/eventitems'.format(event['EventId'])) + minutes_url = self.BASE_URL + "/events/{}/eventitems".format(event["EventId"]) response = self.get(minutes_url) # If an event item does not have a value for # EventItemMinutesSequence, it is not in the minutes - filtered_items = (item for item in response.json() - if (item['EventItemTitle'] - and item['EventItemMinutesSequence'])) - sorted_items = sorted(filtered_items, - key=lambda item: item['EventItemMinutesSequence']) + filtered_items = ( + item + for item in response.json() + if (item["EventItemTitle"] and item["EventItemMinutesSequence"]) + ) + sorted_items = sorted( + filtered_items, key=lambda item: item["EventItemMinutesSequence"] + ) for item in sorted_items: self._suppress_item_matter(item, minutes_url) yield item def _suppress_item_matter(self, item, agenda_url): - ''' + """ Agenda items in Legistar do not always display links to associated matter files even if the same agenda item in the API references a Matter File. The agenda items @@ -169,14 +174,15 @@ def _suppress_item_matter(self, item, agenda_url): logic should be used for all Legislative Bodies, this method is currently just a hook for being overridden in particular scrapers. As of now, at least LA Metro uses this hook. - ''' + """ pass def rollcalls(self, event): for item in self.agenda(event): - if item['EventItemRollCallFlag']: - rollcall_url = self.BASE_URL + \ - '/eventitems/{}/rollcalls'.format(item['EventItemId']) + if item["EventItemRollCallFlag"]: + rollcall_url = self.BASE_URL + "/eventitems/{}/rollcalls".format( + item["EventItemId"] + ) response = self.get(rollcall_url) @@ -185,24 +191,26 @@ def rollcalls(self, event): def add_docs(self, e, events, doc_type): try: - if events[doc_type] != 'Not\xa0available': - e.add_document(note=events[doc_type]['label'], - url=events[doc_type]['url'], - media_type="application/pdf") + if events[doc_type] != "Not\xa0available": + e.add_document( + note=events[doc_type]["label"], + url=events[doc_type]["url"], + media_type="application/pdf", + ) except ValueError: pass def _event_status(self, event): - '''Events can have a status of tentative, confirmed, cancelled, or + """Events can have a status of tentative, confirmed, cancelled, or passed (http://docs.opencivicdata.org/en/latest/data/event.html). By default, set status to passed if the current date and time exceeds the event date and time, or confirmed otherwise. Available for override in jurisdictional scrapers. - ''' - if datetime.datetime.utcnow().replace(tzinfo=pytz.utc) > event['start']: - status = 'passed' + """ + if datetime.datetime.utcnow().replace(tzinfo=pytz.utc) > event["start"]: + status = "passed" else: - status = 'confirmed' + status = "confirmed" return status @@ -213,11 +221,11 @@ def _get_web_event(self, api_event): return self.web_detail(api_event) def web_detail(self, event): - ''' + """ Grabs the information for an event from the Legistar website and returns as a dictionary. - ''' - insite_url = event['EventInSiteURL'] + """ + insite_url = event["EventInSiteURL"] try: event_page = self._webscraper.lxmlize(insite_url) @@ -234,21 +242,22 @@ def web_detail(self, event): else: raise - div_id = 'ctl00_ContentPlaceHolder1_pageTop1' + div_id = "ctl00_ContentPlaceHolder1_pageTop1" detail_div = event_page.xpath(".//div[@id='%s']" % div_id)[0] event_page_details = self._webscraper.parse_details(detail_div) - event_page_details['Meeting Details'] = {'url': insite_url} + event_page_details["Meeting Details"] = {"url": insite_url} return event_page_details class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase): - ''' + """ There are some inSite sites that have information that only appears event listing page, like NYC's 'Meeting Topic.' This scraper visits the listing page and attempts to zip API and web events together - ''' + """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -269,8 +278,7 @@ def _get_web_event(self, api_event): return self.web_results(api_event) def web_results(self, event): - api_key = (event['EventBodyName'].strip(), - event['start']) + api_key = (event["EventBodyName"].strip(), event["start"]) # Check the cache of events we've already scraped from the web interface # for the API event at hand. @@ -287,33 +295,32 @@ def web_results(self, event): return event def _scrapeWebCalendar(self): - '''Generator yielding events from Legistar in roughly reverse + """Generator yielding events from Legistar in roughly reverse chronological order. - ''' + """ for event, _ in self._webscraper.events(follow_links=False): event_key = self._event_key(event, self._webscraper) yield event_key, event def _event_key(self, event, web_scraper): - '''Since Legistar InSite contains more information about events than + """Since Legistar InSite contains more information about events than are available in the API, we need to scrape both. Then, we have to line them up. This method makes a key that should be uniquely identify every event and will allow us to link events from the two data sources. - ''' - response = web_scraper.get(event['iCalendar']['url'], verify=False) - event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt + """ + response = web_scraper.get(event["iCalendar"]["url"], verify=False) + event_time = web_scraper.ical(response.text).subcomponents[0]["DTSTART"].dt event_time = pytz.timezone(self.TIMEZONE).localize(event_time) - key = (event['Name']['label'], - event_time) + key = (event["Name"]["label"], event_time) return key def _not_in_web_interface(self, event): - '''Occasionally, an event will appear in the API, but not in the web + """Occasionally, an event will appear in the API, but not in the web interface. This method checks attributes of the API event that tell us whether the given event is one of those cases, returning True if so, and False otherwise. Available for override in jurisdictional scrapers. - ''' + """ return False diff --git a/src/legistar/api/people.py b/src/legistar/api/people.py index 5f3aa39..c557ce9 100644 --- a/src/legistar/api/people.py +++ b/src/legistar/api/people.py @@ -2,28 +2,29 @@ class LegistarAPIPersonScraper(LegistarAPIScraper): - date_format = '%Y-%m-%dT%H:%M:%S' + date_format = "%Y-%m-%dT%H:%M:%S" def body_types(self): - body_types_url = self.BASE_URL + '/bodytypes/' + body_types_url = self.BASE_URL + "/bodytypes/" response = self.get(body_types_url) - types = {body_type['BodyTypeName']: body_type['BodyTypeId'] - for body_type in response.json()} + types = { + body_type["BodyTypeName"]: body_type["BodyTypeId"] + for body_type in response.json() + } return types def bodies(self): - bodies_url = self.BASE_URL + '/bodies/' + bodies_url = self.BASE_URL + "/bodies/" for body in self.pages(bodies_url, item_key="BodyId"): yield body def body_offices(self, body): - body_id = body['BodyId'] + body_id = body["BodyId"] - offices_url = (self.BASE_URL - + '/bodies/{}/OfficeRecords'.format(body_id)) + offices_url = self.BASE_URL + "/bodies/{}/OfficeRecords".format(body_id) for office in self.pages(offices_url, item_key="OfficeRecordId"): yield office @@ -32,12 +33,13 @@ def to_date(self, text): return self.to_time(text).date() def person_sources_from_office(self, office): - person_api_url = (self.BASE_URL - + '/persons/{OfficeRecordPersonId}'.format(**office)) + person_api_url = self.BASE_URL + "/persons/{OfficeRecordPersonId}".format( + **office + ) response = self.get(person_api_url) - route = '/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}' + route = "/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}" person_web_url = self.WEB_URL + route.format(**response.json()) return person_api_url, person_web_url diff --git a/src/legistar/ui/base.py b/src/legistar/ui/base.py index 0eca4df..9f8434d 100644 --- a/src/legistar/ui/base.py +++ b/src/legistar/ui/base.py @@ -16,23 +16,23 @@ class LegistarSession(requests.Session): def request(self, method, url, **kwargs): response = super(LegistarSession, self).request(method, url, **kwargs) - payload = kwargs.get('data') + payload = kwargs.get("data") self._check_errors(response, payload) return response def _check_errors(self, response, payload): - if response.url.endswith('Error.aspx'): + if response.url.endswith("Error.aspx"): response.status_code = 503 raise scrapelib.HTTPError(response) if not response.text: - if response.request.method.lower() in {'get', 'post'}: + if response.request.method.lower() in {"get", "post"}: response.status_code = 520 raise scrapelib.HTTPError(response) - if 'This record no longer exists. It might have been deleted.' in response.text: + if "This record no longer exists. It might have been deleted." in response.text: response.status_code = 410 raise scrapelib.HTTPError(response) @@ -40,18 +40,19 @@ def _check_errors(self, response, payload): self._range_error(response, payload) def _range_error(self, response, payload): - '''Legistar intermittently does not return the expected response when + """Legistar intermittently does not return the expected response when selecting a time range when searching for events. Right now we are only handling the 'All' range - ''' + """ if self._range_is_all(payload): - expected_range = 'All Years' + expected_range = "All Years" page = lxml.html.fromstring(response.text) - returned_range, = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']") + (returned_range,) = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']" + ) returned_range = returned_range.value @@ -66,22 +67,23 @@ def _range_error(self, response, payload): raise scrapelib.HTTPError(response) def _range_is_all(self, payload): - range_var = 'ctl00_ContentPlaceHolder1_lstYears_ClientState' - all_range = (range_var in payload - and json.loads(payload[range_var])['value'] == 'All') + range_var = "ctl00_ContentPlaceHolder1_lstYears_ClientState" + all_range = ( + range_var in payload and json.loads(payload[range_var])["value"] == "All" + ) return all_range class LegistarScraper(scrapelib.Scraper, LegistarSession): - date_format = '%m/%d/%Y' + date_format = "%m/%d/%Y" def __init__(self, *args, **kwargs): super(LegistarScraper, self).__init__(*args, **kwargs) def lxmlize(self, url, payload=None): - ''' + """ Gets page and returns as XML - ''' + """ if payload: response = self.post(url, payload, verify=False) else: @@ -96,10 +98,9 @@ def pages(self, url, payload=None): yield page - next_page = page.xpath( - "//a[@class='rgCurrentPage']/following-sibling::a[1]") - if payload and 'ctl00$ContentPlaceHolder1$btnSearch' in payload: - del payload['ctl00$ContentPlaceHolder1$btnSearch'] + next_page = page.xpath("//a[@class='rgCurrentPage']/following-sibling::a[1]") + if payload and "ctl00$ContentPlaceHolder1$btnSearch" in payload: + del payload["ctl00$ContentPlaceHolder1$btnSearch"] while len(next_page) > 0: if payload is None: @@ -107,24 +108,27 @@ def pages(self, url, payload=None): payload.update(self.session_secrets(page)) - event_target = next_page[0].attrib['href'].split("'")[1] + event_target = next_page[0].attrib["href"].split("'")[1] - payload['__EVENTTARGET'] = event_target + payload["__EVENTTARGET"] = event_target page = self.lxmlize(url, payload) yield page next_page = page.xpath( - "//a[@class='rgCurrentPage']/following-sibling::a[1]") + "//a[@class='rgCurrentPage']/following-sibling::a[1]" + ) def parse_details(self, detail_div): """ Parse the data in the top section of a detail page. """ - detail_query = ".//*[starts-with(@id, 'ctl00_ContentPlaceHolder1_lbl')"\ - " or starts-with(@id, 'ctl00_ContentPlaceHolder1_hyp')"\ - " or starts-with(@id, 'ctl00_ContentPlaceHolder1_Label')]" + detail_query = ( + ".//*[starts-with(@id, 'ctl00_ContentPlaceHolder1_lbl')" + " or starts-with(@id, 'ctl00_ContentPlaceHolder1_hyp')" + " or starts-with(@id, 'ctl00_ContentPlaceHolder1_Label')]" + ) fields = detail_div.xpath(detail_query) details = {} @@ -133,17 +137,23 @@ def parse_details(self, detail_div): field = list(field) field_1, field_2 = field[0], field[-1] - key = field_1.text_content().replace(':', '').strip() + key = field_1.text_content().replace(":", "").strip() - if field_2.find('.//a') is not None: + if field_2.find(".//a") is not None: value = [] - for link in field_2.xpath('.//a'): - value.append({'label': link.text_content().strip(), - 'url': self._get_link_address(link)}) - - elif 'href' in field_2.attrib: - value = {'label': field_2.text_content().strip(), - 'url': self._get_link_address(field_2)} + for link in field_2.xpath(".//a"): + value.append( + { + "label": link.text_content().strip(), + "url": self._get_link_address(link), + } + ) + + elif "href" in field_2.attrib: + value = { + "label": field_2.text_content().strip(), + "url": self._get_link_address(field_2), + } elif self._parse_detail(key, field_1, field_2): value = self._parse_detail(key, field_1, field_2) @@ -165,62 +175,65 @@ def parse_data_table(self, table): rows = table.xpath(".//tr[@class='rgRow' or @class='rgAltRow']") keys = [] + for header in headers: - text_content = header.text_content().replace(' ', ' ').strip() - inputs = header.xpath('.//input') + text_content = header.text_content().replace(" ", " ").strip() + inputs = header.xpath(".//input") if text_content: keys.append(text_content) elif len(inputs) > 0: - keys.append(header.xpath('.//input')[0].value) + keys.append(header.xpath(".//input")[0].value) else: - keys.append(header.xpath('.//img')[0].get('alt')) + keys.append(header.xpath(".//img")[0].get("alt")) for row in rows: + data, row = self._parse_table_row(row, keys) + yield dict(data), keys, row + + def _parse_table_row(self, row, keys): + for key, field in zip(keys, row.xpath("./td")): + data = defaultdict(lambda: None) + try: - data = defaultdict(lambda: None) - - for key, field in zip(keys, row.xpath("./td")): - text_content = self._stringify(field) - - if field.find('.//a') is not None: - address = self._get_link_address(field.find('.//a')) - if address: - if (key.strip() in ['', 'ics'] - and 'View.ashx?M=IC' in address): - key = 'iCalendar' - value = {'url': address} - else: - value = {'label': text_content, - 'url': address} + text_content = self._stringify(field) + + if field.find(".//a") is not None: + address = self._get_link_address(field.find(".//a")) + if address: + if key.strip() in ["", "ics"] and "View.ashx?M=IC" in address: + key = "iCalendar" + value = {"url": address} else: - value = text_content + value = {"label": text_content, "url": address} else: value = text_content - - data[key] = value - - yield dict(data), keys, row + else: + value = text_content except Exception as e: - print('Problem parsing row:') + print("Problem parsing row:") print(etree.tostring(row)) print(traceback.format_exc()) raise e + else: + data[key] = value + + return data, row + def _get_link_address(self, link): url = None - if 'onclick' in link.attrib: - onclick = link.attrib['onclick'] - if (onclick is not None - and onclick.startswith(("radopen('", - "window.open", - "OpenTelerikWindow"))): + if "onclick" in link.attrib: + onclick = link.attrib["onclick"] + if onclick is not None and onclick.startswith( + ("radopen('", "window.open", "OpenTelerikWindow") + ): onclick_path = onclick.split("'")[1] if not onclick_path.startswith("/"): onclick_path = "/" + onclick_path url = self.BASE_URL + onclick_path - elif 'href' in link.attrib: - url = link.attrib['href'] + elif "href" in link.attrib: + url = link.attrib["href"] return url @@ -237,7 +250,7 @@ def _stringify(self, field): for em in field.xpath("*//em"): if em.text: em.text = "--em--" + em.text + "--em--" - return field.text_content().replace(' ', ' ').strip() + return field.text_content().replace(" ", " ").strip() def to_time(self, text): time = datetime.datetime.strptime(text, self.date_format) @@ -251,18 +264,18 @@ def now(self): return datetime.datetime.utcnow().replace(tzinfo=pytz.utc) def mdY2Ymd(self, text): - month, day, year = text.split('/') + month, day, year = text.split("/") return "%d-%02d-%02d" % (int(year), int(month), int(day)) def session_secrets(self, page): payload = {} - payload['__EVENTARGUMENT'] = None - payload['__VIEWSTATE'] = page.xpath( - "//input[@name='__VIEWSTATE']/@value")[0] + payload["__EVENTARGUMENT"] = None + payload["__VIEWSTATE"] = page.xpath("//input[@name='__VIEWSTATE']/@value")[0] try: - payload['__EVENTVALIDATION'] = page.xpath( - "//input[@name='__EVENTVALIDATION']/@value")[0] + payload["__EVENTVALIDATION"] = page.xpath( + "//input[@name='__EVENTVALIDATION']/@value" + )[0] except IndexError: pass @@ -275,8 +288,8 @@ def accept_response(self, response, **kwargs): def _field_key(x): - field_id = x.attrib['id'] - field = re.split(r'hyp|lbl|Label', field_id)[-1] - field = field.split('Prompt')[0] - field = field.rstrip('X21') + field_id = x.attrib["id"] + field = re.split(r"hyp|lbl|Label", field_id)[-1] + field = field.split("Prompt")[0] + field = field.rstrip("X21") return field diff --git a/src/legistar/ui/bills.py b/src/legistar/ui/bills.py index ce53bf9..eefc0d9 100644 --- a/src/legistar/ui/bills.py +++ b/src/legistar/ui/bills.py @@ -4,8 +4,7 @@ class LegistarBillScraper(LegistarScraper): - def legislation(self, search_text='', created_after=None, - created_before=None): + def legislation(self, search_text="", created_after=None, created_before=None): # If legislation is added to the the legistar system while we # are scraping, it will shift the list of legislation down and @@ -14,15 +13,15 @@ def legislation(self, search_text='', created_after=None, # make sure we are not revisiting scraped_leg = deque([], maxlen=10) - for page in self.search_legislation(search_text, created_after, - created_before): + for page in self.search_legislation(search_text, created_after, created_before): for legislation_summary in self.parse_search_results(page): - if not legislation_summary['url'] in scraped_leg: + if not legislation_summary["url"] in scraped_leg: yield legislation_summary - scraped_leg.append(legislation_summary['url']) + scraped_leg.append(legislation_summary["url"]) - def search_legislation(self, search_text='', created_after=None, - created_before=None): + def search_legislation( + self, search_text="", created_after=None, created_before=None + ): """ Submit a search query on the legislation search page, and return a list of summary results. @@ -38,24 +37,25 @@ def search_legislation(self, search_text='', created_after=None, # fields should be represented as keyword arguments to this # function. The default query string should be for the the # default 'Legislative text' field. - payload['ctl00$ContentPlaceHolder1$txtText'] = search_text + payload["ctl00$ContentPlaceHolder1$txtText"] = search_text if created_after and created_before: payload.update(date_within(created_after, created_before)) elif created_before: payload.update(date_bound(created_before)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '<' + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = "<" elif created_after: payload.update(date_bound(created_after)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '>' + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = ">" # Return up to one million search results - payload['ctl00_ContentPlaceHolder1_lstMax_ClientState'] = '{"value":"1000000"}' - payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = ( - '{"value":"All"}') - payload['ctl00$ContentPlaceHolder1$btnSearch'] = 'Search Legislation' + payload["ctl00_ContentPlaceHolder1_lstMax_ClientState"] = '{"value":"1000000"}' + payload["ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState"] = ( + '{"value":"All"}' + ) + payload["ctl00$ContentPlaceHolder1$btnSearch"] = "Search Legislation" payload.update(self.session_secrets(page)) @@ -68,30 +68,30 @@ def parse_search_results(self, page): ('Document ID', 'Document URL', 'Type', 'Status', 'Introduction Date' 'Passed Date', 'Main Sponsor', 'Title') """ - table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] + table = page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] for legislation, headers, row in self.parse_data_table(table): # Do legislation search-specific stuff # ------------------------------------ # First column should be the ID of the record. id_key = headers[0] try: - legislation_id = legislation[id_key]['label'] + legislation_id = legislation[id_key]["label"] except TypeError: continue - legislation_url = legislation[id_key]['url'].split( - self.BASE_URL)[-1] + legislation_url = legislation[id_key]["url"].split(self.BASE_URL)[-1] legislation[id_key] = legislation_id - legislation['url'] = self.BASE_URL + \ - legislation_url.split('&Options')[0] + '&FullText=1' + legislation["url"] = ( + self.BASE_URL + legislation_url.split("&Options")[0] + "&FullText=1" + ) yield legislation def _advancedSearch(self, page): search_switcher = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']" + )[0] - if 'simple search' in search_switcher.value.lower(): + if "simple search" in search_switcher.value.lower(): return page else: payload = self.session_secrets(page) @@ -100,9 +100,10 @@ def _advancedSearch(self, page): page = self.lxmlize(self.LEGISLATION_URL, payload) search_button = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] - if 'simple search' not in search_button.value.lower(): - raise ValueError('Not on the advanced search page') + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']" + )[0] + if "simple search" not in search_button.value.lower(): + raise ValueError("Not on the advanced search page") return page @@ -114,11 +115,11 @@ def details(self, detail_url, div_id): return self.parse_details(detail_div) def leg_details(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageDetails' + div_id = "ctl00_ContentPlaceHolder1_pageDetails" return self.details(detail_url, div_id) def action_details(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageTop1' + div_id = "ctl00_ContentPlaceHolder1_pageTop1" return self.details(detail_url, div_id) def history(self, detail_url): @@ -126,7 +127,8 @@ def history(self, detail_url): try: history_table = detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']" + )[0] except IndexError: print(detail_url) raise @@ -142,16 +144,15 @@ def history(self, detail_url): yield action def _action_sort_key(self, action): - action_date = self.to_date(action['Date']) - action_url = action['Action\xa0Details']['url'] + action_date = self.to_date(action["Date"]) + action_url = action["Action\xa0Details"]["url"] return (action_date, action_url) def text(self, detail_url): detail_page = self.lxmlize(detail_url) - text_div = detail_page.xpath( - "//div[@id='ctl00_ContentPlaceHolder1_divText']") + text_div = detail_page.xpath("//div[@id='ctl00_ContentPlaceHolder1_divText']") if len(text_div): return tostring(text_div[0], pretty_print=True).decode() @@ -162,21 +163,27 @@ def extract_votes(self, action_detail_url): action_detail_page = self.lxmlize(action_detail_url) try: vote_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']" + )[0] except IndexError: self.warning("No votes found in table") return None, [] votes = list(self.parse_data_table(vote_table)) vote_list = [] for vote, _, _ in votes: - raw_option = vote['Vote'].lower() - vote_list.append((self.VOTE_OPTIONS.get(raw_option, raw_option), - vote['Person Name']['label'])) + raw_option = vote["Vote"].lower() + vote_list.append( + ( + self.VOTE_OPTIONS.get(raw_option, raw_option), + vote["Person Name"]["label"], + ) + ) action_detail_div = action_detail_page.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']")[0] + ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']" + )[0] action_details = self.parse_details(action_detail_div) - result = action_details['Result'].lower() + result = action_details["Result"].lower() return result, vote_list @@ -184,16 +191,20 @@ def extract_votes(self, action_detail_url): def date_within(created_after, created_before): payload = date_bound(created_after) - payload['ctl00$ContentPlaceHolder1$txtFileCreated2'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=created_before) - payload['ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=created_before) + payload["ctl00$ContentPlaceHolder1$txtFileCreated2"] = ( + "{d.year}-{d.month:02}-{d.day:02}".format(d=created_before) + ) + payload["ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput"] = ( + "{d.month}/{d.day}/{d.year}".format(d=created_before) + ) - payload['ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=created_before) + payload["ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState"] = ( + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=created_before + ) + ) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = 'between' + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = "between" return payload @@ -201,13 +212,17 @@ def date_within(created_after, created_before): def date_bound(creation_date): payload = {} - payload['ctl00$ContentPlaceHolder1$txtFileCreated1'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=creation_date) - payload['ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=creation_date) - - payload['ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=creation_date) + payload["ctl00$ContentPlaceHolder1$txtFileCreated1"] = ( + "{d.year}-{d.month:02}-{d.day:02}".format(d=creation_date) + ) + payload["ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput"] = ( + "{d.month}/{d.day}/{d.year}".format(d=creation_date) + ) + + payload["ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState"] = ( + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=creation_date + ) + ) return payload diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py index 3d12089..28697ce 100644 --- a/src/legistar/ui/events.py +++ b/src/legistar/ui/events.py @@ -11,28 +11,27 @@ class LegistarEventsScraper(LegistarScraper): ECOMMENT_JS_URLS = ( - 'https://metro.granicusideas.com/meetings.js', - 'https://metro.granicusideas.com/meetings.js?scope=past' + "https://metro.granicusideas.com/meetings.js", + "https://metro.granicusideas.com/meetings.js?scope=past", ) - def __init__(self, *args, event_info_key='Meeting Details', **kwargs): + def __init__(self, *args, event_info_key="Meeting Details", **kwargs): super().__init__(*args, **kwargs) self.event_info_key = event_info_key - @property def ecomment_dict(self): """ Parse event IDs and eComment links from JavaScript file with lines like: activateEcomment('750', '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'); """ - if getattr(self, '_ecomment_dict', None) is None: + if getattr(self, "_ecomment_dict", None) is None: ecomment_dict = {} # Define a callback to apply to each node, e.g., # https://esprima.readthedocs.io/en/latest/syntactic-analysis.html#example-console-calls-removal def is_activateEcomment(node, metadata): - if node.callee and node.callee.name == 'activateEcomment': + if node.callee and node.callee.name == "activateEcomment": event_id, _, comment_url = node.arguments ecomment_dict[event_id.value] = comment_url.value @@ -53,15 +52,18 @@ def event_pages(self, since): def should_cache_response(self, response): # Never cache the top level events page, because that may result in # expired .NET state values. - return (super().should_cache_response(response) and - response.url != self.EVENTSPAGE) + return ( + super().should_cache_response(response) and response.url != self.EVENTSPAGE + ) def event_search(self, page, since): payload = self.session_secrets(page) - payload['ctl00_ContentPlaceHolder1_lstYears_ClientState'] = '{"value":"%s"}' % since + payload["ctl00_ContentPlaceHolder1_lstYears_ClientState"] = ( + '{"value":"%s"}' % since + ) - payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$lstYears' + payload["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$lstYears" return self.pages(self.EVENTSPAGE, payload) @@ -78,7 +80,8 @@ def events(self, follow_links=True, since=None): if since: if since > current_year: raise ValueError( - 'Value of :since cannot exceed {}'.format(current_year)) + "Value of :since cannot exceed {}".format(current_year) + ) else: since_year = since - 1 @@ -92,16 +95,18 @@ def events(self, follow_links=True, since=None): no_events_in_year = True for page in self.event_pages(year): - events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] + events_table = page.xpath( + "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']" + )[0] for event, _, _ in self.parse_data_table(events_table): - ical_url = event['iCalendar']['url'] + ical_url = event["iCalendar"]["url"] if ical_url in scraped_events: continue else: scraped_events.append(ical_url) if follow_links and type(event[self.event_info_key]) == dict: - agenda = self.agenda(event[self.event_info_key]['url']) + agenda = self.agenda(event[self.event_info_key]["url"]) else: agenda = None @@ -121,21 +126,28 @@ def agenda(self, detail_url): payload = self.session_secrets(page) - payload.update({"__EVENTARGUMENT": "3:1", - "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain"}) + payload.update( + { + "__EVENTARGUMENT": "3:1", + "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain", + } + ) for page in self.pages(detail_url, payload): agenda_table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']" + )[0] agenda = self.parse_data_table(agenda_table) yield from agenda def add_docs(self, e, events, doc_type): try: - if events[doc_type] != 'Not\xa0available': - e.add_document(note=events[doc_type]['label'], - url=events[doc_type]['url'], - media_type="application/pdf") + if events[doc_type] != "Not\xa0available": + e.add_document( + note=events[doc_type]["label"], + url=events[doc_type]["url"], + media_type="application/pdf", + ) except ValueError: pass @@ -143,16 +155,16 @@ def extract_roll_call(self, action_detail_url): action_detail_page = self.lxmlize(action_detail_url) try: rollcall_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']" + )[0] except IndexError: self.warning("No rollcall found in table") return [] roll_call = list(self.parse_data_table(rollcall_table)) call_list = [] for call, _, _ in roll_call: - option = call['Attendance'] - call_list.append((option, - call['Person Name']['label'])) + option = call["Attendance"] + call_list.append((option, call["Person Name"]["label"])) return call_list @@ -161,9 +173,9 @@ def ical(self, ical_text): return value def _parse_detail(self, key, field_1, field_2): - if key == 'eComment': + if key == "eComment": return self._get_ecomment_link(field_2) or field_2.text_content().strip() def _get_ecomment_link(self, link): - event_id = link.attrib['data-event-id'] + event_id = link.attrib["data-event-id"] return self.ecomment_dict.get(event_id, None) diff --git a/src/legistar/ui/people.py b/src/legistar/ui/people.py index baeb2c7..32b237c 100644 --- a/src/legistar/ui/people.py +++ b/src/legistar/ui/people.py @@ -13,30 +13,34 @@ def council_members(self, extra_args=None, follow_links=True): payload.update(self.session_secrets(page)) if self.ALL_MEMBERS: - payload['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$menuPeople" - payload['__EVENTARGUMENT'] = self.ALL_MEMBERS + payload["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$menuPeople" + payload["__EVENTARGUMENT"] = self.ALL_MEMBERS for page in self.pages(self.MEMBERLIST, payload): table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']" + )[0] for councilman, headers, row in self.parse_data_table(table): - if follow_links and type(councilman['Person Name']) == dict: + if follow_links and type(councilman["Person Name"]) == dict: - detail_url = councilman['Person Name']['url'] + detail_url = councilman["Person Name"]["url"] councilman_details = self.lxmlize(detail_url) detail_div = councilman_details.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0] + ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']" + )[0] councilman.update(self.parse_details(detail_div)) img = councilman_details.xpath( - "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']") + "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']" + ) if img: - councilman['Photo'] = img[0].get('src') + councilman["Photo"] = img[0].get("src") committee_table = councilman_details.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']")[0] + "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']" + )[0] committees = self.parse_data_table(committee_table) yield councilman, committees diff --git a/tests/conftest.py b/tests/conftest.py index 292654f..950819f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="module") def scraper(): scraper = base.LegistarAPIScraper() - scraper.BASE_URL = 'http://webapi.legistar.com/v1/chicago' + scraper.BASE_URL = "http://webapi.legistar.com/v1/chicago" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -19,19 +19,19 @@ def scraper(): @pytest.fixture def project_directory(): test_directory = os.path.abspath(os.path.dirname(__file__)) - return os.path.join(test_directory, '..') + return os.path.join(test_directory, "..") @pytest.fixture def fixtures_directory(): test_directory = os.path.abspath(os.path.dirname(__file__)) - return os.path.join(test_directory, 'fixtures') + return os.path.join(test_directory, "fixtures") @pytest.fixture def metro_api_bill_scraper(): scraper = LegistarAPIBillScraper() - scraper.BASE_URL = 'https://webapi.legistar.com/v1/metro' + scraper.BASE_URL = "https://webapi.legistar.com/v1/metro" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -40,7 +40,7 @@ def metro_api_bill_scraper(): @pytest.fixture def chicago_api_bill_scraper(): scraper = LegistarAPIBillScraper() - scraper.BASE_URL = 'https://webapi.legistar.com/v1/chicago' + scraper.BASE_URL = "https://webapi.legistar.com/v1/chicago" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -48,31 +48,31 @@ def chicago_api_bill_scraper(): @pytest.fixture def matter_index(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'metro', 'matter_index.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "metro", "matter_index.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def all_indexes(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'metro', 'all_indexes.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "metro", "all_indexes.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def dupe_event(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'chicago', 'dupe_event.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "chicago", "dupe_event.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def no_dupe_event(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'chicago', 'no_dupe_event.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "chicago", "no_dupe_event.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture diff --git a/tests/refresh_fixtures.py b/tests/refresh_fixtures.py index a315e5d..0e4c065 100644 --- a/tests/refresh_fixtures.py +++ b/tests/refresh_fixtures.py @@ -10,48 +10,48 @@ def save_page(page, jurisdiction, outfile): test_directory = os.path.abspath(os.path.dirname(__file__)) - project_directory = os.path.join(test_directory, '..') + project_directory = os.path.join(test_directory, "..") fixture_path = os.path.join( - project_directory, 'tests', 'fixtures', jurisdiction, outfile + project_directory, "tests", "fixtures", jurisdiction, outfile ) - with open(fixture_path, 'wb') as f: + with open(fixture_path, "wb") as f: f.write(lxml.html.tostring(page)) def refresh_bills(jurisdiction): s = LegistarBillScraper() - s.LEGISLATION_URL = 'https://{}.legistar.com/Legislation.aspx'.format(jurisdiction) + s.LEGISLATION_URL = "https://{}.legistar.com/Legislation.aspx".format(jurisdiction) - page = next(s.search_legislation('bus')) + page = next(s.search_legislation("bus")) - save_page(page, jurisdiction, 'bills.html') + save_page(page, jurisdiction, "bills.html") def refresh_events(jurisdiction): s = LegistarEventsScraper() - s.EVENTSPAGE = 'https://{}.legistar.com/Calendar.aspx'.format(jurisdiction) + s.EVENTSPAGE = "https://{}.legistar.com/Calendar.aspx".format(jurisdiction) - page = next(s.event_pages('2018-01-01')) + page = next(s.event_pages("2018-01-01")) - save_page(page, jurisdiction, 'events.html') + save_page(page, jurisdiction, "events.html") def refresh_people(jurisdiction): s = LegistarPersonScraper() - MEMBERLIST = 'https://{}.legistar.com/People.aspx'.format(jurisdiction) + MEMBERLIST = "https://{}.legistar.com/People.aspx".format(jurisdiction) page = next(s.pages(MEMBERLIST)) - save_page(page, jurisdiction, 'people.html') + save_page(page, jurisdiction, "people.html") -if __name__ == '__main__': +if __name__ == "__main__": try: _, jurisdictions = sys.argv - jurisdictions = jurisdictions.split(',') + jurisdictions = jurisdictions.split(",") except ValueError: - jurisdictions = ('chicago', 'metro', 'nyc') + jurisdictions = ("chicago", "metro", "nyc") for j in jurisdictions: refresh_bills(j) diff --git a/tests/test_bills.py b/tests/test_bills.py index 82413cf..a9ab1eb 100644 --- a/tests/test_bills.py +++ b/tests/test_bills.py @@ -5,10 +5,10 @@ def test_topics(metro_api_bill_scraper, matter_index, all_indexes): with requests_mock.Mocker() as m: - matter_matcher = re.compile(r'/matters/5036/indexes') + matter_matcher = re.compile(r"/matters/5036/indexes") m.get(matter_matcher, json=matter_index, status_code=200) - all_matcher = re.compile(r'/metro/indexes') + all_matcher = re.compile(r"/metro/indexes") m.get(all_matcher, json=all_indexes, status_code=200) matter_topics = metro_api_bill_scraper.topics(5036) @@ -23,38 +23,42 @@ def test_topics(metro_api_bill_scraper, matter_index, all_indexes): def test_duplicate_events(chicago_api_bill_scraper, caplog, dupe_event): with requests_mock.Mocker() as m: - event_matcher = re.compile('/matters/38768/histories') + event_matcher = re.compile("/matters/38768/histories") m.get(event_matcher, json=dupe_event, status_code=200) - chicago_api_bill_scraper.history('38768') - assert 'appears more than once' in caplog.text + chicago_api_bill_scraper.history("38768") + assert "appears more than once" in caplog.text def test_no_duplicate(chicago_api_bill_scraper, caplog, no_dupe_event): with requests_mock.Mocker() as m: - event_matcher = re.compile('/matters/38769/histories') + event_matcher = re.compile("/matters/38769/histories") m.get(event_matcher, json=no_dupe_event, status_code=200) - chicago_api_bill_scraper.history('38769') - assert 'appears more than once' not in caplog.text + chicago_api_bill_scraper.history("38769") + assert "appears more than once" not in caplog.text def test_404_votes(chicago_api_bill_scraper): with requests_mock.Mocker() as m: - m.get(re.compile(r'.*'), status_code=404) - votes = chicago_api_bill_scraper.votes('408134') + m.get(re.compile(r".*"), status_code=404) + votes = chicago_api_bill_scraper.votes("408134") assert votes == [] def test_500_votes(chicago_api_bill_scraper): with requests_mock.Mocker() as m: - m.get(re.compile(r'.*'), - json={'InnerException': - {'ExceptionMessage': - "The cast to value type 'System.Int32' failed " - "because the materialized value is null. Either " - "the result type's generic parameter or the query " - "must use a nullable type."}}, - status_code=500) - votes = chicago_api_bill_scraper.votes('408134') + m.get( + re.compile(r".*"), + json={ + "InnerException": { + "ExceptionMessage": "The cast to value type 'System.Int32' failed " + "because the materialized value is null. Either " + "the result type's generic parameter or the query " + "must use a nullable type." + } + }, + status_code=500, + ) + votes = chicago_api_bill_scraper.votes("408134") assert votes == [] diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 804885d..d00d82a 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -8,48 +8,48 @@ from src.legistar.ui.people import LegistarPersonScraper -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_bills(project_directory, jurisdiction): bills_fixture = os.path.join( - project_directory, 'tests', 'fixtures', jurisdiction, 'bills.html' + project_directory, "tests", "fixtures", jurisdiction, "bills.html" ) scraper = LegistarBillScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(bills_fixture, 'r') as f: + with open(bills_fixture, "r") as f: page = lxml.html.fromstring(f.read()) result = next(scraper.parse_search_results(page)) print(result) -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_events(project_directory, mocker, jurisdiction): events_fixture = os.path.join( - project_directory, 'tests', 'fixtures', jurisdiction, 'events.html' + project_directory, "tests", "fixtures", jurisdiction, "events.html" ) scraper = LegistarEventsScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(events_fixture, 'r') as f: + with open(events_fixture, "r") as f: page = lxml.html.fromstring(f.read()) - mocker.patch.object(scraper, 'event_pages', return_value=page) + mocker.patch.object(scraper, "event_pages", return_value=page) result, _ = next(scraper.events(follow_links=False)) print(result) -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_people(project_directory, mocker, jurisdiction): people_fixture = os.path.join( - project_directory, 'tests', 'fixtures', jurisdiction, 'people.html' + project_directory, "tests", "fixtures", jurisdiction, "people.html" ) scraper = LegistarPersonScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(people_fixture, 'r') as f: + with open(people_fixture, "r") as f: page = lxml.html.fromstring(f.read()) - mocker.patch.object(scraper, 'pages', return_value=page) + mocker.patch.object(scraper, "pages", return_value=page) result = next(scraper.council_members(follow_links=False)) print(result) diff --git a/tests/test_search.py b/tests/test_search.py index 4a4a3dd..7a89ab1 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -5,12 +5,12 @@ class TestAPISearch(object): def test_search_raises(self, scraper): with pytest.raises(ValueError): - results = scraper.search('/events/', 'EventId', - "MatterFile eq 'O2010-5046'") + results = scraper.search( + "/events/", "EventId", "MatterFile eq 'O2010-5046'" + ) list(results) def test_search(self, scraper): - results = scraper.search('/matters/', 'MatterId', - "MatterFile eq 'O2010-5046'") + results = scraper.search("/matters/", "MatterId", "MatterFile eq 'O2010-5046'") assert len(list(results)) == 1 From e4b1f4be809d371520040247a3adaff590347ce8 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 14:27:03 -0500 Subject: [PATCH 7/8] Final lap --- src/legistar/ui/events.py | 15 ++++++++------- src/legistar/ui/people.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py index 28697ce..16b6455 100644 --- a/src/legistar/ui/events.py +++ b/src/legistar/ui/events.py @@ -1,9 +1,6 @@ -import time -import datetime from collections import deque -import esprima -import pytz +import esprima import icalendar from .base import LegistarScraper @@ -23,7 +20,11 @@ def __init__(self, *args, event_info_key="Meeting Details", **kwargs): def ecomment_dict(self): """ Parse event IDs and eComment links from JavaScript file with lines like: - activateEcomment('750', '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'); + activateEcomment( + '750', + '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', + 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'. # noqa + ); """ if getattr(self, "_ecomment_dict", None) is None: ecomment_dict = {} @@ -96,7 +97,7 @@ def events(self, follow_links=True, since=None): for page in self.event_pages(year): events_table = page.xpath( - "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']" + "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']". # noqa )[0] for event, _, _ in self.parse_data_table(events_table): ical_url = event["iCalendar"]["url"] @@ -105,7 +106,7 @@ def events(self, follow_links=True, since=None): else: scraped_events.append(ical_url) - if follow_links and type(event[self.event_info_key]) == dict: + if follow_links and isinstance(event[self.event_info_key], dict): agenda = self.agenda(event[self.event_info_key]["url"]) else: agenda = None diff --git a/src/legistar/ui/people.py b/src/legistar/ui/people.py index 32b237c..6017b79 100644 --- a/src/legistar/ui/people.py +++ b/src/legistar/ui/people.py @@ -22,7 +22,7 @@ def council_members(self, extra_args=None, follow_links=True): )[0] for councilman, headers, row in self.parse_data_table(table): - if follow_links and type(councilman["Person Name"]) == dict: + if follow_links and isinstance(councilman["Person Name"], dict): detail_url = councilman["Person Name"]["url"] councilman_details = self.lxmlize(detail_url) From 88649c5b87eabe759eb4ef91c416bc98957caec7 Mon Sep 17 00:00:00 2001 From: Hannah Cushman Garland Date: Tue, 30 Sep 2025 14:28:01 -0500 Subject: [PATCH 8/8] Strike stray period --- src/legistar/ui/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py index 16b6455..01eedf2 100644 --- a/src/legistar/ui/events.py +++ b/src/legistar/ui/events.py @@ -97,7 +97,7 @@ def events(self, follow_links=True, since=None): for page in self.event_pages(year): events_table = page.xpath( - "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']". # noqa + "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']" # noqa )[0] for event, _, _ in self.parse_data_table(events_table): ical_url = event["iCalendar"]["url"]