diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 483e33b..eb60bfb 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -1,53 +1,99 @@ -name: tests +name: CI -on: [push] +on: + push: + branches: [master, main] + pull_request: + branches: [master, main] jobs: - test: + get-python-versions: + runs-on: ubuntu-latest + outputs: + python-versions: ${{ steps.get-versions.outputs.python-versions }} + steps: + - uses: actions/checkout@v4 + - name: Extract Python versions from pyproject.toml + id: get-versions + run: | + # Extract Python versions from classifiers in pyproject.toml + # This looks for lines like "Programming Language :: Python :: 3.8" + python_versions=$(grep -o 'Programming Language :: Python :: 3\.[0-9]\+' pyproject.toml | grep -o '3\.[0-9]\+' | sort -V | jq -R -s -c 'split("\n")[:-1]') + echo "python-versions=$python_versions" >> $GITHUB_OUTPUT + echo "Detected Python versions: $python_versions" + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 src/ tests/ --count --select=E9,F63,F7,F82 --show-source --statistics + # run full linting + flake8 src/ tests/ + + test: + needs: [get-python-versions, lint] runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ${{ fromJson(needs.get-python-versions.outputs.python-versions) }} steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.x - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install . - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pip install -r requirements.txt - pytest - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - publish: - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') - needs: test + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[test] + - name: Test with pytest + run: pytest + build: + needs: [test] runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Upload build artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist/ + publish: + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') + needs: [build] + runs-on: ubuntu-latest + permissions: + id-token: write steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - name: Build package for publishing - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - name: Download build artifacts + uses: actions/download-artifact@v3 + with: + name: dist + path: dist/ + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/legistar/__init__.py b/legistar/__init__.py deleted file mode 100644 index 1559bab..0000000 --- a/legistar/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.0.1' # pragma: no cover diff --git a/legistar/base.py b/legistar/base.py deleted file mode 100644 index 5e6327c..0000000 --- a/legistar/base.py +++ /dev/null @@ -1,374 +0,0 @@ -import datetime -import itertools -import traceback -from collections import defaultdict, deque -import re -import requests -import json -import logging - -import scrapelib -import lxml.html -import lxml.etree as etree -import pytz - - -class LegistarSession(requests.Session): - - def request(self, method, url, **kwargs): - response = super(LegistarSession, self).request(method, url, **kwargs) - payload = kwargs.get('data') - - self._check_errors(response, payload) - - return response - - def _check_errors(self, response, payload): - if response.url.endswith('Error.aspx'): - response.status_code = 503 - raise scrapelib.HTTPError(response) - - if not response.text: - if response.request.method.lower() in {'get', 'post'}: - response.status_code = 520 - raise scrapelib.HTTPError(response) - - if 'This record no longer exists. It might have been deleted.' in response.text: - response.status_code = 410 - raise scrapelib.HTTPError(response) - - if payload: - self._range_error(response, payload) - - def _range_error(self, response, payload): - '''Legistar intermittently does not return the expected response when - selecting a time range when searching for events. Right now we - are only handling the 'All' range - ''' - - if self._range_is_all(payload): - - expected_range = 'All Years' - - page = lxml.html.fromstring(response.text) - returned_range, = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']") - - returned_range = returned_range.value - - if returned_range != expected_range: - response.status_code = 520 - # In the event of a retry, the new request does not - # contain the correct payload data. This comes as a - # result of not updating the payload via sessionSecrets: - # so, we do that here. - payload.update(self.sessionSecrets(page)) - - raise scrapelib.HTTPError(response) - - def _range_is_all(self, payload): - range_var = 'ctl00_ContentPlaceHolder1_lstYears_ClientState' - all_range = (range_var in payload and - json.loads(payload[range_var])['value'] == 'All') - return all_range - - -class LegistarScraper(scrapelib.Scraper, LegistarSession): - date_format = '%m/%d/%Y' - - def __init__(self, *args, **kwargs): - super(LegistarScraper, self).__init__(*args, **kwargs) - - def lxmlize(self, url, payload=None): - ''' - Gets page and returns as XML - ''' - if payload: - response = self.post(url, payload, verify=False) - else: - response = self.get(url, verify=False) - entry = response.text - page = lxml.html.fromstring(entry) - page.make_links_absolute(url) - return page - - def pages(self, url, payload=None): - page = self.lxmlize(url, payload) - - yield page - - next_page = page.xpath( - "//a[@class='rgCurrentPage']/following-sibling::a[1]") - if payload and 'ctl00$ContentPlaceHolder1$btnSearch' in payload: - del payload['ctl00$ContentPlaceHolder1$btnSearch'] - - while len(next_page) > 0: - if payload is None: - payload = {} - - payload.update(self.sessionSecrets(page)) - - event_target = next_page[0].attrib['href'].split("'")[1] - - payload['__EVENTTARGET'] = event_target - - page = self.lxmlize(url, payload) - - yield page - - next_page = page.xpath( - "//a[@class='rgCurrentPage']/following-sibling::a[1]") - - def parseDetails(self, detail_div): - """ - Parse the data in the top section of a detail page. - """ - detail_query = ".//*[starts-with(@id, 'ctl00_ContentPlaceHolder1_lbl')"\ - " or starts-with(@id, 'ctl00_ContentPlaceHolder1_hyp')"\ - " or starts-with(@id, 'ctl00_ContentPlaceHolder1_Label')]" - fields = detail_div.xpath(detail_query) - - details = {} - - for field_key, field in itertools.groupby(fields, fieldKey): - field = list(field) - field_1, field_2 = field[0], field[-1] - - key = field_1.text_content().replace(':', '').strip() - - if field_2.find('.//a') is not None: - value = [] - for link in field_2.xpath('.//a'): - value.append({'label': link.text_content().strip(), - 'url': self._get_link_address(link)}) - - elif 'href' in field_2.attrib: - value = {'label': field_2.text_content().strip(), - 'url': self._get_link_address(field_2)} - - elif self._parse_detail(key, field_1, field_2): - value = self._parse_detail(key, field_1, field_2) - - else: - value = field_2.text_content().strip() - - details[key] = value - - return details - - def parseDataTable(self, table): - """ - Legistar uses the same kind of data table in a number of - places. This will return a list of dictionaries using the - table headers as keys. - """ - headers = table.xpath(".//th[starts-with(@class, 'rgHeader')]") - rows = table.xpath(".//tr[@class='rgRow' or @class='rgAltRow']") - - keys = [] - for header in headers: - text_content = header.text_content().replace(' ', ' ').strip() - inputs = header.xpath('.//input') - if text_content: - keys.append(text_content) - elif len(inputs) > 0: - keys.append(header.xpath('.//input')[0].value) - else: - keys.append(header.xpath('.//img')[0].get('alt')) - - for row in rows: - try: - data = defaultdict(lambda: None) - - for key, field in zip(keys, row.xpath("./td")): - text_content = self._stringify(field) - - if field.find('.//a') is not None: - address = self._get_link_address(field.find('.//a')) - if address: - if key.strip() in ['', 'ics'] and 'View.ashx?M=IC' in address: - key = 'iCalendar' - value = {'url': address} - else: - value = {'label': text_content, - 'url': address} - else: - value = text_content - else: - value = text_content - - data[key] = value - - yield dict(data), keys, row - - except Exception as e: - print('Problem parsing row:') - print(etree.tostring(row)) - print(traceback.format_exc()) - raise e - - def _get_link_address(self, link): - url = None - if 'onclick' in link.attrib: - onclick = link.attrib['onclick'] - if (onclick is not None and - onclick.startswith(("radopen('", - "window.open", - "OpenTelerikWindow"))): - onclick_path = onclick.split("'")[1] - if not onclick_path.startswith("/"): - onclick_path = "/" + onclick_path - url = self.BASE_URL + onclick_path - elif 'href' in link.attrib: - url = link.attrib['href'] - - return url - - def _parse_detail(self, key, field_1, field_2): - """ - Perform custom parsing on a given key and field from a detail table. - Available for override on web scraper base classes. - """ - return None - - def _stringify(self, field): - for br in field.xpath("*//br"): - br.tail = "\n" + br.tail if br.tail else "\n" - for em in field.xpath("*//em"): - if em.text: - em.text = "--em--" + em.text + "--em--" - return field.text_content().replace(' ', ' ').strip() - - def toTime(self, text): - time = datetime.datetime.strptime(text, self.date_format) - time = pytz.timezone(self.TIMEZONE).localize(time) - return time - - def toDate(self, text): - return self.toTime(text).date().isoformat() - - def now(self): - return datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - - def mdY2Ymd(self, text): - month, day, year = text.split('/') - return "%d-%02d-%02d" % (int(year), int(month), int(day)) - - def sessionSecrets(self, page): - - payload = {} - payload['__EVENTARGUMENT'] = None - payload['__VIEWSTATE'] = page.xpath( - "//input[@name='__VIEWSTATE']/@value")[0] - try: - payload['__EVENTVALIDATION'] = page.xpath( - "//input[@name='__EVENTVALIDATION']/@value")[0] - except IndexError: - pass - - return(payload) - - def accept_response(self, response, **kwargs): - if response.status_code == 410: - return True - return super().accept_response(response, **kwargs) - - -def fieldKey(x): - field_id = x.attrib['id'] - field = re.split(r'hyp|lbl|Label', field_id)[-1] - field = field.split('Prompt')[0] - field = field.rstrip('X21') - return field - - -class LegistarAPIScraper(scrapelib.Scraper): - date_format = '%Y-%m-%dT%H:%M:%S' - time_string_format = '%I:%M %p' - utc_timestamp_format = '%Y-%m-%dT%H:%M:%S.%f' - - def __init__(self, *args, **kwargs): - super(LegistarAPIScraper, self).__init__(*args, **kwargs) - self.logger = logging.getLogger("legistar") - self.warning = self.logger.warning - - def toTime(self, text): - time = datetime.datetime.strptime(text, self.date_format) - time = pytz.timezone(self.TIMEZONE).localize(time) - return time - - def to_utc_timestamp(self, text): - try: - time = datetime.datetime.strptime(text, self.utc_timestamp_format) - except ValueError as e: - if 'does not match format' in str(e): - time = datetime.datetime.strptime(text, self.date_format) - else: - raise - time = pytz.timezone('UTC').localize(time) - return time - - def search(self, route, item_key, search_conditions): - """ - Base function for searching the Legistar API. - - Arguments: - - route -- The path to search, i.e. /matters/, /events/, etc - item_key -- The unique id field for the items that you are searching. - This is necessary for proper pagination. examples - might be MatterId or EventId - search_conditions -- a string in the OData format for the - your search conditions http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 - - It would be nice if we could provide a - friendly search API. Something like https://github.com/tuomur/python-odata - - - Examples: - # Search for bills introduced after Jan. 1, 2017 - search('/matters/', 'MatterId', "MatterIntroDate gt datetime'2017-01-01'") - """ - - search_url = self.BASE_URL + route - - params = {'$filter': search_conditions} - - try: - yield from self.pages(search_url, - params=params, - item_key=item_key) - except requests.HTTPError as e: - if e.response.status_code == 400: - raise ValueError(e.response.json()['Message']) - if not self.accept_response(e.response): - raise - - def pages(self, url, params=None, item_key=None): - if params is None: - params = {} - - seen = deque([], maxlen=1000) - - page_num = 0 - response = None - while page_num == 0 or len(response.json()) == 1000: - params['$skip'] = page_num * 1000 - response = self.get(url, params=params) - response.raise_for_status() - - for item in response.json(): - if item[item_key] not in seen: - yield item - seen.append(item[item_key]) - - page_num += 1 - - def accept_response(self, response, **kwargs): - """ - This overrides a method that controls whether - the scraper should retry on an error. We don't - want to retry if the API returns a 400, except for - 410, which means the record no longer exists. - """ - return response.status_code < 401 or response.status_code == 410 diff --git a/legistar/bills.py b/legistar/bills.py deleted file mode 100644 index 550c2f3..0000000 --- a/legistar/bills.py +++ /dev/null @@ -1,544 +0,0 @@ -from .base import LegistarScraper, LegistarAPIScraper -from lxml.etree import tostring -from collections import deque -from functools import partialmethod -from urllib.parse import urljoin -import requests -import scrapelib - - -class LegistarBillScraper(LegistarScraper): - def legislation(self, search_text='', created_after=None, - created_before=None): - - # If legislation is added to the the legistar system while we - # are scraping, it will shift the list of legislation down and - # we might revisit the same legislation. So, we keep track of - # the last few pieces of legislation we've visited in order to - # make sure we are not revisiting - scraped_leg = deque([], maxlen=10) - - for page in self.searchLegislation(search_text, created_after, - created_before): - for legislation_summary in self.parseSearchResults(page): - if not legislation_summary['url'] in scraped_leg: - yield legislation_summary - scraped_leg.append(legislation_summary['url']) - - def searchLegislation(self, search_text='', created_after=None, - created_before=None): - """ - Submit a search query on the legislation search page, and return a list - of summary results. - """ - - page = self.lxmlize(self.LEGISLATION_URL) - - page = self._advancedSearch(page) - - payload = {} - - # Enter the search parameters TODO: Each of the possible form - # fields should be represented as keyword arguments to this - # function. The default query string should be for the the - # default 'Legislative text' field. - payload['ctl00$ContentPlaceHolder1$txtText'] = search_text - - if created_after and created_before: - payload.update(dateWithin(created_after, created_before)) - - elif created_before: - payload.update(dateBound(created_before)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '<' - - elif created_after: - payload.update(dateBound(created_after)) - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = '>' - - # Return up to one million search results - payload['ctl00_ContentPlaceHolder1_lstMax_ClientState'] = '{"value":"1000000"}' - payload['ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState'] = '{"value":"All"}' - payload['ctl00$ContentPlaceHolder1$btnSearch'] = 'Search Legislation' - - payload.update(self.sessionSecrets(page)) - - return self.pages(self.LEGISLATION_URL, payload) - - def parseSearchResults(self, page): - """Take a page of search results and return a sequence of data - of tuples about the legislation, of the form - - ('Document ID', 'Document URL', 'Type', 'Status', 'Introduction Date' - 'Passed Date', 'Main Sponsor', 'Title') - """ - table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] - for legislation, headers, row in self.parseDataTable(table): - # Do legislation search-specific stuff - # ------------------------------------ - # First column should be the ID of the record. - id_key = headers[0] - try: - legislation_id = legislation[id_key]['label'] - except TypeError: - continue - legislation_url = legislation[id_key]['url'].split( - self.BASE_URL)[-1] - legislation[id_key] = legislation_id - legislation['url'] = self.BASE_URL + \ - legislation_url.split('&Options')[0] + '&FullText=1' - - yield legislation - - def _advancedSearch(self, page): - search_switcher = page.xpath( - "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0] - - if 'simple search' in search_switcher.value.lower(): - return page - else: - payload = self.sessionSecrets(page) - payload[search_switcher.name] = search_switcher.value - - page = self.lxmlize(self.LEGISLATION_URL, payload) - - if 'simple search' not in page.xpath("//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']")[0].value.lower(): - raise ValueError('Not on the advanced search page') - - return page - - def details(self, detail_url, div_id): - detail_page = self.lxmlize(detail_url) - - detail_div = detail_page.xpath(".//div[@id='%s']" % div_id)[0] - - return self.parseDetails(detail_div) - - def legDetails(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageDetails' - return self.details(detail_url, div_id) - - def actionDetails(self, detail_url): - div_id = 'ctl00_ContentPlaceHolder1_pageTop1' - return self.details(detail_url, div_id) - - def history(self, detail_url): - detail_page = self.lxmlize(detail_url) - - try: - history_table = detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']")[0] - except IndexError: - print(detail_url) - raise - - history = [row[0] for row in self.parseDataTable(history_table)] - - try: - history = sorted(history, key=self._actionSortKey) - except (TypeError, ValueError): - pass - - for action in history: - yield action - - def _actionSortKey(self, action): - action_date = self.toDate(action['Date']) - action_url = action['Action\xa0Details']['url'] - - return (action_date, action_url) - - def text(self, detail_url): - detail_page = self.lxmlize(detail_url) - - text_div = detail_page.xpath( - "//div[@id='ctl00_ContentPlaceHolder1_divText']") - - if len(text_div): - return tostring(text_div[0], pretty_print=True).decode() - else: - return None - - def extractVotes(self, action_detail_url): - action_detail_page = self.lxmlize(action_detail_url) - try: - vote_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] - except IndexError: - self.warning("No votes found in table") - return None, [] - votes = list(self.parseDataTable(vote_table)) - vote_list = [] - for vote, _, _ in votes: - raw_option = vote['Vote'].lower() - vote_list.append((self.VOTE_OPTIONS.get(raw_option, raw_option), - vote['Person Name']['label'])) - - action_detail_div = action_detail_page.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']")[0] - action_details = self.parseDetails(action_detail_div) - result = action_details['Result'].lower() - - return result, vote_list - - -def dateWithin(created_after, created_before): - payload = dateBound(created_after) - - payload['ctl00$ContentPlaceHolder1$txtFileCreated2'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=created_before) - payload['ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=created_before) - - payload['ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=created_before) - - payload['ctl00$ContentPlaceHolder1$radFileCreated'] = 'between' - - return payload - - -def dateBound(creation_date): - payload = {} - - payload['ctl00$ContentPlaceHolder1$txtFileCreated1'] =\ - '{d.year}-{d.month:02}-{d.day:02}'.format(d=creation_date) - payload['ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput'] =\ - '{d.month}/{d.day}/{d.year}'.format(d=creation_date) - - payload['ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState'] =\ - '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 - d=creation_date) - - return payload - - -class LegistarAPIBillScraper(LegistarAPIScraper): - def __init__(self, *args, **kwargs): - ''' - Initialize the Bill scraper with a `scrape_restricted` property. - Do not collect private bills (i.e., bills with 'MatterRestrictViewViaWeb' - set as True in the API), unless the scrapers have access to them, e.g., via a token. - ''' - super().__init__(*args, **kwargs) - - self.scrape_restricted = False - - def matters(self, since_datetime=None): - # scrape from oldest to newest. This makes resuming big - # scraping jobs easier because upon a scrape failure we can - # import everything scraped and then scrape everything newer - # then the last bill we scraped - params = {'$orderby': 'MatterLastModifiedUtc'} - - if since_datetime: - since_iso = since_datetime.isoformat() - - update_fields = ('MatterLastModifiedUtc', - 'MatterIntroDate', - 'MatterPassedDate', - 'MatterDate1', - 'MatterDate2', - # 'MatterEXDate1', # can't use all 17 search - # terms, this one always - # seems to be not set - 'MatterEXDate2', - 'MatterEXDate3', - 'MatterEXDate4', - 'MatterEXDate5', - 'MatterEXDate6', - 'MatterEXDate7', - 'MatterEXDate8', - 'MatterEXDate9', - 'MatterEXDate10', - 'MatterEnactmentDate', - 'MatterAgendaDate') - - since_fmt = "{field} gt datetime'{since_datetime}'" - since_filter =\ - ' or '.join(since_fmt.format(field=field, - since_datetime=since_iso) - for field in update_fields) - - params['$filter'] = since_filter - - matters_url = self.BASE_URL + '/matters' - - for matter in self.pages(matters_url, - params=params, - item_key="MatterId"): - try: - legistar_url = self.legislation_detail_url(matter['MatterId']) - - except scrapelib.HTTPError as e: - if e.response.status_code > 403: - raise - - url = matters_url + '/{}'.format(matter['MatterId']) - self.warning('Bill could not be found in web interface: {}'.format(url)) - if not self.scrape_restricted: - continue - - else: - matter['legistar_url'] = legistar_url - - yield matter - - def matter(self, matter_id): - matter = self.endpoint('/matters/{}', matter_id) - - try: - legistar_url = self.legislation_detail_url(matter_id) - except scrapelib.HTTPError as e: - if e.response.status_code > 403: - raise - - url = self.BASE_URL + '/matters/{}'.format(matter_id) - self.warning('Bill could not be found in web interface: {}'.format(url)) - if not self.scrape_restricted: - return None - - else: - matter['legistar_url'] = legistar_url - - return matter - - def endpoint(self, route, *args): - url = self.BASE_URL + route - response = self.get(url.format(*args)) - return response.json() - - code_sections = partialmethod(endpoint, 'matters/{0}/codesections') - - def topics(self, *args, **kwargs): - if args: - return self.endpoint('/matters/{0}/indexes', *args) - else: - matter_indexes_url = self.BASE_URL + '/indexes' - return self.pages(matter_indexes_url, - params=kwargs, - item_key="IndexId") - - def attachments(self, matter_id): - attachments = self.endpoint('/matters/{0}/attachments', matter_id) - - unique_attachments = [] - scraped_urls = set() - - # Handle matters with duplicate attachments. - for attachment in attachments: - url = attachment['MatterAttachmentHyperlink'] - if url not in scraped_urls: - unique_attachments.append(attachment) - scraped_urls.add(url) - - return unique_attachments - - def votes(self, history_id): - url = self.BASE_URL + '/eventitems/{0}/votes'.format(history_id) - - try: - response = self.get(url) - except requests.HTTPError as e: - if e.response.status_code == 404: - return [] - else: - raise - - if self._missing_votes(response): - return [] - else: - return response.json() - - def history(self, matter_id): - actions = self.endpoint('/matters/{0}/histories', matter_id) - for action in actions: - action['MatterHistoryActionName'] = action['MatterHistoryActionName'].strip() - - actions = sorted((action for action in actions - if (action['MatterHistoryActionDate'] and - action['MatterHistoryActionName'] and - action['MatterHistoryActionBodyName'])), - key=lambda action: action['MatterHistoryActionDate']) - - # sometimes there are exact duplicates of actions. while this - # is a a data entry problem that ideally the source system - # would fix, they ain't always the way the world works. - # - # so, remove adjacent duplicate items. - uniq_actions = [] - - previous_key = None - for action in actions: - # these are the attributes that pupa uses for - # checking for duplicate vote events - current_key = (action['MatterHistoryActionName'], - action['MatterHistoryActionBodyName']) - if current_key != previous_key: - uniq_actions.append(action) - previous_key = current_key - else: - self.warning('"{0} by {1}" appears more than once in {2}/matters/{3}/histories. Duplicate actions have been removed.'.format( - current_key[0], - current_key[1], - self.BASE_URL, - matter_id)) - - return uniq_actions - - def sponsors(self, matter_id): - spons = self.endpoint('/matters/{0}/sponsors', matter_id) - - if spons: - max_version = max( - (sponsor['MatterSponsorMatterVersion'] for sponsor in spons), - key=lambda version: self._version_rank(version) - ) - - spons = [sponsor for sponsor in spons - if sponsor['MatterSponsorMatterVersion'] == str(max_version)] - - return sorted(spons, - key=lambda sponsor: sponsor["MatterSponsorSequence"]) - - else: - return [] - - def _version_rank(self, version): - ''' - In general, matter versions are numbers. This method provides an - override opportunity for handling versions that are not numbers. - ''' - return int(version) - - def relations(self, matter_id): - relations = self.endpoint('/matters/{0}/relations', matter_id) - - if relations: - return self._filter_relations(relations) - - else: - return [] - - def _filter_relations(self, relations): - ''' - Sometimes, many versions of a bill are related. This method returns the - most recent version of each relation. Override this method to apply a - different filter or return the full array of relations. - ''' - # Sort relations such that the latest version of each matter - # ID is returned first. - sorted_relations = sorted( - relations, - key=lambda x: ( - x['MatterRelationMatterId'], - x['MatterRelationFlag'] - ), - reverse=True - ) - - seen_relations = set() - - for relation in sorted_relations: - relation_id = relation['MatterRelationMatterId'] - - if relation_id not in seen_relations: - yield relation - seen_relations.add(relation_id) - - def text(self, matter_id, latest_version_value=None): - '''Historically, we have determined the latest version of a bill - by finding the version with the highest value (either numerical or alphabetical). - - However, the `MatterVersion` field on the matter detail page - most accurately identifies the latest version of a bill. - This proves to be true for Metro, in particular. - - Other municipalities may share this characteristic with Metro. - Until we know more, the `text` function accepts `latest_version_value`, - i.e., matter['MatterVersion'], as an optional argument.''' - - version_route = '/matters/{0}/versions' - text_route = '/matters/{0}/texts/{1}' - - versions = self.endpoint(version_route, matter_id) - - if latest_version_value: - latest_version = next( - version for version - in versions - if version['Value'] == latest_version_value) - else: - latest_version = max( - versions, key=lambda x: self._version_rank(x['Value'])) - - text_url = self.BASE_URL + \ - text_route.format(matter_id, latest_version['Key']) - response = self.get(text_url, stream=True) - if int(response.headers['Content-Length']) < 21052630: - return response.json() - - def legislation_detail_url(self, matter_id): - gateway_url = self.BASE_WEB_URL + '/gateway.aspx?m=l&id={0}'.format(matter_id) - - # We want to supress any session level params for this head request, - # since they could lead to an additonal level of redirect. - # - # Per - # http://docs.python-requests.org/en/master/user/advanced/, we - # have to do this by setting session level params to None - response = self.head( - gateway_url, - params={k: None for k in self.params} - ) - - # If the gateway URL redirects, the matter is publicly viewable. Grab - # its detail URL from the response headers. - if response.status_code == 302: - legislation_detail_route = response.headers['Location'] - return urljoin(self.BASE_WEB_URL, legislation_detail_route) - - # If the gateway URL returns a 200, it has not redirected, i.e., the - # matter is not publicly viewable. Return an unauthorized response. - elif response.status_code == 200: - response.status_code = 403 - raise scrapelib.HTTPError(response) - - # If the status code is anything but a 200 or 302, something is wrong. - # Raise an HTTPError to interrupt the scrape. - else: - self.error('{0} returned an unexpected status code: {1}'.format(gateway_url, response.status_code)) - response.status_code = 500 - raise scrapelib.HTTPError(response) - - def _missing_votes(self, response): - ''' - Check to see if a response has the particular status code and - error message that corresponds to inaccessible eventitem votes. - - see `accept_response` for more discussion of why we are doing this. - ''' - missing = (response.status_code == 500 and - response.json().get('InnerException', {}).get('ExceptionMessage', '') == "The cast to value type 'System.Int32' failed because the materialized value is null. Either the result type's generic parameter or the query must use a nullable type.") # noqa : 501 - return missing - - def accept_response(self, response, **kwargs): - ''' - Sometimes there ought to be votes on an eventitem but when we - visit the votes page, the API returns a 500 status code and a - particular error message. - - Typically, on 500 errors, we'll retry a few times because the - errors are often transient. In this particular case, the errors - are never transient. - - This happens frequently. If we retried on all those - cases, it would really slow down the scraping. To avoid that - we short circuit scrapelib's retry mechanism for this particular - error. - ''' - accept = (super().accept_response(response) or - self._missing_votes(response) or - response.status_code <= 403) - return accept diff --git a/legistar/events.py b/legistar/events.py deleted file mode 100644 index 839a1e1..0000000 --- a/legistar/events.py +++ /dev/null @@ -1,481 +0,0 @@ -from abc import ABCMeta, abstractmethod -import time -import datetime -from collections import deque -import esprima - -import pytz -import icalendar -import scrapelib - -from .base import LegistarScraper, LegistarAPIScraper - - -class LegistarEventsScraper(LegistarScraper): - ECOMMENT_JS_URLS = ( - 'https://metro.granicusideas.com/meetings.js', - 'https://metro.granicusideas.com/meetings.js?scope=past' - ) - - def __init__(self, *args, event_info_key='Meeting Details', **kwargs): - super().__init__(*args, **kwargs) - self.event_info_key = event_info_key - - - @property - def ecomment_dict(self): - """ - Parse event IDs and eComment links from JavaScript file with lines like: - activateEcomment('750', '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'); - """ - if getattr(self, '_ecomment_dict', None) is None: - ecomment_dict = {} - - # Define a callback to apply to each node, e.g., - # https://esprima.readthedocs.io/en/latest/syntactic-analysis.html#example-console-calls-removal - def is_activateEcomment(node, metadata): - if node.callee and node.callee.name == 'activateEcomment': - event_id, _, comment_url = node.arguments - ecomment_dict[event_id.value] = comment_url.value - - for url in self.ECOMMENT_JS_URLS: - response = self.get(url) - esprima.parse(response.text, delegate=is_activateEcomment) - - self._ecomment_dict = ecomment_dict - - return self._ecomment_dict - - def eventPages(self, since): - - page = self.lxmlize(self.EVENTSPAGE) - for page in self.eventSearch(page, since): - yield page - - def should_cache_response(self, response): - # Never cache the top level events page, because that may result in - # expired .NET state values. - return (super().should_cache_response(response) and - response.url != self.EVENTSPAGE) - - def eventSearch(self, page, since): - payload = self.sessionSecrets(page) - - payload['ctl00_ContentPlaceHolder1_lstYears_ClientState'] = '{"value":"%s"}' % since - - payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$lstYears' - - return self.pages(self.EVENTSPAGE, payload) - - def events(self, follow_links=True, since=None): - # If an event is added to the the legistar system while we - # are scraping, it will shift the list of events down and - # we might revisit the same event. So, we keep track of - # the last few events we've visited in order to - # make sure we are not revisiting - scraped_events = deque([], maxlen=10) - - current_year = self.now().year - - if since: - if since > current_year: - raise ValueError( - 'Value of :since cannot exceed {}'.format(current_year)) - else: - since_year = since - 1 - - else: - since_year = 0 - - # Anticipate events will be scheduled for the following year to avoid - # missing upcoming events during scrapes near the end of the current - # year. - for year in range(current_year + 1, since_year, -1): - no_events_in_year = True - - for page in self.eventPages(year): - events_table = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']")[0] - for event, _, _ in self.parseDataTable(events_table): - ical_url = event['iCalendar']['url'] - if ical_url in scraped_events: - continue - else: - scraped_events.append(ical_url) - - if follow_links and type(event[self.event_info_key]) == dict: - agenda = self.agenda(event[self.event_info_key]['url']) - else: - agenda = None - - yield event, agenda - no_events_in_year = False - - # We scrape events in reverse chronological order, starting one year - # in the future. Stop scraping if there are no events in a given - # year, unless that year is in the future, because whether events - # have been scheduled in the future is not a reliable indication of - # whether any happened in the previous year. - if no_events_in_year and year <= current_year: - break - - def agenda(self, detail_url): - page = self.lxmlize(detail_url) - - payload = self.sessionSecrets(page) - - payload.update({"__EVENTARGUMENT": "3:1", - "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain"}) - - for page in self.pages(detail_url, payload): - agenda_table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] - agenda = self.parseDataTable(agenda_table) - yield from agenda - - def addDocs(self, e, events, doc_type): - try: - if events[doc_type] != 'Not\xa0available': - e.add_document(note=events[doc_type]['label'], - url=events[doc_type]['url'], - media_type="application/pdf") - except ValueError: - pass - - def extractRollCall(self, action_detail_url): - action_detail_page = self.lxmlize(action_detail_url) - try: - rollcall_table = action_detail_page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']")[0] - except IndexError: - self.warning("No rollcall found in table") - return [] - roll_call = list(self.parseDataTable(rollcall_table)) - call_list = [] - for call, _, _ in roll_call: - option = call['Attendance'] - call_list.append((option, - call['Person Name']['label'])) - - return call_list - - def ical(self, ical_text): - value = icalendar.Calendar.from_ical(ical_text) - return value - - def _parse_detail(self, key, field_1, field_2): - if key == 'eComment': - return self._get_ecomment_link(field_2) or field_2.text_content().strip() - - def _get_ecomment_link(self, link): - event_id = link.attrib['data-event-id'] - return self.ecomment_dict.get(event_id, None) - - -class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta): - webscraper_class = LegistarEventsScraper - WEB_RETRY_EVENTS = 3 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._webscraper = self._init_webscraper() - - def _init_webscraper(self): - webscraper = self.webscraper_class( - requests_per_minute=self.requests_per_minute, - retry_attempts=self.WEB_RETRY_EVENTS) - - if self.cache_storage: - webscraper.cache_storage = self.cache_storage - - webscraper.cache_write_only = self.cache_write_only - - webscraper.BASE_URL = self.WEB_URL - webscraper.EVENTSPAGE = self.EVENTSPAGE - webscraper.BASE_URL = self.WEB_URL - webscraper.TIMEZONE = self.TIMEZONE - webscraper.date_format = '%m/%d/%Y' - - return webscraper - - @abstractmethod - def _get_web_event(self, api_event): - pass - - def api_events(self, since_datetime=None): - # scrape from oldest to newest. This makes resuming big - # scraping jobs easier because upon a scrape failure we can - # import everything scraped and then scrape everything newer - # then the last event we scraped - params = {'$orderby': 'EventLastModifiedUtc'} - - if since_datetime: - # We include events three days before the given start date - # to make sure we grab updated fields (e.g. audio recordings) - # that don't update the last modified timestamp. - backwards_window = datetime.timedelta(hours=72) - since_iso = (since_datetime - backwards_window).isoformat() - - # Minutes are often published after an event occurs – without a - # corresponding event modification. Query all update fields so later - # changes are always caught by our scraper, particularly when - # scraping narrower windows of time. - update_fields = ('EventDate', - 'EventLastModifiedUtc', - 'EventAgendaLastPublishedUTC', - 'EventMinutesLastPublishedUTC') - - since_fmt = "{field} gt datetime'{since_datetime}'" - since_filter =\ - ' or '.join(since_fmt.format(field=field, - since_datetime=since_iso) - for field in update_fields) - - params['$filter'] = since_filter - - events_url = self.BASE_URL + '/events/' - - yield from self.pages(events_url, - params=params, - item_key="EventId") - - def events(self, since_datetime=None): - for api_event in self.api_events(since_datetime=since_datetime): - if event := self.event(api_event): - yield event - - def event(self, api_event): - time_str = api_event["EventTime"] - if not time_str: # If we don't have an event time, skip it - return - try: - # Start times are entered manually. Sometimes, they don't - # conform to this format. Log events with invalid start times, - # but don't interrupt the scrape for them. - start_time = time.strptime(time_str, self.time_string_format) - except ValueError: - event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) - self.logger.error( - 'API event has invalid start time "{0}": {1}'.format( - time_str, event_url - ) - ) - return - - start = self.toTime(api_event["EventDate"]) - api_event["start"] = start.replace( - hour=start_time.tm_hour, minute=start_time.tm_min - ) - - api_event["status"] = self._event_status(api_event) - - web_event = self._get_web_event(api_event) - - if web_event: - return api_event, web_event - - else: - event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) - self.warning( - "API event could not be found in web interface: {0}".format(event_url) - ) - - def agenda(self, event): - agenda_url = (self.BASE_URL + - '/events/{}/eventitems'.format(event['EventId'])) - - response = self.get(agenda_url) - - # If an event item does not have a value for - # EventItemAgendaSequence, it is not on the agenda - filtered_items = (item for item in response.json() - if (item['EventItemTitle'] and - item['EventItemAgendaSequence'])) - sorted_items = sorted(filtered_items, - key=lambda item: item['EventItemAgendaSequence']) - - for item in sorted_items: - self._suppress_item_matter(item, agenda_url) - yield item - - def minutes(self, event): - minutes_url = (self.BASE_URL + - '/events/{}/eventitems'.format(event['EventId'])) - - response = self.get(minutes_url) - - # If an event item does not have a value for - # EventItemMinutesSequence, it is not in the minutes - filtered_items = (item for item in response.json() - if (item['EventItemTitle'] and - item['EventItemMinutesSequence'])) - sorted_items = sorted(filtered_items, - key=lambda item: item['EventItemMinutesSequence']) - - for item in sorted_items: - self._suppress_item_matter(item, minutes_url) - yield item - - def _suppress_item_matter(self, item, agenda_url): - ''' - Agenda items in Legistar do not always display links to - associated matter files even if the same agenda item - in the API references a Matter File. The agenda items - we scrape should honor the suppression on the Legistar - agendas. - - This is also practical because matter files that are hidden - in the Legistar Agenda do not seem to available for scraping - on Legistar or through the API - - Since we are not completely sure that the same suppression - logic should be used for all Legislative Bodies, this method - is currently just a hook for being overridden in particular - scrapers. As of now, at least LA Metro uses this hook. - ''' - pass - - def rollcalls(self, event): - for item in self.agenda(event): - if item['EventItemRollCallFlag']: - rollcall_url = self.BASE_URL + \ - '/eventitems/{}/rollcalls'.format(item['EventItemId']) - - response = self.get(rollcall_url) - - for item in response.json(): - yield item - - def addDocs(self, e, events, doc_type): - try: - if events[doc_type] != 'Not\xa0available': - e.add_document(note=events[doc_type]['label'], - url=events[doc_type]['url'], - media_type="application/pdf") - except ValueError: - pass - - def _event_status(self, event): - '''Events can have a status of tentative, confirmed, cancelled, or - passed (http://docs.opencivicdata.org/en/latest/data/event.html). By - default, set status to passed if the current date and time exceeds the - event date and time, or confirmed otherwise. Available for override in - jurisdictional scrapers. - ''' - if datetime.datetime.utcnow().replace(tzinfo=pytz.utc) > event['start']: - status = 'passed' - else: - status = 'confirmed' - - return status - - -class LegistarAPIEventScraper(LegistarAPIEventScraperBase): - - def _get_web_event(self, api_event): - return self.web_detail(api_event) - - def web_detail(self, event): - ''' - Grabs the information for an event from the Legistar website - and returns as a dictionary. - ''' - insite_url = event['EventInSiteURL'] - - try: - event_page = self._webscraper.lxmlize(insite_url) - except scrapelib.HTTPError as e: - if e.response.status_code == 410: - return None - elif e.response.status_code == 503: - # Events with draft agendas sometimes have an EventInSiteURL - # that resolves to a 503 status code - self.logger.error( - f"Error while fetching event detail at {insite_url}: {e}" - ) - return None - else: - raise - - div_id = 'ctl00_ContentPlaceHolder1_pageTop1' - detail_div = event_page.xpath(".//div[@id='%s']" % div_id)[0] - - event_page_details = self._webscraper.parseDetails(detail_div) - event_page_details['Meeting Details'] = {'url': insite_url} - - return event_page_details - - -class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase): - ''' - There are some inSite sites that have information that only appears - event listing page, like NYC's 'Meeting Topic.' This scraper visits - the listing page and attempts to zip API and web events together - ''' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # Set attribute equal to an instance of our generator yielding events - # scraped from the Legistar web interface. This allows us to pause - # and resume iteration as needed. - self._events = self._scrapeWebCalendar() - - # Instantiate dictionary where events from generator are stored as they - # are scraped. - self._scraped_events = {} - - def _get_web_event(self, api_event): - if self._not_in_web_interface(api_event): - return None - else: - # None if entire web calendar scraped but API event not found - return self.web_results(api_event) - - def web_results(self, event): - api_key = (event['EventBodyName'].strip(), - event['start']) - - # Check the cache of events we've already scraped from the web interface - # for the API event at hand. - if api_key in self._scraped_events: - return self._scraped_events[api_key] - - else: - # If API event not in web scrape cache, continue scraping the web - # interface. - for web_key, event in self._events: - self._scraped_events[web_key] = event - # When we find the API event, stop scraping. - if web_key == api_key: - return event - - def _scrapeWebCalendar(self): - '''Generator yielding events from Legistar in roughly reverse - chronological order. - ''' - for event, _ in self._webscraper.events(follow_links=False): - event_key = self._event_key(event, self._webscraper) - yield event_key, event - - def _event_key(self, event, web_scraper): - '''Since Legistar InSite contains more information about events than - are available in the API, we need to scrape both. Then, we have - to line them up. This method makes a key that should be - uniquely identify every event and will allow us to link - events from the two data sources. - ''' - response = web_scraper.get(event['iCalendar']['url'], verify=False) - event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt - event_time = pytz.timezone(self.TIMEZONE).localize(event_time) - - key = (event['Name']['label'], - event_time) - - return key - - def _not_in_web_interface(self, event): - '''Occasionally, an event will appear in the API, but not in the web - interface. This method checks attributes of the API event that tell us - whether the given event is one of those cases, returning True if so, and - False otherwise. Available for override in jurisdictional scrapers. - ''' - return False diff --git a/legistar/people.py b/legistar/people.py deleted file mode 100644 index c7e4f48..0000000 --- a/legistar/people.py +++ /dev/null @@ -1,87 +0,0 @@ -from .base import LegistarScraper, LegistarAPIScraper - - -class LegistarPersonScraper(LegistarScraper): - MEMBERLIST = None - ALL_MEMBERS = None - - def councilMembers(self, extra_args=None, follow_links=True): - payload = {} - if extra_args: - payload.update(extra_args) - page = self.lxmlize(self.MEMBERLIST, payload) - payload.update(self.sessionSecrets(page)) - - if self.ALL_MEMBERS: - payload['__EVENTTARGET'] = "ctl00$ContentPlaceHolder1$menuPeople" - payload['__EVENTARGUMENT'] = self.ALL_MEMBERS - - for page in self.pages(self.MEMBERLIST, payload): - table = page.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']")[0] - - for councilman, headers, row in self.parseDataTable(table): - if follow_links and type(councilman['Person Name']) == dict: - - detail_url = councilman['Person Name']['url'] - councilman_details = self.lxmlize(detail_url) - detail_div = councilman_details.xpath( - ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0] - - councilman.update(self.parseDetails(detail_div)) - - img = councilman_details.xpath( - "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']") - if img: - councilman['Photo'] = img[0].get('src') - - committee_table = councilman_details.xpath( - "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']")[0] - committees = self.parseDataTable(committee_table) - - yield councilman, committees - - else: - yield councilman - - -class LegistarAPIPersonScraper(LegistarAPIScraper): - date_format = '%Y-%m-%dT%H:%M:%S' - - def body_types(self): - body_types_url = self.BASE_URL + '/bodytypes/' - response = self.get(body_types_url) - - types = {body_type['BodyTypeName']: body_type['BodyTypeId'] - for body_type in response.json()} - - return types - - def bodies(self): - bodies_url = self.BASE_URL + '/bodies/' - - for body in self.pages(bodies_url, item_key="BodyId"): - yield body - - def body_offices(self, body): - body_id = body['BodyId'] - - offices_url = (self.BASE_URL + - '/bodies/{}/OfficeRecords'.format(body_id)) - - for office in self.pages(offices_url, item_key="OfficeRecordId"): - yield office - - def toDate(self, text): - return self.toTime(text).date() - - def person_sources_from_office(self, office): - person_api_url = (self.BASE_URL + - '/persons/{OfficeRecordPersonId}'.format(**office)) - - response = self.get(person_api_url) - - route = '/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}' - person_web_url = self.WEB_URL + route.format(**response.json()) - - return person_api_url, person_web_url diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3c02f45 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,97 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "legistar" +dynamic = ["version"] +description = "Mixin classes for legistar scrapers" +readme = "README.md" +license = {text = "BSD"} +authors = [ + {name = "Forest Gregg", email = "fgregg@datamade.us"} +] +maintainers = [ + {name = "Forest Gregg", email = "fgregg@datamade.us"} +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "requests", + "lxml", + "pytz", + "icalendar", + "scrapelib", + "esprima" +] +requires-python = ">=3.8" + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-mock", + "requests-mock", + "flake8", + "flake8-pyproject", + "black", +] +test = [ + "pytest", + "pytest-mock", + "requests-mock" +] + +[project.urls] +Homepage = "http://github.com/opencivicdata/python-legistar-scraper/" +Repository = "http://github.com/opencivicdata/python-legistar-scraper/" +Issues = "http://github.com/opencivicdata/python-legistar-scraper/issues" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.dynamic] +version = {attr = "legistar.__version__"} + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v -p no:warnings" +norecursedirs = [".env"] + +[tool.flake8] +max-line-length = 88 +max-complexity = 10 +exclude = [ + ".git", + "__pycache__", + "build", + "dist", + ".env", + ".venv", + "venv", + "*.egg-info" +] +ignore = [ + "E203", # whitespace before ':' (conflicts with black) + "W503", # line break before binary operator (PEP 8 recommends this) +] +per-file-ignores = [ + "__init__.py:F401", # imported but unused (common in __init__.py files) +] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ab92662..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest -pytest-mock -requests-mock diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index f60784d..0000000 --- a/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[tool:pytest] -addopts = -v -p no:warnings -norecursedirs=.env diff --git a/setup.py b/setup.py deleted file mode 100644 index 547f29e..0000000 --- a/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup -from legistar import __version__ - -long_description = '' - -setup(name='legistar', - version=__version__, - packages=['legistar'], - author='Forest Gregg', - author_email='fgregg@datamade.us', - license='BSD', - url='http://github.com/opencivicdata/python-legistar-scraper/', - description='Mixin classes for legistar scrapers', - long_description=long_description, - platforms=['any'], - install_requires=['requests', - 'lxml', - 'pytz', - 'icalendar', - 'scrapelib', - 'esprima' - ], - classifiers=["Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Topic :: Software Development :: Libraries :: Python Modules", - ], - ) diff --git a/src/legistar/__init__.py b/src/legistar/__init__.py new file mode 100644 index 0000000..1e09a73 --- /dev/null +++ b/src/legistar/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" # pragma: no cover diff --git a/src/legistar/api/__init__.py b/src/legistar/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/legistar/api/base.py b/src/legistar/api/base.py new file mode 100644 index 0000000..1292163 --- /dev/null +++ b/src/legistar/api/base.py @@ -0,0 +1,99 @@ +import datetime +from collections import deque +import requests +import logging + +import scrapelib +import pytz + + +class LegistarAPIScraper(scrapelib.Scraper): + date_format = "%Y-%m-%dT%H:%M:%S" + time_string_format = "%I:%M %p" + utc_timestamp_format = "%Y-%m-%dT%H:%M:%S.%f" + + def __init__(self, *args, **kwargs): + super(LegistarAPIScraper, self).__init__(*args, **kwargs) + self.logger = logging.getLogger("legistar") + self.warning = self.logger.warning + + def to_time(self, text): + time = datetime.datetime.strptime(text, self.date_format) + time = pytz.timezone(self.TIMEZONE).localize(time) + return time + + def to_utc_timestamp(self, text): + try: + time = datetime.datetime.strptime(text, self.utc_timestamp_format) + except ValueError as e: + if "does not match format" in str(e): + time = datetime.datetime.strptime(text, self.date_format) + else: + raise + time = pytz.timezone("UTC").localize(time) + return time + + def search(self, route, item_key, search_conditions): + """ + Base function for searching the Legistar API. + + Arguments: + + route -- The path to search, i.e. /matters/, /events/, etc + item_key -- The unique id field for the items that you are searching. + This is necessary for proper pagination. examples + might be MatterId or EventId + search_conditions -- a string in the OData format for the + your search conditions + http://www.odata.org/documentation/odata-version-3-0/url-conventions/#url5.1.2 + + It would be nice if we could provide a + friendly search API. Something like + https://github.com/tuomur/python-odata + + + Examples: + # Search for bills introduced after Jan. 1, 2017 + search('/matters/', 'MatterId', "MatterIntroDate gt datetime'2017-01-01'") + """ + + search_url = self.BASE_URL + route + + params = {"$filter": search_conditions} + + try: + yield from self.pages(search_url, params=params, item_key=item_key) + except requests.HTTPError as e: + if e.response.status_code == 400: + raise ValueError(e.response.json()["Message"]) + if not self.accept_response(e.response): + raise + + def pages(self, url, params=None, item_key=None): + if params is None: + params = {} + + seen = deque([], maxlen=1000) + + page_num = 0 + response = None + while page_num == 0 or len(response.json()) == 1000: + params["$skip"] = page_num * 1000 + response = self.get(url, params=params) + response.raise_for_status() + + for item in response.json(): + if item[item_key] not in seen: + yield item + seen.append(item[item_key]) + + page_num += 1 + + def accept_response(self, response, **kwargs): + """ + This overrides a method that controls whether + the scraper should retry on an error. We don't + want to retry if the API returns a 400, except for + 410, which means the record no longer exists. + """ + return response.status_code < 401 or response.status_code == 410 diff --git a/src/legistar/api/bills.py b/src/legistar/api/bills.py new file mode 100644 index 0000000..408b4c0 --- /dev/null +++ b/src/legistar/api/bills.py @@ -0,0 +1,356 @@ +from functools import partialmethod +from urllib.parse import urljoin + +import requests +import scrapelib + +from .base import LegistarAPIScraper + + +class LegistarAPIBillScraper(LegistarAPIScraper): + def __init__(self, *args, **kwargs): + """ + Initialize the Bill scraper with a `scrape_restricted` property. + Do not collect private bills (i.e., bills with 'MatterRestrictViewViaWeb' + set as True in the API), unless the scrapers have access to them, + e.g., via a token. + """ + super().__init__(*args, **kwargs) + + self.scrape_restricted = False + + def matters(self, since_datetime=None): + # scrape from oldest to newest. This makes resuming big + # scraping jobs easier because upon a scrape failure we can + # import everything scraped and then scrape everything newer + # then the last bill we scraped + params = {"$orderby": "MatterLastModifiedUtc"} + + if since_datetime: + since_iso = since_datetime.isoformat() + + update_fields = ( + "MatterLastModifiedUtc", + "MatterIntroDate", + "MatterPassedDate", + "MatterDate1", + "MatterDate2", + # 'MatterEXDate1', # can't use all 17 search + # terms, this one always + # seems to be not set + "MatterEXDate2", + "MatterEXDate3", + "MatterEXDate4", + "MatterEXDate5", + "MatterEXDate6", + "MatterEXDate7", + "MatterEXDate8", + "MatterEXDate9", + "MatterEXDate10", + "MatterEnactmentDate", + "MatterAgendaDate", + ) + + since_fmt = "{field} gt datetime'{since_datetime}'" + since_filter = " or ".join( + since_fmt.format(field=field, since_datetime=since_iso) + for field in update_fields + ) + + params["$filter"] = since_filter + + matters_url = self.BASE_URL + "/matters" + + for matter in self.pages(matters_url, params=params, item_key="MatterId"): + try: + legistar_url = self.legislation_detail_url(matter["MatterId"]) + + except scrapelib.HTTPError as e: + if e.response.status_code > 403: + raise + + url = matters_url + "/{}".format(matter["MatterId"]) + self.warning("Bill could not be found in web interface: {}".format(url)) + if not self.scrape_restricted: + continue + + else: + matter["legistar_url"] = legistar_url + + yield matter + + def matter(self, matter_id): + matter = self.endpoint("/matters/{}", matter_id) + + try: + legistar_url = self.legislation_detail_url(matter_id) + except scrapelib.HTTPError as e: + if e.response.status_code > 403: + raise + + url = self.BASE_URL + "/matters/{}".format(matter_id) + self.warning("Bill could not be found in web interface: {}".format(url)) + if not self.scrape_restricted: + return None + + else: + matter["legistar_url"] = legistar_url + + return matter + + def endpoint(self, route, *args): + url = self.BASE_URL + route + response = self.get(url.format(*args)) + return response.json() + + code_sections = partialmethod(endpoint, "matters/{0}/codesections") + + def topics(self, *args, **kwargs): + if args: + return self.endpoint("/matters/{0}/indexes", *args) + else: + matter_indexes_url = self.BASE_URL + "/indexes" + return self.pages(matter_indexes_url, params=kwargs, item_key="IndexId") + + def attachments(self, matter_id): + attachments = self.endpoint("/matters/{0}/attachments", matter_id) + + unique_attachments = [] + scraped_urls = set() + + # Handle matters with duplicate attachments. + for attachment in attachments: + url = attachment["MatterAttachmentHyperlink"] + if url not in scraped_urls: + unique_attachments.append(attachment) + scraped_urls.add(url) + + return unique_attachments + + def votes(self, history_id): + url = self.BASE_URL + "/eventitems/{0}/votes".format(history_id) + + try: + response = self.get(url) + except requests.HTTPError as e: + if e.response.status_code == 404: + return [] + else: + raise + + if self._missing_votes(response): + return [] + else: + return response.json() + + def history(self, matter_id): + actions = self.endpoint("/matters/{0}/histories", matter_id) + for action in actions: + action["MatterHistoryActionName"] = action[ + "MatterHistoryActionName" + ].strip() + + actions = sorted( + ( + action + for action in actions + if ( + action["MatterHistoryActionDate"] + and action["MatterHistoryActionName"] + and action["MatterHistoryActionBodyName"] + ) + ), + key=lambda action: action["MatterHistoryActionDate"], + ) + + # sometimes there are exact duplicates of actions. while this + # is a a data entry problem that ideally the source system + # would fix, they ain't always the way the world works. + # + # so, remove adjacent duplicate items. + uniq_actions = [] + + previous_key = None + for action in actions: + # these are the attributes that pupa uses for + # checking for duplicate vote events + current_key = ( + action["MatterHistoryActionName"], + action["MatterHistoryActionBodyName"], + ) + if current_key != previous_key: + uniq_actions.append(action) + previous_key = current_key + else: + self.warning( + '"{0} by {1}" appears more than once in ' + "{2}/matters/{3}/histories. Duplicate actions have been " + "removed.".format( + current_key[0], current_key[1], self.BASE_URL, matter_id + ) + ) + + return uniq_actions + + def sponsors(self, matter_id): + spons = self.endpoint("/matters/{0}/sponsors", matter_id) + + if spons: + max_version = max( + (sponsor["MatterSponsorMatterVersion"] for sponsor in spons), + key=lambda version: self._version_rank(version), + ) + + spons = [ + sponsor + for sponsor in spons + if sponsor["MatterSponsorMatterVersion"] == str(max_version) + ] + + return sorted(spons, key=lambda sponsor: sponsor["MatterSponsorSequence"]) + + else: + return [] + + def _version_rank(self, version): + """ + In general, matter versions are numbers. This method provides an + override opportunity for handling versions that are not numbers. + """ + return int(version) + + def relations(self, matter_id): + relations = self.endpoint("/matters/{0}/relations", matter_id) + + if relations: + return self._filter_relations(relations) + + else: + return [] + + def _filter_relations(self, relations): + """ + Sometimes, many versions of a bill are related. This method returns the + most recent version of each relation. Override this method to apply a + different filter or return the full array of relations. + """ + # Sort relations such that the latest version of each matter + # ID is returned first. + sorted_relations = sorted( + relations, + key=lambda x: (x["MatterRelationMatterId"], x["MatterRelationFlag"]), + reverse=True, + ) + + seen_relations = set() + + for relation in sorted_relations: + relation_id = relation["MatterRelationMatterId"] + + if relation_id not in seen_relations: + yield relation + seen_relations.add(relation_id) + + def text(self, matter_id, latest_version_value=None): + """Historically, we have determined the latest version of a bill + by finding the version with the highest value (either numerical + or alphabetical). + + However, the `MatterVersion` field on the matter detail page + most accurately identifies the latest version of a bill. + This proves to be true for Metro, in particular. + + Other municipalities may share this characteristic with Metro. + Until we know more, the `text` function accepts `latest_version_value`, + i.e., matter['MatterVersion'], as an optional argument.""" + + version_route = "/matters/{0}/versions" + text_route = "/matters/{0}/texts/{1}" + + versions = self.endpoint(version_route, matter_id) + + if latest_version_value: + latest_version = next( + version + for version in versions + if version["Value"] == latest_version_value + ) + else: + latest_version = max(versions, key=lambda x: self._version_rank(x["Value"])) + + text_url = self.BASE_URL + text_route.format(matter_id, latest_version["Key"]) + response = self.get(text_url, stream=True) + if int(response.headers["Content-Length"]) < 21052630: + return response.json() + + def legislation_detail_url(self, matter_id): + gateway_url = self.BASE_WEB_URL + "/gateway.aspx?m=l&id={0}".format(matter_id) + + # We want to supress any session level params for this head request, + # since they could lead to an additonal level of redirect. + # + # Per + # http://docs.python-requests.org/en/master/user/advanced/, we + # have to do this by setting session level params to None + response = self.head(gateway_url, params={k: None for k in self.params}) + + # If the gateway URL redirects, the matter is publicly viewable. Grab + # its detail URL from the response headers. + if response.status_code == 302: + legislation_detail_route = response.headers["Location"] + return urljoin(self.BASE_WEB_URL, legislation_detail_route) + + # If the gateway URL returns a 200, it has not redirected, i.e., the + # matter is not publicly viewable. Return an unauthorized response. + elif response.status_code == 200: + response.status_code = 403 + raise scrapelib.HTTPError(response) + + # If the status code is anything but a 200 or 302, something is wrong. + # Raise an HTTPError to interrupt the scrape. + else: + self.error( + "{0} returned an unexpected status code: {1}".format( + gateway_url, response.status_code + ) + ) + response.status_code = 500 + raise scrapelib.HTTPError(response) + + def _missing_votes(self, response): + """ + Check to see if a response has the particular status code and + error message that corresponds to inaccessible eventitem votes. + + see `accept_response` for more discussion of why we are doing this. + """ + missing = response.status_code == 500 and response.json().get( + "InnerException", {} + ).get("ExceptionMessage", "") == ( + "The cast to value type 'System.Int32' failed because the " + "materialized value is null. Either the result type's " + "generic parameter or the query must use a nullable type." + ) + return missing + + def accept_response(self, response, **kwargs): + """ + Sometimes there ought to be votes on an eventitem but when we + visit the votes page, the API returns a 500 status code and a + particular error message. + + Typically, on 500 errors, we'll retry a few times because the + errors are often transient. In this particular case, the errors + are never transient. + + This happens frequently. If we retried on all those + cases, it would really slow down the scraping. To avoid that + we short circuit scrapelib's retry mechanism for this particular + error. + """ + accept = ( + super().accept_response(response) + or self._missing_votes(response) + or response.status_code <= 403 + ) + return accept diff --git a/src/legistar/api/events.py b/src/legistar/api/events.py new file mode 100644 index 0000000..689a520 --- /dev/null +++ b/src/legistar/api/events.py @@ -0,0 +1,326 @@ +from abc import ABCMeta, abstractmethod +import datetime +import time + +import pytz +import scrapelib + +from .base import LegistarAPIScraper +from ..ui.events import LegistarEventsScraper + + +class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta): + webscraper_class = LegistarEventsScraper + WEB_RETRY_EVENTS = 3 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._webscraper = self._init_webscraper() + + def _init_webscraper(self): + webscraper = self.webscraper_class( + requests_per_minute=self.requests_per_minute, + retry_attempts=self.WEB_RETRY_EVENTS, + ) + + if self.cache_storage: + webscraper.cache_storage = self.cache_storage + + webscraper.cache_write_only = self.cache_write_only + + webscraper.BASE_URL = self.WEB_URL + webscraper.EVENTSPAGE = self.EVENTSPAGE + webscraper.BASE_URL = self.WEB_URL + webscraper.TIMEZONE = self.TIMEZONE + webscraper.date_format = "%m/%d/%Y" + + return webscraper + + @abstractmethod + def _get_web_event(self, api_event): + pass + + def api_events(self, since_datetime=None): + # scrape from oldest to newest. This makes resuming big + # scraping jobs easier because upon a scrape failure we can + # import everything scraped and then scrape everything newer + # then the last event we scraped + params = {"$orderby": "EventLastModifiedUtc"} + + if since_datetime: + # We include events three days before the given start date + # to make sure we grab updated fields (e.g. audio recordings) + # that don't update the last modified timestamp. + backwards_window = datetime.timedelta(hours=72) + since_iso = (since_datetime - backwards_window).isoformat() + + # Minutes are often published after an event occurs – without a + # corresponding event modification. Query all update fields so later + # changes are always caught by our scraper, particularly when + # scraping narrower windows of time. + update_fields = ( + "EventDate", + "EventLastModifiedUtc", + "EventAgendaLastPublishedUTC", + "EventMinutesLastPublishedUTC", + ) + + since_fmt = "{field} gt datetime'{since_datetime}'" + since_filter = " or ".join( + since_fmt.format(field=field, since_datetime=since_iso) + for field in update_fields + ) + + params["$filter"] = since_filter + + events_url = self.BASE_URL + "/events/" + + yield from self.pages(events_url, params=params, item_key="EventId") + + def events(self, since_datetime=None): + for api_event in self.api_events(since_datetime=since_datetime): + if event := self.event(api_event): + yield event + + def event(self, api_event): + time_str = api_event["EventTime"] + if not time_str: # If we don't have an event time, skip it + return + try: + # Start times are entered manually. Sometimes, they don't + # conform to this format. Log events with invalid start times, + # but don't interrupt the scrape for them. + start_time = time.strptime(time_str, self.time_string_format) + except ValueError: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.logger.error( + 'API event has invalid start time "{0}": {1}'.format( + time_str, event_url + ) + ) + return + + start = self.to_time(api_event["EventDate"]) + api_event["start"] = start.replace( + hour=start_time.tm_hour, minute=start_time.tm_min + ) + + api_event["status"] = self._event_status(api_event) + + web_event = self._get_web_event(api_event) + + if web_event: + return api_event, web_event + + else: + event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"]) + self.warning( + "API event could not be found in web interface: {0}".format(event_url) + ) + + def agenda(self, event): + agenda_url = self.BASE_URL + "/events/{}/eventitems".format(event["EventId"]) + + response = self.get(agenda_url) + + # If an event item does not have a value for + # EventItemAgendaSequence, it is not on the agenda + filtered_items = ( + item + for item in response.json() + if (item["EventItemTitle"] and item["EventItemAgendaSequence"]) + ) + sorted_items = sorted( + filtered_items, key=lambda item: item["EventItemAgendaSequence"] + ) + + for item in sorted_items: + self._suppress_item_matter(item, agenda_url) + yield item + + def minutes(self, event): + minutes_url = self.BASE_URL + "/events/{}/eventitems".format(event["EventId"]) + + response = self.get(minutes_url) + + # If an event item does not have a value for + # EventItemMinutesSequence, it is not in the minutes + filtered_items = ( + item + for item in response.json() + if (item["EventItemTitle"] and item["EventItemMinutesSequence"]) + ) + sorted_items = sorted( + filtered_items, key=lambda item: item["EventItemMinutesSequence"] + ) + + for item in sorted_items: + self._suppress_item_matter(item, minutes_url) + yield item + + def _suppress_item_matter(self, item, agenda_url): + """ + Agenda items in Legistar do not always display links to + associated matter files even if the same agenda item + in the API references a Matter File. The agenda items + we scrape should honor the suppression on the Legistar + agendas. + + This is also practical because matter files that are hidden + in the Legistar Agenda do not seem to available for scraping + on Legistar or through the API + + Since we are not completely sure that the same suppression + logic should be used for all Legislative Bodies, this method + is currently just a hook for being overridden in particular + scrapers. As of now, at least LA Metro uses this hook. + """ + pass + + def rollcalls(self, event): + for item in self.agenda(event): + if item["EventItemRollCallFlag"]: + rollcall_url = self.BASE_URL + "/eventitems/{}/rollcalls".format( + item["EventItemId"] + ) + + response = self.get(rollcall_url) + + for item in response.json(): + yield item + + def add_docs(self, e, events, doc_type): + try: + if events[doc_type] != "Not\xa0available": + e.add_document( + note=events[doc_type]["label"], + url=events[doc_type]["url"], + media_type="application/pdf", + ) + except ValueError: + pass + + def _event_status(self, event): + """Events can have a status of tentative, confirmed, cancelled, or + passed (http://docs.opencivicdata.org/en/latest/data/event.html). By + default, set status to passed if the current date and time exceeds the + event date and time, or confirmed otherwise. Available for override in + jurisdictional scrapers. + """ + if datetime.datetime.utcnow().replace(tzinfo=pytz.utc) > event["start"]: + status = "passed" + else: + status = "confirmed" + + return status + + +class LegistarAPIEventScraper(LegistarAPIEventScraperBase): + + def _get_web_event(self, api_event): + return self.web_detail(api_event) + + def web_detail(self, event): + """ + Grabs the information for an event from the Legistar website + and returns as a dictionary. + """ + insite_url = event["EventInSiteURL"] + + try: + event_page = self._webscraper.lxmlize(insite_url) + except scrapelib.HTTPError as e: + if e.response.status_code == 410: + return None + elif e.response.status_code == 503: + # Events with draft agendas sometimes have an EventInSiteURL + # that resolves to a 503 status code + self.logger.error( + f"Error while fetching event detail at {insite_url}: {e}" + ) + return None + else: + raise + + div_id = "ctl00_ContentPlaceHolder1_pageTop1" + detail_div = event_page.xpath(".//div[@id='%s']" % div_id)[0] + + event_page_details = self._webscraper.parse_details(detail_div) + event_page_details["Meeting Details"] = {"url": insite_url} + + return event_page_details + + +class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase): + """ + There are some inSite sites that have information that only appears + event listing page, like NYC's 'Meeting Topic.' This scraper visits + the listing page and attempts to zip API and web events together + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Set attribute equal to an instance of our generator yielding events + # scraped from the Legistar web interface. This allows us to pause + # and resume iteration as needed. + self._events = self._scrapeWebCalendar() + + # Instantiate dictionary where events from generator are stored as they + # are scraped. + self._scraped_events = {} + + def _get_web_event(self, api_event): + if self._not_in_web_interface(api_event): + return None + else: + # None if entire web calendar scraped but API event not found + return self.web_results(api_event) + + def web_results(self, event): + api_key = (event["EventBodyName"].strip(), event["start"]) + + # Check the cache of events we've already scraped from the web interface + # for the API event at hand. + if api_key in self._scraped_events: + return self._scraped_events[api_key] + + else: + # If API event not in web scrape cache, continue scraping the web + # interface. + for web_key, event in self._events: + self._scraped_events[web_key] = event + # When we find the API event, stop scraping. + if web_key == api_key: + return event + + def _scrapeWebCalendar(self): + """Generator yielding events from Legistar in roughly reverse + chronological order. + """ + for event, _ in self._webscraper.events(follow_links=False): + event_key = self._event_key(event, self._webscraper) + yield event_key, event + + def _event_key(self, event, web_scraper): + """Since Legistar InSite contains more information about events than + are available in the API, we need to scrape both. Then, we have + to line them up. This method makes a key that should be + uniquely identify every event and will allow us to link + events from the two data sources. + """ + response = web_scraper.get(event["iCalendar"]["url"], verify=False) + event_time = web_scraper.ical(response.text).subcomponents[0]["DTSTART"].dt + event_time = pytz.timezone(self.TIMEZONE).localize(event_time) + + key = (event["Name"]["label"], event_time) + + return key + + def _not_in_web_interface(self, event): + """Occasionally, an event will appear in the API, but not in the web + interface. This method checks attributes of the API event that tell us + whether the given event is one of those cases, returning True if so, and + False otherwise. Available for override in jurisdictional scrapers. + """ + return False diff --git a/src/legistar/api/people.py b/src/legistar/api/people.py new file mode 100644 index 0000000..c557ce9 --- /dev/null +++ b/src/legistar/api/people.py @@ -0,0 +1,45 @@ +from .base import LegistarAPIScraper + + +class LegistarAPIPersonScraper(LegistarAPIScraper): + date_format = "%Y-%m-%dT%H:%M:%S" + + def body_types(self): + body_types_url = self.BASE_URL + "/bodytypes/" + response = self.get(body_types_url) + + types = { + body_type["BodyTypeName"]: body_type["BodyTypeId"] + for body_type in response.json() + } + + return types + + def bodies(self): + bodies_url = self.BASE_URL + "/bodies/" + + for body in self.pages(bodies_url, item_key="BodyId"): + yield body + + def body_offices(self, body): + body_id = body["BodyId"] + + offices_url = self.BASE_URL + "/bodies/{}/OfficeRecords".format(body_id) + + for office in self.pages(offices_url, item_key="OfficeRecordId"): + yield office + + def to_date(self, text): + return self.to_time(text).date() + + def person_sources_from_office(self, office): + person_api_url = self.BASE_URL + "/persons/{OfficeRecordPersonId}".format( + **office + ) + + response = self.get(person_api_url) + + route = "/PersonDetail.aspx?ID={PersonId}&GUID={PersonGuid}" + person_web_url = self.WEB_URL + route.format(**response.json()) + + return person_api_url, person_web_url diff --git a/src/legistar/ui/__init__.py b/src/legistar/ui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/legistar/ui/base.py b/src/legistar/ui/base.py new file mode 100644 index 0000000..9f8434d --- /dev/null +++ b/src/legistar/ui/base.py @@ -0,0 +1,295 @@ +import datetime +import itertools +import traceback +from collections import defaultdict +import re +import requests +import json + +import scrapelib +import lxml.html +import lxml.etree as etree +import pytz + + +class LegistarSession(requests.Session): + + def request(self, method, url, **kwargs): + response = super(LegistarSession, self).request(method, url, **kwargs) + payload = kwargs.get("data") + + self._check_errors(response, payload) + + return response + + def _check_errors(self, response, payload): + if response.url.endswith("Error.aspx"): + response.status_code = 503 + raise scrapelib.HTTPError(response) + + if not response.text: + if response.request.method.lower() in {"get", "post"}: + response.status_code = 520 + raise scrapelib.HTTPError(response) + + if "This record no longer exists. It might have been deleted." in response.text: + response.status_code = 410 + raise scrapelib.HTTPError(response) + + if payload: + self._range_error(response, payload) + + def _range_error(self, response, payload): + """Legistar intermittently does not return the expected response when + selecting a time range when searching for events. Right now we + are only handling the 'All' range + """ + + if self._range_is_all(payload): + + expected_range = "All Years" + + page = lxml.html.fromstring(response.text) + (returned_range,) = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']" + ) + + returned_range = returned_range.value + + if returned_range != expected_range: + response.status_code = 520 + # In the event of a retry, the new request does not + # contain the correct payload data. This comes as a + # result of not updating the payload via sessionSecrets: + # so, we do that here. + payload.update(self.session_secrets(page)) + + raise scrapelib.HTTPError(response) + + def _range_is_all(self, payload): + range_var = "ctl00_ContentPlaceHolder1_lstYears_ClientState" + all_range = ( + range_var in payload and json.loads(payload[range_var])["value"] == "All" + ) + return all_range + + +class LegistarScraper(scrapelib.Scraper, LegistarSession): + date_format = "%m/%d/%Y" + + def __init__(self, *args, **kwargs): + super(LegistarScraper, self).__init__(*args, **kwargs) + + def lxmlize(self, url, payload=None): + """ + Gets page and returns as XML + """ + if payload: + response = self.post(url, payload, verify=False) + else: + response = self.get(url, verify=False) + entry = response.text + page = lxml.html.fromstring(entry) + page.make_links_absolute(url) + return page + + def pages(self, url, payload=None): + page = self.lxmlize(url, payload) + + yield page + + next_page = page.xpath("//a[@class='rgCurrentPage']/following-sibling::a[1]") + if payload and "ctl00$ContentPlaceHolder1$btnSearch" in payload: + del payload["ctl00$ContentPlaceHolder1$btnSearch"] + + while len(next_page) > 0: + if payload is None: + payload = {} + + payload.update(self.session_secrets(page)) + + event_target = next_page[0].attrib["href"].split("'")[1] + + payload["__EVENTTARGET"] = event_target + + page = self.lxmlize(url, payload) + + yield page + + next_page = page.xpath( + "//a[@class='rgCurrentPage']/following-sibling::a[1]" + ) + + def parse_details(self, detail_div): + """ + Parse the data in the top section of a detail page. + """ + detail_query = ( + ".//*[starts-with(@id, 'ctl00_ContentPlaceHolder1_lbl')" + " or starts-with(@id, 'ctl00_ContentPlaceHolder1_hyp')" + " or starts-with(@id, 'ctl00_ContentPlaceHolder1_Label')]" + ) + fields = detail_div.xpath(detail_query) + + details = {} + + for field_key, field in itertools.groupby(fields, _field_key): + field = list(field) + field_1, field_2 = field[0], field[-1] + + key = field_1.text_content().replace(":", "").strip() + + if field_2.find(".//a") is not None: + value = [] + for link in field_2.xpath(".//a"): + value.append( + { + "label": link.text_content().strip(), + "url": self._get_link_address(link), + } + ) + + elif "href" in field_2.attrib: + value = { + "label": field_2.text_content().strip(), + "url": self._get_link_address(field_2), + } + + elif self._parse_detail(key, field_1, field_2): + value = self._parse_detail(key, field_1, field_2) + + else: + value = field_2.text_content().strip() + + details[key] = value + + return details + + def parse_data_table(self, table): + """ + Legistar uses the same kind of data table in a number of + places. This will return a list of dictionaries using the + table headers as keys. + """ + headers = table.xpath(".//th[starts-with(@class, 'rgHeader')]") + rows = table.xpath(".//tr[@class='rgRow' or @class='rgAltRow']") + + keys = [] + + for header in headers: + text_content = header.text_content().replace(" ", " ").strip() + inputs = header.xpath(".//input") + if text_content: + keys.append(text_content) + elif len(inputs) > 0: + keys.append(header.xpath(".//input")[0].value) + else: + keys.append(header.xpath(".//img")[0].get("alt")) + + for row in rows: + data, row = self._parse_table_row(row, keys) + yield dict(data), keys, row + + def _parse_table_row(self, row, keys): + for key, field in zip(keys, row.xpath("./td")): + data = defaultdict(lambda: None) + + try: + text_content = self._stringify(field) + + if field.find(".//a") is not None: + address = self._get_link_address(field.find(".//a")) + if address: + if key.strip() in ["", "ics"] and "View.ashx?M=IC" in address: + key = "iCalendar" + value = {"url": address} + else: + value = {"label": text_content, "url": address} + else: + value = text_content + else: + value = text_content + + except Exception as e: + print("Problem parsing row:") + print(etree.tostring(row)) + print(traceback.format_exc()) + raise e + + else: + data[key] = value + + return data, row + + def _get_link_address(self, link): + url = None + if "onclick" in link.attrib: + onclick = link.attrib["onclick"] + if onclick is not None and onclick.startswith( + ("radopen('", "window.open", "OpenTelerikWindow") + ): + onclick_path = onclick.split("'")[1] + if not onclick_path.startswith("/"): + onclick_path = "/" + onclick_path + url = self.BASE_URL + onclick_path + elif "href" in link.attrib: + url = link.attrib["href"] + + return url + + def _parse_detail(self, key, field_1, field_2): + """ + Perform custom parsing on a given key and field from a detail table. + Available for override on web scraper base classes. + """ + return None + + def _stringify(self, field): + for br in field.xpath("*//br"): + br.tail = "\n" + br.tail if br.tail else "\n" + for em in field.xpath("*//em"): + if em.text: + em.text = "--em--" + em.text + "--em--" + return field.text_content().replace(" ", " ").strip() + + def to_time(self, text): + time = datetime.datetime.strptime(text, self.date_format) + time = pytz.timezone(self.TIMEZONE).localize(time) + return time + + def to_date(self, text): + return self.to_time(text).date().isoformat() + + def now(self): + return datetime.datetime.utcnow().replace(tzinfo=pytz.utc) + + def mdY2Ymd(self, text): + month, day, year = text.split("/") + return "%d-%02d-%02d" % (int(year), int(month), int(day)) + + def session_secrets(self, page): + + payload = {} + payload["__EVENTARGUMENT"] = None + payload["__VIEWSTATE"] = page.xpath("//input[@name='__VIEWSTATE']/@value")[0] + try: + payload["__EVENTVALIDATION"] = page.xpath( + "//input[@name='__EVENTVALIDATION']/@value" + )[0] + except IndexError: + pass + + return payload + + def accept_response(self, response, **kwargs): + if response.status_code == 410: + return True + return super().accept_response(response, **kwargs) + + +def _field_key(x): + field_id = x.attrib["id"] + field = re.split(r"hyp|lbl|Label", field_id)[-1] + field = field.split("Prompt")[0] + field = field.rstrip("X21") + return field diff --git a/src/legistar/ui/bills.py b/src/legistar/ui/bills.py new file mode 100644 index 0000000..eefc0d9 --- /dev/null +++ b/src/legistar/ui/bills.py @@ -0,0 +1,228 @@ +from .base import LegistarScraper +from lxml.etree import tostring +from collections import deque + + +class LegistarBillScraper(LegistarScraper): + def legislation(self, search_text="", created_after=None, created_before=None): + + # If legislation is added to the the legistar system while we + # are scraping, it will shift the list of legislation down and + # we might revisit the same legislation. So, we keep track of + # the last few pieces of legislation we've visited in order to + # make sure we are not revisiting + scraped_leg = deque([], maxlen=10) + + for page in self.search_legislation(search_text, created_after, created_before): + for legislation_summary in self.parse_search_results(page): + if not legislation_summary["url"] in scraped_leg: + yield legislation_summary + scraped_leg.append(legislation_summary["url"]) + + def search_legislation( + self, search_text="", created_after=None, created_before=None + ): + """ + Submit a search query on the legislation search page, and return a list + of summary results. + """ + + page = self.lxmlize(self.LEGISLATION_URL) + + page = self._advancedSearch(page) + + payload = {} + + # Enter the search parameters TODO: Each of the possible form + # fields should be represented as keyword arguments to this + # function. The default query string should be for the the + # default 'Legislative text' field. + payload["ctl00$ContentPlaceHolder1$txtText"] = search_text + + if created_after and created_before: + payload.update(date_within(created_after, created_before)) + + elif created_before: + payload.update(date_bound(created_before)) + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = "<" + + elif created_after: + payload.update(date_bound(created_after)) + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = ">" + + # Return up to one million search results + payload["ctl00_ContentPlaceHolder1_lstMax_ClientState"] = '{"value":"1000000"}' + payload["ctl00_ContentPlaceHolder1_lstYearsAdvanced_ClientState"] = ( + '{"value":"All"}' + ) + payload["ctl00$ContentPlaceHolder1$btnSearch"] = "Search Legislation" + + payload.update(self.session_secrets(page)) + + return self.pages(self.LEGISLATION_URL, payload) + + def parse_search_results(self, page): + """Take a page of search results and return a sequence of data + of tuples about the legislation, of the form + + ('Document ID', 'Document URL', 'Type', 'Status', 'Introduction Date' + 'Passed Date', 'Main Sponsor', 'Title') + """ + table = page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] + for legislation, headers, row in self.parse_data_table(table): + # Do legislation search-specific stuff + # ------------------------------------ + # First column should be the ID of the record. + id_key = headers[0] + try: + legislation_id = legislation[id_key]["label"] + except TypeError: + continue + legislation_url = legislation[id_key]["url"].split(self.BASE_URL)[-1] + legislation[id_key] = legislation_id + legislation["url"] = ( + self.BASE_URL + legislation_url.split("&Options")[0] + "&FullText=1" + ) + + yield legislation + + def _advancedSearch(self, page): + search_switcher = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']" + )[0] + + if "simple search" in search_switcher.value.lower(): + return page + else: + payload = self.session_secrets(page) + payload[search_switcher.name] = search_switcher.value + + page = self.lxmlize(self.LEGISLATION_URL, payload) + + search_button = page.xpath( + "//input[@id='ctl00_ContentPlaceHolder1_btnSwitch']" + )[0] + if "simple search" not in search_button.value.lower(): + raise ValueError("Not on the advanced search page") + + return page + + def details(self, detail_url, div_id): + detail_page = self.lxmlize(detail_url) + + detail_div = detail_page.xpath(".//div[@id='%s']" % div_id)[0] + + return self.parse_details(detail_div) + + def leg_details(self, detail_url): + div_id = "ctl00_ContentPlaceHolder1_pageDetails" + return self.details(detail_url, div_id) + + def action_details(self, detail_url): + div_id = "ctl00_ContentPlaceHolder1_pageTop1" + return self.details(detail_url, div_id) + + def history(self, detail_url): + detail_page = self.lxmlize(detail_url) + + try: + history_table = detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']" + )[0] + except IndexError: + print(detail_url) + raise + + history = [row[0] for row in self.parse_data_table(history_table)] + + try: + history = sorted(history, key=self._action_sort_key) + except (TypeError, ValueError): + pass + + for action in history: + yield action + + def _action_sort_key(self, action): + action_date = self.to_date(action["Date"]) + action_url = action["Action\xa0Details"]["url"] + + return (action_date, action_url) + + def text(self, detail_url): + detail_page = self.lxmlize(detail_url) + + text_div = detail_page.xpath("//div[@id='ctl00_ContentPlaceHolder1_divText']") + + if len(text_div): + return tostring(text_div[0], pretty_print=True).decode() + else: + return None + + def extract_votes(self, action_detail_url): + action_detail_page = self.lxmlize(action_detail_url) + try: + vote_table = action_detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']" + )[0] + except IndexError: + self.warning("No votes found in table") + return None, [] + votes = list(self.parse_data_table(vote_table)) + vote_list = [] + for vote, _, _ in votes: + raw_option = vote["Vote"].lower() + vote_list.append( + ( + self.VOTE_OPTIONS.get(raw_option, raw_option), + vote["Person Name"]["label"], + ) + ) + + action_detail_div = action_detail_page.xpath( + ".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']" + )[0] + action_details = self.parse_details(action_detail_div) + result = action_details["Result"].lower() + + return result, vote_list + + +def date_within(created_after, created_before): + payload = date_bound(created_after) + + payload["ctl00$ContentPlaceHolder1$txtFileCreated2"] = ( + "{d.year}-{d.month:02}-{d.day:02}".format(d=created_before) + ) + payload["ctl00$ContentPlaceHolder1$txtFileCreated2$dateInput"] = ( + "{d.month}/{d.day}/{d.year}".format(d=created_before) + ) + + payload["ctl00_ContentPlaceHolder1_txtFileCreated2_dateInput_ClientState"] = ( + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=created_before + ) + ) + + payload["ctl00$ContentPlaceHolder1$radFileCreated"] = "between" + + return payload + + +def date_bound(creation_date): + payload = {} + + payload["ctl00$ContentPlaceHolder1$txtFileCreated1"] = ( + "{d.year}-{d.month:02}-{d.day:02}".format(d=creation_date) + ) + payload["ctl00$ContentPlaceHolder1$txtFileCreated1$dateInput"] = ( + "{d.month}/{d.day}/{d.year}".format(d=creation_date) + ) + + payload["ctl00_ContentPlaceHolder1_txtFileCreated1_dateInput_ClientState"] = ( + '{{"enabled":true, "emptyMessage":"","validationText":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","valueAsString":"{d.year}-{d.month:02}-{d.day:02}-00-00-00","minDateStr":"1980-01-01-00-00-00","maxDateStr":"2099-12-31-00-00-00", "lastSetTextBoxValue":"{d.month}/{d.day}/{d.year}"}}'.format( # noqa : E501 + d=creation_date + ) + ) + + return payload diff --git a/src/legistar/ui/events.py b/src/legistar/ui/events.py new file mode 100644 index 0000000..01eedf2 --- /dev/null +++ b/src/legistar/ui/events.py @@ -0,0 +1,182 @@ +from collections import deque + +import esprima +import icalendar + +from .base import LegistarScraper + + +class LegistarEventsScraper(LegistarScraper): + ECOMMENT_JS_URLS = ( + "https://metro.granicusideas.com/meetings.js", + "https://metro.granicusideas.com/meetings.js?scope=past", + ) + + def __init__(self, *args, event_info_key="Meeting Details", **kwargs): + super().__init__(*args, **kwargs) + self.event_info_key = event_info_key + + @property + def ecomment_dict(self): + """ + Parse event IDs and eComment links from JavaScript file with lines like: + activateEcomment( + '750', + '138A085F-0AC1-4A33-B2F3-AC3D6D9F710B', + 'https://metro.granicusideas.com/meetings/750-finance-budget-and-audit-committee-on-2020-03-16-5-00-pm-test'. # noqa + ); + """ + if getattr(self, "_ecomment_dict", None) is None: + ecomment_dict = {} + + # Define a callback to apply to each node, e.g., + # https://esprima.readthedocs.io/en/latest/syntactic-analysis.html#example-console-calls-removal + def is_activateEcomment(node, metadata): + if node.callee and node.callee.name == "activateEcomment": + event_id, _, comment_url = node.arguments + ecomment_dict[event_id.value] = comment_url.value + + for url in self.ECOMMENT_JS_URLS: + response = self.get(url) + esprima.parse(response.text, delegate=is_activateEcomment) + + self._ecomment_dict = ecomment_dict + + return self._ecomment_dict + + def event_pages(self, since): + + page = self.lxmlize(self.EVENTSPAGE) + for page in self.event_search(page, since): + yield page + + def should_cache_response(self, response): + # Never cache the top level events page, because that may result in + # expired .NET state values. + return ( + super().should_cache_response(response) and response.url != self.EVENTSPAGE + ) + + def event_search(self, page, since): + payload = self.session_secrets(page) + + payload["ctl00_ContentPlaceHolder1_lstYears_ClientState"] = ( + '{"value":"%s"}' % since + ) + + payload["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$lstYears" + + return self.pages(self.EVENTSPAGE, payload) + + def events(self, follow_links=True, since=None): + # If an event is added to the the legistar system while we + # are scraping, it will shift the list of events down and + # we might revisit the same event. So, we keep track of + # the last few events we've visited in order to + # make sure we are not revisiting + scraped_events = deque([], maxlen=10) + + current_year = self.now().year + + if since: + if since > current_year: + raise ValueError( + "Value of :since cannot exceed {}".format(current_year) + ) + else: + since_year = since - 1 + + else: + since_year = 0 + + # Anticipate events will be scheduled for the following year to avoid + # missing upcoming events during scrapes near the end of the current + # year. + for year in range(current_year + 1, since_year, -1): + no_events_in_year = True + + for page in self.event_pages(year): + events_table = page.xpath( + "//div[@id='ctl00_ContentPlaceHolder1_MultiPageCalendar']//table[@class='rgMasterTable']" # noqa + )[0] + for event, _, _ in self.parse_data_table(events_table): + ical_url = event["iCalendar"]["url"] + if ical_url in scraped_events: + continue + else: + scraped_events.append(ical_url) + + if follow_links and isinstance(event[self.event_info_key], dict): + agenda = self.agenda(event[self.event_info_key]["url"]) + else: + agenda = None + + yield event, agenda + no_events_in_year = False + + # We scrape events in reverse chronological order, starting one year + # in the future. Stop scraping if there are no events in a given + # year, unless that year is in the future, because whether events + # have been scheduled in the future is not a reliable indication of + # whether any happened in the previous year. + if no_events_in_year and year <= current_year: + break + + def agenda(self, detail_url): + page = self.lxmlize(detail_url) + + payload = self.session_secrets(page) + + payload.update( + { + "__EVENTARGUMENT": "3:1", + "__EVENTTARGET": "ctl00$ContentPlaceHolder1$menuMain", + } + ) + + for page in self.pages(detail_url, payload): + agenda_table = page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']" + )[0] + agenda = self.parse_data_table(agenda_table) + yield from agenda + + def add_docs(self, e, events, doc_type): + try: + if events[doc_type] != "Not\xa0available": + e.add_document( + note=events[doc_type]["label"], + url=events[doc_type]["url"], + media_type="application/pdf", + ) + except ValueError: + pass + + def extract_roll_call(self, action_detail_url): + action_detail_page = self.lxmlize(action_detail_url) + try: + rollcall_table = action_detail_page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']" + )[0] + except IndexError: + self.warning("No rollcall found in table") + return [] + roll_call = list(self.parse_data_table(rollcall_table)) + call_list = [] + for call, _, _ in roll_call: + option = call["Attendance"] + call_list.append((option, call["Person Name"]["label"])) + + return call_list + + def ical(self, ical_text): + value = icalendar.Calendar.from_ical(ical_text) + return value + + def _parse_detail(self, key, field_1, field_2): + if key == "eComment": + return self._get_ecomment_link(field_2) or field_2.text_content().strip() + + def _get_ecomment_link(self, link): + event_id = link.attrib["data-event-id"] + return self.ecomment_dict.get(event_id, None) diff --git a/src/legistar/ui/people.py b/src/legistar/ui/people.py new file mode 100644 index 0000000..6017b79 --- /dev/null +++ b/src/legistar/ui/people.py @@ -0,0 +1,49 @@ +from .base import LegistarScraper + + +class LegistarPersonScraper(LegistarScraper): + MEMBERLIST = None + ALL_MEMBERS = None + + def council_members(self, extra_args=None, follow_links=True): + payload = {} + if extra_args: + payload.update(extra_args) + page = self.lxmlize(self.MEMBERLIST, payload) + payload.update(self.session_secrets(page)) + + if self.ALL_MEMBERS: + payload["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$menuPeople" + payload["__EVENTARGUMENT"] = self.ALL_MEMBERS + + for page in self.pages(self.MEMBERLIST, payload): + table = page.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridPeople_ctl00']" + )[0] + + for councilman, headers, row in self.parse_data_table(table): + if follow_links and isinstance(councilman["Person Name"], dict): + + detail_url = councilman["Person Name"]["url"] + councilman_details = self.lxmlize(detail_url) + detail_div = councilman_details.xpath( + ".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']" + )[0] + + councilman.update(self.parse_details(detail_div)) + + img = councilman_details.xpath( + "//img[@id='ctl00_ContentPlaceHolder1_imgPhoto']" + ) + if img: + councilman["Photo"] = img[0].get("src") + + committee_table = councilman_details.xpath( + "//table[@id='ctl00_ContentPlaceHolder1_gridDepartments_ctl00']" + )[0] + committees = self.parse_data_table(committee_table) + + yield councilman, committees + + else: + yield councilman diff --git a/tests/conftest.py b/tests/conftest.py index 5be6406..950819f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,14 +3,14 @@ import pytest -from legistar import base -from legistar.bills import LegistarAPIBillScraper +from src.legistar.api import base +from src.legistar.api.bills import LegistarAPIBillScraper @pytest.fixture(scope="module") def scraper(): scraper = base.LegistarAPIScraper() - scraper.BASE_URL = 'http://webapi.legistar.com/v1/chicago' + scraper.BASE_URL = "http://webapi.legistar.com/v1/chicago" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -19,19 +19,19 @@ def scraper(): @pytest.fixture def project_directory(): test_directory = os.path.abspath(os.path.dirname(__file__)) - return os.path.join(test_directory, '..') + return os.path.join(test_directory, "..") @pytest.fixture def fixtures_directory(): test_directory = os.path.abspath(os.path.dirname(__file__)) - return os.path.join(test_directory, 'fixtures') + return os.path.join(test_directory, "fixtures") @pytest.fixture def metro_api_bill_scraper(): scraper = LegistarAPIBillScraper() - scraper.BASE_URL = 'https://webapi.legistar.com/v1/metro' + scraper.BASE_URL = "https://webapi.legistar.com/v1/metro" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -40,7 +40,7 @@ def metro_api_bill_scraper(): @pytest.fixture def chicago_api_bill_scraper(): scraper = LegistarAPIBillScraper() - scraper.BASE_URL = 'https://webapi.legistar.com/v1/chicago' + scraper.BASE_URL = "https://webapi.legistar.com/v1/chicago" scraper.retry_attempts = 0 scraper.requests_per_minute = 0 return scraper @@ -48,31 +48,31 @@ def chicago_api_bill_scraper(): @pytest.fixture def matter_index(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'metro', 'matter_index.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "metro", "matter_index.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def all_indexes(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'metro', 'all_indexes.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "metro", "all_indexes.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def dupe_event(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'chicago', 'dupe_event.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "chicago", "dupe_event.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture @pytest.fixture def no_dupe_event(fixtures_directory): - fixture_file = os.path.join(fixtures_directory, 'chicago', 'no_dupe_event.json') - with open(fixture_file, 'r') as f: + fixture_file = os.path.join(fixtures_directory, "chicago", "no_dupe_event.json") + with open(fixture_file, "r") as f: fixture = json.load(f) return fixture diff --git a/tests/refresh_fixtures.py b/tests/refresh_fixtures.py index 173f952..0e4c065 100644 --- a/tests/refresh_fixtures.py +++ b/tests/refresh_fixtures.py @@ -3,52 +3,55 @@ import lxml -from legistar.bills import LegistarBillScraper -from legistar.events import LegistarEventsScraper -from legistar.people import LegistarPersonScraper +from src.legistar.ui.bills import LegistarBillScraper +from src.legistar.ui.events import LegistarEventsScraper +from src.legistar.ui.people import LegistarPersonScraper def save_page(page, jurisdiction, outfile): test_directory = os.path.abspath(os.path.dirname(__file__)) - project_directory = os.path.join(test_directory, '..') + project_directory = os.path.join(test_directory, "..") - with open(os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, outfile), 'wb') as f: + fixture_path = os.path.join( + project_directory, "tests", "fixtures", jurisdiction, outfile + ) + with open(fixture_path, "wb") as f: f.write(lxml.html.tostring(page)) def refresh_bills(jurisdiction): s = LegistarBillScraper() - s.LEGISLATION_URL = 'https://{}.legistar.com/Legislation.aspx'.format(jurisdiction) + s.LEGISLATION_URL = "https://{}.legistar.com/Legislation.aspx".format(jurisdiction) - page = next(s.searchLegislation('bus')) + page = next(s.search_legislation("bus")) - save_page(page, jurisdiction, 'bills.html') + save_page(page, jurisdiction, "bills.html") def refresh_events(jurisdiction): s = LegistarEventsScraper() - s.EVENTSPAGE = 'https://{}.legistar.com/Calendar.aspx'.format(jurisdiction) + s.EVENTSPAGE = "https://{}.legistar.com/Calendar.aspx".format(jurisdiction) - page = next(s.eventPages('2018-01-01')) + page = next(s.event_pages("2018-01-01")) - save_page(page, jurisdiction, 'events.html') + save_page(page, jurisdiction, "events.html") def refresh_people(jurisdiction): s = LegistarPersonScraper() - MEMBERLIST = 'https://{}.legistar.com/People.aspx'.format(jurisdiction) + MEMBERLIST = "https://{}.legistar.com/People.aspx".format(jurisdiction) page = next(s.pages(MEMBERLIST)) - save_page(page, jurisdiction, 'people.html') + save_page(page, jurisdiction, "people.html") -if __name__ == '__main__': +if __name__ == "__main__": try: _, jurisdictions = sys.argv - jurisdictions = jurisdictions.split(',') + jurisdictions = jurisdictions.split(",") except ValueError: - jurisdictions = ('chicago', 'metro', 'nyc') + jurisdictions = ("chicago", "metro", "nyc") for j in jurisdictions: refresh_bills(j) diff --git a/tests/test_bills.py b/tests/test_bills.py index 82413cf..a9ab1eb 100644 --- a/tests/test_bills.py +++ b/tests/test_bills.py @@ -5,10 +5,10 @@ def test_topics(metro_api_bill_scraper, matter_index, all_indexes): with requests_mock.Mocker() as m: - matter_matcher = re.compile(r'/matters/5036/indexes') + matter_matcher = re.compile(r"/matters/5036/indexes") m.get(matter_matcher, json=matter_index, status_code=200) - all_matcher = re.compile(r'/metro/indexes') + all_matcher = re.compile(r"/metro/indexes") m.get(all_matcher, json=all_indexes, status_code=200) matter_topics = metro_api_bill_scraper.topics(5036) @@ -23,38 +23,42 @@ def test_topics(metro_api_bill_scraper, matter_index, all_indexes): def test_duplicate_events(chicago_api_bill_scraper, caplog, dupe_event): with requests_mock.Mocker() as m: - event_matcher = re.compile('/matters/38768/histories') + event_matcher = re.compile("/matters/38768/histories") m.get(event_matcher, json=dupe_event, status_code=200) - chicago_api_bill_scraper.history('38768') - assert 'appears more than once' in caplog.text + chicago_api_bill_scraper.history("38768") + assert "appears more than once" in caplog.text def test_no_duplicate(chicago_api_bill_scraper, caplog, no_dupe_event): with requests_mock.Mocker() as m: - event_matcher = re.compile('/matters/38769/histories') + event_matcher = re.compile("/matters/38769/histories") m.get(event_matcher, json=no_dupe_event, status_code=200) - chicago_api_bill_scraper.history('38769') - assert 'appears more than once' not in caplog.text + chicago_api_bill_scraper.history("38769") + assert "appears more than once" not in caplog.text def test_404_votes(chicago_api_bill_scraper): with requests_mock.Mocker() as m: - m.get(re.compile(r'.*'), status_code=404) - votes = chicago_api_bill_scraper.votes('408134') + m.get(re.compile(r".*"), status_code=404) + votes = chicago_api_bill_scraper.votes("408134") assert votes == [] def test_500_votes(chicago_api_bill_scraper): with requests_mock.Mocker() as m: - m.get(re.compile(r'.*'), - json={'InnerException': - {'ExceptionMessage': - "The cast to value type 'System.Int32' failed " - "because the materialized value is null. Either " - "the result type's generic parameter or the query " - "must use a nullable type."}}, - status_code=500) - votes = chicago_api_bill_scraper.votes('408134') + m.get( + re.compile(r".*"), + json={ + "InnerException": { + "ExceptionMessage": "The cast to value type 'System.Int32' failed " + "because the materialized value is null. Either " + "the result type's generic parameter or the query " + "must use a nullable type." + } + }, + status_code=500, + ) + votes = chicago_api_bill_scraper.votes("408134") assert votes == [] diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 167c0c9..d00d82a 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -3,47 +3,53 @@ import lxml import pytest -from legistar.bills import LegistarBillScraper -from legistar.events import LegistarEventsScraper -from legistar.people import LegistarPersonScraper +from src.legistar.ui.bills import LegistarBillScraper +from src.legistar.ui.events import LegistarEventsScraper +from src.legistar.ui.people import LegistarPersonScraper -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_bills(project_directory, jurisdiction): - bills_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'bills.html') + bills_fixture = os.path.join( + project_directory, "tests", "fixtures", jurisdiction, "bills.html" + ) scraper = LegistarBillScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(bills_fixture, 'r') as f: + with open(bills_fixture, "r") as f: page = lxml.html.fromstring(f.read()) - result = next(scraper.parseSearchResults(page)) + result = next(scraper.parse_search_results(page)) print(result) -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_events(project_directory, mocker, jurisdiction): - events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'events.html') + events_fixture = os.path.join( + project_directory, "tests", "fixtures", jurisdiction, "events.html" + ) scraper = LegistarEventsScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(events_fixture, 'r') as f: + with open(events_fixture, "r") as f: page = lxml.html.fromstring(f.read()) - mocker.patch.object(scraper, 'eventPages', return_value=page) + mocker.patch.object(scraper, "event_pages", return_value=page) result, _ = next(scraper.events(follow_links=False)) print(result) -@pytest.mark.parametrize('jurisdiction', ['chicago', 'metro', 'nyc']) +@pytest.mark.parametrize("jurisdiction", ["chicago", "metro", "nyc"]) def test_parse_people(project_directory, mocker, jurisdiction): - events_fixture = os.path.join(project_directory, 'tests', 'fixtures', jurisdiction, 'people.html') + people_fixture = os.path.join( + project_directory, "tests", "fixtures", jurisdiction, "people.html" + ) scraper = LegistarPersonScraper() - scraper.BASE_URL = '{}.legistar.com'.format(jurisdiction) + scraper.BASE_URL = "{}.legistar.com".format(jurisdiction) - with open(events_fixture, 'r') as f: + with open(people_fixture, "r") as f: page = lxml.html.fromstring(f.read()) - mocker.patch.object(scraper, 'pages', return_value=page) - result = next(scraper.councilMembers(follow_links=False)) + mocker.patch.object(scraper, "pages", return_value=page) + result = next(scraper.council_members(follow_links=False)) print(result) diff --git a/tests/test_search.py b/tests/test_search.py index 4a4a3dd..7a89ab1 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -5,12 +5,12 @@ class TestAPISearch(object): def test_search_raises(self, scraper): with pytest.raises(ValueError): - results = scraper.search('/events/', 'EventId', - "MatterFile eq 'O2010-5046'") + results = scraper.search( + "/events/", "EventId", "MatterFile eq 'O2010-5046'" + ) list(results) def test_search(self, scraper): - results = scraper.search('/matters/', 'MatterId', - "MatterFile eq 'O2010-5046'") + results = scraper.search("/matters/", "MatterId", "MatterFile eq 'O2010-5046'") assert len(list(results)) == 1