CouncilML/data.py at master · hackla-engage/CouncilML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from bs4 import BeautifulSoup
import requests
import json
import io
import datetime
import pytz
from calendar import timegm
import re

local_tz = pytz.timezone("America/Los_Angeles")
current_year = datetime.date.today().year
city_council_agendas_url = "https://www.smgov.net/departments/clerk/agendas.aspx"

def get_data():
    with requests.Session() as sess:
        r = sess.get(city_council_agendas_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        agendas = dict()
        table = soup.find('table', {'class': 'agendaTable'})
        rows = table.findAll('tr')
        for row in rows:
            cells = row.findChildren('td')
            try:
            	date = agenda_date_to_epoch(cells[0])
            except:
            	date = None
            if date and cells[1].string == "Agenda":
                agenda = sess.get(cells[1].findChildren('a', {'href': True})[0]['href']).text
                if "CONSENT CALENDAR" in agenda:
                    agendas[date] = scrape_agenda(agenda, sess)
    return agendas

def agenda_date_to_epoch(date_str):
    '''Transforms scraped date to epoch time'''
    naive_dt = datetime.datetime.strptime(
        str(current_year) + " " + date_str.string.strip(), '%Y %B %d %I:%M %p')
    local_dt = local_tz.localize(naive_dt, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    utc_timetuple = utc_dt.timetuple()
    return timegm(utc_timetuple)

def scrape_agenda(agenda, sess):
    # Searches the entire HTML text for the words since it's not yet parsed into html by BS
    soup_agenda = BeautifulSoup(agenda, 'html.parser')
    tableMeeting = soup_agenda.find('table', {'id': 'MeetingDetail'})
    string_sections = tableMeeting.find_all('strong')
    parent = string_sections[0].find_parent("tr")
    next_siblings = parent.find_next_siblings("tr")
    # staff_reports = {} This and commented code block below can be used to automatically group reports by Category
    staff_reports = []
    reports_holder = []

    for sibling in next_siblings:
        if sibling.find('strong'):
            if len(reports_holder) != 0:
                staff_reports.append(reports_holder)
                reports_holder = []
            #     staff_reports[agenda_group] = reports_holder
            #     reports_holder = []
            # agenda_heading = re.match('^(\d+)\..*', sibling.text)
            # if agenda_heading:
            #     agenda_group = agenda_heading.group(1)
        else:
            cells = sibling.find_all('td')
            if len(cells) > 2 and u'Title' in cells[2]['class']:
                staff_report = cells[2].find('a', {'href': True})
                if staff_report is None:
                    continue
                staff_report_href = 'http://santamonicacityca.iqm2.com/Citizens/' + staff_report['href']
                try:
                    staff_report_r = sess.get(staff_report_href)
                    staff_report_html = staff_report_r.text
                    s_r_processed = process_staff_report(staff_report_html)
                    if len(s_r_processed) != 0:
                        reports_holder.append(s_r_processed)
                except Exception as e:
                    #We should create a log file to capture this output rather than just printing...
                    print("coult not get: " + staff_report_href)
                    print(e)
                    exit()

    return staff_reports


def process_staff_report(staff_report_html):
    '''
    staff_report_html is the HTML text
    '''
    staff_report_soup = BeautifulSoup(staff_report_html, 'html.parser')
    title = staff_report_soup.find('div', {'class': 'LegiFileTitle'})
    if title:
        title = title.text.strip()
    else:
        return []
    info = staff_report_soup.find('div', {'class': 'LegiFileInfo'})
    info_dict = dict()
    info_dict['Title'] = title
    if info:
        info_rows = info.div.table.find_all(['tr'])
        for info_row in info_rows:
            info_headers = info_row.find_all('th')
            info_values = info_row.find_all('td')
            for i in enumerate(info_headers):
                info_dict[i[1].strong.string] = info_values[i[0]].string
    discussion = staff_report_soup.find('div', {'id': 'divItemDiscussion'})
    if discussion != None:
        discussion = discussion.text.replace('Recommended Action', '').replace('\xa0', '').strip()
        info_dict['Recommendations'] = discussion
    body = staff_report_soup.find('div', {'id': 'divBody'})
    if body != None:
        body_paragraphs = []
        paragraphs = body.div.div.find_all('p')
        for paragraph in paragraphs:
            cleaned = paragraph.text.replace('\xa0', '').strip()
            if cleaned:
                body_paragraphs.append(cleaned)
        info_dict['Body'] = body_paragraphs
    return info_dict