course-catalog/course.py at master · HarvardOpenData/course-catalog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import re

import utils


class Course(object):

    # static
    # this is the list of fields that are exported to CSV
    # see to_dict_for_csv()
    CSV_FIELDS = [
        'number',
        'semester',
        'name',
        'instructors',
        'schedule',
        'level',
        'id',
    ]

    def __init__(self):
        self.number = None
        self.name = None
        self.instructors = []
        self.semester = None
        self.schedule = None
        self.strings = []
        self.id = None

    def set_id(self, raw_id_string):
        """
        Raw course ID strings are formatted in the catalog PDF as `(123456)  `
        or something similar. Here we clean it up for you.
        """
        # trim whitespace
        trimmed = raw_id_string.strip()

        if len(trimmed) > 2:
            # remove parens
            without_parens = trimmed[1:-1]
            self.id = without_parens

    def set_course_number(self, number):
        """
        e.g. "Computer Science 50"

        TODO
        From the course number (e.g. Computer Science 50), extract the dept
        (e.g. Computer Science). Ideally we'd split up the course number into
        [Computer Science, 50] but I'm not sure how to name that without
        being confusing. Maybe department and number? idk.

        Actually no, keep the department and number together (makes sorting easier).
        But let someone extract the individual pieces.

        Regex for extracting course number:

            [\dA-Z]+$
        """

        # for now, just replace multiple spaces with a single space
        self.number = re.sub(r" +", " ", number.strip())

        # get the actual course number (e.g. "50" in "Computer Science 50")
        # these are formatted strangely; here's a corpus we have to match properly
        #
        # "Computer Science 50",
        # "Anthropology     1010",
        # "Hausa AA",
        # "Astronomy 91R",
        # "Astronomy 202A",
        # "Aramaic 300 Section: 002",
        # "Biological Sci in Public Hlth 389 Section: 01",
        # "Culture & Belief 61 Section: LEC"
        #
        # From those we want to extract 50, 1010, AA, 91R, 202A, 300, 389, 61
        # (all in string format)
        # hence this regex
        number_matcher = re.compile("([\dA-Z]+)(?: *Section: [\dA-Z]+)?$")
        self.actual_number = number_matcher.findall(self.number)[0]

        # from the actual number, guess the level
        # just letters: language course
        # 1-89: undergraduate, non-concentrators
        # 90-99: undergraduate seminars
        # 100-199: undergraduate, concentrators
        # 200-299: undergraduate & graduate
        # 300-399: graduate
        # 900-999: undergraduate seminars
        # 1000-1999: undergraduate, concentrators
        # 2000-2999: undergraduate & graduate
        # 3000-3999: graduate
        #
        # Or, more simply, for "level", just consider
        # Undergraduate/Undergradute+Graduate/Graduate
        self.level = None

        # extract digits and letters
        digit_matcher = re.compile("\d+")
        digit_matches = digit_matcher.findall(self.actual_number)
        if len(digit_matches) > 0:
            # this has digits, like '50' or '91' or '1010'
            digits = int(digit_matches[0])

            # from this, we can determine level (undergrad, mixed, graduate)
            if digits < 200:
                self.level = "Undergraduate"
            elif digits < 300:
                self.level = "Undergraduate + Graduate"
            elif digits < 400:
                self.level = "Graduate"
            elif digits < 2000:
                self.level = "Undergraduate"
            elif digits < 3000:
                self.level = "Undergradute + Graduate"
            else:
                self.level = "Graduate"

        else:
            # this is a language course with only letters
            # languages are all undergraduate!
            self.level = "Undergraduate"

    def set_schedule(self, schedule):
        # take the raw schedule and structure the data better
        self.schedule = schedule

        schedule_matcher = re.compile("([MTWRF]{1,5}) (\d{4}) ([AP]M) - (\d{4}) ([AP]M)")
        matches = schedule_matcher.findall(schedule)

        if len(matches) == 1:
            # we found something!
            # this contains, in order:
            # 0. Days
            # 1. Start time
            # 2. Start AM or PM
            # 3. End time
            # 4. End AM or PM
            # e.g. ('MW', '1100', 'AM', '1159', 'AM')
            schedule_chunks = matches[0]

            # Convert string to start and end times in military time
            self.start_time = utils.time_to_military(schedule_chunks[1], schedule_chunks[2])
            self.end_time = utils.time_to_military(schedule_chunks[3], schedule_chunks[4])

            # Convert from military time to minutes since midnight
            self.start_time = utils.minutes_since_midnight(self.start_time)
            self.end_time = utils.minutes_since_midnight(self.end_time)

            # convert the days into better-readable ones
            # "MW" => ["Monday", "Wednesday]
            # e.g. "MW"
            days_abbreviation_string = schedule_chunks[0]
            # e.g. ["M","W"]
            days_abbreviation_list = list(days_abbreviation_string)

            # convert "M" to "Monday", etc
            abbreviation_dict = {
                "M": "Monday",
                "T": "Tuesday",
                "W": "Wednesday",
                "R": "Thursday",
                "F": "Friday"
            }
            days_full_list = [abbreviation_dict[d] for d in days_abbreviation_list]
            self.days = days_full_list


    def process_strings(self):
        """
        Once all `strings` (i.e. unstructured description text) are loaded,
        pull out useful information like the schedule.
        """
        # try to extract a course schedule
        times = []
        for string in self.strings:
            # TODO factor this out b/c we use it above
            matcher = re.compile("[MTWRF]{1,5} \d{4} [AP]M - \d{4} [AP]M")
            times.append(matcher.findall(string))

        flattened_times = sum(times,[])

        # this array will have max 1 element
        if len(flattened_times) > 0:
            self.set_schedule(flattened_times[0])
        else:
            self.schedule = None

        # try to extract a semester
        for string in self.strings:
            semester = re.search("20\d\d ((Fall)|(Spring))", string)
            if semester is not None:
                # just find the first one then quit
                self.semester = semester.group()
                break


    def __str__(self):
        return "{}".format(self.name)

    def __repr__(self):
        # for nicer ipython debugging
        return str(self.__dict__)

    def to_dict(self):
        # ALIAS
        return self.__dict__

    def to_dict_for_csv(self):
        # returns a nicer-formatted dict ready for insertion into a csv
        # so that means any arrays need to be flattened to scalars
        # also everything needs to be converted to ascii

        def to_ascii(unicode_str):
            if unicode_str is None:
                return None
            return unicode_str.encode("ascii","replace")

        return dict(
            id=self.id,
            name=to_ascii(self.name),
            number=self.number,
            instructors=to_ascii((" & ".join(self.instructors))),
            semester=self.semester,
            schedule=self.schedule,
            level=self.level
        )