facescrape/facescrape.py at master · samfishman/facescrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Copyright 2014 Sam Fishman
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Facescrape is a tool for scraping the Harvard College Facebook.

Usage:
>>> fs = FaceScraper(my_huid, my_pw)
>>> data = fs.scrape_students({'house': 'Kirkland House'})
>>> fs.export_csv('/path/to/export.csv')
"""

import csv
import re
import requests

LOGIN_URL = "https://www.pin1.harvard.edu/cas/login?service=https%3A%2F%2Fwww.pin1.harvard.edu%2Fpin%2Fauthenticate%3F__authen_application%3DFAS_CS_FACEBOOK%26original_request%3D%2Fsearchform"
INDEX_URL = "http://facebook.college.harvard.edu//search"
INDIVIDUAL_URL = "http://facebook.college.harvard.edu//individual?id=%s"

class FaceScraper(object):
    def __init__(self, huid, pw):
        self.username = str(huid)
        self.pw = str(pw)

    def login(self):
        self.jar = {}
        login_page = requests.get(LOGIN_URL)
        lt = re.findall(r'<input type="hidden" name="lt" value="([\w\-]+)',
                login_page.text)[0]
        ex = re.findall(r'<input type="hidden" name="execution" value="(\w+)',
                login_page.text)[0]
        self.jar.update(login_page.cookies)
        last = requests.post(LOGIN_URL, data={
                'compositeAuthenticationSourceType': 'PIN',
                'username': self.username,
                'password': self.pw,
                '_eventId_submit': 'Login',
                'lt': lt,
                'execution': ex,
                'casPageDisplayType': 'DEFAULT',
                'nonMobileOptionOnMobile': ''},
            allow_redirects=False, cookies=self.jar)
        while last.status_code == 302:
            self.jar.update(last.cookies)
            last = requests.get(last.headers['location'], cookies=self.jar,
                    allow_redirects=False)
        try:
            del self.jar['CASTGC']
        except KeyError:
            raise Exception('Login Failed!')

    def scrape_students(self, filters=None):
        self.login()
        payload = {'name_last': '', 'name_first': '', 'house': '',
                'assigned_house': '', 'year': '', 'concentration': '',
                'num': '9999', 'Search': 'Search', 'view': 'photo'}
        if filters:
            payload.update(filters)
        index = requests.get(INDEX_URL, params=payload, cookies=self.jar,
                            allow_redirects=False)
        ids = re.findall(
                r'<div class="photo">\n<a href="individual\?id=([a-f0-9]+)',
                index.text)
        ans = map(self.get_student, ids)
        self.last_read = ans
        return ans

    def get_student(self, sid):
        r = requests.get(INDIVIDUAL_URL % sid, cookies=self.jar)
        body = r.text.replace('<br>', '')
        data = {}

        for key in ('Name', 'House', 'Year', 'Concentration', 'Assigned House',
                    'Dorm Address', 'Mail Address'):
            match = re.search(
                    r'<span class="field">%s:</span><span class="value">'
                    r'([\w\-\', ]+)<' % key, body)
            if match:
                data[key.lower()] = match.group(1)
            else:
                data[key.lower()] = None

        for key, pat in [('email', r'mailto:([\w\-\.]+@college\.harvard\.edu)'),
                ('photo', r'<img alt="Image" width="250" src="([\w/\-\.]+)"')]:
            match = re.search(pat, body)
            if match:
                data[key] = match.group(1)
            else:
                data[key] = None

        data['photo'] = 'http://facebook.college.harvard.edu/%s' % data['photo']

        return data

    def export_csv(self, path, columns=None):
        if columns == None:
            columns = ('Name', 'House', 'Year', 'Concentration',
                       'Assigned House', 'Dorm Address', 'Mail Address',
                       'Email', 'Photo')
        with open(path, 'w') as f:
            exporter = csv.writer(f)
            exporter.writerow(columns)
            rows = map(lambda p: [p[c.lower()] if c.lower() in p else ''
                                  for c in columns],
                       self.last_read)
            exporter.writerows(rows)