wise1/make_prediction.py at master · berkeley-stat222/wise1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
The code does the following:
    1. Take in argument (project ID)
    2. Queries API to get data for the proj_id
    3. Transforms API data to same featurized format used in model building
    4. Loads a pre-trained random-forest model
    5. Makes predictions from this model
    6. Returns a confidence score of the project being fully funded
       within 30 days
"""

from __future__ import division
from urllib2 import urlopen
import numpy as np
import pandas as pd
import json
import sys
import string
import WiseGamma
from WiseGamma import DataSet, Model
import GammaLL
import requests
import ast

# read in external data
donations = pd.read_csv('donations_counts.csv')
outside_dat =  pd.read_csv('outside_datmay16.csv',
                           dtype = {'school_zip': np.str_, 'med_inc': np.float64,\
                                    'pop': np.float64, 'party': np.str_})


# a few lookup tables
top_city_state = ['Charlotte, NC', 'Tucson, AZ', 'Tulsa, OK',
                  'Milwaukee, WI', 'Seattle, WA', 'San Antonio, TX',
                  'Newark, NJ', 'New York, NY', 'Winston Salem, NC',
                  'Saint Louis, MO', 'Detroit, MI', 'Richmond, VA',
                  'Baltimore, MD', 'Indianapolis, IN', 'Staten Island, NY',
                  'Sacramento, CA', 'Durham, NC', 'Bronx, NY',
                  'Dallas, TX', 'Tampa, FL', 'Las Vegas, NV',
                  'Washington, DC', 'Van Nuys, CA', 'Richmond, CA',
                  'Oakland, CA', 'Los Angeles, CA', 'Austin, TX',
                  'Oklahoma City, OK', 'Atlanta, GA', 'San Jose, CA',
                  'San Francisco, CA', 'Miami, FL', 'Bridgeport, CT',
                  'Anaheim, CA', 'Memphis, TN', 'Philadelphia, PA',
                  'New Orleans, LA', 'Louisville, KY', 'Denver, CO',
                  'Brooklyn, NY', 'Phoenix, AZ', 'Chicago, IL',
                  'Fort Worth, TX', 'San Diego, CA', 'Bakersfield, CA',
                  'Portland, OR', 'Joplin, MO', 'Houston, TX', 'Orlando, FL']

sub_interest_dict = {np.nan: 5.7427204567211509e-05,
                     'Applied Sciences': 0.046910901426980053,
                     'Character Education': 0.011734209594511109,
                     'Civics &amp Government': 0.004503308399487903,
                     'College &amp Career Prep': 0.0089410822293279311,
                     'Community Service': 0.0027847007712426398,
                     'ESL': 0.012527814969187872,
                     'Early Development': 0.018895469981275802,
                     'Economics': 0.0031494696411866856,
                     'Environmental Science': 0.041058586686500816,
                     'Extracurricular': 0.0044112279435957096,
                     'Foreign Languages': 0.0086835485407214408,
                     'Gym &amp Fitness': 0.010195240246769564,
                     'Health &amp; Life Science': 0.037509555650384836,
                     'Health &amp Wellness': 0.013788715992077571,
                     'History &amp Geography': 0.024063208266852952,
                     'Literacy': 0.2898423422134363,
                     'Literature &amp Writing': 0.12423840541970035,
                     'Mathematics': 0.11526493598853993,
                     'Music': 0.051238013697982497,
                     'Nutrition': 0.0021274501290110691,
                     'Other': 0.010846548586201168,
                     'Parent Involvement': 0.0010599115648126695,
                     'Performing Arts': 0.019826055987753846,
                     'Social Sciences': 0.012042106653145562,
                     'Special Needs': 0.071217593377488869,
                     'Sports': 0.0069608265970106708,
                     'Visual Arts': 0.052359097335636214}

pov_interest_dict = {'high poverty': 0.2494275125818487,
                     'highest poverty': 0.58440167076436256,
                     'low poverty': 0.02598288386748716,
                     'moderate poverty': 0.1404074103565012}

# Retrieve Google Trends data; implemented inside the parse function
def retrieve_goog():
    html_base = u"http://www.google.com/trends/fetchComponent?q="
    q = u"donors+choose"
    query_type = u"&cid=TIMESERIES_GRAPH_0&export=3"
    full_query = html_base + q + query_type
    response = requests.get(full_query)
    split = response.text.split('setResponse(')
    if len(split)==1: # If you have reached your quota limit, return value is different
        q_count = None
    else:
        nice_dict = ast.literal_eval(split[1].rstrip()[:-2].replace('new Date', ''))
        # Ugly formatting (from Java)
        # Get most recent Google Trends count
        q_count_scaled = nice_dict['table']['rows'][-1]['c'][-1]['v']
        scale = 0.09854
        q_count = scale * q_count_scaled * 7 # unscale by top # of counts, convert to weekly
    return(q_count)

def parse(proj):
    """
    Args:   Dictionary of content of project proposal from the donorschoose
            API request. This would be elements of the list
            result_json['proposals']

    Output: Row of a dataframe that can be concatenated to already existing
            dataframes (e.g. training and test sets from the CSV dumps) -
            the column names will match
    """

    sch_types = [x['name'] for x in proj['schoolTypes']]
    tchr_types = [x['name'] for x in proj['teacherTypes']]

    city_state = str(proj['city'] + ", " + proj['state'])
    if city_state in top_city_state:
        city_state_cat = city_state
    else:
        city_state_cat = 'Other'

    # gets line of outside_dat where school_zip matches the zip of input proj
    schzip = proj['zip'].split('-')[0]
    outsideline = outside_dat[outside_dat.school_zip == schzip]

    subject = proj['subject']['name']
    poverty = proj['povertyLevel'].lower()

    # Get Google Trends data
    trends = retrieve_goog()

    row_dict = {'school_latitude': proj['latitude'],
                'school_longitude': proj['longitude'],
                'school_charter': 't' if "Charter" in sch_types else 'f',
                'school_magnet': 't' if "Magnet" in sch_types else 'f',
                'school_year_round': 't' if "Year-round" in sch_types else 'f',
                'school_nlns': 't' if "New Leaders" in sch_types else 'f',
                'teacher_prefix': proj['teacherName'].split()[0],
                'teacher_teach_for_america': 't' if "Teach for America"\
                                                    in tchr_types else 'f',
                'teacher_ny_teaching_fellow': 't' if "NY Teaching Fellow"\
                                                     in tchr_types else 'f',
                'primary_focus_subject': subject,
                'resource_type': proj['resource']['name'],
                'poverty_level': poverty,
                'grade_level': proj['gradeLevel']['name'],
                'total_price_excluding_optional_support': float(proj['totalPrice']),
                'free_shipping': 't' if proj['freeShipping'] == "true" else 'f',
                'years_since_2000': float(proj['expirationDate'].split('-')[0]) - 2000,
                'scaled_interest_par_sub': sub_interest_dict[subject],
                'scaled_interest_par_pov': pov_interest_dict[poverty],
                'Google_query': trends,
                'med_inc': outsideline.iloc[0]['med_inc'],
                'pop': outsideline.iloc[0]['pop'],
                'party': outsideline.iloc[0]['party']}

    return pd.DataFrame(row_dict, index=[0])


def query_api(proj_id):
    """
    Args:   project ID of the project to be queried
            from the donorschoose API

    Output: A pandas dataframe consisting of a single line that featurizes
            the API data into the same form as the data used to train the
            model so that we can predict on this new line.
    """
    if len(proj_id) != 7 or not proj_id.isdigit():
        raise ValueError('The project ID should be a 7 digit number.')

    dc_apikey = '80g5fqgy8nd2'
    url_string = 'http://api.donorschoose.org/common/json_feed.html?id='\
                 + proj_id + '&APIKey=' + dc_apikey
    query = urlopen(url_string)
    projects_json = json.load(query)

    if not projects_json['proposals']:
        raise ValueError('Invalid project ID.')

    return parse(projects_json['proposals'][0])


def load_model(model_file):
    """
    A workaround to load a pre-trained WiseGamma random forest model because
    WiseGamma.Model.load is not available in the current version
    """
    return WiseGamma.Model(GammaLL.load_model_from_file(model_file, ""))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print >> sys.stderr, "Usage: python query_api.py <project ID number>"
        exit(-1)

    newline_df = query_api(sys.argv[1])
    """
    Write data frame out to csv, without the index and then read in as a
    WiseGamma.DataSet object. This is a workaround since the current
    version of WiseGamma does not have the appropriate pandas support.
    """
    newline_df.to_csv('temp.csv', index=FALSE)
    newline_ds = DataSet.load('temp.csv')


    trained_model = load_model("model1")
    pred = trained_model.probs(newline_ds)

    """ Version 2 (not currently available)
    pred = trained_model.probs(newline_df)
    """

    print pred['Yes'].values