kdd2013/model.py at master · bartjkdp/kdd2013 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
model.py
Bart J - 2014-06-01.
KDD Cup 2013 Track 1
"""
from __future__ import division
from lib import IO
from lib import PreProcess
from lib import InitialCalculation
from lib import GenerateFeatures
from time import sleep
from itertools import izip
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

import sys
import evaluate
import __builtin__

def Step1():
	print '================================================================'
	print 'KDD Cup 2013 - Track 1'
	print 'Model by Bart Jeukendrup'
	print '================================================================'
	print 'Step 1/6: Reading CSV files to memory & pre-process data'
	print '================================================================'

	# Read all the data from the files or the memory cache
	IO.readAuthors()
	IO.readVenues()
	IO.readPapers()
	IO.readAuthorPaper()

	IO.readTrainData()
	IO.readValidData()

	# Preprocess string values (strip HTML, lower, character encoding)
	__builtin__.authors = PreProcess.authors(__builtin__.authors)
	__builtin__.papers = PreProcess.papers(__builtin__.papers)
	__builtin__.venues = PreProcess.venues(__builtin__.venues)
	_builtin__.paperauthor = PreProcess.paperauthors(__builtin__.paperauthor)

	print '================================================================'
	print 'Step 2/6: Initial feature calculation'
	print '================================================================'

	# Calculate adjecency and probability matrixes of HeteSim
	InitialCalculation.calculate()

def Step2():
	print '================================================================'
	print 'Step 3/6: Generating features'
	print '================================================================'

	trainLabels = []
	trainFeatures = []
	validFeatures = []

	print 'Generating features for train data...'

	for trainRow in trainData:
		trainLabels.append(trainRow['label'])
		trainFeatures.append(GenerateFeatures.allFeatures(trainRow['authorId'], trainRow['paperId']))

	print 'Generating features for validation data...'
	for validRow in validData:
		validFeatures.append(GenerateFeatures.allFeatures(validRow['authorId'], validRow['paperId']))

	print '================================================================'
	print 'Step 4/6: Training models'
	print '================================================================'

	rfClassifier = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=-1, min_samples_split=10, random_state=1)
	rfClassifier.fit(trainFeatures, trainLabels)

	gbClassifier = GradientBoostingClassifier(n_estimators=100, verbose=1, learning_rate=1.0, max_depth=3, random_state=0)
	gbClassifier.fit(trainFeatures, trainLabels)

	print '================================================================'
	print 'Step 5/6: Applying models'
	print '================================================================'

	rfPredictions = list(rfClassifier.predict_proba(validFeatures)[:,1])
	gbPredictions = list(gbClassifier.predict_proba(validFeatures)[:,1])

	print 'Taking the weighted average of the two models'
	predictions = []
	for prediction in zip(rfPredictions, gbPredictions):
		# Take the weighted average of the two models
		predictions.append(0.5 * prediction[0] + 0.5 * prediction[1])

	print '================================================================'
	print 'Step 6/6: Saving results and calculate performance'
	print '================================================================'

	IO.writePredictions(predictions, validData)
	evaluate.calculate_map()

if __name__ == "__main__":
	Step1()
	Step2()