DestinyProject/predict.py at master · TheF1rstPancake/DestinyProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from sklearn.feature_extraction import text
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from math import sqrt

import pandas as pd
import numpy as np

logging.getLogger("requests").setLevel(logging.WARNING)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(module)s - %(process)d - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

def predictWinners(predictions):
	"""
	Given a datafile with 4 columns:
		index (id)
		gameId
		standing
		team
	Predict who wins and who looses a game.
	"""

	#load data
	#predictions = pd.read_csv(datafile)

	logger.info("Converting probabilities to 0s and 1s")

	#group by game id
	groupedByGame = predictions.groupby("gameId")

	#for each game, predict one winner and one loser
	#UNLESS there is only one team, in which case they are automatically assinged as the winner (standing -> 0 is winner, 1 is loser)
	for gameId in groupedByGame.groups.keys():
		group  = groupedByGame.get_group(gameId)
		#print(group)
		if len(group) == 1:
			predictions.ix[predictions['id'] == group['id'].values[0],'standing'] = 0
		else:
			if group['standing'].values[0] > group['standing'].values[1]:
				predictions.ix[predictions['id'] == group['id'].values[0], 'standing'] = 1
				predictions.ix[predictions['id'] == group['id'].values[1], 'standing'] = 0
			else:
				predictions.ix[predictions['id'] == group['id'].values[0], 'standing'] = 0
				predictions.ix[predictions['id'] == group['id'].values[1], 'standing'] = 1

	predictions.to_csv("predictions_binomial.csv")
	return predictions


if __name__ == "__main__":
	#create training and testing datasets
	#just split the data in half
	teamData = pd.read_csv("datafiles/teamData.csv")
	train = teamData.ix[0:len(teamData)/2, ]
	test = teamData.ix[(len(teamData)/2 + 1):len(teamData), ]

	#print train[[c for c in train.columns if c != "standing" and c!="date"].columns
 	features = ['characterLevel',
				'combatRating',
				'combatRatingStd',
				'killsDeathsRatio',
				'killsDeathsAssists',
				'defensiveKills',
				'offensiveKills',
				'objectivesCompleted',
				'refrencedId',
				'team',
				'hasHighestScoringPlayer',
				'hasLowestScoringPlayer',
				'numberOfFireTeams',
  				'weaponKillsHeavy',
  				'players',
  				'averageScorePerKill',
  				'longestKillSpree',
  				'dominationKills',
  				]

	logger.info("Building random forest!")
  	randomForest = RandomForestClassifier(n_estimators=1000, )
  	randomForest.fit(train[features], train['standing'])

  	#predictions = randomForest.predict(test[features])
  	#get the probability that the team wins and that it loses
  	predictions = randomForest.predict_proba(test[features])

  	submission = pd.DataFrame({"id":test.index.values,
  		"standing":[i[1] for i in predictions],
  		"gameId":test['gameId'],
  		"team":test['team'],
  		"probabilityOfVictory":[i[0] for i in predictions]})

  	#given the probabilities of victory (and defeat) and turn those into 0s and 1s
  	submission = predictWinners(submission)

  	rms = sqrt(mean_squared_error(test['standing'], submission['standing']))

  	logger.info("RMSE: {0}".format(rms))

  	submission.to_csv("datafiles/submission.csv")