Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions prefpy/GaussianRUMGMM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from scipy.stats import norm, rankdata
import time
import math
import numpy as np
import pytz
import pandas as pd
from numpy.linalg import inv, pinv

#probability of one alternative is preferred to another.
#x is the difference between the means of the two alternatives
def prPair(x):
return norm.cdf(x, 0, scale=math.sqrt(2))

#derivative of probability
def dPrPair(x):
return np.exp(-x ** 2/4)/(2 * math.sqrt(math.pi))

#second order derivative of probability
def ddPrPair(x):
return -x * np.exp(- x ** 2/4)/(4 * math.sqrt(math.pi))

#calculate the breakings
def dataBreaking(data, m, normalize):
breaking = np.zeros((m, m), int)
n = len(data)
for d in data:
for i in range(0, m):
for j in range(i+1, m):
breaking[d[i], d[j]] += 1
if normalize:
return breaking/n
else:
return breaking

#for debugging purpose
def trueBreaking(Mu):
m=len(Mu)
A=np.zeros((m,m), float)

for i in range(0,m):
for j in range(0,m):
x = Mu[i] - Mu[j]
A[i,j] = norm.cdf(x, 0, scale=math.sqrt(2))
np.fill_diagonal(A,0)
return A

#To calculate the gradient of objective function
def gradientPrPair(breaking, theta, n):
theta = theta[0]
m = len(theta)
grad = np.zeros((m, 1), float)
for i in range(0, m):
for j in range(0, m):
if j != i:
x = theta[i] - theta[j]
grad[i] += 2 * (breaking[i][j] - n * prPair(x)) * dPrPair(x)
return grad

#To calculate the Hessian matrix
def hessianPrPair(breaking, theta, n):
theta = theta[0]
m = len(theta)
hessian = np.zeros((m, m), float)
for i in range(0, m):
for j in range(0, m):
if j != i:
x = theta[i] - theta[j]
hessian[i][i] += 2*(breaking[i][j]-n*prPair(x))*ddPrPair(x) - 2*n*(dPrPair(x))**2
if j > i:
hessian[i][j] = 2*n*dPrPair(x)**2-2*ddPrPair(breaking[i][j]-n*prPair(x))
else:
hessian[i][j] = hessian[j][i]
return hessian

#' GMM Method for Estimating Random Utility Model wih Normal dsitributions
#'
#' @param Data.pairs data broken up into pairs
#' @param m number of alternatives
#' @param itr number of itrations to run
#' @param Var indicator for difference variance (default is FALSE)
#' @param prior magnitude of fake observations input into the model
#' @return Estimated mean parameters for distribution of underlying normal (variance is fixed at 1)
#' @export
#' @examples
#' data(Data.Test)
#' Data.Test.pairs <- Breaking(Data.Test, "full")
#' Estimation.Normal.GMM(Data.Test.pairs, 5)
def GMMGaussianRUM(data, m, n, itr=1):

t0 = time.time() #get starting time

muhat = np.ones((1, m), float)
breaking = dataBreaking(data, m, False)

for itr in range(1,itr + 1):
try:
Hinv = np.linalg.pinv(hessianPrPair(breaking, muhat, n))
except np.linalg.linalg.LinAlgError:
Hinv = 0.01 * np.identity(m)
Grad = gradientPrPair(breaking, muhat, n)
muhat = (muhat.transpose() - np.dot(Hinv, Grad)).transpose()
muhat = muhat - muhat.min()

t = time.time() - t0
print("Time used:", t)

return muhat
28 changes: 28 additions & 0 deletions prefpy/GaussianRUMtest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from generate import *
from likelihood_rum import *
from GaussianRUMGMM import *
#from scipy.stats import norm

if __name__ == '__main__':

m = 4
n = 1000
ntrials = 100
sse = 0

for t in range(0, ntrials):
print("Trial: ", t)
Params = GenerateRUMParameters(m, "normal")
Data = GenerateRUMData(Params,m,n,"normal")
GroundTruth = Params["Mean"]
GroundTruth = GroundTruth - GroundTruth.min()
mu = GMMGaussianRUM(Data, m, n, itr=1)
mu = mu - mu.min()
se = np.square(GroundTruth - mu[0]).sum()
print("Ground Truth: ", GroundTruth)
print("Estimate: ", mu)
print("Error: ", se)
sse += se

mse = sse/ntrials
print("MSE: ", mse)
140 changes: 140 additions & 0 deletions prefpy/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import numpy as np
from scipy.stats import rankdata

def GenerateRUMParameters(m, distribution):
arr = []
for i in range(0, m):
arr.append(1)
if distribution=='normal':
parameter = dict(m = m, Mean = np.random.uniform(0,1,(m,)), SD = np.array(arr))
parameter["order"] = parameter["Mean"].ravel().argsort()
elif distribution=='exponential':
unscaled = random.uniform(0,1,(m,))
parameter = dict(m = m, Mean = unscaled/unscaled.sum())
else:
raise ValueError("Distribution name \"", distribution, "\" not recognized")
return parameter

#' Generate observation of ranks given parameters
#'
#' Given a list of parameters (generated via the Generate RUM Parameters function),
#' generate random utilities from these models and then return their ranks
#'
#' @param Params inference object from an Estimation function, or parameters object from a generate function
#' @param m number of alternatives
#' @param n number of agents
#' @param distribution can be either 'normal' or 'exponential'
#' @return a matrix of observed rankings
#' @export
#' @examples
#' Params = Generate.RUM.Parameters(10, "normal")
#' Generate.RUM.Data(Params,m=10,n=5,"normal")
#' Params = Generate.RUM.Parameters(10, "exponential")
#' Generate.RUM.Data(Params,m=10,n=5,"exponential")
def GenerateRUMData(Params, m, n, distribution):
if distribution == "exponential":
return np.transpose(np.repeat(rankdata(np.random.exponential(size = m, scale = 1/Params["Mean"])), n))
elif distribution == "normal":
A = rankdata(-np.random.normal(size = m, loc = Params["Mean"], scale = Params["SD"]))-1
for i in range(0, n - 1):
A = np.vstack([A, (-np.random.normal(size = m, loc = Params["Mean"], scale = Params["SD"])).ravel().argsort()])
return A.astype(int)
else:
raise ValueError("Distribution name \"", distribution, "\" not recognized")#' Breaks full or partial orderings into pairwise comparisons
#'
#' Given full or partial orderings, this function will generate pairwise comparison
#' Options
#' 1. full - All available pairwise comparisons. This is used for partial
#' rank data where the ranked objects are a random subset of all objects
#' 2. adjacent - Only adjacent pairwise breakings
#' 3. top - also takes in k, will break within top k
#' and will also generate pairwise comparisons comparing the
#' top k with the rest of the data
#' 4. top.partial - This is used for partial rank data where the ranked
#' alternatives are preferred over the non-ranked alternatives
#'
#' The first column is the preferred alternative, and the second column is the
#' less preferred alternative. The third column gives the rank distance between
#' the two alternatives, and the fourth column enumerates the agent that the
#' pairwise comparison came from.
#'
#' @param Data data in either full or partial ranking format
#' @param method - can be full, adjacent, top or top.partial
#' @param k This applies to the top method, choose which top k to focus on
#' @return Pairwise breakings, where the three columns are winner, loser and rank distance (latter used for Zemel)
#' @export
#' @examples
#' data(Data.Test)
#' Data.Test.pairs <- Breaking(Data.Test, "full")

def Breaking(Data):
rows = Data.shape[0]
cols = Data.shape[1]
count = 0
final = np.zeros((int(rows*cols*(cols - 1) / 2), 4), int)
for i in range(0,len(Data)):
current_list = Data[i]
for j in range(0, len(current_list)):
for k in range(j + 1, len(current_list)):
final[count, 0] = current_list[j]
final[count, 1] = current_list[k]
final[count, 2] = k - j
final[count, 3] = i + 1
count = count + 1
return final
# m = Data.shape[1]

# pair_full <- function(rankings) pair.top.k(rankings, length(rankings))

# pair.top.k <- function(rankings, k) {
# pair.top.helper <- function(first, rest) {
# pc <- c();
# z <- length(rest)
# for(i in 1:z) pc <- rbind(pc, array(as.numeric(c(first, rest[i], i))))
# pc
# }
# if(length(rankings) <= 1 | k <= 0) c()
# else rbind(pair.top.helper(rankings[1], rankings[-1]), pair.top.k(rankings[-1], k - 1))
# }

# pair.adj <- function(rankings) {
# if(length(rankings) <= 1) c()
# else rbind(c(rankings[1], rankings[2]), pair.adj(rankings[-1]))
# }

# pair.top.partial <- function(rankings, m) {
# # this is used in the case when we have missing ranks that we can
# # fill in at the end of the ranking. We can assume here that all
# # ranked items have higher preferences than non-ranked items
# # (e.g. election data)

# # the number of alternatives that are not missing
# k <- length(rankings)

# # these are the missing rankings
# missing <- Filter(function(x) !(x %in% rankings), 1:m)

# # if there is more than one item missing, scramble the rest and place them in the ranking
# if(m - k > 1) missing <- scramble(missing)

# # now just apply the top k breaking, with the missing elements
# # inserted at the end
# pair.top.k(c(rankings, missing), k)
# }

# break.into <- function(Data, breakfunction, ...) {
# n <- nrow(Data)
# # applying a Filter(identity..., ) to each row removes all of the missing data
# # this is used in the case that only a partial ordering is provided
# tmp <- do.call(rbind, lapply(1:n, function(row) cbind(breakfunction(Filter(identity, Data[row, ]), ...), row)))
# colnames(tmp) <- c("V1", "V2", "distance", "agent")
# tmp
# }

# if(method == "full") Data.pairs <- break.into(Data, pair.full)
# if(method == "adjacent") Data.pairs <- break.into(Data, pair.adj)
# if(method == "top") Data.pairs <- break.into(Data, pair.top.k, k = k)
# if(method == "top.partial") Data.pairs <- break.into(Data, pair.top.partial, m = m)

# Data.pairs
# }