-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlda_utils.py
More file actions
138 lines (112 loc) · 4.36 KB
/
lda_utils.py
File metadata and controls
138 lines (112 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import math
from scipy.special import psi, polygamma
import numpy as np
import scipy.io as sio
# http://stackoverflow.com/questions/2272149/round-to-5-or-other-number-in-python
def round_nicely(x, base=5):
return int(base * round(float(x)/base))
def threshold_matrix(matrix, epsilon=0.0):
thresholded = matrix.copy()
n_row, n_col = thresholded.shape
for i in range(n_row):
row = thresholded[i, :]
if epsilon > 0:
small = row < epsilon
row[small] = 0
else:
smallest_val = np.min(row)
smallest_arr = np.ones_like(row) * smallest_val
close = np.isclose(row, smallest_arr)
row[close] = 0
return thresholded
def word_indices(document, sparse=False):
"""
Turns a document vector of word counts into a vector of the indices
words that have non-zero counts, repeated for each count
e.g.
>>> word_indices(np.array([3, 0, 1, 2, 0, 5]))
[0, 0, 0, 2, 3, 3, 5, 5, 5, 5, 5]
"""
if not sparse: # document is a panda thing
results = []
for nnz in document.values.nonzero()[1]:
count = document.values[0].flatten()
for n in range(int(count[nnz])):
results.append(nnz)
return results
else: # document is a row slice of a scipy sparse matrix
nnz = document.nonzero()[1]
counts = []
results = []
for col in nnz:
count = int(document[0, col])
temp = [col] * count
results.extend(temp)
return results
def psi_inverse(initial_x, y, num_iter=5):
"""
Computes the inverse digamma function using Newton's method
See Appendix c of Minka, T. P. (2003). Estimating a Dirichlet distribution.
Annals of Physics, 2000(8), 1-13. http://doi.org/10.1007/s00256-007-0299-1 for details.
"""
# initialisation
if y >= -2.22:
x_old = np.exp(y)+0.5
else:
gamma_val = -psi(1)
x_old = -(1/(y+gamma_val))
# do Newton update here
for i in range(num_iter):
numerator = psi(x_old) - y
denumerator = polygamma(1, x_old)
x_new = x_old - (numerator/denumerator)
x_old = x_new
return x_new
def estimate_alpha_from_theta(D, K, initial_alpha, thetas, n_iter=100):
"""
Estimate posterior alpha of a Dirichlet multinomial from samples of the multinomial vectors.
This implements the Newton's method as described in Huang, J. (2005). Maximum Likelihood Estimation of
Dirichlet Distribution Parameters. Distribution, 40(2), 1-9. doi:10.1.1.93.2881
"""
# initialise old and new alphas before iteration
alpha_old = np.ones(K) * initial_alpha
alpha_new = np.zeros_like(alpha_old)
temp = np.log(thetas)
log_pk = np.sum(temp, axis=0)
log_pk /= D
for i in range(n_iter):
# update concentration parameter of a dirichlet from samples of the multinomial vector theta
psi_sum_alpha_old = psi(np.sum(alpha_old))
for k in range(K):
initial_x = alpha_old[k]
y = psi_sum_alpha_old+log_pk[k]
alpha_new[k] = psi_inverse(initial_x, y)
# set alpha_new to alpha_old for the next iteration update
alpha_old = alpha_new
return alpha_new
def generate_counts_from_matlab(matfile):
mat_contents = sio.loadmat(matfile)
a = mat_contents['a'][0]
data = mat_contents['data']
return a, data
def estimate_alpha_from_counts(D, K, initial_alpha, counts, n_iter=1000):
"""
Estimate posterior alpha of a Dirichlet multinomial from samples of the multinomial counts.
This implements the fixed point update as described in Minka, T. P. (2003). Estimating a Dirichlet distribution.
Annals of Physics, 2000(8), 1-13. http://doi.org/10.1007/s00256-007-0299-1
"""
counts = counts.astype(float)
sdata = np.sum(counts, axis=1)
# initialise old and new alphas before iteration
alpha_old = np.ones(K) * initial_alpha
for i in range(n_iter):
sa = np.sum(alpha_old)
temp = np.tile(alpha_old, (D, 1))
g = np.sum(psi(counts + temp), axis=0) - D*psi(alpha_old)
h = np.sum(psi(sdata + sa)) - D*psi(sa)
alpha_new = alpha_old * (g/h)
if np.max(np.abs(alpha_new-alpha_old)) < 1e-6:
break
# set alpha_new to alpha_old for the next iteration update
alpha_old = alpha_new
return alpha_new