Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

Lightweight implementations of generative label models for weakly supervised machine learning

### Installation
```
pip3 install git+https://github.com/yongzx/labelmodels.git@top-k-probable
```

# Example Usage - Naive Bayes Model
```python
# Let votes be an m x n matrix where m is the number of data examples, n is the
Expand Down
134 changes: 133 additions & 1 deletion labelmodels/hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def get_most_probable_labels(self, votes, seq_starts):
seq_starts = np.array(seq_starts, dtype=np.int)

out = np.ndarray((votes.shape[0],), dtype=np.int)

offset = 0
for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32):
jll = self._get_labeling_function_likelihoods(votes)
Expand Down Expand Up @@ -166,6 +165,139 @@ def get_most_probable_labels(self, votes, seq_starts):
offset += len(res)
return out

def get_k_most_probable_labels(self, votes, seq_starts, topk, return_viterbi_scores=False):
"""
Computes the topk most probable underlying sequence nodes given function
outputs.

Based on https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py

:param votes: m x n matrix in {0, ..., k}, where m is the sum of
the lengths of the sequences in the batch, n is the
number of labeling functions and k is the number of
classes
:param seq_starts: vector of length l of row indices in votes indicating
the start of each sequence, where l is the number of
sequences in the batch. So, votes[seq_starts[i]]
is the row vector of labeling function outputs for the
first element in the ith sequence
:return: matrix of shape (topk, m), where element is the most likely predicted labels
"""
# Converts to CSR and integers to standardize input
votes = sparse.csr_matrix(votes, dtype=np.int)
seq_starts = np.array(seq_starts, dtype=np.int)

out = np.ndarray((topk, votes.shape[0],), dtype=np.int32)
out_scores = np.ndarray((topk, seq_starts.shape[0],), dtype=np.float64)
final_scores = []

EMPTY = -1
offset = 0
offset_scores = 0
for votes, seq_starts in self._create_minibatches(
votes, seq_starts, 32):
# Initializes joint log likelihood with labeling function likelihood
jll = self._get_labeling_function_likelihoods(votes)
norm_start_balance = self._get_norm_start_balance()
norm_transitions = self._get_norm_transitions()

path_scores = []
path_indices = []
normalization = []
T = votes.shape[0]
seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1]

# follow https://github.com/stanfordnlp/stanza/blob/b24d124156911f95e3c5715e9dc9f75c6076619c/stanza/models/common/crf.py#L77
# for implementation of normalization of viterbi scores
for i in range(0, T):
if i in seq_starts:
path_scores.append((jll[i] + norm_start_balance).unsqueeze(0))
path_indices.append(torch.zeros([self.num_classes, self.num_classes]))

alphas = (jll[i] + norm_start_balance).unsqueeze(0) # shape: (1, self.num_classes)
else:
p = path_scores[i-1].clone().unsqueeze(2) + norm_transitions
p = p.view(-1, self.num_classes) # shape: (self.num_classes, self.num_classes)
maxk = min(p.size()[0], topk)
scores, paths = torch.topk(p, k=maxk, dim=0) # paths would use (num_tags * n_permutations) nodes

assert scores.shape == (maxk, self.num_classes)
assert paths.shape == (maxk, self.num_classes)
scores = jll[i] + scores

path_scores.append(scores)
path_indices.append(paths)

transition_scores = alphas.unsqueeze(2) + norm_transitions # shape: (1, self.num_classes, self.num_classes)
alphas = jll[i] + torch.logsumexp(transition_scores, dim=1)

if i in seq_ends:
log_norm = torch.logsumexp(alphas, dim=1)
normalization.append(log_norm.item())

res = []
res_scores = []
seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1]
for k in range(topk):
j = T-1
viterbi_path = []
viterbi_score = []
while j >= 0:
if j in seq_ends:
seq_path_scores = path_scores[j].view(-1)
skip_rest = False
if seq_path_scores.shape[0] <= k:
# print("seq_end:", j)
skip_rest = True

viterbi_scores, best_paths = torch.topk(seq_path_scores, k=min(topk, seq_path_scores.shape[0]), dim=0) # capped at 256 because some instances are 4-token long
if skip_rest:
viterbi_path.append(EMPTY)
viterbi_score.append(EMPTY)
else:
viterbi_path.append(best_paths[k])
viterbi_score.append(viterbi_scores[k])
# if k == 0:
# # because viterbi_scores include scores for other k,
# # this if-condition ensures that we only need to store the viterbi_scores for
# final_scores.append(viterbi_scores.tolist() + [-1] * (topk - seq_path_scores.shape[0])) #
if j in seq_starts:
j -= 1
continue
if skip_rest:
viterbi_path.append(EMPTY)
else:
viterbi_path.append(int(path_indices[j].view(-1)[viterbi_path[-1]]))
j -= 1
# # if path == -1, it means that at this k, there's no viterbi path. E.g., k = 257 and we are working with 4-token sentence
# # assert False
# if -1 in viterbi_path:
# assert False
viterbi_path = [(int(path % self.num_classes) + 1) if path != -1 else -1 for path in viterbi_path]
viterbi_path.reverse()
viterbi_score.reverse()
res.append(viterbi_path)
res_scores.append(viterbi_score)


for k in range(topk):
for i in range(len(res[k])):
out[k][offset + i] = res[k][i]

for k in range(topk):
for i in range(len(res_scores[k])):
if res_scores[k][i] != EMPTY:
out_scores[k][offset_scores + i] = res_scores[k][i] - normalization[i]
else:
out_scores[k][offset_scores + i] = res_scores[k][i]

offset += len(res[0])
offset_scores += len(res_scores[0])

if return_viterbi_scores:
return out, out_scores
return out

def get_label_distribution(self, votes, seq_starts):
"""Returns the unary and pairwise marginals over true labels estimated
by the model.
Expand Down
Loading