BatsResearch · yongzx · Jan 11, 2022 · Jan 11, 2022 · Jan 11, 2022 · Jan 12, 2022
diff --git a/README.md b/README.md
@@ -4,6 +4,11 @@
 
 Lightweight implementations of generative label models for weakly supervised machine learning
 
+### Installation
+```
+pip3 install git+https://github.com/yongzx/labelmodels.git@top-k-probable
+```
+
 # Example Usage - Naive Bayes Model
 ```python
 # Let votes be an m x n matrix where m is the number of data examples, n is the

diff --git a/labelmodels/hmm.py b/labelmodels/hmm.py
@@ -129,7 +129,6 @@ def get_most_probable_labels(self, votes, seq_starts):
         seq_starts = np.array(seq_starts, dtype=np.int)
 
         out = np.ndarray((votes.shape[0],), dtype=np.int)
-
         offset = 0
         for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32):
             jll = self._get_labeling_function_likelihoods(votes)
@@ -166,6 +165,139 @@ def get_most_probable_labels(self, votes, seq_starts):
             offset += len(res)
         return out
 
+    def get_k_most_probable_labels(self, votes, seq_starts, topk, return_viterbi_scores=False):
+        """
+        Computes the topk most probable underlying sequence nodes given function
+        outputs.
+
+        Based on https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py
+
+        :param votes: m x n matrix in {0, ..., k}, where m is the sum of
+                            the lengths of the sequences in the batch, n is the
+                            number of labeling functions and k is the number of
+                            classes
+        :param seq_starts: vector of length l of row indices in votes indicating
+                           the start of each sequence, where l is the number of
+                           sequences in the batch. So, votes[seq_starts[i]]
+                           is the row vector of labeling function outputs for the
+                           first element in the ith sequence
+        :return: matrix of shape (topk, m), where element is the most likely predicted labels
+        """
+        # Converts to CSR and integers to standardize input
+        votes = sparse.csr_matrix(votes, dtype=np.int)
+        seq_starts = np.array(seq_starts, dtype=np.int)
+
+        out = np.ndarray((topk, votes.shape[0],), dtype=np.int32)
+        out_scores = np.ndarray((topk, seq_starts.shape[0],), dtype=np.float64)
+        final_scores = []
+
+        EMPTY = -1
+        offset = 0
+        offset_scores = 0
+        for votes, seq_starts in self._create_minibatches(
+                votes, seq_starts, 32):
+            # Initializes joint log likelihood with labeling function likelihood
+            jll = self._get_labeling_function_likelihoods(votes)
+            norm_start_balance = self._get_norm_start_balance()
+            norm_transitions = self._get_norm_transitions()
+
+            path_scores = []
+            path_indices = []
+            normalization = []
+            T = votes.shape[0]
+            seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1]
+
+            # follow https://github.com/stanfordnlp/stanza/blob/b24d124156911f95e3c5715e9dc9f75c6076619c/stanza/models/common/crf.py#L77
+            # for implementation of normalization of viterbi scores
+            for i in range(0, T):
+                if i in seq_starts:
+                    path_scores.append((jll[i] + norm_start_balance).unsqueeze(0))
+                    path_indices.append(torch.zeros([self.num_classes, self.num_classes]))
+
+                    alphas = (jll[i] + norm_start_balance).unsqueeze(0) # shape: (1, self.num_classes)
+                else:                  
+                    p = path_scores[i-1].clone().unsqueeze(2) + norm_transitions
+                    p = p.view(-1, self.num_classes)  # shape: (self.num_classes, self.num_classes)
+                    maxk = min(p.size()[0], topk)
+                    scores, paths = torch.topk(p, k=maxk, dim=0)  # paths would use (num_tags * n_permutations) nodes
+
+                    assert scores.shape == (maxk, self.num_classes)
+                    assert paths.shape == (maxk, self.num_classes)
+                    scores = jll[i] + scores
+
+                    path_scores.append(scores)
+                    path_indices.append(paths)
+
+                    transition_scores = alphas.unsqueeze(2) + norm_transitions  # shape: (1, self.num_classes, self.num_classes)
+                    alphas = jll[i] + torch.logsumexp(transition_scores, dim=1)
+
+                if i in seq_ends:
+                    log_norm = torch.logsumexp(alphas, dim=1)
+                    normalization.append(log_norm.item())
+
+            res = []
+            res_scores = []
+            seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1]
+            for k in range(topk):
+                j = T-1
+                viterbi_path = []
+                viterbi_score = []
+                while j >= 0:
+                    if j in seq_ends:
+                        seq_path_scores = path_scores[j].view(-1)
+                        skip_rest = False
+                        if seq_path_scores.shape[0] <= k:
+                            # print("seq_end:", j)
+                            skip_rest = True
+
+                        viterbi_scores, best_paths = torch.topk(seq_path_scores, k=min(topk, seq_path_scores.shape[0]), dim=0)  # capped at 256 because some instances are 4-token long
+                        if skip_rest:
+                            viterbi_path.append(EMPTY)
+                            viterbi_score.append(EMPTY)
+                        else:
+                            viterbi_path.append(best_paths[k])
+                            viterbi_score.append(viterbi_scores[k])
+                        # if k == 0:
+                        #     # because viterbi_scores include scores for other k, 
+                        #     # this if-condition ensures that we only need to store the viterbi_scores for 
+                        #     final_scores.append(viterbi_scores.tolist() + [-1] * (topk - seq_path_scores.shape[0]))  # 
+                    if j in seq_starts:
+                        j -= 1
+                        continue
+                    if skip_rest:
+                        viterbi_path.append(EMPTY)
+                    else:
+                        viterbi_path.append(int(path_indices[j].view(-1)[viterbi_path[-1]]))
+                    j -= 1
+                # # if path == -1, it means that at this k, there's no viterbi path. E.g., k = 257 and we are working with 4-token sentence
+                # # assert False 
+                # if -1 in viterbi_path:
+                #     assert False
+                viterbi_path = [(int(path % self.num_classes) + 1) if path != -1 else -1 for path in viterbi_path]  
+                viterbi_path.reverse()
+                viterbi_score.reverse()
+                res.append(viterbi_path)
+                res_scores.append(viterbi_score)
+
+
+            for k in range(topk):
+                for i in range(len(res[k])):
+                    out[k][offset + i] = res[k][i]
+
+            for k in range(topk):
+                for i in range(len(res_scores[k])):
+                    if res_scores[k][i] != EMPTY:
+                        out_scores[k][offset_scores + i] = res_scores[k][i] - normalization[i]
+                    else:
+                        out_scores[k][offset_scores + i] = res_scores[k][i]
+
+            offset += len(res[0])
+            offset_scores += len(res_scores[0])
+
+        if return_viterbi_scores:
+            return out, out_scores
+        return out
+
     def get_label_distribution(self, votes, seq_starts):
         """Returns the unary and pairwise marginals over true labels estimated
         by the model.