Tree_graphical_model/2_3.py at master · ujohn33/Tree_graphical_model · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
""" This file is created as the solution template for question 2.3 in DD2434 - Assignment 2.

    Please keep the fixed parameters in the function templates as is (in 2_3.py file).
    However if you need, you can add parameters as default parameters.
    i.e.
    Function template: def calculate_likelihood(tree_topology, theta, beta):
    You can change it to: def calculate_likelihood(tree_topology, theta, beta, new_param_1=[], new_param_2=123):

    You can write helper functions however you want.

    If you want, you can use the class structures provided to you (Node, Tree and TreeMixture classes in Tree.py
    file), and modify them as needed. In addition to the sample files given to you, it is very important for you to
    test your algorithm with your own simulated data for various cases and analyse the subsolutions.

    For those who do not want to use the provided structures, we also saved the properties of trees in .txt and .npy
    format.

    Also, I am aware that the file names and their extensions are not well-formed, especially in Tree.py file
    (i.e example_tree_mixture.pkl_samples.txt). I wanted to keep the template codes as simple as possible.
    You can change the file names however you want (i.e tmm_1_samples.txt).

    For this assignment, we gave you three different trees (q_2_3_small_tree, q_2_3_medium_tree, q_2_3_large_tree).
    Each tree have 5 samples (whose inner nodes are masked with np.nan values).
    We want you to calculate the likelihoods of each given sample and report it.
"""
import numpy as np
from Tree import Tree
from Tree import Node
from collections import defaultdict


# probability of child with assigned value cat with a parrent with assigned value parent_cat
def CPD(theta, node, cat, parent_cat=None):
    if parent_cat is None:
        return theta[node][cat]
    else:
        return theta[node][int(parent_cat)][int(cat)]

# non-nan values are assignments for leaves. nan values are inner nodes
def find_leaves(beta):
    leaves = []
    for index, value in enumerate(beta):
        if not np.isnan(value):
            leaves.append(index)
    return leaves

# if the parent value of a node checked in topology equals given node, then the checked node is a child of given node
def find_children(p, topology):
    children = []
    for index, parent in enumerate(topology):
        if parent == p:
            children.append(index)
    return children

# if the parent is the same but the node has a different index
def calculate_sibling(u, topology):
    for index, parent in enumerate(topology):
        if parent == topology[u] and u != index:
            return index
    return None

def calculate_likelihood(tree_topology, theta, beta):
    # initialize dictionaries for s and t
    likelihood = 0
    s_VALUES = defaultdict(dict)
    t_VALUES= defaultdict(dict)

    def calculate_s(tree_topology, theta, beta):
        def subproblem_S(u, j, children):
            if s_VALUES[u].get(j) is not None: # If it has already been calculated
                return s_VALUES[u].get(j)
            # Visit the vertices from leaves to root
            if len(children) < 1:           # identify leaves
                if int(beta[u]) == j:
                    s_VALUES[u][j] = 1
                    return 1
                else:
                    s_VALUES[u][j] = 0
                    return 0
            subsolution = np.zeros(len(children))
            # run down the tree solving the subproblem for children of children recursively
            for index, child in enumerate(children):
                for category in range(len(theta[0])):
                    subsolution[index] += subproblem_S(child, category, find_children(child, tree_topology)) * CPD(theta, child,category, j)
            s_subsolution = np.prod(subsolution)
            s_VALUES[u][j] = s_subsolution
            return s_subsolution
        # Start from root and find run the subproblem for children of the root node
        for i in range(len(theta[0])):
            subproblem_S(0, i, find_children(0, tree_topology))
        return s_VALUES

    s_VALUES = calculate_s(tree_topology, theta, beta)
    # Do the same for t, now siblings are taken into consideration
    def subproblem_t(u, i, parent, sibling):
        if t_VALUES[u].get(i) is not None:  # If it has already been calculated
            return t_VALUES[u].get(i)

        if np.isnan(parent):  # identify the root
            return CPD(theta, u, i)
        subsolution = 0
        parent = int(parent)
        for j in range(len(theta[0])):
            for k in range(len(theta[0])):
                subsolution += CPD(theta, u, i, j) * CPD(theta, sibling, k, j) * s_VALUES[sibling].get(k) * subproblem_t(parent,j,tree_topology[parent],calculate_sibling(parent,tree_topology))
        t_VALUES[u][i] = subsolution
        return subsolution

    # Expressing the joint
    for leaf, cat in enumerate(beta):
        if not np.isnan(cat):
            #print(beta)
            #print(cat)
            likelihood = subproblem_t(leaf, cat, int(tree_topology[leaf]),calculate_sibling(leaf, tree_topology)) * s_VALUES[leaf].get(cat)
            print("Calculating the likelihood...")
    #likelihood = np.random.rand()
            return likelihood


def main():
    print("Hello World!")
    print("This file is the solution template for question 2.3.")

    print("\n1. Load tree data from file and print it\n")
    filename = "data/q2_3_small_tree.pkl"  # "data/q2_3_medium_tree.pkl", "data/q2_3_large_tree.pkl"
    t = Tree()
    t.load_tree(filename)
    t.print()

    print("\n2. Calculate likelihood of each FILTERED sample\n")
    # These filtered samples already available in the tree object.
    # Alternatively, if you want, you can load them from corresponding .txt or .npy files

    for sample_idx in range(t.num_samples):
        beta = t.filtered_samples[sample_idx]
        print("\n\tSample: ", sample_idx, "\tBeta: ", beta)
        sample_likelihood = calculate_likelihood(t.get_topology_array(), t.get_theta_array(), beta)
        print("\tLikelihood: ", sample_likelihood)


if __name__ == "__main__":
    main()