AlexNet-Source-Code/multisoft-normed.py at main · computerhistory/AlexNet-Source-Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import sys
import numpy as np
import numpy.random as nr
from math import exp, log

def sumprod_brute(elts, size, fixed = -1):
    if size > len(elts):
        return 0
    if fixed >= 0 and fixed < len(elts):
        if size == 0:
            return 0
        z = 0
        for s in xrange(size):
            z += sumprod_brute(elts[:fixed], s) * sumprod_brute(elts[fixed+1:], size - 1 - s)
        return z * exp(elts[fixed])
    if size == 0:
        return 1
    return exp(elts[0]) * sumprod_brute(elts[1:], size - 1) + sumprod_brute(elts[1:], size)

# Returns sum over all subsets of given size of product
# of exp of elements.
# Also returns, for each index, the said sum given that the element
# at that index is in the subset.
def sumprod(elts, size, fixed = -1):
    N = len(elts)
    B = np.zeros((size + 1, N + 1)) # Backward lattice
    B[0, N] = 1

    logBNorms = np.zeros(N + 1)
    # Backward pass
    for i in xrange(N - 1, -1, -1):
        B[0, i] = (i >= size and i > fixed) * B[0, i + 1] # This can get quite small
        for s in xrange(max(1, size - i), size + 1):
            B[s, i] = B[s - 1, i + 1] * exp(elts[i]) + B[s, i + 1] * (fixed != i)
        norm = B[:,i].sum()
        B[:,i] /= norm
        logBNorms[i] = log(norm) + logBNorms[i + 1]

    F = np.zeros((size + 1,)) # Forward column
    F[0] = 1

    # Forward pass
    # Compute y_j for each j (marginal prob)
    y = np.zeros(N)
    logFNorm = -logBNorms[0] # Subtract log partition function
    for i in xrange(1, N + 1):
        for s in xrange(size, -1, -1):
            if s < size:
                y[i - 1] += F[s] * B[size - 1 - s, i]
            if s > 0:
                F[s] = F[s - 1] * exp(elts[i - 1]) + F[s] * (fixed != i - 1)
            elif fixed == i - 1:
                F[0] = 0
        norm = F.sum()
        F /= norm
        y[i - 1] *= exp(elts[i - 1] + logBNorms[i] + logFNorm)
        logFNorm += log(norm)
    return y

# Computes log(exp(x) + exp(y))
def logadd(x, y):
    if x == -np.inf and y == -np.inf:
        return -np.inf
    M = max(x,y)
    m = min(x,y)
    diff = M - m
    return M if diff >= 15 else M + log(1 + exp(-diff))


# Returns sum over all subsets of given size of product
# of exp of elements.
# Also returns, for each index, the said sum given that the element
# at that index is in the subset.
def sumprod_logspace(elts, size, fixed = -1):
    N = len(elts)
    logB = -np.inf * np.ones((size + 1, N + 1)) # Backward lattice

    logB[0, :] = 0
    if fixed >= 0:
        logB[0, :fixed + 1] = -np.inf

    # Backward pass
    for i in xrange(N - 1, -1, -1):
        for s in xrange(max(1, size - i), size + 1):
            logB[s, i] = logadd(logB[s - 1, i + 1] + elts[i], logB[s, i + 1] if fixed != i else -np.inf)

    logF = -np.inf * np.ones((size + 1,)) # Forward column
    logF[0] = 0

    # Forward pass
    # Compute y_j for each j (marginal prob)
    logy = -np.inf * np.ones(N)
    logFNorm = -logB[size, 0] # Subtract log partition function
    for i in xrange(1, N + 1):
        for s in xrange(size, -1, -1):
            if s < size:
                logy[i - 1] = logadd(logy[i - 1], logF[s] + logB[size - 1 - s, i])
            if s > 0:
                logF[s] = logadd(logF[s - 1] + elts[i - 1], logF[s] if fixed != i - 1 else -np.inf)
            elif fixed == i - 1:
                logF[0] = -np.inf

        logy[i - 1] += elts[i - 1] + logFNorm
    return np.exp(logy)

# Checks the gradient with respect to the objective
# E = log(y_i)
# where y_i = z_i/Z and i = the index of the correct label
def check_grad(elts, size, correct=0):
    eps = 0.01
    N = len(elts)
    y = sumprod_logspace(elts, size)
    Cy = sumprod_logspace(elts, size, fixed=correct)

    grad = Cy - y
    print "Analytic gradient: "
    print grad

    grad_num = np.zeros_like(grad)
    for i in xrange(N):
        tmp = elts[i]
        elts[i] += eps
        y_n = sumprod_logspace(elts, size)
        grad_num[i] = (log(y_n[correct]) - log(y[correct])) / eps
        elts[i] = tmp
    print "Numeric gradient: "
    print grad_num

if __name__ == "__main__":
    nr.seed(2)
    N = 5 # The number of outputs in the softmax
    size = 2 # The size of the multisoft set
    fixed = -2 # Force this index to be on (negative = don't)
    elts = nr.randn(N)
    elts -= elts.max()
    elts = np.array([-0.071459650993347, -0.517264485359192, -0.128548145294189, -0.113207340240479 ,0.000000000000000])
    print elts

    dp_y = sumprod_logspace(elts, size, fixed=fixed)
    bf_Z = sumprod_brute(elts, size, fixed=fixed)
    print "Brute force Z: %f" % bf_Z

    print "Brute force z/Z:"
    bf_z = np.zeros(N)
    for i in xrange(N):
        for s in xrange(size):
            bf_z[i] += sumprod_brute(elts[:i], s, fixed=fixed) * sumprod_brute(elts[i+1:], size - 1 - s, fixed=fixed-i-1)
        bf_z[i] *= exp(elts[i])

    print bf_z / bf_Z

    print "DP z/Z:"
    print dp_y

    check_grad(elts, size, correct=3)