forked from josephwon0310/Launchpad
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze.py
More file actions
101 lines (78 loc) · 3.05 KB
/
analyze.py
File metadata and controls
101 lines (78 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from bokeh.layouts import row
from bokeh.models import HoverTool
from bokeh.palettes import Category10
from bokeh.plotting import figure, output_file, show, ColumnDataSource
import cast
from collections import Counter
import numpy as np
import os
import pandas as pd
import parser
from sklearn.manifold import TSNE
import sys
import word2vec
if '-w' in sys.argv and '-l' in sys.argv:
print('Use -l to analyze lines (default) or -w to analyze words.')
sys.exit(1)
mode = 'lines'
if '-w' in sys.argv:
mode = 'words'
sys.argv.remove('-w')
if '-l' in sys.argv:
mode = 'lines'
sys.argv.remove('-l')
if len(sys.argv) < 2:
print('Please specify a movie to parse.')
sys.exit(1)
word2vec.load()
plots = []
for i in range(1, len(sys.argv)):
arg = sys.argv[i]
filename = arg + '.txt'
script_data = parser.parse_script(filename)
script_cast = cast.make_cast(script_data).filter(min_lines=20)
plot = figure(title=arg.replace('-', ' '),
tools='pan, wheel_zoom, box_zoom, reset',
active_scroll='wheel_zoom')
hover = HoverTool(tooltips='<div>@label</div>')
plot.add_tools(hover)
X = labels = np.array([])
y = np.array([], dtype=np.int32)
for j, c in enumerate(script_cast):
if mode == 'lines':
valid_lines = [line for line in c.lines if word2vec.is_valid(parser.line_to_words(line))]
vectors = np.array([word2vec.sentence_vector(line) for line in valid_lines])
labels = np.concatenate((labels, valid_lines))
elif mode =='words':
valid_words = [word for word in c.words if word2vec.is_valid(word)]
vectors = np.array([word2vec.word_vector(word) for word in valid_words])
labels = np.concatenate((labels, valid_words))
else:
sys.exit(1)
if vectors.size:
X = np.concatenate((X, vectors)) if X.size else vectors
y = np.concatenate((y, [j] * vectors.shape[0]))
feature_cols = ['d{}'.format(i) for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_cols)
df['char_index'] = y
df['label'] = labels
tsne = TSNE(n_components=2)
X_embedded = tsne.fit_transform(df.loc[:,feature_cols].values)
df['x-tsne'] = X_embedded[:, 0]
df['y-tsne'] = X_embedded[:, 1]
if mode == 'words':
freq = Counter(script_cast.all_words)
elif mode == 'lines':
freq = Counter(script_cast.all_lines)
df['inv-freq'] = df['label'].map(lambda x: 1/freq[x])
for i, c in enumerate(script_cast):
source = ColumnDataSource(data=dict(df.loc[df['char_index'] == i, ['x-tsne', 'y-tsne', 'label', 'inv-freq']]))
plot.circle(0, 0, color=Category10[10][i % 10], size=0, legend=c.name) # fixes legend
plot.circle('x-tsne', 'y-tsne', color=Category10[10][i % 10], alpha='inv-freq', size=10, legend=c.name, source=source)
plot.legend.location='bottom_right'
plot.legend.click_policy='hide'
plot.toolbar.autohide = True
plots.append(plot)
os.makedirs('out/analysis/' + mode, exist_ok=True)
output_file(os.path.join('out/analysis/' + mode, '{}{}.html'.format(sys.argv[1], '+' + str(len(sys.argv)-2) if len(sys.argv) > 2 else '')), title='Analysis')
show(row(*plots))