-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_source_map_content_only.py
More file actions
200 lines (168 loc) · 7.19 KB
/
analyze_source_map_content_only.py
File metadata and controls
200 lines (168 loc) · 7.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""Post-hoc analysis: redo Op-2 CKA on content-only positions.
Reads output/exp_causal_source_map_3b.json. For each problem × language,
identifies which ablated positions correspond to chat-template scaffolding
(user, assistant, system, newlines, punctuation) vs actual content tokens
(math statement). Recomputes cross-language CKA using only content-token
positions.
If within-problem cross-lang CKA remains elevated on the content-only slice,
the operator signal is real. If it collapses to the null, it was all template.
"""
import json
import numpy as np
TEMPLATE_STOPWORDS = {
'user', 'assistant', 'system', 'im_start', 'im_end',
'<|im_start|>', '<|im_end|>', '<|endoftext|>',
'\n', ' \n', '\n\n', '', ' ',
}
def is_template_token(tok):
s = tok.strip().lower()
if s in TEMPLATE_STOPWORDS:
return True
if s.startswith('<|') and s.endswith('|>'):
return True
# Whitespace / punctuation only
if all(not c.isalnum() for c in s):
return True
return False
def linear_cka(X, Y):
n = X.shape[0]
if n < 2:
return float('nan')
Xc = X - X.mean(axis=0, keepdims=True)
Yc = Y - Y.mean(axis=0, keepdims=True)
XtX = Xc @ Xc.T
YtY = Yc @ Yc.T
hsic = np.trace(XtX @ YtY) / ((n - 1) ** 2)
vx = np.trace(XtX @ XtX) / ((n - 1) ** 2)
vy = np.trace(YtY @ YtY) / ((n - 1) ** 2)
if vx < 1e-12 or vy < 1e-12:
return 0.0
return float(hsic / np.sqrt(vx * vy))
def resample(profile, n_grid=20):
profile = np.asarray(profile, dtype=np.float64)
if len(profile) == 0:
return np.zeros(n_grid)
if len(profile) == 1:
return np.full(n_grid, profile[0])
x_src = np.linspace(0, 1, len(profile))
x_dst = np.linspace(0, 1, n_grid)
return np.interp(x_dst, x_src, profile)
def build_content_profile(result, layer):
"""Extract the source map at layer, keeping only content-token positions."""
sm = np.asarray(result['source_map'][str(layer)])
ablated_positions = result['ablated_positions']
tokens = result['tokens']
keep_idx = []
content_tokens = []
for i, pos in enumerate(ablated_positions):
tok = tokens[pos]
if not is_template_token(tok):
keep_idx.append(i)
content_tokens.append(tok)
return sm[keep_idx], content_tokens
def main():
with open('/home/greg/Desktop/Projects/BrainInsideTheMachine/output/exp_causal_source_map_3b.json') as f:
d = json.load(f)
per_problem = d['per_problem']
layer_targets = d['config']['layer_targets']
langs = d['config']['langs']
by_key = {(r['problem_idx'], r['lang']): r for r in per_problem}
problem_indices = sorted(set(k[0] for k in by_key.keys()))
complete = [p for p in problem_indices if all((p, l) in by_key for l in langs)]
print(f"Complete problems: {len(complete)}")
print(f"Languages: {langs}")
# Diagnose content token counts per problem per language
print("\nContent-token counts (P_total / P_content):")
for p in complete[:5]:
for lang in langs:
r = by_key[(p, lang)]
_, ct = build_content_profile(r, layer_targets[0])
print(f" prob{p}/{lang}: {len(r['ablated_positions'])} / {len(ct)} — sample: {ct[:8]}")
# For each layer, redo the CKA on content-only profiles
print(f"\n{'=' * 80}")
print("CKA of source maps — content tokens only (template excluded)")
print(f"{'=' * 80}")
header = (f"{'Layer':>6} "
f"{'ALL xLang':>10} {'CONTENT xLang':>14} "
f"{'ALL null':>10} {'CONTENT null':>14} "
f"{'raw subst':>10}")
print(header)
print('-' * len(header))
results = {}
for layer in layer_targets:
# Full (all positions) cross-language CKA — already in Op-2 output but recompute for sanity
lang_mats_all = {}
lang_mats_content = {}
for lang in langs:
all_profiles = []
content_profiles = []
for p in complete:
r = by_key[(p, lang)]
sm_all = np.asarray(r['source_map'][str(layer)])
sm_content, _ = build_content_profile(r, layer)
all_profiles.append(resample(sm_all))
content_profiles.append(resample(sm_content))
lang_mats_all[lang] = np.stack(all_profiles, axis=0)
lang_mats_content[lang] = np.stack(content_profiles, axis=0)
# Within-problem cross-language
cka_all_xlang = linear_cka(lang_mats_all[langs[0]], lang_mats_all[langs[1]])
cka_content_xlang = linear_cka(lang_mats_content[langs[0]], lang_mats_content[langs[1]])
# Null: across-problem same-language (split half)
en_all = lang_mats_all[langs[0]]
en_content = lang_mats_content[langs[0]]
n = en_all.shape[0]
cka_all_null = linear_cka(en_all[: n // 2], en_all[n // 2:])
cka_content_null = linear_cka(en_content[: n // 2], en_content[n // 2:])
# Raw substrate (from baseline hidden states)
en_h = np.stack([
np.asarray(by_key[(p, langs[0])]['baseline_hidden'][str(layer)])
for p in complete
], axis=0)
zh_h = np.stack([
np.asarray(by_key[(p, langs[1])]['baseline_hidden'][str(layer)])
for p in complete
], axis=0)
cka_substrate = linear_cka(en_h, zh_h)
print(f" L{layer:02d} "
f"{cka_all_xlang:>10.4f} {cka_content_xlang:>14.4f} "
f"{cka_all_null:>10.4f} {cka_content_null:>14.4f} "
f"{cka_substrate:>10.4f}")
results[layer] = {
'cka_all_xlang': cka_all_xlang,
'cka_content_xlang': cka_content_xlang,
'cka_all_null': cka_all_null,
'cka_content_null': cka_content_null,
'cka_raw_substrate': cka_substrate,
}
print(f"\n{'=' * 80}")
print("Interpretation guide:")
print(f"{'=' * 80}")
print("""
CONTENT xLang vs CONTENT null:
If CONTENT xLang >> CONTENT null, the operator signal survives the
template confound — cross-language source maps share structure on
actual content positions. Strong evidence for the operator ansatz.
If CONTENT xLang ≈ CONTENT null, the original signal was all template
and the operator does not carry language-invariant structure.
CONTENT xLang vs raw substrate:
If CONTENT xLang > raw substrate on the same problems, the operator
is MORE language-invariant than the substrate. Strongest possible
outcome for the operator ansatz.
If raw substrate > CONTENT xLang, the substrate is already as
language-invariant as the operator; promoting to operator-thinking
adds no invariance but may add mechanistic clarity.
""")
# Save
out = {
'analysis': 'content_only_cka',
'layer_results': results,
'per_layer_content_count_sample': [
(p, lang, len(build_content_profile(by_key[(p, lang)], layer_targets[0])[1]))
for p in complete[:5] for lang in langs
],
}
with open('/home/greg/Desktop/Projects/BrainInsideTheMachine/output/source_map_content_only_cka.json', 'w') as f:
json.dump(out, f, indent=2)
print("\nSaved: output/source_map_content_only_cka.json")
if __name__ == '__main__':
main()