-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathINCLINATION.json
More file actions
277 lines (277 loc) · 13 KB
/
INCLINATION.json
File metadata and controls
277 lines (277 loc) · 13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
{
"schema_version": "inclination.v0",
"intents": [
{
"id": "INT00001",
"origin": "Engineering with LLMs in production; safety dependent on agent self-policing.",
"intent": "Agentic action can be bounded, observable, and verifiable, without trusting the agent or forfeiting expected utility.",
"horizon": "now"
},
{
"id": "INT00002",
"origin": "Frontier-model dependence in independent labs; open-model performance gap.",
"intent": "Open-model agent performance can be stabilized to approach prior-generation frontier performance, with closed vocabulary, deterministic tools, cross-model evaluations, and interpretability-informed training.",
"horizon": "next"
},
{
"id": "INT00003",
"origin": "Vendor-platform lock-in observed across operator workflows.",
"intent": "Rebuild-from-runtime sovereignty, achieved by deriving replacements for vendor components when a vendor dependency's friction exceeds its operational value.",
"horizon": "later"
}
],
"hypotheses": [
{
"id": "HYP00001",
"intent_ids": ["INT00001"],
"hypothesis": "Closed-vocabulary AST-CI gates reduce off-vocabulary generation rate compared to no gate, measured on generated code across imperative languages.",
"ablations": [
"Vocabulary as advisory warnings instead of a gate produces equivalent off-vocabulary rate.",
"Prompt instructions alone produce equivalent off-vocabulary rate.",
"Disabling the AST walker produces no measurable change in off-vocabulary rate."
],
"type": "empirical",
"dependencies": [],
"experiments": ["EXP00001"],
"conclusion": null
},
{
"id": "HYP00002",
"intent_ids": ["INT00001"],
"hypothesis": "Substrate records (Atlas, Git, Gateway) contain sufficient evidence to reconstruct any agent action without reliance on the agent's self-report.",
"ablations": [
"Reconstruction from substrate records alone fails to recover what occurred in audited sessions.",
"Agent self-report alone produces reconstruction equivalent to substrate records.",
"Atlas-disabled sessions remain equivalently reconstructable."
],
"type": "operational",
"dependencies": [],
"experiments": ["EXP00002"],
"conclusion": null
},
{
"id": "HYP00003",
"intent_ids": ["INT00001", "INT00003"],
"hypothesis": "Propagate's State load at session start re-orients a new agent session to current work context without operator intervention.",
"ablations": [
"Propagate's State load drifts from current work context in ways the operator must correct.",
"Operator must manually re-orient the new agent session despite Propagate's State load.",
"Vendor-managed memory (Claude memory, ChatGPT memory) provides equivalent re-orientation; Propagate's State load adds nothing.",
"Propagate's State load fails to support continuation when vendor memory is also disabled."
],
"type": "operational",
"dependencies": ["HYP00002"],
"experiments": ["EXP00010"],
"conclusion": null
},
{
"id": "HYP00004",
"intent_ids": ["INT00001"],
"hypothesis": "Substrate enforcement layers (Seatbelt, Intermediary, Atlas, identity isolation) compose such that bypassing any single layer does not produce unauthorized capability.",
"ablations": [
"Bypassing Seatbelt alone yields Atlas write capability.",
"Removing Intermediary mediation yields equivalent boundedness.",
"Collapsing identity isolation to a shared POSIX user yields no measurable change in bypass surface."
],
"type": "operational",
"dependencies": [],
"experiments": ["EXP00003"],
"conclusion": null
},
{
"id": "HYP00005",
"intent_ids": ["INT00001"],
"hypothesis": "Cross-model review composed with eval produces a higher defect detection rate than weaker baselines (eval-only, same-vendor review, review-without-eval) on a labeled task corpus.",
"ablations": [
"Removing the eval step produces equivalent detection rate.",
"Same-vendor self-review produces equivalent detection rate.",
"Random-verdict placebo produces equivalent detection rate."
],
"type": "empirical",
"dependencies": [],
"experiments": ["EXP00004"],
"conclusion": null
},
{
"id": "HYP00006",
"intent_ids": ["INT00001", "INT00003"],
"hypothesis": "Substrate enforcement holds equivalently under remote operator access (SSH-over-Tailscale) compared to local operation.",
"ablations": [
"Remote sessions exhibit measurable enforcement relaxation.",
"Vendor-mobile harness (Claude Code web, Codex cloud) covers operator mobile need without requiring remote substrate enforcement.",
"Remote operation produces measurably degraded enforcement integrity."
],
"type": "operational",
"dependencies": [],
"experiments": ["EXP00005"],
"conclusion": null
},
{
"id": "HYP00007",
"intent_ids": ["INT00001"],
"hypothesis": "Substrate-active agent work produces equivalent work-quality scores compared to substrate-ablated work, with model and task held constant.",
"ablations": [
"Substrate-active sessions produce measurably lower work-quality scores than ablated sessions.",
"Substrate-induced friction reduces task completion rate below ablated baseline."
],
"type": "empirical",
"dependencies": [],
"experiments": ["EXP00006"],
"conclusion": null
},
{
"id": "HYP00008",
"intent_ids": ["INT00002"],
"hypothesis": "Substrate-active agent runs produce lower across-run variance compared to substrate-ablated runs, with model and task held constant.",
"ablations": [
"Substrate-active runs produce equivalent or higher across-run variance.",
"Observed variance reduction is attributable to randomization rather than substrate composition."
],
"type": "empirical",
"dependencies": [],
"experiments": ["EXP00006"],
"conclusion": null
},
{
"id": "HYP00009",
"intent_ids": ["INT00002"],
"hypothesis": "Interpretability-informed training applied to an open model produces a tuned variant that performs measurably better in the substrate than the untuned base model.",
"ablations": [
"Training without an interpretability-derived signal produces equivalent improvement.",
"The tuned variant performs equivalently or worse than the untuned base model in the substrate.",
"The interpretability-derived preference signal correlates with no measurable model behavior."
],
"type": "empirical",
"dependencies": [],
"experiments": ["EXP00007"],
"conclusion": null
},
{
"id": "HYP00010",
"intent_ids": ["INT00002"],
"hypothesis": "An open model tuned via interpretability-informed training (HYP00009), run within the full apparatus, produces performance comparable to prior-generation frontier on a fixed work-task corpus.",
"ablations": [
"The tuned open model in the full apparatus fails to approach prior-generation frontier performance.",
"The untuned open model in the apparatus produces equivalent performance, showing tuning contributes nothing.",
"Performance improvement is attributable to raw model capability rather than substrate or tuning."
],
"type": "empirical",
"dependencies": ["HYP00009"],
"experiments": ["EXP00008"],
"conclusion": null
},
{
"id": "HYP00011",
"intent_ids": ["INT00002"],
"hypothesis": "Substrate stabilization holds across multiple open instrumentable model families (Qwen, Gemma, gpt-oss), not only the family used to develop the apparatus.",
"ablations": [
"Stabilization holds only for the family used during apparatus development.",
"Performance varies significantly across families with apparatus held constant, indicating family-specific tuning rather than substrate stabilization."
],
"type": "empirical",
"dependencies": ["HYP00010"],
"experiments": ["EXP00008"],
"conclusion": null
},
{
"id": "HYP00012",
"intent_ids": ["INT00002", "INT00003"],
"hypothesis": "Substrate-mediated runs produce reproducible per-task cost compared to ad-hoc vendor-API agent invocation, measured as low cost variance across reruns of fixed task sets.",
"ablations": [
"Substrate-mediated runs exhibit cost variance comparable to ad-hoc agent invocation.",
"Reproducibility requires extensive operator manual tuning per session rather than emerging from substrate properties.",
"Cost reproducibility requires holding non-cost variables constant in ways that diverge from normative use."
],
"type": "empirical",
"dependencies": ["HYP00010"],
"experiments": ["EXP00008"],
"conclusion": null
},
{
"id": "HYP00013",
"intent_ids": ["INT00003"],
"hypothesis": "Substrate enforcement, observability, and evidence guarantees hold equivalently regardless of which underlying infrastructure provides the substrate's runtime (harness identity, network layer, vendor service).",
"ablations": [
"Substrate guarantees vary measurably by underlying infrastructure component.",
"Apparent neutrality is achieved only through component-specific compensations in substrate code.",
"Substituting a previously-validated infrastructure component reveals contract gaps the original setup masked."
],
"type": "operational",
"dependencies": [],
"experiments": ["EXP00009"],
"conclusion": null
}
],
"experiments": [
{
"id": "EXP00001",
"intent_ids": ["INT00001"],
"hypothesis_ids": ["HYP00001"],
"label": "Measurement of off-vocabulary generation rate in agent-produced code under closed-vocabulary AST-CI gating compared to no gating, across imperative languages.",
"push": null
},
{
"id": "EXP00002",
"intent_ids": ["INT00001"],
"hypothesis_ids": ["HYP00002"],
"label": "Audit of agent action reconstruction fidelity from substrate records (Atlas, Git, Gateway) compared to reconstruction from agent self-report alone.",
"push": null
},
{
"id": "EXP00003",
"intent_ids": ["INT00001"],
"hypothesis_ids": ["HYP00004"],
"label": "Validation of substrate enforcement composition by attempting single-layer bypass against each enforcement layer (Seatbelt, Intermediary, Atlas, identity isolation) and observing whether unauthorized capability emerges.",
"push": null
},
{
"id": "EXP00004",
"intent_ids": ["INT00001"],
"hypothesis_ids": ["HYP00005"],
"label": "Measurement of defect detection rate against a labeled task corpus, comparing cross-model review composed with eval to weaker baselines (eval alone, same-vendor review, review without eval).",
"push": null
},
{
"id": "EXP00005",
"intent_ids": ["INT00001", "INT00003"],
"hypothesis_ids": ["HYP00006"],
"label": "Comparison of substrate enforcement integrity between local operator sessions and remote operator sessions over SSH-over-Tailscale, executing equivalent substrate operations.",
"push": null
},
{
"id": "EXP00006",
"intent_ids": ["INT00001", "INT00002"],
"hypothesis_ids": ["HYP00007", "HYP00008"],
"label": "Measurement of agent work quality and across-run variance on a fixed task set, comparing substrate-enabled sessions to substrate-ablated sessions with model and task held constant.",
"push": null
},
{
"id": "EXP00007",
"intent_ids": ["INT00002"],
"hypothesis_ids": ["HYP00009"],
"label": "Comparison of in-substrate performance between an open base model and its variant tuned via interpretability-informed training, on a fixed task set.",
"push": null
},
{
"id": "EXP00008",
"intent_ids": ["INT00002", "INT00003"],
"hypothesis_ids": ["HYP00010", "HYP00011", "HYP00012"],
"label": "Comparison of in-substrate performance and per-task cost across tuned open-model families (Qwen, Gemma, gpt-oss) against prior-generation frontier on a fixed work-task corpus; reruns measure cost variance.",
"push": null
},
{
"id": "EXP00009",
"intent_ids": ["INT00003"],
"hypothesis_ids": ["HYP00013"],
"label": "Comparison of substrate enforcement, observability, and evidence guarantees across substitutions of underlying infrastructure components (harness identity, network layer, vendor service).",
"push": null
},
{
"id": "EXP00010",
"intent_ids": ["INT00001", "INT00003"],
"hypothesis_ids": ["HYP00003"],
"label": "Comparison of new-session orientation under Propagate's State load, no load, and vendor-managed memory alone, measured by required operator intervention and drift from current work context.",
"push": null
}
]
}