Skip to content

Commit 2d79ecf

Browse files
rekmarksclaude
andcommitted
refactor(ocap-kernel): Clean up orphaned system subclusters on kernel boot
- Add #handlePersistedSystemSubclusters to delete orphaned subclusters (those without configs) before vat initialization - Split system subcluster init into restore (before vats) and launch (after queue) - Add SubclusterManager.deleteSubcluster for deleting without terminating - Make kernel facet registration idempotent with #ensureKernelFacetRegistered - Add test for orphaned system subcluster cleanup Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent a3a0711 commit 2d79ecf

3 files changed

Lines changed: 158 additions & 39 deletions

File tree

packages/ocap-kernel/src/Kernel.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,54 @@ describe('Kernel', () => {
873873
});
874874
});
875875

876+
describe('system subcluster cleanup', () => {
877+
it('deletes orphaned system subclusters without starting their vats', async () => {
878+
const db = makeMapKernelDatabase();
879+
const systemSubclusterConfig = {
880+
name: 'testSystemSubcluster',
881+
config: {
882+
bootstrap: 'testSystemSubcluster',
883+
vats: {
884+
testSystemSubcluster: {
885+
sourceSpec: 'system-vat.js',
886+
},
887+
},
888+
},
889+
};
890+
891+
// Create kernel with system subcluster
892+
const kernel1 = await Kernel.make(mockPlatformServices, db, {
893+
systemSubclusters: [systemSubclusterConfig],
894+
});
895+
expect(kernel1.getSubclusters()).toHaveLength(1);
896+
expect(kernel1.getVatIds()).toStrictEqual(['v1']);
897+
expect(
898+
kernel1.getSystemSubclusterRoot('testSystemSubcluster'),
899+
).toBeDefined();
900+
901+
// Stop kernel
902+
await kernel1.stop();
903+
904+
// Clear spies to track what happens on restart
905+
launchWorkerMock.mockClear();
906+
makeVatHandleMock.mockClear();
907+
908+
// Restart kernel WITHOUT the system subcluster config
909+
const kernel2 = await Kernel.make(mockPlatformServices, db, {
910+
systemSubclusters: [], // No system subclusters
911+
});
912+
913+
// The orphaned system subcluster should have been deleted without starting vats
914+
expect(launchWorkerMock).not.toHaveBeenCalled();
915+
expect(makeVatHandleMock).not.toHaveBeenCalled();
916+
expect(kernel2.getSubclusters()).toHaveLength(0);
917+
expect(kernel2.getVatIds()).toStrictEqual([]);
918+
expect(
919+
kernel2.getSystemSubclusterRoot('testSystemSubcluster'),
920+
).toBeUndefined();
921+
});
922+
});
923+
876924
describe('revoke and isRevoked', () => {
877925
it('reflect when an object is revoked', async () => {
878926
const kernel = await Kernel.make(

packages/ocap-kernel/src/Kernel.ts

Lines changed: 88 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ export class Kernel {
5454
/** Stores bootstrap root krefs of launched system subclusters */
5555
readonly #systemSubclusterRoots: Map<string, KRef> = new Map();
5656

57+
/** Whether the kernel facet has been registered */
58+
#kernelFacetRegistered = false;
59+
5760
/** Manages remote kernel connections */
5861
readonly #remoteManager: RemoteManager;
5962

@@ -222,19 +225,25 @@ export class Kernel {
222225
async #init(
223226
systemSubclusterConfigs?: SystemSubclusterConfig[],
224227
): Promise<void> {
228+
const configs = systemSubclusterConfigs ?? [];
229+
225230
// Set up the remote message handler
226231
this.#remoteManager.setMessageHandler(
227232
async (from: string, message: string) =>
228233
this.#remoteManager.handleRemoteMessage(from, message),
229234
);
230235

236+
// Handle persisted system subclusters before initializing vats:
237+
// - Delete orphaned ones (no longer in config) so their vats aren't started
238+
// - Restore mappings for existing ones (registers kernelFacet if needed)
239+
this.#handlePersistedSystemSubclusters(configs);
240+
231241
// Start all vats that were previously running before starting the queue
232242
// This ensures that any messages in the queue have their target vats ready
233243
await this.#vatManager.initializeAllVats();
234244

235245
// Start the kernel queue processing (non-blocking)
236246
// This runs for the entire lifetime of the kernel
237-
// Must start before initializing system subclusters since launchSubcluster awaits bootstrap results
238247
this.#kernelQueue
239248
.run(this.#kernelRouter.deliver.bind(this.#kernelRouter))
240249
.catch((error) => {
@@ -245,67 +254,101 @@ export class Kernel {
245254
// Don't re-throw to avoid unhandled rejection in this long-running task
246255
});
247256

248-
// Initialize system subclusters after queue is running
249-
await this.#initSystemSubclusters(systemSubclusterConfigs ?? []);
257+
// Launch any NEW system subclusters (requires queue to be running)
258+
// It's safe to do this last because messages and existing vats cannot possibly
259+
// be targeting these new subclusters.
260+
await this.#launchNewSystemSubclusters(configs);
250261
}
251262

252263
/**
253-
* Initialize system subclusters.
254-
* For existing system subclusters (from a previous session), restore their
255-
* root references from persistence. For new system subclusters, launch them.
264+
* Handle persisted system subclusters before vat initialization.
265+
* - Deletes orphaned subclusters (no longer in config) so their vats aren't started
266+
* - Restores mappings for existing subclusters (registers kernelFacet if needed)
256267
*
257268
* @param configs - Array of system subcluster configurations.
258269
*/
259-
async #initSystemSubclusters(
260-
configs: SystemSubclusterConfig[],
261-
): Promise<void> {
262-
// First, restore persisted system subcluster mappings to #systemSubclusterRoots
270+
#handlePersistedSystemSubclusters(configs: SystemSubclusterConfig[]): void {
263271
const persistedMappings =
264272
this.#kernelStore.getAllSystemSubclusterMappings();
265273

266-
// Early return if no system subclusters to handle
267-
if (persistedMappings.size === 0 && configs.length === 0) {
274+
if (persistedMappings.size === 0) {
268275
return;
269276
}
270277

271-
// Ensure kernel facet is registered before initializing system subclusters
272-
this.#registerKernelFacet();
278+
const configNames = new Set(configs.map((config) => config.name));
279+
280+
// Track if we have any valid persisted subclusters to restore
281+
let hasValidPersistedSubclusters = false;
273282

274283
for (const [name, subclusterId] of persistedMappings) {
284+
if (!configNames.has(name)) {
285+
// This system subcluster no longer has a config - delete it
286+
this.#logger.info(
287+
`System subcluster "${name}" no longer in config, deleting`,
288+
);
289+
this.#subclusterManager.deleteSubcluster(subclusterId);
290+
this.#kernelStore.deleteSystemSubclusterMapping(name);
291+
continue;
292+
}
293+
294+
// Subcluster has a config - try to restore it
275295
const subcluster = this.getSubcluster(subclusterId);
276-
if (subcluster) {
277-
// Subcluster exists - get its bootstrap root from the already-resuscitated vats
278-
const bootstrapVatId = subcluster.vats[0]; // bootstrap vat is first
279-
if (bootstrapVatId) {
280-
const rootKref = this.#kernelStore.getRootObject(bootstrapVatId);
281-
if (rootKref) {
282-
this.#systemSubclusterRoots.set(name, rootKref);
283-
this.#logger.info(`Restored system subcluster "${name}"`);
284-
} else {
285-
this.#logger.warn(
286-
`System subcluster "${name}" has no root object, will relaunch`,
287-
);
288-
// Clean up the stale mapping
289-
this.#kernelStore.deleteSystemSubclusterMapping(name);
290-
}
291-
}
292-
} else {
293-
// Subcluster no longer exists - clean up the stale mapping
296+
if (!subcluster) {
294297
this.#logger.warn(
295298
`System subcluster "${name}" mapping points to non-existent subcluster ${subclusterId}, cleaning up`,
296299
);
297300
this.#kernelStore.deleteSystemSubclusterMapping(name);
301+
continue;
302+
}
303+
304+
const bootstrapVatId = subcluster.vats[0];
305+
if (!bootstrapVatId) {
306+
continue;
298307
}
299-
}
300308

301-
// Then, launch any NEW system subclusters not already persisted
302-
for (const { name, config } of configs) {
303-
if (this.#systemSubclusterRoots.has(name)) {
304-
// Already restored from persistence - skip
309+
const rootKref = this.#kernelStore.getRootObject(bootstrapVatId);
310+
if (!rootKref) {
311+
this.#logger.warn(
312+
`System subcluster "${name}" has no root object, will relaunch`,
313+
);
314+
this.#kernelStore.deleteSystemSubclusterMapping(name);
305315
continue;
306316
}
307317

308-
// New system subcluster - launch it
318+
// Valid subcluster to restore - register kernelFacet on first valid one
319+
if (!hasValidPersistedSubclusters) {
320+
this.#ensureKernelFacetRegistered();
321+
hasValidPersistedSubclusters = true;
322+
}
323+
324+
this.#systemSubclusterRoots.set(name, rootKref);
325+
this.#logger.info(`Restored system subcluster "${name}"`);
326+
}
327+
}
328+
329+
/**
330+
* Launch new system subclusters that aren't already in persistence.
331+
* This must be called after the kernel queue is running since launchSubcluster
332+
* sends bootstrap messages.
333+
*
334+
* @param configs - Array of system subcluster configurations.
335+
*/
336+
async #launchNewSystemSubclusters(
337+
configs: SystemSubclusterConfig[],
338+
): Promise<void> {
339+
// Filter to only configs that weren't restored from persistence
340+
const newConfigs = configs.filter(
341+
({ name }) => !this.#systemSubclusterRoots.has(name),
342+
);
343+
344+
if (newConfigs.length === 0) {
345+
return;
346+
}
347+
348+
// Ensure kernelFacet is registered for new system subclusters
349+
this.#ensureKernelFacetRegistered();
350+
351+
for (const { name, config } of newConfigs) {
309352
const result = await this.launchSubcluster(config);
310353
this.#systemSubclusterRoots.set(name, result.bootstrapRootKref);
311354

@@ -318,8 +361,13 @@ export class Kernel {
318361

319362
/**
320363
* Register the kernel facet as a kernel service for system subclusters.
364+
* This is idempotent - calling it multiple times has no effect after the first.
321365
*/
322-
#registerKernelFacet(): void {
366+
#ensureKernelFacetRegistered(): void {
367+
if (this.#kernelFacetRegistered) {
368+
return;
369+
}
370+
323371
const kernelFacet = makeKernelFacet({
324372
launchSubcluster: this.launchSubcluster.bind(this),
325373
terminateSubcluster: this.terminateSubcluster.bind(this),
@@ -332,6 +380,7 @@ export class Kernel {
332380
'kernelFacet',
333381
kernelFacet,
334382
);
383+
this.#kernelFacetRegistered = true;
335384
}
336385

337386
/**

packages/ocap-kernel/src/vats/SubclusterManager.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,28 @@ export class SubclusterManager {
183183
return this.#kernelStore.getSubclusterVats(subclusterId);
184184
}
185185

186+
/**
187+
* Deletes a subcluster and its vat data from storage without terminating running vats.
188+
* This is used for cleaning up orphaned subclusters before vats are started.
189+
*
190+
* @param subclusterId - The ID of the subcluster to delete.
191+
*/
192+
deleteSubcluster(subclusterId: string): void {
193+
const subcluster = this.#kernelStore.getSubcluster(subclusterId);
194+
if (!subcluster) {
195+
return;
196+
}
197+
198+
// Delete vat configs and mark vats as terminated so their data will be cleaned up
199+
for (const vatId of subcluster.vats) {
200+
this.#kernelStore.deleteVatConfig(vatId);
201+
this.#kernelStore.markVatAsTerminated(vatId);
202+
}
203+
204+
// Delete the subcluster record
205+
this.#kernelStore.deleteSubcluster(subclusterId);
206+
}
207+
186208
/**
187209
* Launches all vats for a subcluster and sets up their bootstrap connections.
188210
*

0 commit comments

Comments
 (0)