Skip to content

Commit 3e37d50

Browse files
authored
channels: persistent intro-DM ledger for HTTP mode (H3 onboarding fix) (#119)
Production HTTP-mode tenants stayed at /health.onboarding="pending" forever and could re-DM the installer on every process restart. Both behaviours trace to the in-memory firstDmSent flag in SlackHttpChannel plus the fact that markOnboardingStarted only runs from the Socket Mode flow (gated on a channels.yaml that is never baked in HTTP-mode rootfs). Add an optional IntroductionLedger surface on SlackHttpChannel that production wires against the SQLite onboarding_state table. The receiver short-circuits the intro DM when the ledger reports sent and stamps it only AFTER a successful Slack send, so: - /health.onboarding flips to "complete" on the first successful DM - a process restart never re-DMs the installer - a transient Slack failure (rate limit, no ts returned, throw) leaves the row clear so the next process start retries The ledger is optional; tests and single-tenant dev fall back to the existing in-memory flag with the prior semantics. The SlackHttpChannel constructor stays compatible with all existing callers. Tests: 7 new ledger-path cases on slack-http-receiver plus a slack-channel-factory regression that pins the introductionLedger forward, all green. Full suite 2316 pass / 0 fail. Refs: real-vs-fake audit v2 finding H3
1 parent 50a3fd4 commit 3e37d50

5 files changed

Lines changed: 272 additions & 17 deletions

File tree

src/channels/__tests__/slack-channel-factory.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,43 @@ describe("createSlackChannel", () => {
251251
expect(ch).toBeInstanceOf(SlackHttpChannel);
252252
});
253253

254+
test("transport=http forwards introductionLedger to the SlackHttpChannel", async () => {
255+
// H3 fix regression guard: the SQLite-backed intro-DM ledger must
256+
// reach the channel constructor so connect() can short-circuit on
257+
// a process restart and stamp /health=onboarding-complete on a
258+
// successful send. A future refactor that drops the forward
259+
// breaks this test loud; without it, a tenant whose first DM
260+
// failed would silently re-fire (or never fire) on restart.
261+
const idFetcher = { get: () => Promise.resolve(HTTP_IDENTITY) };
262+
const secFetcher = makeSecretFetcher();
263+
let isCalled = 0;
264+
let markCalled = 0;
265+
const ledger = {
266+
isIntroSent: () => {
267+
isCalled++;
268+
return true;
269+
},
270+
markIntroSent: () => {
271+
markCalled++;
272+
},
273+
};
274+
const ch = await createSlackChannel({
275+
transport: "http",
276+
channelsConfig: null,
277+
identityFetcher: idFetcher,
278+
secretsFetcher: secFetcher,
279+
introductionLedger: ledger,
280+
});
281+
// Calling connect() exercises the ledger path. The mocked Bolt
282+
// client (loaded by this test's @slack/bolt mock) makes auth.test
283+
// resolve immediately; isIntroSent() === true short-circuits the
284+
// intro DM so we know the ledger reached the channel.
285+
expect(ch).toBeInstanceOf(SlackHttpChannel);
286+
await (ch as InstanceType<typeof SlackHttpChannel>).connect();
287+
expect(isCalled).toBeGreaterThanOrEqual(1);
288+
expect(markCalled).toBe(0);
289+
});
290+
254291
test("makeSecretFetcher fails-loud when production asks for an unknown name", async () => {
255292
// This is the audit Finding 1 regression guard. If the production code
256293
// in slack-channel-factory.ts ever drifts to ask for a different name

src/channels/__tests__/slack-http-receiver.test.ts

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ mock.module("@slack/bolt", () => ({
8383

8484
// Import the channel AFTER the module mock so the constructor uses our doubles.
8585
const { SlackHttpChannel } = await import("../slack-http-receiver.ts");
86+
type IntroductionLedger = import("../slack-http-receiver.ts").IntroductionLedger;
8687

8788
const SECRET = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
8889
const TEAM_ID = "T9TK3CUKW";
@@ -778,6 +779,142 @@ describe("synthetic first DM on connect", () => {
778779
});
779780
});
780781

782+
// ----- introductionLedger (persistent intro-DM idempotency) ----------------
783+
//
784+
// H3 root-cause fix: the in-memory `firstDmSent` flag lost its state on
785+
// every process restart, so a tenant whose first intro DM failed (or
786+
// silently never fired) had no record of the attempt. Phantom Cloud
787+
// production also lacked any signal in `/health` that the intro had
788+
// happened: `onboarding` stayed at "pending" because `markOnboardingStarted`
789+
// only ran from the Socket Mode `startOnboarding` path which is gated on a
790+
// `channels.yaml` that is never baked in HTTP-mode rootfs.
791+
//
792+
// The ledger surface fixes both legs in one place. Production wires it
793+
// against the SQLite `onboarding_state` table; these tests use a tiny
794+
// in-memory recorder that pins the contract:
795+
// 1. isIntroSent() === true short-circuits before any Slack call.
796+
// 2. markIntroSent() fires only AFTER a successful sendIntroductionDm.
797+
// 3. A throwing markIntroSent does not derail connect().
798+
799+
describe("introductionLedger persistent idempotency", () => {
800+
type LedgerStub = {
801+
state: { sent: boolean };
802+
isCalled: { isIntroSent: number; markIntroSent: number };
803+
isIntroSent: () => boolean;
804+
markIntroSent: () => void;
805+
};
806+
function makeLedger(initial: boolean): LedgerStub {
807+
const state = { sent: initial };
808+
const isCalled = { isIntroSent: 0, markIntroSent: 0 };
809+
return {
810+
state,
811+
isCalled,
812+
isIntroSent: () => {
813+
isCalled.isIntroSent++;
814+
return state.sent;
815+
},
816+
markIntroSent: () => {
817+
isCalled.markIntroSent++;
818+
state.sent = true;
819+
},
820+
};
821+
}
822+
823+
test("skips intro DM when ledger already records intro_sent (process restart case)", async () => {
824+
const ledger = makeLedger(true);
825+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
826+
await channel.connect();
827+
const calls = mockPostMessage.mock.calls as unknown as Array<[{ channel?: string }]>;
828+
const introCalls = calls.filter((c) => c[0].channel === "D_DM_OPEN").length;
829+
expect(introCalls).toBe(0);
830+
expect(ledger.isCalled.isIntroSent).toBeGreaterThanOrEqual(1);
831+
expect(ledger.isCalled.markIntroSent).toBe(0);
832+
});
833+
834+
test("fires intro DM and stamps ledger on a successful send (first boot)", async () => {
835+
const ledger = makeLedger(false);
836+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
837+
await channel.connect();
838+
const calls = mockPostMessage.mock.calls as unknown as Array<[{ channel?: string }]>;
839+
const introCalls = calls.filter((c) => c[0].channel === "D_DM_OPEN").length;
840+
expect(introCalls).toBe(1);
841+
expect(ledger.isCalled.markIntroSent).toBe(1);
842+
expect(ledger.state.sent).toBe(true);
843+
});
844+
845+
test("does NOT stamp ledger when sendIntroductionDm fails (transient Slack error preserves retry budget)", async () => {
846+
// Slack rate-limit on chat.postMessage. The DM never reaches the
847+
// user; the ledger must stay clear so a process restart retries.
848+
mockPostMessage.mockImplementation(() => Promise.reject(new Error("ratelimited")));
849+
const ledger = makeLedger(false);
850+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
851+
await channel.connect();
852+
expect(channel.getConnectionState()).toBe("connected");
853+
expect(ledger.state.sent).toBe(false);
854+
expect(ledger.isCalled.markIntroSent).toBe(0);
855+
mockPostMessage.mockImplementation(() => Promise.resolve({ ts: "1234567890.123456" }));
856+
});
857+
858+
test("does NOT stamp ledger when chat.postMessage returns no ts (ledger preserves retry across restart)", async () => {
859+
mockPostMessage.mockImplementationOnce(() => Promise.resolve({ ts: "" } as { ts: string }));
860+
const ledger = makeLedger(false);
861+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
862+
await channel.connect();
863+
expect(ledger.state.sent).toBe(false);
864+
expect(ledger.isCalled.markIntroSent).toBe(0);
865+
});
866+
867+
test("ledger stamp survives reconnect: a subsequent connect() does not re-DM", async () => {
868+
const ledger = makeLedger(false);
869+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
870+
await channel.connect();
871+
await channel.disconnect();
872+
mockPostMessage.mockClear();
873+
await channel.connect();
874+
const calls = mockPostMessage.mock.calls as unknown as Array<[{ channel?: string }]>;
875+
const introCalls = calls.filter((c) => c[0].channel === "D_DM_OPEN").length;
876+
expect(introCalls).toBe(0);
877+
});
878+
879+
test("a throwing markIntroSent does not derail connect() (defense in depth)", async () => {
880+
const ledger: IntroductionLedger = {
881+
isIntroSent: () => false,
882+
markIntroSent: () => {
883+
throw new Error("disk full");
884+
},
885+
};
886+
const warns: string[] = [];
887+
const original = console.warn;
888+
console.warn = (...args: unknown[]) => {
889+
warns.push(args.map(String).join(" "));
890+
};
891+
try {
892+
const channel = new SlackHttpChannel({ ...baseConfig, introductionLedger: ledger });
893+
await expect(channel.connect()).resolves.toBeUndefined();
894+
expect(channel.getConnectionState()).toBe("connected");
895+
} finally {
896+
console.warn = original;
897+
}
898+
const all = warns.join("\n");
899+
expect(all).toContain("markIntroSent failed");
900+
expect(all).toContain("disk full");
901+
});
902+
903+
test("without a ledger (single-tenant dev) the in-memory firstDmSent fallback is used", async () => {
904+
// No introductionLedger on baseConfig -> reconnect should still
905+
// short-circuit via the in-memory flag. This pins the back-compat
906+
// surface for self-hosted Socket Mode and unit-test fixtures.
907+
const channel = new SlackHttpChannel(baseConfig);
908+
await channel.connect();
909+
await channel.disconnect();
910+
mockPostMessage.mockClear();
911+
await channel.connect();
912+
const calls = mockPostMessage.mock.calls as unknown as Array<[{ channel?: string }]>;
913+
const introCalls = calls.filter((c) => c[0].channel === "D_DM_OPEN").length;
914+
expect(introCalls).toBe(0);
915+
});
916+
});
917+
781918
// ----- send and outbound API ----------------------------------------------
782919

783920
describe("send / outbound", () => {

src/channels/slack-channel-factory.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import { DEFAULT_METADATA_BASE_URL, MetadataIdentityFetcher, type SlackIdentity } from "../config/identity-fetcher.ts";
77
import { MetadataSecretFetcher } from "../config/metadata-fetcher.ts";
88
import type { ChannelsConfig } from "../config/schemas.ts";
9-
import { SlackHttpChannel } from "./slack-http-receiver.ts";
9+
import { type IntroductionLedger, SlackHttpChannel } from "./slack-http-receiver.ts";
1010
import type { SlackMetricsEmitter } from "./slack-metrics.ts";
1111
import type { SlackTransport } from "./slack-transport.ts";
1212
import { SlackChannel } from "./slack.ts";
@@ -68,6 +68,15 @@ export type CreateSlackChannelInput = {
6868
* follow-up generalizes the surface (Phase 17).
6969
*/
7070
metrics?: SlackMetricsEmitter;
71+
/**
72+
* Persistent intro-DM ledger forwarded to the HTTP receiver. The
73+
* receiver consults it before firing the synthetic first DM so a
74+
* process restart does not re-DM the installer, and stamps it on a
75+
* successful send so /health reports onboarding complete. Socket
76+
* Mode does not use this surface (its onboarding lives in
77+
* `src/onboarding/flow.ts`).
78+
*/
79+
introductionLedger?: IntroductionLedger;
7180
};
7281

7382
/**
@@ -125,6 +134,7 @@ export async function createSlackChannel(input: CreateSlackChannelInput): Promis
125134
teamId: identity.slack.teamId,
126135
installerUserId: identity.slack.installerUserId,
127136
teamName: identity.slack.teamName,
137+
introductionLedger: input.introductionLedger,
128138
});
129139
}
130140

src/channels/slack-http-receiver.ts

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,42 @@ import { redactTokens } from "./slack-http-utils.ts";
4343
import { sendIntroductionDm } from "./slack-introduction.ts";
4444
import type { Channel, ChannelCapabilities, InboundMessage, OutboundMessage, SentMessage } from "./types.ts";
4545

46+
/**
47+
* Persistent introduction ledger surface. The HTTP receiver consults
48+
* `isIntroSent` before firing the synthetic first DM and calls
49+
* `markIntroSent` only AFTER a successful Slack send. The wiring lives
50+
* in `index.ts` against the SQLite `onboarding_state` table; this
51+
* indirection keeps the channel free of a direct SQLite handle so the
52+
* unit tests do not have to spin up a database.
53+
*
54+
* Two outcomes the ledger pins:
55+
* 1. `/health.onboarding` flips from "pending" to "complete" after the
56+
* first successful intro DM, mirroring the Socket Mode flow.
57+
* 2. A process restart never re-fires the intro DM once it has gone
58+
* out (the in-memory `firstDmSent` flag was lost on restart and
59+
* could re-DM the user; SQLite is now authoritative).
60+
*
61+
* Both callbacks are optional: when omitted (single-tenant dev / tests),
62+
* the channel falls back to the in-memory `firstDmSent` flag with the
63+
* same semantics it had before the ledger was introduced.
64+
*/
65+
export type IntroductionLedger = {
66+
isIntroSent(): boolean;
67+
markIntroSent(): void;
68+
};
69+
4670
export type SlackHttpChannelConfig = {
4771
botToken: string;
4872
gatewaySigningSecret: string;
4973
teamId: string;
5074
installerUserId: string;
5175
teamName: string;
76+
/**
77+
* Persistent intro-DM ledger. Production wires this against
78+
* `onboarding_state` so the /health endpoint reports onboarding
79+
* complete and a process restart does not re-DM the installer.
80+
*/
81+
introductionLedger?: IntroductionLedger;
5282
};
5383

5484
type ConnectionState = "disconnected" | "connecting" | "connected" | "error";
@@ -79,11 +109,12 @@ export class SlackHttpChannel implements Channel, EventDispatchHost {
79109
private connectionState: ConnectionState = "disconnected";
80110
private botUserId: string | null = null;
81111
private phantomName = "Phantom";
82-
// Instance-level guard against re-introducing the agent after a
83-
// transient disconnect plus reconnect. A process restart resets the
84-
// flag intentionally: a fresh user-visible DM beats a silent UX
85-
// failure when the operator has had to restart the channel.
112+
// In-memory fallback when no persistent ledger is wired (tests,
113+
// single-tenant dev). Production reads/writes the SQLite ledger
114+
// passed in via `introductionLedger`; once the ledger is set it
115+
// is the authoritative source and this flag is unused.
86116
private firstDmSent = false;
117+
private readonly introductionLedger: IntroductionLedger | null;
87118

88119
constructor(config: SlackHttpChannelConfig) {
89120
if (!config.botToken) throw new Error("SlackHttpChannel: botToken is required");
@@ -94,6 +125,7 @@ export class SlackHttpChannel implements Channel, EventDispatchHost {
94125
this.installerUserId = config.installerUserId;
95126
this.teamName = config.teamName;
96127
this.gatewaySigningSecret = config.gatewaySigningSecret;
128+
this.introductionLedger = config.introductionLedger ?? null;
97129

98130
// Bolt's `App` constructor calls `receiver.init(app)` synchronously
99131
// to wire the App reference; we reuse that App for processEvent
@@ -201,17 +233,43 @@ export class SlackHttpChannel implements Channel, EventDispatchHost {
201233
}
202234

203235
// Synthetic first DM. Fire after the channel state flips to connected
204-
// so the user can reply immediately; gate on firstDmSent so a
205-
// reconnect-after-drop does not re-introduce.
206-
if (!this.firstDmSent && this.installerUserId) {
207-
const result = await sendIntroductionDm({
208-
phantomName: this.phantomName,
209-
teamName: this.teamName,
210-
installerUserId: this.installerUserId,
211-
sendDm: (userId, text) => this.sendDm(userId, text),
212-
});
213-
if (result.sent) {
214-
this.firstDmSent = true;
236+
// so the user can reply immediately. Idempotency:
237+
// - production wires `introductionLedger` against the SQLite
238+
// onboarding_state table; a true ledger means the DM has
239+
// already gone out (in this process or a prior restart) and
240+
// we skip without retrying. The ledger is also what /health
241+
// reads to flip onboarding from "pending" to "complete".
242+
// - without a ledger (tests, single-tenant dev) we fall back
243+
// to the in-memory `firstDmSent` flag; a restart re-fires.
244+
if (!this.installerUserId) {
245+
return;
246+
}
247+
if (this.introductionLedger?.isIntroSent() === true) {
248+
console.log(`[${LOG_TAG}] introduction ledger already records intro_sent; skipping intro DM`);
249+
return;
250+
}
251+
if (this.introductionLedger === null && this.firstDmSent) {
252+
return;
253+
}
254+
const result = await sendIntroductionDm({
255+
phantomName: this.phantomName,
256+
teamName: this.teamName,
257+
installerUserId: this.installerUserId,
258+
sendDm: (userId, text) => this.sendDm(userId, text),
259+
});
260+
if (result.sent) {
261+
this.firstDmSent = true;
262+
// The ledger is stamped only AFTER a successful Slack send so
263+
// a transient Slack failure leaves the row clear and the next
264+
// process start retries. This mirrors the Phase 12
265+
// firstboot_state pattern in the Socket Mode flow.
266+
if (this.introductionLedger !== null) {
267+
try {
268+
this.introductionLedger.markIntroSent();
269+
} catch (err: unknown) {
270+
const msg = err instanceof Error ? err.message : String(err);
271+
console.warn(`[${LOG_TAG}] introduction ledger markIntroSent failed: ${msg}`);
272+
}
215273
}
216274
}
217275
}

src/index.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ import { MemorySystem } from "./memory/system.ts";
5757
import { isFirstRun, isOnboardingInProgress } from "./onboarding/detection.ts";
5858
import { type OnboardingTarget, startOnboarding } from "./onboarding/flow.ts";
5959
import { buildOnboardingPrompt } from "./onboarding/prompt.ts";
60-
import { getOnboardingStatus } from "./onboarding/state.ts";
60+
import { getOnboardingStatus, markOnboardingComplete, markOnboardingStarted } from "./onboarding/state.ts";
6161
import { createRoleRegistry } from "./roles/registry.ts";
6262
import type { RoleTemplate } from "./roles/types.ts";
6363
import { Scheduler } from "./scheduler/service.ts";
@@ -364,11 +364,24 @@ async function main(): Promise<void> {
364364
// sees both metric families. Per-emitter registries keep names from
365365
// colliding across channels.
366366
setMetricsRegistryProvider(() => [slackMetrics.registry, emailMetrics.registry]);
367+
// HTTP-mode tenants need a persistent intro-DM ledger so /health
368+
// reports onboarding=complete after the first DM lands and a process
369+
// restart does not re-DM the installer. We bind it to the SQLite
370+
// `onboarding_state` table that the Socket Mode flow already drives.
371+
// The factory ignores this surface for the Socket Mode path; it is
372+
// only consumed by SlackHttpChannel.connect().
367373
const slackChannel: SlackTransport | null = await createSlackChannel({
368374
transport: slackTransport,
369375
channelsConfig,
370376
metadataBaseUrl: process.env.METADATA_BASE_URL,
371377
metrics: slackMetrics,
378+
introductionLedger: {
379+
isIntroSent: () => getOnboardingStatus(db).status === "complete",
380+
markIntroSent: () => {
381+
markOnboardingStarted(db);
382+
markOnboardingComplete(db);
383+
},
384+
},
372385
});
373386

374387
if (slackChannel) {

0 commit comments

Comments
 (0)