Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
09942e1
Merge pull request #404 from elizaOS/dev
lalalune Mar 25, 2026
0fd653a
Merge origin/main into validated dev promotion
lalalune Mar 25, 2026
d0acbcc
fix(cloud): set MILADY_CLOUD_PROVISIONED env var for cloud containers
0xSolace Mar 20, 2026
4cab4b8
fix: use agent:latest instead of hardcoded version
0xSolace Mar 28, 2026
5bc67b4
fix(milady): wallet proxy, Neon branches, remove provisioning cron, d…
0xSolace Mar 30, 2026
9ec941c
fix(ci): biome lint/format fixes - unused vars, imports, formatting
0xSolace Mar 30, 2026
4be0db1
fix(billing): reuse Settings billing tab on /dashboard/billing page
0xSolace Mar 30, 2026
f44f93f
fix(ci): resolve 24 unit test failures caused by bun mock.module poll…
0xSolace Mar 30, 2026
29dcca1
fix(ci): fix compat-envelope domain assertion and mcp-tools credits m…
0xSolace Mar 30, 2026
9513fe4
fix(security): address 4 critical review items - path validation, que…
0xSolace Mar 30, 2026
98adc83
fix: restore Neon project ID fallback with type annotation to fix TS …
0xSolace Mar 30, 2026
34cea41
fix(ci): add missing UsersRepository and writeTransaction exports to …
0xSolace Mar 30, 2026
43b3e43
fix(ci): add resetWhatsAppColumnSupportCacheForTests to UsersReposito…
0xSolace Mar 30, 2026
79afb80
fix(ci): preserve real UsersRepository class in mocks to fix downstre…
0xSolace Mar 30, 2026
914360a
fix: use correct Neon parent project ID accessible by API key
0xSolace Mar 30, 2026
9d576d9
fix(provisioning): disable sync provision in production - always use …
0xSolace Mar 30, 2026
b95040f
fix: use primary DB for findRunningSandbox to avoid read replica lag
0xSolace Mar 30, 2026
1a52ef0
fix: add detailed logging to wallet proxy for debugging 503s
0xSolace Mar 30, 2026
f8936d4
fix: use public agent domain (waifu.fun) for wallet proxy instead of …
0xSolace Mar 30, 2026
b68d97b
fix: completely disable Vercel provisioning cron - VPS worker only
0xSolace Mar 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions app/api/cron/cleanup-stuck-provisioning/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/**
* Cleanup Stuck Provisioning Cron
*
* Detects and recovers agents that are stuck in "provisioning" status with no
* active job to drive them forward. This happens when:
*
* 1. A container crashes while the agent is running, and something (e.g.
* the Next.js sync provision path) sets status = 'provisioning' but
* never creates a jobs-table record.
* 2. A provision job is enqueued but the worker invocation dies before it
* can claim the record — in this case the job-recovery logic in
* process-provisioning-jobs will already handle it, but we add a belt-
* and-suspenders check here for the no-job case.
*
* Criteria for "stuck":
* - status = 'provisioning'
* - updated_at < NOW() - 10 minutes (well beyond any normal provision time)
* - no jobs row in ('pending', 'in_progress') whose data->>'agentId' matches
*
* Action: set status = 'error', write a descriptive error_message so the user
* can see what happened and re-provision.
*
* Schedule: every 5 minutes ("* /5 * * * *" in vercel.json)
* Protected by CRON_SECRET.
*/

import { and, eq, lt, sql } from "drizzle-orm";
import { NextRequest, NextResponse } from "next/server";
import { dbWrite } from "@/db/client";
import { jobs } from "@/db/schemas/jobs";
import { miladySandboxes } from "@/db/schemas/milady-sandboxes";
import { verifyCronSecret } from "@/lib/api/cron-auth";
import { logger } from "@/lib/utils/logger";

export const runtime = "nodejs";
export const dynamic = "force-dynamic";
export const maxDuration = 60;

/** How long an agent must be stuck before we reset it (ms). */
const STUCK_THRESHOLD_MINUTES = 10;

interface CleanupResult {
agentId: string;
agentName: string | null;
organizationId: string;
stuckSinceMinutes: number;
}

async function handleCleanupStuckProvisioning(request: NextRequest) {
try {
const authError = verifyCronSecret(request, "[Cleanup Stuck Provisioning]");
if (authError) return authError;

logger.info("[Cleanup Stuck Provisioning] Starting scan");

const cutoff = new Date(Date.now() - STUCK_THRESHOLD_MINUTES * 60 * 1000);

/**
* Single UPDATE … RETURNING query:
*
* UPDATE milady_sandboxes
* SET status = 'error',
* error_message = '...',
* updated_at = NOW()
* WHERE status = 'provisioning'
* AND updated_at < :cutoff
* AND NOT EXISTS (
* SELECT 1 FROM jobs
* WHERE jobs.data->>'agentId' = milady_sandboxes.id::text
* AND jobs.status IN ('pending', 'in_progress')
* )
* RETURNING id, agent_name, organization_id, updated_at
*
* We run this inside dbWrite so it lands on the primary replica and is
* subject to the write path's connection pool.
*/
const stuckAgents = await dbWrite
.update(miladySandboxes)
.set({
status: "error",
error_message:
"Agent was stuck in provisioning state with no active provisioning job. " +
"This usually means a container crashed before the provisioning job could be created, " +
"or the job was lost. Please try starting the agent again.",
updated_at: new Date(),
})
.where(
and(
eq(miladySandboxes.status, "provisioning"),
lt(miladySandboxes.updated_at, cutoff),
sql`NOT EXISTS (
SELECT 1 FROM ${jobs}
WHERE ${jobs.data}->>'agentId' = ${miladySandboxes.id}::text
AND ${jobs.status} IN ('pending', 'in_progress')
)`,
),
)
.returning({
agentId: miladySandboxes.id,
agentName: miladySandboxes.agent_name,
organizationId: miladySandboxes.organization_id,
updatedAt: miladySandboxes.updated_at,
});

const results: CleanupResult[] = stuckAgents.map((row) => ({
agentId: row.agentId,
agentName: row.agentName,
organizationId: row.organizationId,
// updatedAt is now the new timestamp; we can't recover the old one here,
// but the log message below captures the count.
stuckSinceMinutes: STUCK_THRESHOLD_MINUTES, // minimum — actual may be longer
}));

if (results.length > 0) {
logger.warn("[Cleanup Stuck Provisioning] Reset stuck agents", {
count: results.length,
agents: results.map((r) => ({
agentId: r.agentId,
agentName: r.agentName,
organizationId: r.organizationId,
})),
});
} else {
logger.info("[Cleanup Stuck Provisioning] No stuck agents found");
}

return NextResponse.json({
success: true,
data: {
cleaned: results.length,
thresholdMinutes: STUCK_THRESHOLD_MINUTES,
timestamp: new Date().toISOString(),
agents: results,
},
});
} catch (error) {
logger.error(
"[Cleanup Stuck Provisioning] Failed:",
error instanceof Error ? error.message : String(error),
);

return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : "Cleanup failed",
},
{ status: 500 },
);
}
}

/**
* GET /api/cron/cleanup-stuck-provisioning
* Cron endpoint — protected by CRON_SECRET (Vercel passes it automatically).
*/
export async function GET(request: NextRequest) {
return handleCleanupStuckProvisioning(request);
}

/**
* POST /api/cron/cleanup-stuck-provisioning
* Manual trigger for testing — same auth requirement.
*/
export async function POST(request: NextRequest) {
return handleCleanupStuckProvisioning(request);
}
2 changes: 2 additions & 0 deletions app/api/stripe/create-checkout-session/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const ALLOWED_ORIGINS = [
process.env.NEXT_PUBLIC_APP_URL,
"http://localhost:3000",
"http://localhost:3001",
"https://milady.ai",
"https://www.milady.ai",
].filter(Boolean) as string[];

// Configurable currency
Expand Down
2 changes: 1 addition & 1 deletion app/api/v1/admin/service-pricing/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import { NextRequest, NextResponse } from "next/server";
import { z } from "zod";
import { servicePricingRepository } from "@/db/repositories";
import { servicePricingRepository } from "@/db/repositories/service-pricing";
import { requireAdminWithResponse } from "@/lib/api/admin-auth";
import { invalidateServicePricingCache } from "@/lib/services/proxy/pricing";
import { logger } from "@/lib/utils/logger";
Expand Down
115 changes: 6 additions & 109 deletions app/api/v1/cron/process-provisioning-jobs/route.ts
Original file line number Diff line number Diff line change
@@ -1,115 +1,12 @@
import { timingSafeEqual } from "crypto";
import { NextRequest, NextResponse } from "next/server";
import { provisioningJobService } from "@/lib/services/provisioning-jobs";
import { logger } from "@/lib/utils/logger";
import { NextResponse } from "next/server";

export const dynamic = "force-dynamic";
export const maxDuration = 120; // Provisioning can take up to ~90s per job

function verifyCronSecret(request: NextRequest): boolean {
const authHeader = request.headers.get("authorization");
const cronSecret = process.env.CRON_SECRET;

if (!cronSecret) {
logger.error("[Provisioning Jobs] CRON_SECRET not configured - rejecting request for security");
return false;
}

const providedSecret = authHeader?.replace("Bearer ", "") || "";
const providedBuffer = Buffer.from(providedSecret, "utf8");
const secretBuffer = Buffer.from(cronSecret, "utf8");

// Reject immediately on length mismatch — padding to max-length would
// make timingSafeEqual always compare equal-length buffers but the
// zero-padded tail leaks nothing useful; however, strict length-equality
// is the canonical safe pattern and avoids any ambiguity.
if (providedBuffer.length !== secretBuffer.length) {
return false;
}

return timingSafeEqual(providedBuffer, secretBuffer);
}

/**
* Process Provisioning Jobs Cron Handler
*
* Claims and executes pending provisioning jobs from the `jobs` table.
* Uses FOR UPDATE SKIP LOCKED pattern (via JobsRepository) to prevent
* double-processing when multiple cron invocations overlap.
*
* Schedule: Every minute (matches deployment-monitor)
* Batch size: 5 jobs per invocation
*
* Also recovers stale jobs (stuck in_progress > 5 minutes) and retries
* them with exponential backoff.
*/
async function handleProcessProvisioningJobs(request: NextRequest) {
try {
if (!process.env.CRON_SECRET) {
return NextResponse.json(
{
success: false,
error: "Server configuration error: CRON_SECRET not set",
},
{ status: 500 },
);
}

if (!verifyCronSecret(request)) {
logger.warn("[Provisioning Jobs] Unauthorized request", {
ip: request.headers.get("x-forwarded-for"),
timestamp: new Date().toISOString(),
});
return NextResponse.json({ success: false, error: "Unauthorized" }, { status: 401 });
}

logger.info("[Provisioning Jobs] Starting job processing cycle");

const result = await provisioningJobService.processPendingJobs(5);

if (result.claimed > 0) {
logger.info("[Provisioning Jobs] Processing complete", {
claimed: result.claimed,
succeeded: result.succeeded,
failed: result.failed,
});
}

return NextResponse.json({
success: true,
data: {
...result,
timestamp: new Date().toISOString(),
},
});
} catch (error) {
logger.error(
"[Provisioning Jobs] Failed:",
error instanceof Error ? error.message : String(error),
);

return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : "Provisioning job processing failed",
},
{ status: 500 },
);
}
}

/**
* GET /api/v1/cron/process-provisioning-jobs
* Protected by CRON_SECRET.
*/
export async function GET(request: NextRequest) {
return handleProcessProvisioningJobs(request);
}

/**
* POST /api/v1/cron/process-provisioning-jobs
* Protected by CRON_SECRET (for manual testing).
* DISABLED — Provisioning is handled exclusively by the standalone VPS worker.
* This route is kept as a no-op to prevent 404s from any lingering cron invocations.
* The VPS worker polls the jobs table directly and has SSH access to Docker nodes.
*/
export async function POST(request: NextRequest) {
return handleProcessProvisioningJobs(request);
export async function POST() {
return NextResponse.json({ success: true, message: "Provisioning handled by VPS worker", skipped: true });
}
Loading
Loading