ENG-1602: Add PDF extraction API route (#937)

sid597 · web-flow · commit 91fe58c82527 · 2026-04-13T14:44:29.000+05:30
* ENG-1602: Add PDF extraction API route

Multi-provider (Anthropic, OpenAI, Gemini) endpoint for extracting
discourse graph nodes from uploaded PDFs.

* ENG-1602: Unify provider configs for chat and extraction

Widen Message.content to support multimodal content blocks and add
systemPrompt/responseMimeType to Settings. Each provider's
formatRequestBody now handles both text-only chat and PDF extraction,
eliminating the parallel PROVIDERS block in the extraction route.

OpenAI extraction switches from Responses API to Chat Completions
(now supports PDF). Gemini field casing fixed to match REST API docs.

* ENG-1602: Enforce JSON schema output for all providers

Add structured output enforcement via each provider's native mechanism:
Anthropic output_config, OpenAI response_format with strict mode,
Gemini responseJsonSchema. Removes prompt-based JSON instructions and
response cleanup parsing since constrained decoding guarantees valid JSON.

* ENG-1602: Use object destructuring for buildExtractionMessages

Per AGENTS.md: functions with more than 2 parameters use named
parameters via object destructuring.

* ENG-1602: Add eslint-disable for API-required snake_case fields

Inline disables for response_format, json_schema (OpenAI), and
output_config (Anthropic) — external API contract names.

* ENG-1602: Fix Gemini text part format

Gemini parts use { text } not { type: "text", text }. The shared
textBlock was using the Anthropic/OpenAI format which Gemini rejects.
diff --git a/apps/website/app/api/ai/extract/route.ts b/apps/website/app/api/ai/extract/route.ts
@@ -0,0 +1,201 @@
+import { NextRequest, NextResponse } from "next/server";
+import {
+  ExtractionRequestSchema,
+  EXTRACTION_RESULT_JSON_SCHEMA,
+  type ExtractionResponse,
+  type ProviderId,
+} from "~/types/extraction";
+import type { LLMProviderConfig, Message, Settings } from "~/types/llm";
+import {
+  anthropicConfig,
+  openaiConfig,
+  geminiConfig,
+} from "~/utils/llm/providers";
+import {
+  DEFAULT_EXTRACTION_PROMPT,
+  buildUserPrompt,
+} from "~/prompts/extraction";
+import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
+
+export const runtime = "nodejs";
+export const maxDuration = 300;
+
+const PROVIDER_CONFIGS: Record<ProviderId, LLMProviderConfig> = {
+  anthropic: anthropicConfig,
+  openai: openaiConfig,
+  gemini: geminiConfig,
+};
+
+const buildExtractionMessages = ({
+  provider,
+  pdfBase64,
+  userPrompt,
+}: {
+  provider: ProviderId;
+  pdfBase64: string;
+  userPrompt: string;
+}): Message[] => {
+  switch (provider) {
+    case "anthropic":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              type: "document",
+              source: {
+                type: "base64",
+                media_type: "application/pdf", // eslint-disable-line @typescript-eslint/naming-convention
+                data: pdfBase64,
+              },
+            },
+            { type: "text", text: userPrompt },
+          ],
+        },
+      ];
+    case "openai":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              type: "file",
+              file: {
+                filename: "paper.pdf",
+                file_data: `data:application/pdf;base64,${pdfBase64}`, // eslint-disable-line @typescript-eslint/naming-convention
+              },
+            },
+            { type: "text", text: userPrompt },
+          ],
+        },
+      ];
+    case "gemini":
+      return [
+        {
+          role: "user",
+          content: [
+            {
+              inlineData: {
+                mimeType: "application/pdf",
+                data: pdfBase64,
+              },
+            },
+            { text: userPrompt },
+          ],
+        },
+      ];
+  }
+};
+
+export const POST = async (
+  request: NextRequest,
+): Promise<NextResponse<ExtractionResponse>> => {
+  let body: unknown;
+  try {
+    body = await request.json();
+  } catch {
+    return NextResponse.json(
+      { success: false, error: "Invalid JSON body" },
+      { status: 400 },
+    );
+  }
+
+  const validated = ExtractionRequestSchema.safeParse(body);
+  if (!validated.success) {
+    return NextResponse.json(
+      { success: false, error: validated.error.message },
+      { status: 400 },
+    );
+  }
+
+  const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
+    validated.data;
+
+  const config = PROVIDER_CONFIGS[provider];
+  const apiKey = process.env[config.apiKeyEnvVar];
+
+  if (!apiKey) {
+    return NextResponse.json(
+      { success: false, error: `API key not configured for ${provider}.` },
+      { status: 500 },
+    );
+  }
+
+  const messages = buildExtractionMessages({
+    provider,
+    pdfBase64,
+    userPrompt: buildUserPrompt(researchQuestion),
+  });
+
+  const settings: Settings = {
+    model,
+    maxTokens: 16384,
+    temperature: 0.6,
+    systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
+    outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
+  };
+
+  const apiUrl =
+    typeof config.apiUrl === "function"
+      ? config.apiUrl(settings)
+      : config.apiUrl;
+
+  try {
+    const response = await fetch(apiUrl, {
+      method: "POST",
+      headers: config.apiHeaders(apiKey),
+      body: JSON.stringify(config.formatRequestBody(messages, settings)),
+      signal: AbortSignal.timeout(270_000),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text().catch(() => "");
+      return NextResponse.json(
+        {
+          success: false,
+          error: `${provider} API error (${response.status}): ${errorText.slice(0, 200)}`,
+        },
+        { status: 502 },
+      );
+    }
+
+    const responseData: unknown = await response.json();
+    const rawText = config.extractResponseText(responseData);
+
+    if (!rawText) {
+      return NextResponse.json(
+        { success: false, error: `Empty response from ${provider}` },
+        { status: 502 },
+      );
+    }
+
+    let result;
+    try {
+      result = parseExtractionResponse(rawText);
+    } catch (parseError) {
+      const message =
+        parseError instanceof SyntaxError
+          ? "LLM returned invalid JSON"
+          : "LLM returned unexpected response structure";
+      return NextResponse.json(
+        {
+          success: false,
+          error: `Failed to parse extraction response — ${message}`,
+        },
+        { status: 502 },
+      );
+    }
+
+    return NextResponse.json({ success: true, data: result });
+  } catch (error) {
+    const message =
+      error instanceof Error
+        ? `Extraction failed — ${error.message}`
+        : "Extraction failed";
+    console.error("AI extraction failed:", error);
+    return NextResponse.json(
+      { success: false, error: message },
+      { status: 500 },
+    );
+  }
+};
diff --git a/apps/website/app/prompts/extraction.ts b/apps/website/app/prompts/extraction.ts
@@ -0,0 +1,59 @@
+export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
+
+Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
+
+## Node Types
+
+- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
+- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
+- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
+- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
+- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
+- **Source**: A cited publication. Author(s) and year.
+
+## Quality
+
+- Atomic: one idea per node. Split compound sentences.
+- Self-contained: understandable without the paper.
+- Faithful: no inference or editorializing.
+- Specific: "X reduced Y by 43% in Z" not "X was effective."
+- 8–25 nodes. Quality over quantity. Cover all sections.
+- Evidence = prior work cited. Result = this paper's experiments.
+
+## Example
+
+Excerpt (Results):
+"CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."
+
+{
+  "nodes": [
+    {
+      "nodeType": "Result",
+      "content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
+      "supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
+      "sourceSection": "Results"
+    },
+    {
+      "nodeType": "Result",
+      "content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
+      "supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
+      "sourceSection": "Results"
+    },
+    {
+      "nodeType": "Claim",
+      "content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
+      "supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
+      "sourceSection": "Results"
+    }
+  ]
+}`;
+
+export const buildUserPrompt = (researchQuestion?: string): string => {
+  let prompt = "Extract discourse graph nodes from the attached paper.";
+
+  if (researchQuestion) {
+    prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
+  }
+
+  return prompt;
+};
diff --git a/apps/website/app/types/extraction.ts b/apps/website/app/types/extraction.ts
@@ -0,0 +1,59 @@
+import { z } from "zod";
+
+export const PROVIDER_IDS = ["anthropic", "openai", "gemini"] as const;
+
+export type ProviderId = (typeof PROVIDER_IDS)[number];
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractedNodeSchema = z.object({
+  nodeType: z.string(),
+  content: z.string(),
+  supportSnippet: z.string(),
+  sourceSection: z.string().nullable(),
+});
+
+export type ExtractedNode = z.infer<typeof ExtractedNodeSchema>;
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractionResultSchema = z.object({
+  nodes: z.array(ExtractedNodeSchema),
+});
+
+export type ExtractionResult = z.infer<typeof ExtractionResultSchema>;
+
+// eslint-disable-next-line @typescript-eslint/naming-convention
+export const ExtractionRequestSchema = z.object({
+  pdfBase64: z.string().min(1).max(44_000_000),
+  provider: z.enum(PROVIDER_IDS),
+  model: z.string().min(1),
+  researchQuestion: z.string().optional(),
+  systemPrompt: z.string().optional(),
+});
+
+export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;
+
+export const EXTRACTION_RESULT_JSON_SCHEMA: Record<string, unknown> = {
+  type: "object",
+  properties: {
+    nodes: {
+      type: "array",
+      items: {
+        type: "object",
+        properties: {
+          nodeType: { type: "string" },
+          content: { type: "string" },
+          supportSnippet: { type: "string" },
+          sourceSection: { type: ["string", "null"] },
+        },
+        required: ["nodeType", "content", "supportSnippet", "sourceSection"],
+        additionalProperties: false,
+      },
+    },
+  },
+  required: ["nodes"],
+  additionalProperties: false,
+};
+
+export type ExtractionResponse =
+  | { success: true; data: ExtractionResult }
+  | { success: false; error: string };
diff --git a/apps/website/app/types/llm.ts b/apps/website/app/types/llm.ts
@@ -1,12 +1,16 @@
+export type ContentBlock = Record<string, unknown>;
+
 export type Message = {
   role: string;
-  content: string;
+  content: string | ContentBlock[];
 };
 
 export type Settings = {
   model: string;
   maxTokens: number;
   temperature: number;
+  systemPrompt?: string;
+  outputSchema?: Record<string, unknown>;
   safetySettings?: Array<{
     category: string;
     threshold: string;
diff --git a/apps/website/app/utils/ai/parseExtractionResponse.ts b/apps/website/app/utils/ai/parseExtractionResponse.ts
@@ -0,0 +1,9 @@
+import {
+  ExtractionResultSchema,
+  type ExtractionResult,
+} from "~/types/extraction";
+
+export const parseExtractionResponse = (raw: string): ExtractionResult => {
+  const parsed: unknown = JSON.parse(raw);
+  return ExtractionResultSchema.parse(parsed);
+};
diff --git a/apps/website/app/utils/llm/providers.ts b/apps/website/app/utils/llm/providers.ts