Skip to content

Commit 91fe58c

Browse files
authored
ENG-1602: Add PDF extraction API route (#937)
* ENG-1602: Add PDF extraction API route Multi-provider (Anthropic, OpenAI, Gemini) endpoint for extracting discourse graph nodes from uploaded PDFs. * ENG-1602: Unify provider configs for chat and extraction Widen Message.content to support multimodal content blocks and add systemPrompt/responseMimeType to Settings. Each provider's formatRequestBody now handles both text-only chat and PDF extraction, eliminating the parallel PROVIDERS block in the extraction route. OpenAI extraction switches from Responses API to Chat Completions (now supports PDF). Gemini field casing fixed to match REST API docs. * ENG-1602: Enforce JSON schema output for all providers Add structured output enforcement via each provider's native mechanism: Anthropic output_config, OpenAI response_format with strict mode, Gemini responseJsonSchema. Removes prompt-based JSON instructions and response cleanup parsing since constrained decoding guarantees valid JSON. * ENG-1602: Use object destructuring for buildExtractionMessages Per AGENTS.md: functions with more than 2 parameters use named parameters via object destructuring. * ENG-1602: Add eslint-disable for API-required snake_case fields Inline disables for response_format, json_schema (OpenAI), and output_config (Anthropic) — external API contract names. * ENG-1602: Fix Gemini text part format Gemini parts use { text } not { type: "text", text }. The shared textBlock was using the Anthropic/OpenAI format which Gemini rejects.
1 parent ee245bb commit 91fe58c

File tree

6 files changed

+371
-4
lines changed

6 files changed

+371
-4
lines changed
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
import { NextRequest, NextResponse } from "next/server";
2+
import {
3+
ExtractionRequestSchema,
4+
EXTRACTION_RESULT_JSON_SCHEMA,
5+
type ExtractionResponse,
6+
type ProviderId,
7+
} from "~/types/extraction";
8+
import type { LLMProviderConfig, Message, Settings } from "~/types/llm";
9+
import {
10+
anthropicConfig,
11+
openaiConfig,
12+
geminiConfig,
13+
} from "~/utils/llm/providers";
14+
import {
15+
DEFAULT_EXTRACTION_PROMPT,
16+
buildUserPrompt,
17+
} from "~/prompts/extraction";
18+
import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
19+
20+
export const runtime = "nodejs";
21+
export const maxDuration = 300;
22+
23+
const PROVIDER_CONFIGS: Record<ProviderId, LLMProviderConfig> = {
24+
anthropic: anthropicConfig,
25+
openai: openaiConfig,
26+
gemini: geminiConfig,
27+
};
28+
29+
const buildExtractionMessages = ({
30+
provider,
31+
pdfBase64,
32+
userPrompt,
33+
}: {
34+
provider: ProviderId;
35+
pdfBase64: string;
36+
userPrompt: string;
37+
}): Message[] => {
38+
switch (provider) {
39+
case "anthropic":
40+
return [
41+
{
42+
role: "user",
43+
content: [
44+
{
45+
type: "document",
46+
source: {
47+
type: "base64",
48+
media_type: "application/pdf", // eslint-disable-line @typescript-eslint/naming-convention
49+
data: pdfBase64,
50+
},
51+
},
52+
{ type: "text", text: userPrompt },
53+
],
54+
},
55+
];
56+
case "openai":
57+
return [
58+
{
59+
role: "user",
60+
content: [
61+
{
62+
type: "file",
63+
file: {
64+
filename: "paper.pdf",
65+
file_data: `data:application/pdf;base64,${pdfBase64}`, // eslint-disable-line @typescript-eslint/naming-convention
66+
},
67+
},
68+
{ type: "text", text: userPrompt },
69+
],
70+
},
71+
];
72+
case "gemini":
73+
return [
74+
{
75+
role: "user",
76+
content: [
77+
{
78+
inlineData: {
79+
mimeType: "application/pdf",
80+
data: pdfBase64,
81+
},
82+
},
83+
{ text: userPrompt },
84+
],
85+
},
86+
];
87+
}
88+
};
89+
90+
export const POST = async (
91+
request: NextRequest,
92+
): Promise<NextResponse<ExtractionResponse>> => {
93+
let body: unknown;
94+
try {
95+
body = await request.json();
96+
} catch {
97+
return NextResponse.json(
98+
{ success: false, error: "Invalid JSON body" },
99+
{ status: 400 },
100+
);
101+
}
102+
103+
const validated = ExtractionRequestSchema.safeParse(body);
104+
if (!validated.success) {
105+
return NextResponse.json(
106+
{ success: false, error: validated.error.message },
107+
{ status: 400 },
108+
);
109+
}
110+
111+
const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
112+
validated.data;
113+
114+
const config = PROVIDER_CONFIGS[provider];
115+
const apiKey = process.env[config.apiKeyEnvVar];
116+
117+
if (!apiKey) {
118+
return NextResponse.json(
119+
{ success: false, error: `API key not configured for ${provider}.` },
120+
{ status: 500 },
121+
);
122+
}
123+
124+
const messages = buildExtractionMessages({
125+
provider,
126+
pdfBase64,
127+
userPrompt: buildUserPrompt(researchQuestion),
128+
});
129+
130+
const settings: Settings = {
131+
model,
132+
maxTokens: 16384,
133+
temperature: 0.6,
134+
systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
135+
outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
136+
};
137+
138+
const apiUrl =
139+
typeof config.apiUrl === "function"
140+
? config.apiUrl(settings)
141+
: config.apiUrl;
142+
143+
try {
144+
const response = await fetch(apiUrl, {
145+
method: "POST",
146+
headers: config.apiHeaders(apiKey),
147+
body: JSON.stringify(config.formatRequestBody(messages, settings)),
148+
signal: AbortSignal.timeout(270_000),
149+
});
150+
151+
if (!response.ok) {
152+
const errorText = await response.text().catch(() => "");
153+
return NextResponse.json(
154+
{
155+
success: false,
156+
error: `${provider} API error (${response.status}): ${errorText.slice(0, 200)}`,
157+
},
158+
{ status: 502 },
159+
);
160+
}
161+
162+
const responseData: unknown = await response.json();
163+
const rawText = config.extractResponseText(responseData);
164+
165+
if (!rawText) {
166+
return NextResponse.json(
167+
{ success: false, error: `Empty response from ${provider}` },
168+
{ status: 502 },
169+
);
170+
}
171+
172+
let result;
173+
try {
174+
result = parseExtractionResponse(rawText);
175+
} catch (parseError) {
176+
const message =
177+
parseError instanceof SyntaxError
178+
? "LLM returned invalid JSON"
179+
: "LLM returned unexpected response structure";
180+
return NextResponse.json(
181+
{
182+
success: false,
183+
error: `Failed to parse extraction response — ${message}`,
184+
},
185+
{ status: 502 },
186+
);
187+
}
188+
189+
return NextResponse.json({ success: true, data: result });
190+
} catch (error) {
191+
const message =
192+
error instanceof Error
193+
? `Extraction failed — ${error.message}`
194+
: "Extraction failed";
195+
console.error("AI extraction failed:", error);
196+
return NextResponse.json(
197+
{ success: false, error: message },
198+
{ status: 500 },
199+
);
200+
}
201+
};
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
2+
3+
Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
4+
5+
## Node Types
6+
7+
- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
8+
- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
9+
- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
10+
- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
11+
- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
12+
- **Source**: A cited publication. Author(s) and year.
13+
14+
## Quality
15+
16+
- Atomic: one idea per node. Split compound sentences.
17+
- Self-contained: understandable without the paper.
18+
- Faithful: no inference or editorializing.
19+
- Specific: "X reduced Y by 43% in Z" not "X was effective."
20+
- 8–25 nodes. Quality over quantity. Cover all sections.
21+
- Evidence = prior work cited. Result = this paper's experiments.
22+
23+
## Example
24+
25+
Excerpt (Results):
26+
"CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."
27+
28+
{
29+
"nodes": [
30+
{
31+
"nodeType": "Result",
32+
"content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
33+
"supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
34+
"sourceSection": "Results"
35+
},
36+
{
37+
"nodeType": "Result",
38+
"content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
39+
"supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
40+
"sourceSection": "Results"
41+
},
42+
{
43+
"nodeType": "Claim",
44+
"content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
45+
"supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
46+
"sourceSection": "Results"
47+
}
48+
]
49+
}`;
50+
51+
export const buildUserPrompt = (researchQuestion?: string): string => {
52+
let prompt = "Extract discourse graph nodes from the attached paper.";
53+
54+
if (researchQuestion) {
55+
prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
56+
}
57+
58+
return prompt;
59+
};
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { z } from "zod";
2+
3+
export const PROVIDER_IDS = ["anthropic", "openai", "gemini"] as const;
4+
5+
export type ProviderId = (typeof PROVIDER_IDS)[number];
6+
7+
// eslint-disable-next-line @typescript-eslint/naming-convention
8+
export const ExtractedNodeSchema = z.object({
9+
nodeType: z.string(),
10+
content: z.string(),
11+
supportSnippet: z.string(),
12+
sourceSection: z.string().nullable(),
13+
});
14+
15+
export type ExtractedNode = z.infer<typeof ExtractedNodeSchema>;
16+
17+
// eslint-disable-next-line @typescript-eslint/naming-convention
18+
export const ExtractionResultSchema = z.object({
19+
nodes: z.array(ExtractedNodeSchema),
20+
});
21+
22+
export type ExtractionResult = z.infer<typeof ExtractionResultSchema>;
23+
24+
// eslint-disable-next-line @typescript-eslint/naming-convention
25+
export const ExtractionRequestSchema = z.object({
26+
pdfBase64: z.string().min(1).max(44_000_000),
27+
provider: z.enum(PROVIDER_IDS),
28+
model: z.string().min(1),
29+
researchQuestion: z.string().optional(),
30+
systemPrompt: z.string().optional(),
31+
});
32+
33+
export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;
34+
35+
export const EXTRACTION_RESULT_JSON_SCHEMA: Record<string, unknown> = {
36+
type: "object",
37+
properties: {
38+
nodes: {
39+
type: "array",
40+
items: {
41+
type: "object",
42+
properties: {
43+
nodeType: { type: "string" },
44+
content: { type: "string" },
45+
supportSnippet: { type: "string" },
46+
sourceSection: { type: ["string", "null"] },
47+
},
48+
required: ["nodeType", "content", "supportSnippet", "sourceSection"],
49+
additionalProperties: false,
50+
},
51+
},
52+
},
53+
required: ["nodes"],
54+
additionalProperties: false,
55+
};
56+
57+
export type ExtractionResponse =
58+
| { success: true; data: ExtractionResult }
59+
| { success: false; error: string };

apps/website/app/types/llm.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1+
export type ContentBlock = Record<string, unknown>;
2+
13
export type Message = {
24
role: string;
3-
content: string;
5+
content: string | ContentBlock[];
46
};
57

68
export type Settings = {
79
model: string;
810
maxTokens: number;
911
temperature: number;
12+
systemPrompt?: string;
13+
outputSchema?: Record<string, unknown>;
1014
safetySettings?: Array<{
1115
category: string;
1216
threshold: string;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import {
2+
ExtractionResultSchema,
3+
type ExtractionResult,
4+
} from "~/types/extraction";
5+
6+
export const parseExtractionResponse = (raw: string): ExtractionResult => {
7+
const parsed: unknown = JSON.parse(raw);
8+
return ExtractionResultSchema.parse(parsed);
9+
};

0 commit comments

Comments
 (0)