diff --git a/.gitignore b/.gitignore index 2b8b5a9..88ac9fa 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,8 @@ coverage/ exports* debug/ logs/*.txt + +# AI tooling +.claude/ +.wolf/ +CLAUDE.md diff --git a/src/scraper/api-version.ts b/src/scraper/api-version.ts new file mode 100644 index 0000000..64033e6 --- /dev/null +++ b/src/scraper/api-version.ts @@ -0,0 +1,7 @@ +/** + * Perplexity versions its REST endpoints via a `?version=X.Y` query param + * (e.g. `/rest/thread/list_ask_threads?version=2.18`). Both library discovery + * and conversation extraction hit `/rest/` endpoints, so they share this default + * to keep the version in one place rather than duplicating the literal. + */ +export const DEFAULT_API_VERSION = '2.18' diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts index 0a3102a..62b961a 100644 --- a/src/scraper/conversation-extractor.ts +++ b/src/scraper/conversation-extractor.ts @@ -4,8 +4,9 @@ import { z } from 'zod' import { type Page, type BrowserContext, type Response } from '@playwright/test' import { logger } from '../utils/logger.js' import { waitStrategy } from '../utils/wait-strategy.js' -import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js' import { type Config } from '../utils/config.js' +import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js' +import { DEFAULT_API_VERSION } from './api-version.js' export interface ConversationMessage { role: 'user' | 'assistant' @@ -44,11 +45,25 @@ export class ConversationExtractor { blocks: z.array(ConversationExtractor.BlockSchema).optional(), }) + /** + * Validates the top-level shape of the `/rest/thread/{id}` HTTP response, + * confirmed against a live 2026 response. The endpoint returns either a bare + * array of entries or an object wrapping them; pagination is signalled by the + * top-level `has_next_page` / `next_cursor` pair. Fields are optional and the + * object is non-strict, so unknown/new keys don't reject an otherwise-valid + * response — shape drift is surfaced via diagnostics, and `EntrySchema` + * remains the per-entry fallback downstream. + */ private static readonly ApiResponseSchema = z.union([ z.array(ConversationExtractor.EntrySchema), z.object({ entries: z.array(ConversationExtractor.EntrySchema), background_entries: z.array(z.unknown()).optional(), + has_next_page: z.boolean().optional(), + next_cursor: z.string().nullable().optional(), + status: z.string().optional(), + thread_metadata: z.unknown().optional(), + // Legacy/list-style pagination — kept for backward compatibility. collection_info: z .object({ has_next_page: z.boolean().optional(), @@ -106,12 +121,6 @@ export class ConversationExtractor { } } - private static readonly TIMEOUT_MAX_MS = 30_000 - private static readonly TIMEOUT_MIN_MS = 8_000 - private static readonly TIMEOUT_STEP_DOWN_MS = 3_000 - private static readonly TIMEOUT_STEP_UP_MS = 1_000 - - private currentTimeoutMs = ConversationExtractor.TIMEOUT_MAX_MS private readonly diagnostics: ApiDiagnosticsWriter constructor( @@ -121,21 +130,6 @@ export class ConversationExtractor { this.diagnostics = new ApiDiagnosticsWriter(config) } - reduceTimeout(): void { - this.currentTimeoutMs = Math.max( - ConversationExtractor.TIMEOUT_MIN_MS, - this.currentTimeoutMs - ConversationExtractor.TIMEOUT_STEP_DOWN_MS - ) - logger.debug(`[extractor] timeout reduced to ${this.currentTimeoutMs}ms`) - } - - recoverTimeout(): void { - this.currentTimeoutMs = Math.min( - ConversationExtractor.TIMEOUT_MAX_MS, - this.currentTimeoutMs + ConversationExtractor.TIMEOUT_STEP_UP_MS - ) - } - async extract(conversationUrl: string): Promise { await this.ensureContextIsAlive() @@ -147,16 +141,12 @@ export class ConversationExtractor { throw new ConversationExtractor.ExtractionError(`Failed to create new page: ${errorMessage}`) } - const apiResponsePromise = this.captureConversationApiResponse(conversationPage) - try { await this.navigateToConversationUrl(conversationPage, conversationUrl) await waitStrategy(this.config).afterScroll(conversationPage) - const capturedApiData = await apiResponsePromise - if (!capturedApiData) { - throw new ConversationExtractor.NoDataError('API response timeout or not found') - } + const conversationId = this.extractIdFromUrl(conversationUrl) + const capturedApiData = await this.fetchThreadData(conversationPage, conversationId) const extractedConversation = this.parseConversationData(capturedApiData, conversationUrl) if (!extractedConversation) { @@ -176,6 +166,122 @@ export class ConversationExtractor { } } + // Safety cap on cursor pagination (~99 entries/page, so ~5k entries). + private static readonly MAX_PAGES = 50 + + private async fetchThreadData(page: Page, threadId: string): Promise { + const firstParsed = await this.fetchThreadPage(page, threadId, null) + const firstObj = + firstParsed && !Array.isArray(firstParsed) ? (firstParsed as Record) : null + + let cursor = typeof firstObj?.next_cursor === 'string' ? firstObj.next_cursor : null + + // Fast path: the whole thread fit in one response (the common case). + if (firstObj?.has_next_page !== true || !cursor) { + return firstParsed + } + + // Long thread: walk the cursor pages and accumulate entries in API order + // (the endpoint paginates in conversation order — the same order the previous + // response-listener relied on, so no reordering is needed here). + const entries: unknown[] = Array.isArray(firstObj.entries) ? [...firstObj.entries] : [] + const backgroundEntries: unknown[] = Array.isArray(firstObj.background_entries) + ? [...firstObj.background_entries] + : [] + let pageCount = 1 + + while (cursor && pageCount < ConversationExtractor.MAX_PAGES) { + const parsed = await this.fetchThreadPage(page, threadId, cursor) + const obj = Array.isArray(parsed) + ? { entries: parsed as unknown[] } + : (parsed as Record) + + if (Array.isArray(obj.entries)) entries.push(...obj.entries) + if (Array.isArray(obj.background_entries)) backgroundEntries.push(...obj.background_entries) + pageCount++ + + cursor = + obj.has_next_page === true && typeof obj.next_cursor === 'string' ? obj.next_cursor : null + } + + if (cursor) { + logger.warn( + `[extractor] thread ${threadId}: stopped at the ${ConversationExtractor.MAX_PAGES}-page ` + + 'cap; export may be truncated' + ) + } + logger.debug( + `[extractor] thread ${threadId}: assembled ${entries.length} entries from ${pageCount} pages` + ) + + // Preserve the first page's top-level metadata, but with the full entry set. + return { + ...firstObj, + entries, + background_entries: backgroundEntries, + has_next_page: false, + next_cursor: null, + } + } + + /** Fetches and validates a single page of the thread API (cursor = null for page 1). */ + private async fetchThreadPage( + page: Page, + threadId: string, + cursor: string | null + ): Promise { + let apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default` + if (cursor) apiUrl += `&cursor=${encodeURIComponent(cursor)}` + + const raw = await page.evaluate( + async ({ url }: { url: string }) => { + const res = await fetch(url, { credentials: 'include' }) + const text = await res.text() + return { status: res.status, body: text } + }, + { url: apiUrl } + ) + + logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}${cursor ? ' (paged)' : ''}`) + + if (raw.status === 401 || raw.status === 403) { + throw new ConversationExtractor.AuthError(`Authentication required (${raw.status})`) + } + if (raw.status === 404) { + throw new ConversationExtractor.NotFoundError('Thread not found (404)') + } + if (raw.status >= 500) { + throw new ConversationExtractor.ServerError(`Server error (${raw.status})`) + } + if (raw.status !== 200) { + throw new ConversationExtractor.ExtractionError(`Thread API returned HTTP ${raw.status}`) + } + + let parsed: unknown + try { + parsed = JSON.parse(raw.body) + } catch { + throw new ConversationExtractor.ExtractionError(`Thread API: invalid JSON for ${threadId}`) + } + + // Validate the HTTP response shape. On drift we record a diagnostic and fall + // through — the per-entry EntrySchema downstream is the resilient fallback, + // so a shape change is surfaced without breaking extraction. + const shapeResult = ConversationExtractor.ApiResponseSchema.safeParse(parsed) + if (!shapeResult.success) { + logger.warn(`[extractor] thread ${threadId}: unexpected response shape (see api-diagnostics)`) + this.diagnostics + .writeFailure({ + url: apiUrl, + errorType: 'zod_error', + zodErrorPaths: shapeResult.error.issues.map((issue) => issue.path.join('.')), + }) + .catch(() => {}) + } + + return parsed + } + private async ensureContextIsAlive(): Promise { if (!this.context) { throw new ConversationExtractor.ExtractionError('Browser context is missing') @@ -187,78 +293,6 @@ export class ConversationExtractor { } } - private captureConversationApiResponse(page: Page): Promise { - const accumulatedEntries: unknown[] = [] - let isRequestResolved = false - - return new Promise((resolve) => { - const timeoutId = setTimeout(() => { - if (!isRequestResolved) { - if (accumulatedEntries.length > 0) { - logger.info( - `API response timeout – resolving with ${accumulatedEntries.length} accumulated entries` - ) - resolve({ entries: accumulatedEntries }) - } else { - logger.warn('API response timeout – resolving with null') - resolve(null) - } - isRequestResolved = true - } - }, this.currentTimeoutMs) - - page.on('response', async (response: Response) => { - if (isRequestResolved) return - - const responseUrl = response.url() - const isThreadApiRequest = responseUrl.includes('/rest/thread/') - const isListRequest = - responseUrl.includes('list_ask_threads') || - responseUrl.includes('list_recent') || - responseUrl.includes('list_pinned') - - if (!isThreadApiRequest || isListRequest) return - if (page.isClosed()) return - - try { - const jsonResponse = await response.json() - if (isRequestResolved) return - - const parseResult = ConversationExtractor.ApiResponseSchema.safeParse(jsonResponse) - - if (!parseResult.success) { - this.diagnostics - .writeFailure({ - url: response.url(), - errorType: 'zod_error', - zodErrorPaths: parseResult.error.issues.map((issue) => issue.path.join('.')), - }) - .catch(() => {}) - } else { - const responseData = parseResult.data - const currentEntries = Array.isArray(responseData) ? responseData : responseData.entries - accumulatedEntries.push(...currentEntries) - - const hasNextPage = - !Array.isArray(responseData) && responseData.collection_info?.has_next_page === true - - if (!hasNextPage) { - clearTimeout(timeoutId) - isRequestResolved = true - resolve({ entries: accumulatedEntries }) - } else { - logger.info( - `Captured paginated response, ${accumulatedEntries.length} entries so far...` - ) - } - } - } catch (_error) { - // Silent catch for JSON parse errors from non-JSON responses - } - }) - }) - } - private async navigateToConversationUrl(page: Page, url: string): Promise { const NAVIGATION_TIMEOUT_MS = 30000 const navigationResponse = await page.goto(url, { @@ -369,8 +403,8 @@ export class ConversationExtractor { if (dataObject && Array.isArray(dataObject.entries)) return dataObject.entries as unknown[] if (dataObject && (dataObject.query_str || dataObject.blocks)) return [data] + logger.warn(`Unknown API response shape for ${url}`) this.diagnostics.writeFailure({ url, errorType: 'unknown_shape' }).catch(() => {}) - return [] } @@ -397,9 +431,12 @@ export class ConversationExtractor { } let answer = '' + const seenAnswers = new Set() for (const block of entry.blocks ?? []) { - if (block.markdown_block?.answer) { - answer += block.markdown_block.answer + '\n\n' + const blockAnswer = block.markdown_block?.answer + if (blockAnswer && !seenAnswers.has(blockAnswer)) { + seenAnswers.add(blockAnswer) + answer += blockAnswer + '\n\n' } } diff --git a/src/scraper/library-discovery.ts b/src/scraper/library-discovery.ts index e896cd8..db73c96 100644 --- a/src/scraper/library-discovery.ts +++ b/src/scraper/library-discovery.ts @@ -1,5 +1,6 @@ import type { Page } from '@playwright/test' import { logger } from '../utils/logger.js' +import { DEFAULT_API_VERSION } from './api-version.js' // ─── Constants ─────────────────────────────────────────────────────────────── @@ -27,22 +28,30 @@ const VERSIONED_URL_PATTERNS = [ interface RawThread { uuid: string - slug: string title: string - query_str: string - first_answer: string - answer_preview: string - last_query_datetime: string - mode: string - status: string - display_model: string - thread_access: number - has_next_page: boolean - total_threads: number - collection: Collection | null - sources: string[] - query_count: number - search_focus: string + // New format (2026+) + link?: string + variant?: string + unread?: boolean + context_uuid?: string + task_description?: string | null + mode_type?: number + // Old format (pre-2026, may still appear) + slug?: string + query_str?: string + first_answer?: string + answer_preview?: string | null + last_query_datetime?: string + mode?: string + status?: string + display_model?: string + thread_access?: number + has_next_page?: boolean + total_threads?: number + collection?: Collection | null + sources?: string[] + query_count?: number + search_focus?: string [key: string]: unknown } @@ -57,20 +66,28 @@ export interface ConversationMeta { id: string url: string uuid: string - slug: string title: string - query_str: string - first_answer: string - answer_preview: string - last_query_datetime: string - mode: string - status: string - display_model: string - thread_access: number - collection: Collection | null - sources: string[] - query_count: number - search_focus: string + status?: string + // New format + link?: string + variant?: string + unread?: boolean + context_uuid?: string + task_description?: string | null + mode_type?: number + // Old format + slug?: string + query_str?: string + first_answer?: string + answer_preview?: string | null + last_query_datetime?: string + mode?: string + display_model?: string + thread_access?: number + collection?: Collection | null + sources?: string[] + query_count?: number + search_focus?: string [key: string]: unknown } @@ -88,10 +105,11 @@ function extractVersionFromUrl(url: string): string | null { } function rawThreadToConversationMeta(thread: RawThread): ConversationMeta { + const path = thread.link ?? `/search/${thread.slug ?? thread.uuid}` return { ...thread, id: thread.uuid, - url: `${BASE_URL}/search/${thread.slug}`, + url: `${BASE_URL}${path}`, } } @@ -103,13 +121,13 @@ async function detectApiVersion(page: Page): Promise { (res) => VERSIONED_URL_PATTERNS.some((p) => res.url().includes(p)) && res.status() === 200, { timeout: 15_000 } ) - const version = extractVersionFromUrl(response.url()) ?? '2.18' + const version = extractVersionFromUrl(response.url()) ?? DEFAULT_API_VERSION const pathname = new URL(response.url()).pathname logger.debug(`Detected API version: ${version} (from ${pathname})`) return version } catch { - logger.debug('Version detection timeout — using fallback 2.18') - return '2.18' + logger.debug(`Version detection timeout — using fallback ${DEFAULT_API_VERSION}`) + return DEFAULT_API_VERSION } } @@ -185,7 +203,7 @@ async function fetchThreadBatch( } const threads = parsed as RawThread[] - const total = threads[0]?.total_threads ?? 0 + const total = (threads[0]?.total_threads as number | undefined) ?? 0 return { threads, diff --git a/src/scraper/worker-pool.ts b/src/scraper/worker-pool.ts index e64f3b1..d0adf56 100644 --- a/src/scraper/worker-pool.ts +++ b/src/scraper/worker-pool.ts @@ -92,14 +92,13 @@ export class WorkerPool { ): Promise { try { const result = await worker.extractor.extract(item.meta.url) - await this.handleSuccess(worker, item.meta, result) + await this.handleSuccess(item.meta, result) } catch (error) { - await this.handleFailure(worker, item, queue, error) + await this.handleFailure(item, queue, error) } } private async handleSuccess( - worker: ExtractionWorker, meta: ConversationMeta, result: Awaited> ): Promise { @@ -115,22 +114,12 @@ export class WorkerPool { this.checkpointManager.markAsProcessed(meta.id, result.contentHash) logger.info(`${progressLabel} Processed: ${result.title}`) } - - worker.extractor.recoverTimeout() } - private async handleFailure( - worker: ExtractionWorker, - item: QueueItem, - queue: QueueItem[], - error: unknown - ): Promise { - const isTimeout = error instanceof Error && error.message.includes('API response timeout') + private async handleFailure(item: QueueItem, queue: QueueItem[], error: unknown): Promise { const isContextLost = error instanceof Error && error.message.includes('context is no longer available') - if (isTimeout) worker.extractor.reduceTimeout() - if (isContextLost) { logger.warn('Browser context lost. Refreshing worker context...') await this.refreshContext() diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts index d087a01..484f29e 100644 --- a/test/unit/conversation-extractor.unit.test.ts +++ b/test/unit/conversation-extractor.unit.test.ts @@ -1,17 +1,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest' import { ConversationExtractor } from '../../src/scraper/conversation-extractor.js' -import { ApiDiagnosticsWriter } from '../../src/utils/api-diagnostics.js' -import type { BrowserContext } from '@playwright/test' - -vi.mock('../../src/utils/api-diagnostics.js', () => { - return { - ApiDiagnosticsWriter: vi.fn().mockImplementation(function () { - return { - writeFailure: vi.fn().mockResolvedValue(undefined), - } - }), - } -}) +import { logger } from '../../src/utils/logger.js' +import type { BrowserContext, Page } from '@playwright/test' describe('ConversationExtractor (Unit)', () => { let extractor: ConversationExtractor @@ -31,65 +21,307 @@ describe('ConversationExtractor (Unit)', () => { vi.clearAllMocks() }) + // ─── ensureEntriesFormat ────────────────────────────────────────────────────── + describe('ensureEntriesFormat', () => { - it('should return array if input is array', () => { + it('returns array as-is when input is already an array', () => { const data = [{ query_str: 'test' }] const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual(data) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return data.entries if input has entries array', () => { + it('returns data.entries when input has an entries array', () => { const data = { entries: [{ query_str: 'test' }] } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual(data.entries) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return [data] if input has query_str', () => { + it('wraps single entry object when it has query_str', () => { const data = { query_str: 'test' } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual([data]) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return empty array and call diagnostics for unknown shape', () => { - const data = { foo: 'bar' } + it('wraps single entry object when it has blocks', () => { + const data = { blocks: [] } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') + expect(result).toEqual([data]) + }) + + it('returns empty array for unknown shape', () => { + const result = (extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com') expect(result).toEqual([]) - expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({ - url: 'http://test.com', - errorType: 'unknown_shape', - }) }) }) + // ─── parseConversationData ──────────────────────────────────────────────────── + describe('parseConversationData', () => { - it('should return null and call diagnostics if entries are empty', () => { - const data = { entries: [] } - const result = extractor.parseConversationData(data, 'http://test.com') + it('returns null when entries array is empty', () => { + const result = (extractor as any).parseConversationData( + { entries: [] }, + 'https://www.perplexity.ai/search/test-id' + ) expect(result).toBeNull() - expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({ - url: 'http://test.com', - errorType: 'empty_entries', - }) }) - it('should parse valid entries correctly', () => { + it('parses new-format entries (blocks with markdown_block.answer)', () => { const data = { entries: [ { thread_title: 'Test Thread', query_str: 'What is 1+1?', - blocks: [{ markdown_block: { answer: '2' } }], + updated_datetime: '2026-01-01T00:00:00Z', + blocks: [ + { intended_usage: 'plan', plan_block: {} }, + { + intended_usage: 'ask_text_0_markdown', + markdown_block: { answer: 'The answer is 2.' }, + }, + ], }, ], } - const result = extractor.parseConversationData(data, 'https://perplexity.ai/search/uuid') + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) expect(result).not.toBeNull() - expect(result?.title).toBe('Test Thread') - expect(result?.content).toContain('What is 1+1?') - expect(result?.content).toContain('2') + expect(result.title).toBe('Test Thread') + expect(result.content).toContain('What is 1+1?') + expect(result.content).toContain('The answer is 2.') + }) + + it('deduplicates identical answers from ask_text and ask_text_0_markdown blocks', () => { + const answer = 'The answer is 2.' + const data = { + entries: [ + { + thread_title: 'Dedupe Test', + query_str: 'What is 1+1?', + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer } }, + { intended_usage: 'ask_text', markdown_block: { answer } }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result).not.toBeNull() + const occurrences = (result.content.match(/The answer is 2\./g) ?? []).length + expect(occurrences).toBe(1) + }) + + it('extracts space name from collection_info', () => { + const data = { + entries: [ + { + thread_title: 'Test', + query_str: 'Question', + collection_info: { title: 'HomeLab' }, + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'Answer' } }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result?.spaceName).toBe('HomeLab') + }) + + it('handles multi-turn conversations', () => { + const data = { + entries: [ + { + thread_title: 'Multi-turn', + query_str: 'First question', + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'First answer' } }, + ], + }, + { + query_str: 'Follow-up question', + blocks: [ + { + intended_usage: 'ask_text_0_markdown', + markdown_block: { answer: 'Follow-up answer' }, + }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result?.messages).toHaveLength(4) + expect(result?.messages[0]).toEqual({ role: 'user', content: 'First question' }) + expect(result?.messages[1]).toEqual({ role: 'assistant', content: 'First answer' }) + expect(result?.messages[2]).toEqual({ role: 'user', content: 'Follow-up question' }) + expect(result?.messages[3]).toEqual({ role: 'assistant', content: 'Follow-up answer' }) + }) + }) + + // ─── fetchThreadData ────────────────────────────────────────────────────────── + + describe('fetchThreadData', () => { + const makePage = (status: number, body: string): Page => + ({ evaluate: vi.fn().mockResolvedValue({ status, body }) }) as unknown as Page + + it('returns parsed JSON on HTTP 200', async () => { + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + background_entries: [], + } + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + }) + + it('throws AuthError on HTTP 401', async () => { + const page = makePage(401, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.AuthError + ) + }) + + it('throws AuthError on HTTP 403', async () => { + const page = makePage(403, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.AuthError + ) + }) + + it('throws NotFoundError on HTTP 404', async () => { + const page = makePage(404, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.NotFoundError + ) + }) + + it('throws ServerError on HTTP 500', async () => { + const page = makePage(500, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.ServerError + ) + }) + + it('throws ExtractionError on invalid JSON body', async () => { + const page = makePage(200, 'not-json') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.ExtractionError + ) + }) + + it('calls the correct API URL with thread ID', async () => { + const page = makePage(200, '{"entries":[],"background_entries":[]}') + await (extractor as any).fetchThreadData(page, 'my-thread-id').catch(() => {}) + const callArg = (page.evaluate as any).mock.calls[0][1] + expect(callArg.url).toContain('/rest/thread/my-thread-id') + }) + + it('returns valid data and writes no diagnostic when the shape matches', async () => { + // Mirrors the real 2026 /rest/thread/{id} shape: top-level entries + + // background_entries + has_next_page/next_cursor + extra metadata keys. + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + background_entries: [], + has_next_page: false, + next_cursor: null, + status: 'success', + thread_metadata: { title: 'Test' }, + } + const writeFailure = vi.spyOn((extractor as any).diagnostics, 'writeFailure') + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + expect(writeFailure).not.toHaveBeenCalled() + }) + + it('returns the single page unchanged when there is no next page', async () => { + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + background_entries: [], + has_next_page: false, + next_cursor: null, + } + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + expect(page.evaluate as any).toHaveBeenCalledTimes(1) + }) + + it('accumulates entries across cursor pages until has_next_page is false', async () => { + const page1 = { + entries: [{ query_str: 'q1', blocks: [] }], + background_entries: [], + has_next_page: true, + next_cursor: 'cur-1', + } + const page2 = { + entries: [{ query_str: 'q2', blocks: [] }], + background_entries: [], + has_next_page: false, + next_cursor: null, + } + const evaluate = vi + .fn() + .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page1) }) + .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page2) }) + const page = { evaluate } as unknown as Page + + const result: any = await (extractor as any).fetchThreadData(page, 'abc-123') + + expect(evaluate).toHaveBeenCalledTimes(2) + expect(result.entries.map((e: any) => e.query_str)).toEqual(['q1', 'q2']) + expect(result.has_next_page).toBe(false) + // the second request must carry the cursor returned by page 1 + expect((evaluate.mock.calls[1][1] as any).url).toContain('cursor=cur-1') + }) + + it('still returns the body but records a zod_error diagnostic on shape drift', async () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + // entries must be an array of objects; a string violates the schema + const page = makePage(200, '{"entries":"not-an-array"}') + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual({ entries: 'not-an-array' }) + expect(writeFailure).toHaveBeenCalledWith(expect.objectContaining({ errorType: 'zod_error' })) + }) + }) + + // ─── diagnostics on parse failures ────────────────────────────────────────── + + describe('diagnostics', () => { + it('writes an unknown_shape diagnostic for an unrecognised response shape', () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + ;(extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com') + expect(writeFailure).toHaveBeenCalledWith( + expect.objectContaining({ errorType: 'unknown_shape' }) + ) + }) + + it('writes an empty_entries diagnostic when there are no entries to validate', () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + const result = (extractor as any).parseConversationData( + { foo: 'bar' }, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result).toBeNull() + expect(writeFailure).toHaveBeenCalledWith( + expect.objectContaining({ errorType: 'empty_entries' }) + ) }) }) })