From ddfa1651092334cc73f3f02995347e41dcfca387 Mon Sep 17 00:00:00 2001 From: Dave Jordan Date: Wed, 3 Jun 2026 22:59:27 +0200 Subject: [PATCH 1/5] fix: adapt scraper to Perplexity API changes (2026) --- src/scraper/conversation-extractor.ts | 170 +++++++------------------- src/scraper/library-discovery.ts | 77 +++++++----- 2 files changed, 92 insertions(+), 155 deletions(-) diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts index 0a3102a..77c4ef4 100644 --- a/src/scraper/conversation-extractor.ts +++ b/src/scraper/conversation-extractor.ts @@ -4,7 +4,6 @@ import { z } from 'zod' import { type Page, type BrowserContext, type Response } from '@playwright/test' import { logger } from '../utils/logger.js' import { waitStrategy } from '../utils/wait-strategy.js' -import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js' import { type Config } from '../utils/config.js' export interface ConversationMessage { @@ -44,19 +43,6 @@ export class ConversationExtractor { blocks: z.array(ConversationExtractor.BlockSchema).optional(), }) - private static readonly ApiResponseSchema = z.union([ - z.array(ConversationExtractor.EntrySchema), - z.object({ - entries: z.array(ConversationExtractor.EntrySchema), - background_entries: z.array(z.unknown()).optional(), - collection_info: z - .object({ - has_next_page: z.boolean().optional(), - }) - .optional(), - }), - ]) - static readonly ExtractionError = class extends Error { constructor(message: string) { super(message) @@ -106,35 +92,14 @@ export class ConversationExtractor { } } - private static readonly TIMEOUT_MAX_MS = 30_000 - private static readonly TIMEOUT_MIN_MS = 8_000 - private static readonly TIMEOUT_STEP_DOWN_MS = 3_000 - private static readonly TIMEOUT_STEP_UP_MS = 1_000 - - private currentTimeoutMs = ConversationExtractor.TIMEOUT_MAX_MS - private readonly diagnostics: ApiDiagnosticsWriter - constructor( private readonly config: Config, private readonly context: BrowserContext - ) { - this.diagnostics = new ApiDiagnosticsWriter(config) - } + ) {} - reduceTimeout(): void { - this.currentTimeoutMs = Math.max( - ConversationExtractor.TIMEOUT_MIN_MS, - this.currentTimeoutMs - ConversationExtractor.TIMEOUT_STEP_DOWN_MS - ) - logger.debug(`[extractor] timeout reduced to ${this.currentTimeoutMs}ms`) - } + reduceTimeout(): void {} - recoverTimeout(): void { - this.currentTimeoutMs = Math.min( - ConversationExtractor.TIMEOUT_MAX_MS, - this.currentTimeoutMs + ConversationExtractor.TIMEOUT_STEP_UP_MS - ) - } + recoverTimeout(): void {} async extract(conversationUrl: string): Promise { await this.ensureContextIsAlive() @@ -147,16 +112,12 @@ export class ConversationExtractor { throw new ConversationExtractor.ExtractionError(`Failed to create new page: ${errorMessage}`) } - const apiResponsePromise = this.captureConversationApiResponse(conversationPage) - try { await this.navigateToConversationUrl(conversationPage, conversationUrl) await waitStrategy(this.config).afterScroll(conversationPage) - const capturedApiData = await apiResponsePromise - if (!capturedApiData) { - throw new ConversationExtractor.NoDataError('API response timeout or not found') - } + const conversationId = this.extractIdFromUrl(conversationUrl) + const capturedApiData = await this.fetchThreadData(conversationPage, conversationId) const extractedConversation = this.parseConversationData(capturedApiData, conversationUrl) if (!extractedConversation) { @@ -176,6 +137,40 @@ export class ConversationExtractor { } } + private async fetchThreadData(page: Page, threadId: string): Promise { + const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=2.18&source=default` + + const raw = await page.evaluate( + async ({ url }: { url: string }) => { + const res = await fetch(url, { credentials: 'include' }) + const text = await res.text() + return { status: res.status, body: text } + }, + { url: apiUrl } + ) + + logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}`) + + if (raw.status === 401 || raw.status === 403) { + throw new ConversationExtractor.AuthError(`Authentication required (${raw.status})`) + } + if (raw.status === 404) { + throw new ConversationExtractor.NotFoundError('Thread not found (404)') + } + if (raw.status >= 500) { + throw new ConversationExtractor.ServerError(`Server error (${raw.status})`) + } + if (raw.status !== 200) { + throw new ConversationExtractor.ExtractionError(`Thread API returned HTTP ${raw.status}`) + } + + try { + return JSON.parse(raw.body) + } catch { + throw new ConversationExtractor.ExtractionError(`Thread API: invalid JSON for ${threadId}`) + } + } + private async ensureContextIsAlive(): Promise { if (!this.context) { throw new ConversationExtractor.ExtractionError('Browser context is missing') @@ -187,78 +182,6 @@ export class ConversationExtractor { } } - private captureConversationApiResponse(page: Page): Promise { - const accumulatedEntries: unknown[] = [] - let isRequestResolved = false - - return new Promise((resolve) => { - const timeoutId = setTimeout(() => { - if (!isRequestResolved) { - if (accumulatedEntries.length > 0) { - logger.info( - `API response timeout – resolving with ${accumulatedEntries.length} accumulated entries` - ) - resolve({ entries: accumulatedEntries }) - } else { - logger.warn('API response timeout – resolving with null') - resolve(null) - } - isRequestResolved = true - } - }, this.currentTimeoutMs) - - page.on('response', async (response: Response) => { - if (isRequestResolved) return - - const responseUrl = response.url() - const isThreadApiRequest = responseUrl.includes('/rest/thread/') - const isListRequest = - responseUrl.includes('list_ask_threads') || - responseUrl.includes('list_recent') || - responseUrl.includes('list_pinned') - - if (!isThreadApiRequest || isListRequest) return - if (page.isClosed()) return - - try { - const jsonResponse = await response.json() - if (isRequestResolved) return - - const parseResult = ConversationExtractor.ApiResponseSchema.safeParse(jsonResponse) - - if (!parseResult.success) { - this.diagnostics - .writeFailure({ - url: response.url(), - errorType: 'zod_error', - zodErrorPaths: parseResult.error.issues.map((issue) => issue.path.join('.')), - }) - .catch(() => {}) - } else { - const responseData = parseResult.data - const currentEntries = Array.isArray(responseData) ? responseData : responseData.entries - accumulatedEntries.push(...currentEntries) - - const hasNextPage = - !Array.isArray(responseData) && responseData.collection_info?.has_next_page === true - - if (!hasNextPage) { - clearTimeout(timeoutId) - isRequestResolved = true - resolve({ entries: accumulatedEntries }) - } else { - logger.info( - `Captured paginated response, ${accumulatedEntries.length} entries so far...` - ) - } - } - } catch (_error) { - // Silent catch for JSON parse errors from non-JSON responses - } - }) - }) - } - private async navigateToConversationUrl(page: Page, url: string): Promise { const NAVIGATION_TIMEOUT_MS = 30000 const navigationResponse = await page.goto(url, { @@ -317,11 +240,6 @@ export class ConversationExtractor { .safeParse(formattedEntries) if (!entriesValidationResult.success) { - if (formattedEntries.length === 0) { - this.diagnostics - .writeFailure({ url: conversationUrl, errorType: 'empty_entries' }) - .catch(() => {}) - } logger.warn( `Entry validation failed for ${conversationUrl}: ${entriesValidationResult.error.message}` ) @@ -369,8 +287,7 @@ export class ConversationExtractor { if (dataObject && Array.isArray(dataObject.entries)) return dataObject.entries as unknown[] if (dataObject && (dataObject.query_str || dataObject.blocks)) return [data] - this.diagnostics.writeFailure({ url, errorType: 'unknown_shape' }).catch(() => {}) - + logger.warn(`Unknown API response shape for ${url}`) return [] } @@ -397,9 +314,12 @@ export class ConversationExtractor { } let answer = '' + const seenAnswers = new Set() for (const block of entry.blocks ?? []) { - if (block.markdown_block?.answer) { - answer += block.markdown_block.answer + '\n\n' + const blockAnswer = block.markdown_block?.answer + if (blockAnswer && !seenAnswers.has(blockAnswer)) { + seenAnswers.add(blockAnswer) + answer += blockAnswer + '\n\n' } } diff --git a/src/scraper/library-discovery.ts b/src/scraper/library-discovery.ts index e896cd8..58f2848 100644 --- a/src/scraper/library-discovery.ts +++ b/src/scraper/library-discovery.ts @@ -27,22 +27,30 @@ const VERSIONED_URL_PATTERNS = [ interface RawThread { uuid: string - slug: string title: string - query_str: string - first_answer: string - answer_preview: string - last_query_datetime: string - mode: string - status: string - display_model: string - thread_access: number - has_next_page: boolean - total_threads: number - collection: Collection | null - sources: string[] - query_count: number - search_focus: string + // New format (2026+) + link?: string + variant?: string + unread?: boolean + context_uuid?: string + task_description?: string | null + mode_type?: number + // Old format (pre-2026, may still appear) + slug?: string + query_str?: string + first_answer?: string + answer_preview?: string | null + last_query_datetime?: string + mode?: string + status?: string + display_model?: string + thread_access?: number + has_next_page?: boolean + total_threads?: number + collection?: Collection | null + sources?: string[] + query_count?: number + search_focus?: string [key: string]: unknown } @@ -57,20 +65,28 @@ export interface ConversationMeta { id: string url: string uuid: string - slug: string title: string - query_str: string - first_answer: string - answer_preview: string - last_query_datetime: string - mode: string - status: string - display_model: string - thread_access: number - collection: Collection | null - sources: string[] - query_count: number - search_focus: string + status?: string + // New format + link?: string + variant?: string + unread?: boolean + context_uuid?: string + task_description?: string | null + mode_type?: number + // Old format + slug?: string + query_str?: string + first_answer?: string + answer_preview?: string | null + last_query_datetime?: string + mode?: string + display_model?: string + thread_access?: number + collection?: Collection | null + sources?: string[] + query_count?: number + search_focus?: string [key: string]: unknown } @@ -88,10 +104,11 @@ function extractVersionFromUrl(url: string): string | null { } function rawThreadToConversationMeta(thread: RawThread): ConversationMeta { + const path = thread.link ?? `/search/${thread.slug ?? thread.uuid}` return { ...thread, id: thread.uuid, - url: `${BASE_URL}/search/${thread.slug}`, + url: `${BASE_URL}${path}`, } } @@ -185,7 +202,7 @@ async function fetchThreadBatch( } const threads = parsed as RawThread[] - const total = threads[0]?.total_threads ?? 0 + const total = (threads[0]?.total_threads as number | undefined) ?? 0 return { threads, From 69e499e5d6259ca39b8c559dc9f9d51cdbd55e55 Mon Sep 17 00:00:00 2001 From: Dave Jordan Date: Wed, 3 Jun 2026 23:01:01 +0200 Subject: [PATCH 2/5] chore: ignore AI tooling directories --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 2b8b5a9..88ac9fa 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,8 @@ coverage/ exports* debug/ logs/*.txt + +# AI tooling +.claude/ +.wolf/ +CLAUDE.md From 7cfba60fc8ecfa5861f4f5b20a181e3ec82dffad Mon Sep 17 00:00:00 2001 From: Dave Jordan Date: Wed, 3 Jun 2026 23:13:15 +0200 Subject: [PATCH 3/5] test: update conversation-extractor tests for new API approach --- test/unit/conversation-extractor.unit.test.ts | 207 ++++++++++++++---- 1 file changed, 170 insertions(+), 37 deletions(-) diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts index d087a01..3ea62df 100644 --- a/test/unit/conversation-extractor.unit.test.ts +++ b/test/unit/conversation-extractor.unit.test.ts @@ -1,17 +1,6 @@ import { describe, it, expect, vi, beforeEach } from 'vitest' import { ConversationExtractor } from '../../src/scraper/conversation-extractor.js' -import { ApiDiagnosticsWriter } from '../../src/utils/api-diagnostics.js' -import type { BrowserContext } from '@playwright/test' - -vi.mock('../../src/utils/api-diagnostics.js', () => { - return { - ApiDiagnosticsWriter: vi.fn().mockImplementation(function () { - return { - writeFailure: vi.fn().mockResolvedValue(undefined), - } - }), - } -}) +import type { BrowserContext, Page } from '@playwright/test' describe('ConversationExtractor (Unit)', () => { let extractor: ConversationExtractor @@ -31,65 +20,209 @@ describe('ConversationExtractor (Unit)', () => { vi.clearAllMocks() }) + // ─── ensureEntriesFormat ────────────────────────────────────────────────────── + describe('ensureEntriesFormat', () => { - it('should return array if input is array', () => { + it('returns array as-is when input is already an array', () => { const data = [{ query_str: 'test' }] const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual(data) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return data.entries if input has entries array', () => { + it('returns data.entries when input has an entries array', () => { const data = { entries: [{ query_str: 'test' }] } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual(data.entries) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return [data] if input has query_str', () => { + it('wraps single entry object when it has query_str', () => { const data = { query_str: 'test' } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') expect(result).toEqual([data]) - expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled() }) - it('should return empty array and call diagnostics for unknown shape', () => { - const data = { foo: 'bar' } + it('wraps single entry object when it has blocks', () => { + const data = { blocks: [] } const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com') + expect(result).toEqual([data]) + }) + + it('returns empty array for unknown shape', () => { + const result = (extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com') expect(result).toEqual([]) - expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({ - url: 'http://test.com', - errorType: 'unknown_shape', - }) }) }) + // ─── parseConversationData ──────────────────────────────────────────────────── + describe('parseConversationData', () => { - it('should return null and call diagnostics if entries are empty', () => { - const data = { entries: [] } - const result = extractor.parseConversationData(data, 'http://test.com') + it('returns null when entries array is empty', () => { + const result = (extractor as any).parseConversationData( + { entries: [] }, + 'https://www.perplexity.ai/search/test-id' + ) expect(result).toBeNull() - expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({ - url: 'http://test.com', - errorType: 'empty_entries', - }) }) - it('should parse valid entries correctly', () => { + it('parses new-format entries (blocks with markdown_block.answer)', () => { const data = { entries: [ { thread_title: 'Test Thread', query_str: 'What is 1+1?', - blocks: [{ markdown_block: { answer: '2' } }], + updated_datetime: '2026-01-01T00:00:00Z', + blocks: [ + { intended_usage: 'plan', plan_block: {} }, + { + intended_usage: 'ask_text_0_markdown', + markdown_block: { answer: 'The answer is 2.' }, + }, + ], }, ], } - const result = extractor.parseConversationData(data, 'https://perplexity.ai/search/uuid') + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) expect(result).not.toBeNull() - expect(result?.title).toBe('Test Thread') - expect(result?.content).toContain('What is 1+1?') - expect(result?.content).toContain('2') + expect(result.title).toBe('Test Thread') + expect(result.content).toContain('What is 1+1?') + expect(result.content).toContain('The answer is 2.') + }) + + it('deduplicates identical answers from ask_text and ask_text_0_markdown blocks', () => { + const answer = 'The answer is 2.' + const data = { + entries: [ + { + thread_title: 'Dedupe Test', + query_str: 'What is 1+1?', + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer } }, + { intended_usage: 'ask_text', markdown_block: { answer } }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result).not.toBeNull() + const occurrences = (result.content.match(/The answer is 2\./g) ?? []).length + expect(occurrences).toBe(1) + }) + + it('extracts space name from collection_info', () => { + const data = { + entries: [ + { + thread_title: 'Test', + query_str: 'Question', + collection_info: { title: 'HomeLab' }, + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'Answer' } }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result?.spaceName).toBe('HomeLab') + }) + + it('handles multi-turn conversations', () => { + const data = { + entries: [ + { + thread_title: 'Multi-turn', + query_str: 'First question', + blocks: [ + { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'First answer' } }, + ], + }, + { + query_str: 'Follow-up question', + blocks: [ + { + intended_usage: 'ask_text_0_markdown', + markdown_block: { answer: 'Follow-up answer' }, + }, + ], + }, + ], + } + const result = (extractor as any).parseConversationData( + data, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result?.messages).toHaveLength(4) + expect(result?.messages[0]).toEqual({ role: 'user', content: 'First question' }) + expect(result?.messages[1]).toEqual({ role: 'assistant', content: 'First answer' }) + expect(result?.messages[2]).toEqual({ role: 'user', content: 'Follow-up question' }) + expect(result?.messages[3]).toEqual({ role: 'assistant', content: 'Follow-up answer' }) + }) + }) + + // ─── fetchThreadData ────────────────────────────────────────────────────────── + + describe('fetchThreadData', () => { + const makePage = (status: number, body: string): Page => + ({ evaluate: vi.fn().mockResolvedValue({ status, body }) }) as unknown as Page + + it('returns parsed JSON on HTTP 200', async () => { + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + background_entries: [], + } + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + }) + + it('throws AuthError on HTTP 401', async () => { + const page = makePage(401, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.AuthError + ) + }) + + it('throws AuthError on HTTP 403', async () => { + const page = makePage(403, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.AuthError + ) + }) + + it('throws NotFoundError on HTTP 404', async () => { + const page = makePage(404, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.NotFoundError + ) + }) + + it('throws ServerError on HTTP 500', async () => { + const page = makePage(500, '{}') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.ServerError + ) + }) + + it('throws ExtractionError on invalid JSON body', async () => { + const page = makePage(200, 'not-json') + await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow( + ConversationExtractor.ExtractionError + ) + }) + + it('calls the correct API URL with thread ID', async () => { + const page = makePage(200, '{"entries":[],"background_entries":[]}') + await (extractor as any).fetchThreadData(page, 'my-thread-id').catch(() => {}) + const callArg = (page.evaluate as any).mock.calls[0][1] + expect(callArg.url).toContain('/rest/thread/my-thread-id') }) }) }) From cce6218c1d5dd70c22123b095b856c2e8ef1654e Mon Sep 17 00:00:00 2001 From: DaveG7 Date: Thu, 4 Jun 2026 22:16:38 +0200 Subject: [PATCH 4/5] fix(extractor): restore response validation + diagnostics, justify direct fetch Address PR #39 review feedback on conversation-extractor: - Restore ApiResponseSchema, validated against a live 2026 /rest/thread/{id} response. Pagination is the top-level has_next_page/next_cursor pair (not collection_info, which is the list endpoint). Diagnose-and-continue: shape drift writes a diagnostic and falls through to the per-entry EntrySchema gate. - Restore ApiDiagnosticsWriter calls (zod_error / unknown_shape / empty_entries) so the debug/api-diagnostics.jsonl path the REPL references works again. - Keep the page.evaluate()+fetch approach for consistency with library-discovery (the response-listener was the lone divergent /rest/ path); replace hardcoded version=2.18 with shared DEFAULT_API_VERSION. - Remove dead adaptive-timeout no-ops (reduceTimeout/recoverTimeout) and their now-unused worker-pool callers. Co-Authored-By: Claude Opus 4.8 --- src/scraper/api-version.ts | 7 ++ src/scraper/conversation-extractor.ts | 76 +++++++++++++++++-- src/scraper/library-discovery.ts | 7 +- src/scraper/worker-pool.ts | 17 +---- test/unit/conversation-extractor.unit.test.ts | 71 +++++++++++++++++ 5 files changed, 154 insertions(+), 24 deletions(-) create mode 100644 src/scraper/api-version.ts diff --git a/src/scraper/api-version.ts b/src/scraper/api-version.ts new file mode 100644 index 0000000..64033e6 --- /dev/null +++ b/src/scraper/api-version.ts @@ -0,0 +1,7 @@ +/** + * Perplexity versions its REST endpoints via a `?version=X.Y` query param + * (e.g. `/rest/thread/list_ask_threads?version=2.18`). Both library discovery + * and conversation extraction hit `/rest/` endpoints, so they share this default + * to keep the version in one place rather than duplicating the literal. + */ +export const DEFAULT_API_VERSION = '2.18' diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts index 77c4ef4..ca31725 100644 --- a/src/scraper/conversation-extractor.ts +++ b/src/scraper/conversation-extractor.ts @@ -5,6 +5,8 @@ import { type Page, type BrowserContext, type Response } from '@playwright/test' import { logger } from '../utils/logger.js' import { waitStrategy } from '../utils/wait-strategy.js' import { type Config } from '../utils/config.js' +import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js' +import { DEFAULT_API_VERSION } from './api-version.js' export interface ConversationMessage { role: 'user' | 'assistant' @@ -43,6 +45,33 @@ export class ConversationExtractor { blocks: z.array(ConversationExtractor.BlockSchema).optional(), }) + /** + * Validates the top-level shape of the `/rest/thread/{id}` HTTP response, + * confirmed against a live 2026 response. The endpoint returns either a bare + * array of entries or an object wrapping them; pagination is signalled by the + * top-level `has_next_page` / `next_cursor` pair. Fields are optional and the + * object is non-strict, so unknown/new keys don't reject an otherwise-valid + * response — shape drift is surfaced via diagnostics, and `EntrySchema` + * remains the per-entry fallback downstream. + */ + private static readonly ApiResponseSchema = z.union([ + z.array(ConversationExtractor.EntrySchema), + z.object({ + entries: z.array(ConversationExtractor.EntrySchema), + background_entries: z.array(z.unknown()).optional(), + has_next_page: z.boolean().optional(), + next_cursor: z.string().nullable().optional(), + status: z.string().optional(), + thread_metadata: z.unknown().optional(), + // Legacy/list-style pagination — kept for backward compatibility. + collection_info: z + .object({ + has_next_page: z.boolean().optional(), + }) + .optional(), + }), + ]) + static readonly ExtractionError = class extends Error { constructor(message: string) { super(message) @@ -92,14 +121,14 @@ export class ConversationExtractor { } } + private readonly diagnostics: ApiDiagnosticsWriter + constructor( private readonly config: Config, private readonly context: BrowserContext - ) {} - - reduceTimeout(): void {} - - recoverTimeout(): void {} + ) { + this.diagnostics = new ApiDiagnosticsWriter(config) + } async extract(conversationUrl: string): Promise { await this.ensureContextIsAlive() @@ -138,7 +167,7 @@ export class ConversationExtractor { } private async fetchThreadData(page: Page, threadId: string): Promise { - const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=2.18&source=default` + const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default` const raw = await page.evaluate( async ({ url }: { url: string }) => { @@ -164,11 +193,38 @@ export class ConversationExtractor { throw new ConversationExtractor.ExtractionError(`Thread API returned HTTP ${raw.status}`) } + let parsed: unknown try { - return JSON.parse(raw.body) + parsed = JSON.parse(raw.body) } catch { throw new ConversationExtractor.ExtractionError(`Thread API: invalid JSON for ${threadId}`) } + + // Validate the HTTP response shape. On drift we record a diagnostic and fall + // through — the per-entry EntrySchema downstream is the resilient fallback, + // so a shape change is surfaced without breaking extraction. + const shapeResult = ConversationExtractor.ApiResponseSchema.safeParse(parsed) + if (!shapeResult.success) { + logger.warn(`[extractor] thread ${threadId}: unexpected response shape (see api-diagnostics)`) + this.diagnostics + .writeFailure({ + url: apiUrl, + errorType: 'zod_error', + zodErrorPaths: shapeResult.error.issues.map((issue) => issue.path.join('.')), + }) + .catch(() => {}) + } else if (!Array.isArray(shapeResult.data)) { + const data = shapeResult.data + const hasNextPage = data.has_next_page ?? data.collection_info?.has_next_page + if (hasNextPage === true) { + logger.warn( + `[extractor] thread ${threadId}: response reports has_next_page; ` + + 'additional pages are not fetched and content may be truncated' + ) + } + } + + return parsed } private async ensureContextIsAlive(): Promise { @@ -240,6 +296,11 @@ export class ConversationExtractor { .safeParse(formattedEntries) if (!entriesValidationResult.success) { + if (formattedEntries.length === 0) { + this.diagnostics + .writeFailure({ url: conversationUrl, errorType: 'empty_entries' }) + .catch(() => {}) + } logger.warn( `Entry validation failed for ${conversationUrl}: ${entriesValidationResult.error.message}` ) @@ -288,6 +349,7 @@ export class ConversationExtractor { if (dataObject && (dataObject.query_str || dataObject.blocks)) return [data] logger.warn(`Unknown API response shape for ${url}`) + this.diagnostics.writeFailure({ url, errorType: 'unknown_shape' }).catch(() => {}) return [] } diff --git a/src/scraper/library-discovery.ts b/src/scraper/library-discovery.ts index 58f2848..db73c96 100644 --- a/src/scraper/library-discovery.ts +++ b/src/scraper/library-discovery.ts @@ -1,5 +1,6 @@ import type { Page } from '@playwright/test' import { logger } from '../utils/logger.js' +import { DEFAULT_API_VERSION } from './api-version.js' // ─── Constants ─────────────────────────────────────────────────────────────── @@ -120,13 +121,13 @@ async function detectApiVersion(page: Page): Promise { (res) => VERSIONED_URL_PATTERNS.some((p) => res.url().includes(p)) && res.status() === 200, { timeout: 15_000 } ) - const version = extractVersionFromUrl(response.url()) ?? '2.18' + const version = extractVersionFromUrl(response.url()) ?? DEFAULT_API_VERSION const pathname = new URL(response.url()).pathname logger.debug(`Detected API version: ${version} (from ${pathname})`) return version } catch { - logger.debug('Version detection timeout — using fallback 2.18') - return '2.18' + logger.debug(`Version detection timeout — using fallback ${DEFAULT_API_VERSION}`) + return DEFAULT_API_VERSION } } diff --git a/src/scraper/worker-pool.ts b/src/scraper/worker-pool.ts index e64f3b1..d0adf56 100644 --- a/src/scraper/worker-pool.ts +++ b/src/scraper/worker-pool.ts @@ -92,14 +92,13 @@ export class WorkerPool { ): Promise { try { const result = await worker.extractor.extract(item.meta.url) - await this.handleSuccess(worker, item.meta, result) + await this.handleSuccess(item.meta, result) } catch (error) { - await this.handleFailure(worker, item, queue, error) + await this.handleFailure(item, queue, error) } } private async handleSuccess( - worker: ExtractionWorker, meta: ConversationMeta, result: Awaited> ): Promise { @@ -115,22 +114,12 @@ export class WorkerPool { this.checkpointManager.markAsProcessed(meta.id, result.contentHash) logger.info(`${progressLabel} Processed: ${result.title}`) } - - worker.extractor.recoverTimeout() } - private async handleFailure( - worker: ExtractionWorker, - item: QueueItem, - queue: QueueItem[], - error: unknown - ): Promise { - const isTimeout = error instanceof Error && error.message.includes('API response timeout') + private async handleFailure(item: QueueItem, queue: QueueItem[], error: unknown): Promise { const isContextLost = error instanceof Error && error.message.includes('context is no longer available') - if (isTimeout) worker.extractor.reduceTimeout() - if (isContextLost) { logger.warn('Browser context lost. Refreshing worker context...') await this.refreshContext() diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts index 3ea62df..25a9ac1 100644 --- a/test/unit/conversation-extractor.unit.test.ts +++ b/test/unit/conversation-extractor.unit.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect, vi, beforeEach } from 'vitest' import { ConversationExtractor } from '../../src/scraper/conversation-extractor.js' +import { logger } from '../../src/utils/logger.js' import type { BrowserContext, Page } from '@playwright/test' describe('ConversationExtractor (Unit)', () => { @@ -224,5 +225,75 @@ describe('ConversationExtractor (Unit)', () => { const callArg = (page.evaluate as any).mock.calls[0][1] expect(callArg.url).toContain('/rest/thread/my-thread-id') }) + + it('returns valid data and writes no diagnostic when the shape matches', async () => { + // Mirrors the real 2026 /rest/thread/{id} shape: top-level entries + + // background_entries + has_next_page/next_cursor + extra metadata keys. + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + background_entries: [], + has_next_page: false, + next_cursor: null, + status: 'success', + thread_metadata: { title: 'Test' }, + } + const writeFailure = vi.spyOn((extractor as any).diagnostics, 'writeFailure') + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + expect(writeFailure).not.toHaveBeenCalled() + }) + + it('warns (without failing) when the response reports top-level has_next_page', async () => { + const warn = vi.spyOn(logger, 'warn').mockImplementation(() => {}) + const payload = { + entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], + has_next_page: true, + next_cursor: 'cursor-abc', + } + const page = makePage(200, JSON.stringify(payload)) + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual(payload) + expect(warn).toHaveBeenCalledWith(expect.stringContaining('has_next_page')) + }) + + it('still returns the body but records a zod_error diagnostic on shape drift', async () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + // entries must be an array of objects; a string violates the schema + const page = makePage(200, '{"entries":"not-an-array"}') + const result = await (extractor as any).fetchThreadData(page, 'abc-123') + expect(result).toEqual({ entries: 'not-an-array' }) + expect(writeFailure).toHaveBeenCalledWith(expect.objectContaining({ errorType: 'zod_error' })) + }) + }) + + // ─── diagnostics on parse failures ────────────────────────────────────────── + + describe('diagnostics', () => { + it('writes an unknown_shape diagnostic for an unrecognised response shape', () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + ;(extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com') + expect(writeFailure).toHaveBeenCalledWith( + expect.objectContaining({ errorType: 'unknown_shape' }) + ) + }) + + it('writes an empty_entries diagnostic when there are no entries to validate', () => { + const writeFailure = vi + .spyOn((extractor as any).diagnostics, 'writeFailure') + .mockResolvedValue(undefined) + const result = (extractor as any).parseConversationData( + { foo: 'bar' }, + 'https://www.perplexity.ai/search/test-id' + ) + expect(result).toBeNull() + expect(writeFailure).toHaveBeenCalledWith( + expect.objectContaining({ errorType: 'empty_entries' }) + ) + }) }) }) From 3f1f1de06b7078e5358edc9858899d64835ad854 Mon Sep 17 00:00:00 2001 From: DaveG7 Date: Thu, 4 Jun 2026 23:13:11 +0200 Subject: [PATCH 5/5] feat(extractor): paginate long threads via cursor accumulation A live thread with 212 entries paginates across 3 pages of ~99; the new single-fetch path only returned page 1, truncating long conversations. fetchThreadData now keeps the single-fetch fast path for normal threads and, when has_next_page is true, follows the top-level next_cursor (same URL + &cursor=) accumulating entries in API order until the thread is complete, capped at 50 pages. Split the per-page fetch+validate into fetchThreadPage. This restores the long-thread coverage the old response listener provided. Co-Authored-By: Claude Opus 4.8 --- src/scraper/conversation-extractor.ts | 77 ++++++++++++++++--- test/unit/conversation-extractor.unit.test.ts | 38 +++++++-- 2 files changed, 99 insertions(+), 16 deletions(-) diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts index ca31725..62b961a 100644 --- a/src/scraper/conversation-extractor.ts +++ b/src/scraper/conversation-extractor.ts @@ -166,8 +166,72 @@ export class ConversationExtractor { } } + // Safety cap on cursor pagination (~99 entries/page, so ~5k entries). + private static readonly MAX_PAGES = 50 + private async fetchThreadData(page: Page, threadId: string): Promise { - const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default` + const firstParsed = await this.fetchThreadPage(page, threadId, null) + const firstObj = + firstParsed && !Array.isArray(firstParsed) ? (firstParsed as Record) : null + + let cursor = typeof firstObj?.next_cursor === 'string' ? firstObj.next_cursor : null + + // Fast path: the whole thread fit in one response (the common case). + if (firstObj?.has_next_page !== true || !cursor) { + return firstParsed + } + + // Long thread: walk the cursor pages and accumulate entries in API order + // (the endpoint paginates in conversation order — the same order the previous + // response-listener relied on, so no reordering is needed here). + const entries: unknown[] = Array.isArray(firstObj.entries) ? [...firstObj.entries] : [] + const backgroundEntries: unknown[] = Array.isArray(firstObj.background_entries) + ? [...firstObj.background_entries] + : [] + let pageCount = 1 + + while (cursor && pageCount < ConversationExtractor.MAX_PAGES) { + const parsed = await this.fetchThreadPage(page, threadId, cursor) + const obj = Array.isArray(parsed) + ? { entries: parsed as unknown[] } + : (parsed as Record) + + if (Array.isArray(obj.entries)) entries.push(...obj.entries) + if (Array.isArray(obj.background_entries)) backgroundEntries.push(...obj.background_entries) + pageCount++ + + cursor = + obj.has_next_page === true && typeof obj.next_cursor === 'string' ? obj.next_cursor : null + } + + if (cursor) { + logger.warn( + `[extractor] thread ${threadId}: stopped at the ${ConversationExtractor.MAX_PAGES}-page ` + + 'cap; export may be truncated' + ) + } + logger.debug( + `[extractor] thread ${threadId}: assembled ${entries.length} entries from ${pageCount} pages` + ) + + // Preserve the first page's top-level metadata, but with the full entry set. + return { + ...firstObj, + entries, + background_entries: backgroundEntries, + has_next_page: false, + next_cursor: null, + } + } + + /** Fetches and validates a single page of the thread API (cursor = null for page 1). */ + private async fetchThreadPage( + page: Page, + threadId: string, + cursor: string | null + ): Promise { + let apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default` + if (cursor) apiUrl += `&cursor=${encodeURIComponent(cursor)}` const raw = await page.evaluate( async ({ url }: { url: string }) => { @@ -178,7 +242,7 @@ export class ConversationExtractor { { url: apiUrl } ) - logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}`) + logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}${cursor ? ' (paged)' : ''}`) if (raw.status === 401 || raw.status === 403) { throw new ConversationExtractor.AuthError(`Authentication required (${raw.status})`) @@ -213,15 +277,6 @@ export class ConversationExtractor { zodErrorPaths: shapeResult.error.issues.map((issue) => issue.path.join('.')), }) .catch(() => {}) - } else if (!Array.isArray(shapeResult.data)) { - const data = shapeResult.data - const hasNextPage = data.has_next_page ?? data.collection_info?.has_next_page - if (hasNextPage === true) { - logger.warn( - `[extractor] thread ${threadId}: response reports has_next_page; ` + - 'additional pages are not fetched and content may be truncated' - ) - } } return parsed diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts index 25a9ac1..484f29e 100644 --- a/test/unit/conversation-extractor.unit.test.ts +++ b/test/unit/conversation-extractor.unit.test.ts @@ -244,17 +244,45 @@ describe('ConversationExtractor (Unit)', () => { expect(writeFailure).not.toHaveBeenCalled() }) - it('warns (without failing) when the response reports top-level has_next_page', async () => { - const warn = vi.spyOn(logger, 'warn').mockImplementation(() => {}) + it('returns the single page unchanged when there is no next page', async () => { const payload = { entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }], - has_next_page: true, - next_cursor: 'cursor-abc', + background_entries: [], + has_next_page: false, + next_cursor: null, } const page = makePage(200, JSON.stringify(payload)) const result = await (extractor as any).fetchThreadData(page, 'abc-123') expect(result).toEqual(payload) - expect(warn).toHaveBeenCalledWith(expect.stringContaining('has_next_page')) + expect(page.evaluate as any).toHaveBeenCalledTimes(1) + }) + + it('accumulates entries across cursor pages until has_next_page is false', async () => { + const page1 = { + entries: [{ query_str: 'q1', blocks: [] }], + background_entries: [], + has_next_page: true, + next_cursor: 'cur-1', + } + const page2 = { + entries: [{ query_str: 'q2', blocks: [] }], + background_entries: [], + has_next_page: false, + next_cursor: null, + } + const evaluate = vi + .fn() + .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page1) }) + .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page2) }) + const page = { evaluate } as unknown as Page + + const result: any = await (extractor as any).fetchThreadData(page, 'abc-123') + + expect(evaluate).toHaveBeenCalledTimes(2) + expect(result.entries.map((e: any) => e.query_str)).toEqual(['q1', 'q2']) + expect(result.has_next_page).toBe(false) + // the second request must carry the cursor returned by page 1 + expect((evaluate.mock.calls[1][1] as any).url).toContain('cursor=cur-1') }) it('still returns the body but records a zod_error diagnostic on shape drift', async () => {