From ddfa1651092334cc73f3f02995347e41dcfca387 Mon Sep 17 00:00:00 2001
From: Dave Jordan <gd.giordi@gmail.com>
Date: Wed, 3 Jun 2026 22:59:27 +0200
Subject: [PATCH 1/5] fix: adapt scraper to Perplexity API changes (2026)

---
 src/scraper/conversation-extractor.ts | 170 +++++++-------------------
 src/scraper/library-discovery.ts      |  77 +++++++-----
 2 files changed, 92 insertions(+), 155 deletions(-)
diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts
index 0a3102a..77c4ef4 100644
--- a/src/scraper/conversation-extractor.ts
+++ b/src/scraper/conversation-extractor.ts
@@ -4,7 +4,6 @@ import { z } from 'zod'
 import { type Page, type BrowserContext, type Response } from '@playwright/test'
 import { logger } from '../utils/logger.js'
 import { waitStrategy } from '../utils/wait-strategy.js'
-import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js'
 import { type Config } from '../utils/config.js'
 
 export interface ConversationMessage {
@@ -44,19 +43,6 @@ export class ConversationExtractor {
     blocks: z.array(ConversationExtractor.BlockSchema).optional(),
   })
 
-  private static readonly ApiResponseSchema = z.union([
-    z.array(ConversationExtractor.EntrySchema),
-    z.object({
-      entries: z.array(ConversationExtractor.EntrySchema),
-      background_entries: z.array(z.unknown()).optional(),
-      collection_info: z
-        .object({
-          has_next_page: z.boolean().optional(),
-        })
-        .optional(),
-    }),
-  ])
-
   static readonly ExtractionError = class extends Error {
     constructor(message: string) {
       super(message)
@@ -106,35 +92,14 @@ export class ConversationExtractor {
     }
   }
 
-  private static readonly TIMEOUT_MAX_MS = 30_000
-  private static readonly TIMEOUT_MIN_MS = 8_000
-  private static readonly TIMEOUT_STEP_DOWN_MS = 3_000
-  private static readonly TIMEOUT_STEP_UP_MS = 1_000
-
-  private currentTimeoutMs = ConversationExtractor.TIMEOUT_MAX_MS
-  private readonly diagnostics: ApiDiagnosticsWriter
-
   constructor(
     private readonly config: Config,
     private readonly context: BrowserContext
-  ) {
-    this.diagnostics = new ApiDiagnosticsWriter(config)
-  }
+  ) {}
 
-  reduceTimeout(): void {
-    this.currentTimeoutMs = Math.max(
-      ConversationExtractor.TIMEOUT_MIN_MS,
-      this.currentTimeoutMs - ConversationExtractor.TIMEOUT_STEP_DOWN_MS
-    )
-    logger.debug(`[extractor] timeout reduced to ${this.currentTimeoutMs}ms`)
-  }
+  reduceTimeout(): void {}
 
-  recoverTimeout(): void {
-    this.currentTimeoutMs = Math.min(
-      ConversationExtractor.TIMEOUT_MAX_MS,
-      this.currentTimeoutMs + ConversationExtractor.TIMEOUT_STEP_UP_MS
-    )
-  }
+  recoverTimeout(): void {}
 
   async extract(conversationUrl: string): Promise<ExtractedConversation> {
     await this.ensureContextIsAlive()
@@ -147,16 +112,12 @@ export class ConversationExtractor {
       throw new ConversationExtractor.ExtractionError(`Failed to create new page: ${errorMessage}`)
     }
 
-    const apiResponsePromise = this.captureConversationApiResponse(conversationPage)
-
     try {
       await this.navigateToConversationUrl(conversationPage, conversationUrl)
       await waitStrategy(this.config).afterScroll(conversationPage)
 
-      const capturedApiData = await apiResponsePromise
-      if (!capturedApiData) {
-        throw new ConversationExtractor.NoDataError('API response timeout or not found')
-      }
+      const conversationId = this.extractIdFromUrl(conversationUrl)
+      const capturedApiData = await this.fetchThreadData(conversationPage, conversationId)
 
       const extractedConversation = this.parseConversationData(capturedApiData, conversationUrl)
       if (!extractedConversation) {
@@ -176,6 +137,40 @@ export class ConversationExtractor {
     }
   }
 
+  private async fetchThreadData(page: Page, threadId: string): Promise<unknown> {
+    const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=2.18&source=default`
+
+    const raw = await page.evaluate(
+      async ({ url }: { url: string }) => {
+        const res = await fetch(url, { credentials: 'include' })
+        const text = await res.text()
+        return { status: res.status, body: text }
+      },
+      { url: apiUrl }
+    )
+
+    logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}`)
+
+    if (raw.status === 401 || raw.status === 403) {
+      throw new ConversationExtractor.AuthError(`Authentication required (${raw.status})`)
+    }
+    if (raw.status === 404) {
+      throw new ConversationExtractor.NotFoundError('Thread not found (404)')
+    }
+    if (raw.status >= 500) {
+      throw new ConversationExtractor.ServerError(`Server error (${raw.status})`)
+    }
+    if (raw.status !== 200) {
+      throw new ConversationExtractor.ExtractionError(`Thread API returned HTTP ${raw.status}`)
+    }
+
+    try {
+      return JSON.parse(raw.body)
+    } catch {
+      throw new ConversationExtractor.ExtractionError(`Thread API: invalid JSON for ${threadId}`)
+    }
+  }
+
   private async ensureContextIsAlive(): Promise<void> {
     if (!this.context) {
       throw new ConversationExtractor.ExtractionError('Browser context is missing')
@@ -187,78 +182,6 @@ export class ConversationExtractor {
     }
   }
 
-  private captureConversationApiResponse(page: Page): Promise<unknown> {
-    const accumulatedEntries: unknown[] = []
-    let isRequestResolved = false
-
-    return new Promise((resolve) => {
-      const timeoutId = setTimeout(() => {
-        if (!isRequestResolved) {
-          if (accumulatedEntries.length > 0) {
-            logger.info(
-              `API response timeout – resolving with ${accumulatedEntries.length} accumulated entries`
-            )
-            resolve({ entries: accumulatedEntries })
-          } else {
-            logger.warn('API response timeout – resolving with null')
-            resolve(null)
-          }
-          isRequestResolved = true
-        }
-      }, this.currentTimeoutMs)
-
-      page.on('response', async (response: Response) => {
-        if (isRequestResolved) return
-
-        const responseUrl = response.url()
-        const isThreadApiRequest = responseUrl.includes('/rest/thread/')
-        const isListRequest =
-          responseUrl.includes('list_ask_threads') ||
-          responseUrl.includes('list_recent') ||
-          responseUrl.includes('list_pinned')
-
-        if (!isThreadApiRequest || isListRequest) return
-        if (page.isClosed()) return
-
-        try {
-          const jsonResponse = await response.json()
-          if (isRequestResolved) return
-
-          const parseResult = ConversationExtractor.ApiResponseSchema.safeParse(jsonResponse)
-
-          if (!parseResult.success) {
-            this.diagnostics
-              .writeFailure({
-                url: response.url(),
-                errorType: 'zod_error',
-                zodErrorPaths: parseResult.error.issues.map((issue) => issue.path.join('.')),
-              })
-              .catch(() => {})
-          } else {
-            const responseData = parseResult.data
-            const currentEntries = Array.isArray(responseData) ? responseData : responseData.entries
-            accumulatedEntries.push(...currentEntries)
-
-            const hasNextPage =
-              !Array.isArray(responseData) && responseData.collection_info?.has_next_page === true
-
-            if (!hasNextPage) {
-              clearTimeout(timeoutId)
-              isRequestResolved = true
-              resolve({ entries: accumulatedEntries })
-            } else {
-              logger.info(
-                `Captured paginated response, ${accumulatedEntries.length} entries so far...`
-              )
-            }
-          }
-        } catch (_error) {
-          // Silent catch for JSON parse errors from non-JSON responses
-        }
-      })
-    })
-  }
-
   private async navigateToConversationUrl(page: Page, url: string): Promise<void> {
     const NAVIGATION_TIMEOUT_MS = 30000
     const navigationResponse = await page.goto(url, {
@@ -317,11 +240,6 @@ export class ConversationExtractor {
         .safeParse(formattedEntries)
 
       if (!entriesValidationResult.success) {
-        if (formattedEntries.length === 0) {
-          this.diagnostics
-            .writeFailure({ url: conversationUrl, errorType: 'empty_entries' })
-            .catch(() => {})
-        }
         logger.warn(
           `Entry validation failed for ${conversationUrl}: ${entriesValidationResult.error.message}`
         )
@@ -369,8 +287,7 @@ export class ConversationExtractor {
     if (dataObject && Array.isArray(dataObject.entries)) return dataObject.entries as unknown[]
     if (dataObject && (dataObject.query_str || dataObject.blocks)) return [data]
 
-    this.diagnostics.writeFailure({ url, errorType: 'unknown_shape' }).catch(() => {})
-
+    logger.warn(`Unknown API response shape for ${url}`)
     return []
   }
 
@@ -397,9 +314,12 @@ export class ConversationExtractor {
       }
 
       let answer = ''
+      const seenAnswers = new Set<string>()
       for (const block of entry.blocks ?? []) {
-        if (block.markdown_block?.answer) {
-          answer += block.markdown_block.answer + '\n\n'
+        const blockAnswer = block.markdown_block?.answer
+        if (blockAnswer && !seenAnswers.has(blockAnswer)) {
+          seenAnswers.add(blockAnswer)
+          answer += blockAnswer + '\n\n'
         }
       }
 
diff --git a/src/scraper/library-discovery.ts b/src/scraper/library-discovery.ts
index e896cd8..58f2848 100644
--- a/src/scraper/library-discovery.ts
+++ b/src/scraper/library-discovery.ts
@@ -27,22 +27,30 @@ const VERSIONED_URL_PATTERNS = [
 
 interface RawThread {
   uuid: string
-  slug: string
   title: string
-  query_str: string
-  first_answer: string
-  answer_preview: string
-  last_query_datetime: string
-  mode: string
-  status: string
-  display_model: string
-  thread_access: number
-  has_next_page: boolean
-  total_threads: number
-  collection: Collection | null
-  sources: string[]
-  query_count: number
-  search_focus: string
+  // New format (2026+)
+  link?: string
+  variant?: string
+  unread?: boolean
+  context_uuid?: string
+  task_description?: string | null
+  mode_type?: number
+  // Old format (pre-2026, may still appear)
+  slug?: string
+  query_str?: string
+  first_answer?: string
+  answer_preview?: string | null
+  last_query_datetime?: string
+  mode?: string
+  status?: string
+  display_model?: string
+  thread_access?: number
+  has_next_page?: boolean
+  total_threads?: number
+  collection?: Collection | null
+  sources?: string[]
+  query_count?: number
+  search_focus?: string
   [key: string]: unknown
 }
 
@@ -57,20 +65,28 @@ export interface ConversationMeta {
   id: string
   url: string
   uuid: string
-  slug: string
   title: string
-  query_str: string
-  first_answer: string
-  answer_preview: string
-  last_query_datetime: string
-  mode: string
-  status: string
-  display_model: string
-  thread_access: number
-  collection: Collection | null
-  sources: string[]
-  query_count: number
-  search_focus: string
+  status?: string
+  // New format
+  link?: string
+  variant?: string
+  unread?: boolean
+  context_uuid?: string
+  task_description?: string | null
+  mode_type?: number
+  // Old format
+  slug?: string
+  query_str?: string
+  first_answer?: string
+  answer_preview?: string | null
+  last_query_datetime?: string
+  mode?: string
+  display_model?: string
+  thread_access?: number
+  collection?: Collection | null
+  sources?: string[]
+  query_count?: number
+  search_focus?: string
   [key: string]: unknown
 }
 
@@ -88,10 +104,11 @@ function extractVersionFromUrl(url: string): string | null {
 }
 
 function rawThreadToConversationMeta(thread: RawThread): ConversationMeta {
+  const path = thread.link ?? `/search/${thread.slug ?? thread.uuid}`
   return {
     ...thread,
     id: thread.uuid,
-    url: `${BASE_URL}/search/${thread.slug}`,
+    url: `${BASE_URL}${path}`,
   }
 }
 
@@ -185,7 +202,7 @@ async function fetchThreadBatch(
   }
 
   const threads = parsed as RawThread[]
-  const total = threads[0]?.total_threads ?? 0
+  const total = (threads[0]?.total_threads as number | undefined) ?? 0
 
   return {
     threads,

From 69e499e5d6259ca39b8c559dc9f9d51cdbd55e55 Mon Sep 17 00:00:00 2001
From: Dave Jordan <gd.giordi@gmail.com>
Date: Wed, 3 Jun 2026 23:01:01 +0200
Subject: [PATCH 2/5] chore: ignore AI tooling directories

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2b8b5a9..88ac9fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,8 @@ coverage/
 exports*
 debug/
 logs/*.txt
+
+# AI tooling
+.claude/
+.wolf/
+CLAUDE.md

From 7cfba60fc8ecfa5861f4f5b20a181e3ec82dffad Mon Sep 17 00:00:00 2001
From: Dave Jordan <gd.giordi@gmail.com>
Date: Wed, 3 Jun 2026 23:13:15 +0200
Subject: [PATCH 3/5] test: update conversation-extractor tests for new API
 approach

---
 test/unit/conversation-extractor.unit.test.ts | 207 ++++++++++++++----
 1 file changed, 170 insertions(+), 37 deletions(-)

diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts
index d087a01..3ea62df 100644
--- a/test/unit/conversation-extractor.unit.test.ts
+++ b/test/unit/conversation-extractor.unit.test.ts
@@ -1,17 +1,6 @@
 import { describe, it, expect, vi, beforeEach } from 'vitest'
 import { ConversationExtractor } from '../../src/scraper/conversation-extractor.js'
-import { ApiDiagnosticsWriter } from '../../src/utils/api-diagnostics.js'
-import type { BrowserContext } from '@playwright/test'
-
-vi.mock('../../src/utils/api-diagnostics.js', () => {
-  return {
-    ApiDiagnosticsWriter: vi.fn().mockImplementation(function () {
-      return {
-        writeFailure: vi.fn().mockResolvedValue(undefined),
-      }
-    }),
-  }
-})
+import type { BrowserContext, Page } from '@playwright/test'
 
 describe('ConversationExtractor (Unit)', () => {
   let extractor: ConversationExtractor
@@ -31,65 +20,209 @@ describe('ConversationExtractor (Unit)', () => {
     vi.clearAllMocks()
   })
 
+  // ─── ensureEntriesFormat ──────────────────────────────────────────────────────
+
   describe('ensureEntriesFormat', () => {
-    it('should return array if input is array', () => {
+    it('returns array as-is when input is already an array', () => {
       const data = [{ query_str: 'test' }]
       const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com')
       expect(result).toEqual(data)
-      expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled()
     })
 
-    it('should return data.entries if input has entries array', () => {
+    it('returns data.entries when input has an entries array', () => {
       const data = { entries: [{ query_str: 'test' }] }
       const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com')
       expect(result).toEqual(data.entries)
-      expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled()
     })
 
-    it('should return [data] if input has query_str', () => {
+    it('wraps single entry object when it has query_str', () => {
       const data = { query_str: 'test' }
       const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com')
       expect(result).toEqual([data])
-      expect((extractor as any).diagnostics.writeFailure).not.toHaveBeenCalled()
     })
 
-    it('should return empty array and call diagnostics for unknown shape', () => {
-      const data = { foo: 'bar' }
+    it('wraps single entry object when it has blocks', () => {
+      const data = { blocks: [] }
       const result = (extractor as any).ensureEntriesFormat(data, 'http://test.com')
+      expect(result).toEqual([data])
+    })
+
+    it('returns empty array for unknown shape', () => {
+      const result = (extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com')
       expect(result).toEqual([])
-      expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({
-        url: 'http://test.com',
-        errorType: 'unknown_shape',
-      })
     })
   })
 
+  // ─── parseConversationData ────────────────────────────────────────────────────
+
   describe('parseConversationData', () => {
-    it('should return null and call diagnostics if entries are empty', () => {
-      const data = { entries: [] }
-      const result = extractor.parseConversationData(data, 'http://test.com')
+    it('returns null when entries array is empty', () => {
+      const result = (extractor as any).parseConversationData(
+        { entries: [] },
+        'https://www.perplexity.ai/search/test-id'
+      )
       expect(result).toBeNull()
-      expect((extractor as any).diagnostics.writeFailure).toHaveBeenCalledWith({
-        url: 'http://test.com',
-        errorType: 'empty_entries',
-      })
     })
 
-    it('should parse valid entries correctly', () => {
+    it('parses new-format entries (blocks with markdown_block.answer)', () => {
       const data = {
         entries: [
           {
             thread_title: 'Test Thread',
             query_str: 'What is 1+1?',
-            blocks: [{ markdown_block: { answer: '2' } }],
+            updated_datetime: '2026-01-01T00:00:00Z',
+            blocks: [
+              { intended_usage: 'plan', plan_block: {} },
+              {
+                intended_usage: 'ask_text_0_markdown',
+                markdown_block: { answer: 'The answer is 2.' },
+              },
+            ],
           },
         ],
       }
-      const result = extractor.parseConversationData(data, 'https://perplexity.ai/search/uuid')
+      const result = (extractor as any).parseConversationData(
+        data,
+        'https://www.perplexity.ai/search/test-id'
+      )
       expect(result).not.toBeNull()
-      expect(result?.title).toBe('Test Thread')
-      expect(result?.content).toContain('What is 1+1?')
-      expect(result?.content).toContain('2')
+      expect(result.title).toBe('Test Thread')
+      expect(result.content).toContain('What is 1+1?')
+      expect(result.content).toContain('The answer is 2.')
+    })
+
+    it('deduplicates identical answers from ask_text and ask_text_0_markdown blocks', () => {
+      const answer = 'The answer is 2.'
+      const data = {
+        entries: [
+          {
+            thread_title: 'Dedupe Test',
+            query_str: 'What is 1+1?',
+            blocks: [
+              { intended_usage: 'ask_text_0_markdown', markdown_block: { answer } },
+              { intended_usage: 'ask_text', markdown_block: { answer } },
+            ],
+          },
+        ],
+      }
+      const result = (extractor as any).parseConversationData(
+        data,
+        'https://www.perplexity.ai/search/test-id'
+      )
+      expect(result).not.toBeNull()
+      const occurrences = (result.content.match(/The answer is 2\./g) ?? []).length
+      expect(occurrences).toBe(1)
+    })
+
+    it('extracts space name from collection_info', () => {
+      const data = {
+        entries: [
+          {
+            thread_title: 'Test',
+            query_str: 'Question',
+            collection_info: { title: 'HomeLab' },
+            blocks: [
+              { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'Answer' } },
+            ],
+          },
+        ],
+      }
+      const result = (extractor as any).parseConversationData(
+        data,
+        'https://www.perplexity.ai/search/test-id'
+      )
+      expect(result?.spaceName).toBe('HomeLab')
+    })
+
+    it('handles multi-turn conversations', () => {
+      const data = {
+        entries: [
+          {
+            thread_title: 'Multi-turn',
+            query_str: 'First question',
+            blocks: [
+              { intended_usage: 'ask_text_0_markdown', markdown_block: { answer: 'First answer' } },
+            ],
+          },
+          {
+            query_str: 'Follow-up question',
+            blocks: [
+              {
+                intended_usage: 'ask_text_0_markdown',
+                markdown_block: { answer: 'Follow-up answer' },
+              },
+            ],
+          },
+        ],
+      }
+      const result = (extractor as any).parseConversationData(
+        data,
+        'https://www.perplexity.ai/search/test-id'
+      )
+      expect(result?.messages).toHaveLength(4)
+      expect(result?.messages[0]).toEqual({ role: 'user', content: 'First question' })
+      expect(result?.messages[1]).toEqual({ role: 'assistant', content: 'First answer' })
+      expect(result?.messages[2]).toEqual({ role: 'user', content: 'Follow-up question' })
+      expect(result?.messages[3]).toEqual({ role: 'assistant', content: 'Follow-up answer' })
+    })
+  })
+
+  // ─── fetchThreadData ──────────────────────────────────────────────────────────
+
+  describe('fetchThreadData', () => {
+    const makePage = (status: number, body: string): Page =>
+      ({ evaluate: vi.fn().mockResolvedValue({ status, body }) }) as unknown as Page
+
+    it('returns parsed JSON on HTTP 200', async () => {
+      const payload = {
+        entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }],
+        background_entries: [],
+      }
+      const page = makePage(200, JSON.stringify(payload))
+      const result = await (extractor as any).fetchThreadData(page, 'abc-123')
+      expect(result).toEqual(payload)
+    })
+
+    it('throws AuthError on HTTP 401', async () => {
+      const page = makePage(401, '{}')
+      await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow(
+        ConversationExtractor.AuthError
+      )
+    })
+
+    it('throws AuthError on HTTP 403', async () => {
+      const page = makePage(403, '{}')
+      await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow(
+        ConversationExtractor.AuthError
+      )
+    })
+
+    it('throws NotFoundError on HTTP 404', async () => {
+      const page = makePage(404, '{}')
+      await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow(
+        ConversationExtractor.NotFoundError
+      )
+    })
+
+    it('throws ServerError on HTTP 500', async () => {
+      const page = makePage(500, '{}')
+      await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow(
+        ConversationExtractor.ServerError
+      )
+    })
+
+    it('throws ExtractionError on invalid JSON body', async () => {
+      const page = makePage(200, 'not-json')
+      await expect((extractor as any).fetchThreadData(page, 'abc-123')).rejects.toThrow(
+        ConversationExtractor.ExtractionError
+      )
+    })
+
+    it('calls the correct API URL with thread ID', async () => {
+      const page = makePage(200, '{"entries":[],"background_entries":[]}')
+      await (extractor as any).fetchThreadData(page, 'my-thread-id').catch(() => {})
+      const callArg = (page.evaluate as any).mock.calls[0][1]
+      expect(callArg.url).toContain('/rest/thread/my-thread-id')
     })
   })
 })

From cce6218c1d5dd70c22123b095b856c2e8ef1654e Mon Sep 17 00:00:00 2001
From: DaveG7 <gd.giordi@gmail.com>
Date: Thu, 4 Jun 2026 22:16:38 +0200
Subject: [PATCH 4/5] fix(extractor): restore response validation +
 diagnostics, justify direct fetch

Address PR #39 review feedback on conversation-extractor:

- Restore ApiResponseSchema, validated against a live 2026 /rest/thread/{id}
  response. Pagination is the top-level has_next_page/next_cursor pair (not
  collection_info, which is the list endpoint). Diagnose-and-continue: shape
  drift writes a diagnostic and falls through to the per-entry EntrySchema gate.
- Restore ApiDiagnosticsWriter calls (zod_error / unknown_shape / empty_entries)
  so the debug/api-diagnostics.jsonl path the REPL references works again.
- Keep the page.evaluate()+fetch approach for consistency with library-discovery
  (the response-listener was the lone divergent /rest/ path); replace hardcoded
  version=2.18 with shared DEFAULT_API_VERSION.
- Remove dead adaptive-timeout no-ops (reduceTimeout/recoverTimeout) and their
  now-unused worker-pool callers.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/scraper/api-version.ts                    |  7 ++
 src/scraper/conversation-extractor.ts         | 76 +++++++++++++++++--
 src/scraper/library-discovery.ts              |  7 +-
 src/scraper/worker-pool.ts                    | 17 +----
 test/unit/conversation-extractor.unit.test.ts | 71 +++++++++++++++++
 5 files changed, 154 insertions(+), 24 deletions(-)
 create mode 100644 src/scraper/api-version.ts

diff --git a/src/scraper/api-version.ts b/src/scraper/api-version.ts
new file mode 100644
index 0000000..64033e6
--- /dev/null
+++ b/src/scraper/api-version.ts
@@ -0,0 +1,7 @@
+/**
+ * Perplexity versions its REST endpoints via a `?version=X.Y` query param
+ * (e.g. `/rest/thread/list_ask_threads?version=2.18`). Both library discovery
+ * and conversation extraction hit `/rest/` endpoints, so they share this default
+ * to keep the version in one place rather than duplicating the literal.
+ */
+export const DEFAULT_API_VERSION = '2.18'
diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts
index 77c4ef4..ca31725 100644
--- a/src/scraper/conversation-extractor.ts
+++ b/src/scraper/conversation-extractor.ts
@@ -5,6 +5,8 @@ import { type Page, type BrowserContext, type Response } from '@playwright/test'
 import { logger } from '../utils/logger.js'
 import { waitStrategy } from '../utils/wait-strategy.js'
 import { type Config } from '../utils/config.js'
+import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js'
+import { DEFAULT_API_VERSION } from './api-version.js'
 
 export interface ConversationMessage {
   role: 'user' | 'assistant'
@@ -43,6 +45,33 @@ export class ConversationExtractor {
     blocks: z.array(ConversationExtractor.BlockSchema).optional(),
   })
 
+  /**
+   * Validates the top-level shape of the `/rest/thread/{id}` HTTP response,
+   * confirmed against a live 2026 response. The endpoint returns either a bare
+   * array of entries or an object wrapping them; pagination is signalled by the
+   * top-level `has_next_page` / `next_cursor` pair. Fields are optional and the
+   * object is non-strict, so unknown/new keys don't reject an otherwise-valid
+   * response — shape drift is surfaced via diagnostics, and `EntrySchema`
+   * remains the per-entry fallback downstream.
+   */
+  private static readonly ApiResponseSchema = z.union([
+    z.array(ConversationExtractor.EntrySchema),
+    z.object({
+      entries: z.array(ConversationExtractor.EntrySchema),
+      background_entries: z.array(z.unknown()).optional(),
+      has_next_page: z.boolean().optional(),
+      next_cursor: z.string().nullable().optional(),
+      status: z.string().optional(),
+      thread_metadata: z.unknown().optional(),
+      // Legacy/list-style pagination — kept for backward compatibility.
+      collection_info: z
+        .object({
+          has_next_page: z.boolean().optional(),
+        })
+        .optional(),
+    }),
+  ])
+
   static readonly ExtractionError = class extends Error {
     constructor(message: string) {
       super(message)
@@ -92,14 +121,14 @@ export class ConversationExtractor {
     }
   }
 
+  private readonly diagnostics: ApiDiagnosticsWriter
+
   constructor(
     private readonly config: Config,
     private readonly context: BrowserContext
-  ) {}
-
-  reduceTimeout(): void {}
-
-  recoverTimeout(): void {}
+  ) {
+    this.diagnostics = new ApiDiagnosticsWriter(config)
+  }
 
   async extract(conversationUrl: string): Promise<ExtractedConversation> {
     await this.ensureContextIsAlive()
@@ -138,7 +167,7 @@ export class ConversationExtractor {
   }
 
   private async fetchThreadData(page: Page, threadId: string): Promise<unknown> {
-    const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=2.18&source=default`
+    const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default`
 
     const raw = await page.evaluate(
       async ({ url }: { url: string }) => {
@@ -164,11 +193,38 @@ export class ConversationExtractor {
       throw new ConversationExtractor.ExtractionError(`Thread API returned HTTP ${raw.status}`)
     }
 
+    let parsed: unknown
     try {
-      return JSON.parse(raw.body)
+      parsed = JSON.parse(raw.body)
     } catch {
       throw new ConversationExtractor.ExtractionError(`Thread API: invalid JSON for ${threadId}`)
     }
+
+    // Validate the HTTP response shape. On drift we record a diagnostic and fall
+    // through — the per-entry EntrySchema downstream is the resilient fallback,
+    // so a shape change is surfaced without breaking extraction.
+    const shapeResult = ConversationExtractor.ApiResponseSchema.safeParse(parsed)
+    if (!shapeResult.success) {
+      logger.warn(`[extractor] thread ${threadId}: unexpected response shape (see api-diagnostics)`)
+      this.diagnostics
+        .writeFailure({
+          url: apiUrl,
+          errorType: 'zod_error',
+          zodErrorPaths: shapeResult.error.issues.map((issue) => issue.path.join('.')),
+        })
+        .catch(() => {})
+    } else if (!Array.isArray(shapeResult.data)) {
+      const data = shapeResult.data
+      const hasNextPage = data.has_next_page ?? data.collection_info?.has_next_page
+      if (hasNextPage === true) {
+        logger.warn(
+          `[extractor] thread ${threadId}: response reports has_next_page; ` +
+            'additional pages are not fetched and content may be truncated'
+        )
+      }
+    }
+
+    return parsed
   }
 
   private async ensureContextIsAlive(): Promise<void> {
@@ -240,6 +296,11 @@ export class ConversationExtractor {
         .safeParse(formattedEntries)
 
       if (!entriesValidationResult.success) {
+        if (formattedEntries.length === 0) {
+          this.diagnostics
+            .writeFailure({ url: conversationUrl, errorType: 'empty_entries' })
+            .catch(() => {})
+        }
         logger.warn(
           `Entry validation failed for ${conversationUrl}: ${entriesValidationResult.error.message}`
         )
@@ -288,6 +349,7 @@ export class ConversationExtractor {
     if (dataObject && (dataObject.query_str || dataObject.blocks)) return [data]
 
     logger.warn(`Unknown API response shape for ${url}`)
+    this.diagnostics.writeFailure({ url, errorType: 'unknown_shape' }).catch(() => {})
     return []
   }
 
diff --git a/src/scraper/library-discovery.ts b/src/scraper/library-discovery.ts
index 58f2848..db73c96 100644
--- a/src/scraper/library-discovery.ts
+++ b/src/scraper/library-discovery.ts
@@ -1,5 +1,6 @@
 import type { Page } from '@playwright/test'
 import { logger } from '../utils/logger.js'
+import { DEFAULT_API_VERSION } from './api-version.js'
 
 // ─── Constants ───────────────────────────────────────────────────────────────
 
@@ -120,13 +121,13 @@ async function detectApiVersion(page: Page): Promise<string> {
       (res) => VERSIONED_URL_PATTERNS.some((p) => res.url().includes(p)) && res.status() === 200,
       { timeout: 15_000 }
     )
-    const version = extractVersionFromUrl(response.url()) ?? '2.18'
+    const version = extractVersionFromUrl(response.url()) ?? DEFAULT_API_VERSION
     const pathname = new URL(response.url()).pathname
     logger.debug(`Detected API version: ${version} (from ${pathname})`)
     return version
   } catch {
-    logger.debug('Version detection timeout — using fallback 2.18')
-    return '2.18'
+    logger.debug(`Version detection timeout — using fallback ${DEFAULT_API_VERSION}`)
+    return DEFAULT_API_VERSION
   }
 }
 
diff --git a/src/scraper/worker-pool.ts b/src/scraper/worker-pool.ts
index e64f3b1..d0adf56 100644
--- a/src/scraper/worker-pool.ts
+++ b/src/scraper/worker-pool.ts
@@ -92,14 +92,13 @@ export class WorkerPool {
   ): Promise<void> {
     try {
       const result = await worker.extractor.extract(item.meta.url)
-      await this.handleSuccess(worker, item.meta, result)
+      await this.handleSuccess(item.meta, result)
     } catch (error) {
-      await this.handleFailure(worker, item, queue, error)
+      await this.handleFailure(item, queue, error)
     }
   }
 
   private async handleSuccess(
-    worker: ExtractionWorker,
     meta: ConversationMeta,
     result: Awaited<ReturnType<ConversationExtractor['extract']>>
   ): Promise<void> {
@@ -115,22 +114,12 @@ export class WorkerPool {
       this.checkpointManager.markAsProcessed(meta.id, result.contentHash)
       logger.info(`${progressLabel} Processed: ${result.title}`)
     }
-
-    worker.extractor.recoverTimeout()
   }
 
-  private async handleFailure(
-    worker: ExtractionWorker,
-    item: QueueItem,
-    queue: QueueItem[],
-    error: unknown
-  ): Promise<void> {
-    const isTimeout = error instanceof Error && error.message.includes('API response timeout')
+  private async handleFailure(item: QueueItem, queue: QueueItem[], error: unknown): Promise<void> {
     const isContextLost =
       error instanceof Error && error.message.includes('context is no longer available')
 
-    if (isTimeout) worker.extractor.reduceTimeout()
-
     if (isContextLost) {
       logger.warn('Browser context lost. Refreshing worker context...')
       await this.refreshContext()
diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts
index 3ea62df..25a9ac1 100644
--- a/test/unit/conversation-extractor.unit.test.ts
+++ b/test/unit/conversation-extractor.unit.test.ts
@@ -1,5 +1,6 @@
 import { describe, it, expect, vi, beforeEach } from 'vitest'
 import { ConversationExtractor } from '../../src/scraper/conversation-extractor.js'
+import { logger } from '../../src/utils/logger.js'
 import type { BrowserContext, Page } from '@playwright/test'
 
 describe('ConversationExtractor (Unit)', () => {
@@ -224,5 +225,75 @@ describe('ConversationExtractor (Unit)', () => {
       const callArg = (page.evaluate as any).mock.calls[0][1]
       expect(callArg.url).toContain('/rest/thread/my-thread-id')
     })
+
+    it('returns valid data and writes no diagnostic when the shape matches', async () => {
+      // Mirrors the real 2026 /rest/thread/{id} shape: top-level entries +
+      // background_entries + has_next_page/next_cursor + extra metadata keys.
+      const payload = {
+        entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }],
+        background_entries: [],
+        has_next_page: false,
+        next_cursor: null,
+        status: 'success',
+        thread_metadata: { title: 'Test' },
+      }
+      const writeFailure = vi.spyOn((extractor as any).diagnostics, 'writeFailure')
+      const page = makePage(200, JSON.stringify(payload))
+      const result = await (extractor as any).fetchThreadData(page, 'abc-123')
+      expect(result).toEqual(payload)
+      expect(writeFailure).not.toHaveBeenCalled()
+    })
+
+    it('warns (without failing) when the response reports top-level has_next_page', async () => {
+      const warn = vi.spyOn(logger, 'warn').mockImplementation(() => {})
+      const payload = {
+        entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }],
+        has_next_page: true,
+        next_cursor: 'cursor-abc',
+      }
+      const page = makePage(200, JSON.stringify(payload))
+      const result = await (extractor as any).fetchThreadData(page, 'abc-123')
+      expect(result).toEqual(payload)
+      expect(warn).toHaveBeenCalledWith(expect.stringContaining('has_next_page'))
+    })
+
+    it('still returns the body but records a zod_error diagnostic on shape drift', async () => {
+      const writeFailure = vi
+        .spyOn((extractor as any).diagnostics, 'writeFailure')
+        .mockResolvedValue(undefined)
+      // entries must be an array of objects; a string violates the schema
+      const page = makePage(200, '{"entries":"not-an-array"}')
+      const result = await (extractor as any).fetchThreadData(page, 'abc-123')
+      expect(result).toEqual({ entries: 'not-an-array' })
+      expect(writeFailure).toHaveBeenCalledWith(expect.objectContaining({ errorType: 'zod_error' }))
+    })
+  })
+
+  // ─── diagnostics on parse failures ──────────────────────────────────────────
+
+  describe('diagnostics', () => {
+    it('writes an unknown_shape diagnostic for an unrecognised response shape', () => {
+      const writeFailure = vi
+        .spyOn((extractor as any).diagnostics, 'writeFailure')
+        .mockResolvedValue(undefined)
+      ;(extractor as any).ensureEntriesFormat({ foo: 'bar' }, 'http://test.com')
+      expect(writeFailure).toHaveBeenCalledWith(
+        expect.objectContaining({ errorType: 'unknown_shape' })
+      )
+    })
+
+    it('writes an empty_entries diagnostic when there are no entries to validate', () => {
+      const writeFailure = vi
+        .spyOn((extractor as any).diagnostics, 'writeFailure')
+        .mockResolvedValue(undefined)
+      const result = (extractor as any).parseConversationData(
+        { foo: 'bar' },
+        'https://www.perplexity.ai/search/test-id'
+      )
+      expect(result).toBeNull()
+      expect(writeFailure).toHaveBeenCalledWith(
+        expect.objectContaining({ errorType: 'empty_entries' })
+      )
+    })
   })
 })

From 3f1f1de06b7078e5358edc9858899d64835ad854 Mon Sep 17 00:00:00 2001
From: DaveG7 <gd.giordi@gmail.com>
Date: Thu, 4 Jun 2026 23:13:11 +0200
Subject: [PATCH 5/5] feat(extractor): paginate long threads via cursor
 accumulation

A live thread with 212 entries paginates across 3 pages of ~99; the new
single-fetch path only returned page 1, truncating long conversations.

fetchThreadData now keeps the single-fetch fast path for normal threads and,
when has_next_page is true, follows the top-level next_cursor (same URL +
&cursor=<encoded>) accumulating entries in API order until the thread is
complete, capped at 50 pages. Split the per-page fetch+validate into
fetchThreadPage. This restores the long-thread coverage the old response
listener provided.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/scraper/conversation-extractor.ts         | 77 ++++++++++++++++---
 test/unit/conversation-extractor.unit.test.ts | 38 +++++++--
 2 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/src/scraper/conversation-extractor.ts b/src/scraper/conversation-extractor.ts
index ca31725..62b961a 100644
--- a/src/scraper/conversation-extractor.ts
+++ b/src/scraper/conversation-extractor.ts
@@ -166,8 +166,72 @@ export class ConversationExtractor {
     }
   }
 
+  // Safety cap on cursor pagination (~99 entries/page, so ~5k entries).
+  private static readonly MAX_PAGES = 50
+
   private async fetchThreadData(page: Page, threadId: string): Promise<unknown> {
-    const apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default`
+    const firstParsed = await this.fetchThreadPage(page, threadId, null)
+    const firstObj =
+      firstParsed && !Array.isArray(firstParsed) ? (firstParsed as Record<string, unknown>) : null
+
+    let cursor = typeof firstObj?.next_cursor === 'string' ? firstObj.next_cursor : null
+
+    // Fast path: the whole thread fit in one response (the common case).
+    if (firstObj?.has_next_page !== true || !cursor) {
+      return firstParsed
+    }
+
+    // Long thread: walk the cursor pages and accumulate entries in API order
+    // (the endpoint paginates in conversation order — the same order the previous
+    // response-listener relied on, so no reordering is needed here).
+    const entries: unknown[] = Array.isArray(firstObj.entries) ? [...firstObj.entries] : []
+    const backgroundEntries: unknown[] = Array.isArray(firstObj.background_entries)
+      ? [...firstObj.background_entries]
+      : []
+    let pageCount = 1
+
+    while (cursor && pageCount < ConversationExtractor.MAX_PAGES) {
+      const parsed = await this.fetchThreadPage(page, threadId, cursor)
+      const obj = Array.isArray(parsed)
+        ? { entries: parsed as unknown[] }
+        : (parsed as Record<string, unknown>)
+
+      if (Array.isArray(obj.entries)) entries.push(...obj.entries)
+      if (Array.isArray(obj.background_entries)) backgroundEntries.push(...obj.background_entries)
+      pageCount++
+
+      cursor =
+        obj.has_next_page === true && typeof obj.next_cursor === 'string' ? obj.next_cursor : null
+    }
+
+    if (cursor) {
+      logger.warn(
+        `[extractor] thread ${threadId}: stopped at the ${ConversationExtractor.MAX_PAGES}-page ` +
+          'cap; export may be truncated'
+      )
+    }
+    logger.debug(
+      `[extractor] thread ${threadId}: assembled ${entries.length} entries from ${pageCount} pages`
+    )
+
+    // Preserve the first page's top-level metadata, but with the full entry set.
+    return {
+      ...firstObj,
+      entries,
+      background_entries: backgroundEntries,
+      has_next_page: false,
+      next_cursor: null,
+    }
+  }
+
+  /** Fetches and validates a single page of the thread API (cursor = null for page 1). */
+  private async fetchThreadPage(
+    page: Page,
+    threadId: string,
+    cursor: string | null
+  ): Promise<unknown> {
+    let apiUrl = `https://www.perplexity.ai/rest/thread/${threadId}?version=${DEFAULT_API_VERSION}&source=default`
+    if (cursor) apiUrl += `&cursor=${encodeURIComponent(cursor)}`
 
     const raw = await page.evaluate(
       async ({ url }: { url: string }) => {
@@ -178,7 +242,7 @@ export class ConversationExtractor {
       { url: apiUrl }
     )
 
-    logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}`)
+    logger.debug(`[extractor] thread ${threadId}: HTTP ${raw.status}${cursor ? ' (paged)' : ''}`)
 
     if (raw.status === 401 || raw.status === 403) {
       throw new ConversationExtractor.AuthError(`Authentication required (${raw.status})`)
@@ -213,15 +277,6 @@ export class ConversationExtractor {
           zodErrorPaths: shapeResult.error.issues.map((issue) => issue.path.join('.')),
         })
         .catch(() => {})
-    } else if (!Array.isArray(shapeResult.data)) {
-      const data = shapeResult.data
-      const hasNextPage = data.has_next_page ?? data.collection_info?.has_next_page
-      if (hasNextPage === true) {
-        logger.warn(
-          `[extractor] thread ${threadId}: response reports has_next_page; ` +
-            'additional pages are not fetched and content may be truncated'
-        )
-      }
     }
 
     return parsed
diff --git a/test/unit/conversation-extractor.unit.test.ts b/test/unit/conversation-extractor.unit.test.ts
index 25a9ac1..484f29e 100644
--- a/test/unit/conversation-extractor.unit.test.ts
+++ b/test/unit/conversation-extractor.unit.test.ts
@@ -244,17 +244,45 @@ describe('ConversationExtractor (Unit)', () => {
       expect(writeFailure).not.toHaveBeenCalled()
     })
 
-    it('warns (without failing) when the response reports top-level has_next_page', async () => {
-      const warn = vi.spyOn(logger, 'warn').mockImplementation(() => {})
+    it('returns the single page unchanged when there is no next page', async () => {
       const payload = {
         entries: [{ thread_title: 'Test', query_str: 'q', blocks: [] }],
-        has_next_page: true,
-        next_cursor: 'cursor-abc',
+        background_entries: [],
+        has_next_page: false,
+        next_cursor: null,
       }
       const page = makePage(200, JSON.stringify(payload))
       const result = await (extractor as any).fetchThreadData(page, 'abc-123')
       expect(result).toEqual(payload)
-      expect(warn).toHaveBeenCalledWith(expect.stringContaining('has_next_page'))
+      expect(page.evaluate as any).toHaveBeenCalledTimes(1)
+    })
+
+    it('accumulates entries across cursor pages until has_next_page is false', async () => {
+      const page1 = {
+        entries: [{ query_str: 'q1', blocks: [] }],
+        background_entries: [],
+        has_next_page: true,
+        next_cursor: 'cur-1',
+      }
+      const page2 = {
+        entries: [{ query_str: 'q2', blocks: [] }],
+        background_entries: [],
+        has_next_page: false,
+        next_cursor: null,
+      }
+      const evaluate = vi
+        .fn()
+        .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page1) })
+        .mockResolvedValueOnce({ status: 200, body: JSON.stringify(page2) })
+      const page = { evaluate } as unknown as Page
+
+      const result: any = await (extractor as any).fetchThreadData(page, 'abc-123')
+
+      expect(evaluate).toHaveBeenCalledTimes(2)
+      expect(result.entries.map((e: any) => e.query_str)).toEqual(['q1', 'q2'])
+      expect(result.has_next_page).toBe(false)
+      // the second request must carry the cursor returned by page 1
+      expect((evaluate.mock.calls[1][1] as any).url).toContain('cursor=cur-1')
     })
 
     it('still returns the body but records a zod_error diagnostic on shape drift', async () => {