@@ -263,86 +263,126 @@ export class ContentProcessor {
263263 let errorCount = 0 ;
264264 let hasNetworkErrors = false ;
265265
266- while ( queue . length > 0 ) {
267- const url = queue . shift ( ) ;
268- if ( ! url ) continue ;
266+ // Launch a single browser instance and reuse one page (tab) for all URLs
267+ let browser : Browser | null = null ;
268+ let page : Page | null = null ;
269269
270- const normalizedUrl = Utils . normalizeUrl ( url ) ;
271- if ( visitedUrls . has ( normalizedUrl ) ) continue ;
272- visitedUrls . add ( normalizedUrl ) ;
270+ const launchBrowser = async ( ) : Promise < { browser : Browser ; page : Page } > => {
271+ let executablePath : string | undefined = process . env . PUPPETEER_EXECUTABLE_PATH ;
272+ if ( ! executablePath ) {
273+ if ( fs . existsSync ( '/usr/bin/chromium' ) ) {
274+ executablePath = '/usr/bin/chromium' ;
275+ } else if ( fs . existsSync ( '/usr/bin/chromium-browser' ) ) {
276+ executablePath = '/usr/bin/chromium-browser' ;
277+ }
278+ }
279+ const b = await puppeteer . launch ( {
280+ executablePath,
281+ args : [ '--no-sandbox' , '--disable-setuid-sandbox' ] ,
282+ } ) ;
283+ const p = await b . newPage ( ) ;
284+ return { browser : b , page : p } ;
285+ } ;
273286
274- if ( ! Utils . shouldProcessUrl ( url ) ) {
275- logger . debug ( `Skipping URL with unsupported extension: ${ url } ` ) ;
276- skippedCount ++ ;
277- continue ;
287+ const ensureBrowser = async ( ) : Promise < Page > => {
288+ if ( ! browser || ! browser . isConnected ( ) ) {
289+ logger . info ( browser ? 'Browser disconnected, relaunching...' : 'Launching browser...' ) ;
290+ const launched = await launchBrowser ( ) ;
291+ browser = launched . browser ;
292+ page = launched . page ;
278293 }
294+ return page ! ;
295+ } ;
279296
280- try {
281- logger . info ( `Crawling: ${ url } ` ) ;
282- const sources = referrers . get ( url ) ?? new Set ( [ baseUrl ] ) ;
283- const content = await this . processPage ( url , sourceConfig , ( reportedUrl , status ) => {
284- if ( status === 404 ) {
285- for ( const source of sources ) {
286- addBrokenLink ( source , reportedUrl ) ;
297+ try {
298+ while ( queue . length > 0 ) {
299+ const url = queue . shift ( ) ;
300+ if ( ! url ) continue ;
301+
302+ const normalizedUrl = Utils . normalizeUrl ( url ) ;
303+ if ( visitedUrls . has ( normalizedUrl ) ) continue ;
304+ visitedUrls . add ( normalizedUrl ) ;
305+
306+ if ( ! Utils . shouldProcessUrl ( url ) ) {
307+ logger . debug ( `Skipping URL with unsupported extension: ${ url } ` ) ;
308+ skippedCount ++ ;
309+ continue ;
310+ }
311+
312+ try {
313+ logger . info ( `Crawling: ${ url } ` ) ;
314+ const sources = referrers . get ( url ) ?? new Set ( [ baseUrl ] ) ;
315+
316+ // For HTML pages, ensure the browser is running and pass the shared page
317+ // For PDFs, processPage handles them without Puppeteer
318+ const currentPage = Utils . isPdfUrl ( url ) ? undefined : await ensureBrowser ( ) ;
319+
320+ const result = await this . processPage ( url , sourceConfig , ( reportedUrl , status ) => {
321+ if ( status === 404 ) {
322+ for ( const source of sources ) {
323+ addBrokenLink ( source , reportedUrl ) ;
324+ }
287325 }
288- }
289- } ) ;
326+ } , currentPage ) ;
290327
291- if ( content !== null ) {
292- await processPageContent ( url , content ) ;
293- if ( Utils . isPdfUrl ( url ) ) {
294- pdfProcessedCount ++ ;
328+ if ( result . content !== null ) {
329+ await processPageContent ( url , result . content ) ;
330+ if ( Utils . isPdfUrl ( url ) ) {
331+ pdfProcessedCount ++ ;
332+ } else {
333+ processedCount ++ ;
334+ }
295335 } else {
296- processedCount ++ ;
336+ skippedSizeCount ++ ;
297337 }
298- } else {
299- skippedSizeCount ++ ;
300- }
301338
302- // Only try to extract links from HTML pages, not PDFs
303- if ( ! Utils . isPdfUrl ( url ) ) {
304- const response = await axios . get ( url ) ;
305- const $ = load ( response . data ) ;
306- const pageUrlForLinks = response ?. request ?. res ?. responseUrl || url ;
307-
308- logger . debug ( `Finding links on page ${ url } ` ) ;
309- let newLinksFound = 0 ;
310-
311- $ ( 'a[href]' ) . each ( ( _ , element ) => {
312- const href = $ ( element ) . attr ( 'href' ) ;
313- if ( ! href || href . startsWith ( '#' ) || href . startsWith ( 'mailto:' ) ) return ;
314-
315- const fullUrl = Utils . buildUrl ( href , pageUrlForLinks ) ;
316- if ( fullUrl . startsWith ( sourceConfig . url ) ) {
317- addReferrer ( fullUrl , pageUrlForLinks ) ;
318- if ( ! visitedUrls . has ( Utils . normalizeUrl ( fullUrl ) ) ) {
319- if ( ! queue . includes ( fullUrl ) ) {
320- queue . push ( fullUrl ) ;
321- newLinksFound ++ ;
339+ // Use links extracted from the full rendered DOM by processPage
340+ // (no separate axios request needed)
341+ if ( result . links . length > 0 ) {
342+ const pageUrlForLinks = result . finalUrl || url ;
343+ logger . debug ( `Finding links on page ${ url } ` ) ;
344+ let newLinksFound = 0 ;
345+
346+ for ( const href of result . links ) {
347+ const fullUrl = Utils . buildUrl ( href , pageUrlForLinks ) ;
348+ if ( fullUrl . startsWith ( sourceConfig . url ) ) {
349+ addReferrer ( fullUrl , pageUrlForLinks ) ;
350+ if ( ! visitedUrls . has ( Utils . normalizeUrl ( fullUrl ) ) ) {
351+ if ( ! queue . includes ( fullUrl ) ) {
352+ queue . push ( fullUrl ) ;
353+ newLinksFound ++ ;
354+ }
322355 }
323356 }
324357 }
325- } ) ;
326358
327- logger . debug ( `Found ${ newLinksFound } new links on ${ url } ` ) ;
328- }
329- } catch ( error : any ) {
330- logger . error ( `Failed during processing or link discovery for ${ url } :` , error ) ;
331- errorCount ++ ;
359+ logger . debug ( `Found ${ newLinksFound } new links on ${ url } ` ) ;
360+ }
361+ } catch ( error : any ) {
362+ logger . error ( `Failed during processing or link discovery for ${ url } :` , error ) ;
363+ errorCount ++ ;
332364
333- const status = this . getHttpStatus ( error ) ;
334- if ( status === 404 ) {
335- const sources = referrers . get ( url ) ?? new Set ( [ baseUrl ] ) ;
336- for ( const source of sources ) {
337- addBrokenLink ( source , url ) ;
365+ const status = this . getHttpStatus ( error ) ;
366+ if ( status === 404 ) {
367+ const sources = referrers . get ( url ) ?? new Set ( [ baseUrl ] ) ;
368+ for ( const source of sources ) {
369+ addBrokenLink ( source , url ) ;
370+ }
371+ }
372+
373+ // Check if this is a network error (DNS resolution, connection issues, etc.)
374+ if ( this . isNetworkError ( error ) ) {
375+ hasNetworkErrors = true ;
376+ logger . warn ( `Network error detected for ${ url } , this may affect cleanup decisions` ) ;
338377 }
339378 }
340-
341- // Check if this is a network error (DNS resolution, connection issues, etc.)
342- if ( this . isNetworkError ( error ) ) {
343- hasNetworkErrors = true ;
344- logger . warn ( `Network error detected for ${ url } , this may affect cleanup decisions` ) ;
345- }
379+ }
380+ } finally {
381+ // Close the shared browser instance when the crawl is done
382+ const browserToClose = browser as Browser | null ;
383+ if ( browserToClose && browserToClose . isConnected ( ) ) {
384+ await browserToClose . close ( ) ;
385+ logger . debug ( 'Shared browser closed after crawl completed' ) ;
346386 }
347387 }
348388
@@ -394,8 +434,9 @@ export class ContentProcessor {
394434 async processPage (
395435 url : string ,
396436 sourceConfig : SourceConfig ,
397- onHttpStatus ?: ( url : string , status : number ) => void
398- ) : Promise < string | null > {
437+ onHttpStatus ?: ( url : string , status : number ) => void ,
438+ existingPage ?: Page
439+ ) : Promise < { content : string | null , links : string [ ] , finalUrl : string } > {
399440 const logger = this . logger . child ( 'page-processor' ) ;
400441 logger . debug ( `Processing content from ${ url } ` ) ;
401442
@@ -408,10 +449,10 @@ export class ContentProcessor {
408449 // Check size limit for PDF content
409450 if ( markdown . length > sourceConfig . max_size ) {
410451 logger . warn ( `PDF content (${ markdown . length } chars) exceeds max size (${ sourceConfig . max_size } ). Skipping ${ url } .` ) ;
411- return null ;
452+ return { content : null , links : [ ] , finalUrl : url } ;
412453 }
413454
414- return markdown ;
455+ return { content : markdown , links : [ ] , finalUrl : url } ;
415456 } catch ( error ) {
416457 const status = this . getHttpStatus ( error ) ;
417458 if ( status !== undefined && status >= 400 ) {
@@ -421,28 +462,36 @@ export class ContentProcessor {
421462 throw error ;
422463 }
423464 logger . error ( `Failed to process PDF ${ url } :` , error ) ;
424- return null ;
465+ return { content : null , links : [ ] , finalUrl : url } ;
425466 }
426467 }
427468
428- // Original HTML page processing logic
469+ // HTML page processing logic
470+ // If an existing page (tab) is provided, reuse it; otherwise launch a standalone browser
429471 let browser : Browser | null = null ;
472+ let page : Page ;
473+ const ownsTheBrowser = ! existingPage ;
430474 try {
431- // Use system Chromium if available (for Docker environments)
432- let executablePath : string | undefined = process . env . PUPPETEER_EXECUTABLE_PATH ;
433- if ( ! executablePath ) {
434- if ( fs . existsSync ( '/usr/bin/chromium' ) ) {
435- executablePath = '/usr/bin/chromium' ;
436- } else if ( fs . existsSync ( '/usr/bin/chromium-browser' ) ) {
437- executablePath = '/usr/bin/chromium-browser' ;
475+ if ( existingPage ) {
476+ page = existingPage ;
477+ } else {
478+ // Standalone mode: launch a browser for this single page
479+ let executablePath : string | undefined = process . env . PUPPETEER_EXECUTABLE_PATH ;
480+ if ( ! executablePath ) {
481+ if ( fs . existsSync ( '/usr/bin/chromium' ) ) {
482+ executablePath = '/usr/bin/chromium' ;
483+ } else if ( fs . existsSync ( '/usr/bin/chromium-browser' ) ) {
484+ executablePath = '/usr/bin/chromium-browser' ;
485+ }
438486 }
487+
488+ browser = await puppeteer . launch ( {
489+ executablePath,
490+ args : [ '--no-sandbox' , '--disable-setuid-sandbox' ] ,
491+ } ) ;
492+ page = await browser . newPage ( ) ;
439493 }
440-
441- browser = await puppeteer . launch ( {
442- executablePath,
443- args : [ '--no-sandbox' , '--disable-setuid-sandbox' ] ,
444- } ) ;
445- const page : Page = await browser . newPage ( ) ;
494+
446495 logger . debug ( `Navigating to ${ url } ` ) ;
447496 const response = await page . goto ( url , { waitUntil : 'networkidle2' , timeout : 60000 } ) ;
448497 const status = response ?. status ( ) ;
@@ -452,8 +501,26 @@ export class ContentProcessor {
452501 throw error ;
453502 }
454503
504+ // Get the final URL after any redirects
505+ const finalUrl = page . url ( ) ;
506+
507+ // Extract ALL links from the full rendered DOM before any content filtering
508+ // This searches the entire document, not just the main content area
509+ const links : string [ ] = await page . evaluate ( ( ) => {
510+ const anchors = document . querySelectorAll ( 'a[href]' ) ;
511+ const hrefs : string [ ] = [ ] ;
512+ anchors . forEach ( a => {
513+ const href = a . getAttribute ( 'href' ) ;
514+ if ( href && ! href . startsWith ( '#' ) && ! href . startsWith ( 'mailto:' ) ) {
515+ hrefs . push ( href ) ;
516+ }
517+ } ) ;
518+ return hrefs ;
519+ } ) ;
520+ logger . debug ( `Extracted ${ links . length } links from full DOM of ${ url } ` ) ;
521+
455522 const htmlContent : string = await page . evaluate ( ( ) => {
456- // 💡 Try specific content selectors first, then fall back to broader ones
523+ // Try specific content selectors first, then fall back to broader ones
457524 const mainContentElement =
458525 document . querySelector ( '.docs-content' ) || // Common docs pattern
459526 document . querySelector ( '.doc-content' ) || // Alternative docs pattern
@@ -467,8 +534,7 @@ export class ContentProcessor {
467534
468535 if ( htmlContent . length > sourceConfig . max_size ) {
469536 logger . warn ( `Raw HTML content (${ htmlContent . length } chars) exceeds max size (${ sourceConfig . max_size } ). Skipping detailed processing for ${ url } .` ) ;
470- await browser . close ( ) ;
471- return null ;
537+ return { content : null , links, finalUrl } ;
472538 }
473539
474540 logger . debug ( `Got HTML content (${ htmlContent . length } chars), creating DOM` ) ;
@@ -481,7 +547,7 @@ export class ContentProcessor {
481547 this . markCodeParents ( pre . parentElement ) ;
482548 } ) ;
483549
484- // 💡 Extract H1s BEFORE Readability - it often strips them as "chrome"
550+ // Extract H1s BEFORE Readability - it often strips them as "chrome"
485551 // We'll inject them back after Readability processing
486552 const h1Elements = document . querySelectorAll ( 'h1' ) ;
487553 const extractedH1s : string [ ] = [ ] ;
@@ -505,8 +571,7 @@ export class ContentProcessor {
505571
506572 if ( ! article ) {
507573 logger . warn ( `Failed to parse article content with Readability for ${ url } ` ) ;
508- await browser . close ( ) ;
509- return null ;
574+ return { content : null , links, finalUrl } ;
510575 }
511576
512577 // Debug: Log what Readability extracted
@@ -517,7 +582,7 @@ export class ContentProcessor {
517582 logger . debug ( `[Readability Debug] Contains H2 tag: ${ article . content ?. includes ( '<h2' ) } ` ) ;
518583 logger . debug ( `[Readability Debug] Contains original-h1 class: ${ article . content ?. includes ( 'original-h1' ) } ` ) ;
519584
520- // 💡 Restore H1s: find elements with our marker class and convert back from H2
585+ // Restore H1s: find elements with our marker class and convert back from H2
521586 const articleDom = new JSDOM ( article . content ) ;
522587 const articleDoc = articleDom . window . document ;
523588 const originalH1Elements = articleDoc . querySelectorAll ( '.original-h1' ) ;
@@ -557,7 +622,7 @@ export class ContentProcessor {
557622 logger . debug ( `Converting HTML to Markdown` ) ;
558623 let markdown = this . turndownService . turndown ( cleanHtml ) ;
559624
560- // 💡 Inject extracted H1s back if they're not in the markdown
625+ // Inject extracted H1s back if they're not in the markdown
561626 // Readability often strips them as "page chrome"
562627 // Use article.title as fallback if no H1 was extracted
563628 const pageTitle = extractedH1s . length > 0 ? extractedH1s [ 0 ] : ( article . title ?. trim ( ) || '' ) ;
@@ -578,7 +643,7 @@ export class ContentProcessor {
578643 }
579644
580645 logger . debug ( `Markdown conversion complete (${ markdown . length } chars)` ) ;
581- return markdown ;
646+ return { content : markdown , links , finalUrl } ;
582647 } catch ( error ) {
583648 const status = this . getHttpStatus ( error ) ;
584649 if ( status !== undefined && status >= 400 ) {
@@ -588,9 +653,10 @@ export class ContentProcessor {
588653 throw error ;
589654 }
590655 logger . error ( `Error processing page ${ url } :` , error ) ;
591- return null ;
656+ return { content : null , links : [ ] , finalUrl : url } ;
592657 } finally {
593- if ( browser && browser . isConnected ( ) ) {
658+ // Only close the browser if we launched it ourselves (standalone mode)
659+ if ( ownsTheBrowser && browser && browser . isConnected ( ) ) {
594660 await browser . close ( ) ;
595661 logger . debug ( `Browser closed for ${ url } ` ) ;
596662 }
0 commit comments