Skip to content

Commit 0052fcb

Browse files
authored
3x performance improvement for websites (#53)
* Fixing issues and improving performance Signed-off-by: Denis Jannot <denis.jannot@solo.io> * Update version Signed-off-by: Denis Jannot <denis.jannot@solo.io> --------- Signed-off-by: Denis Jannot <denis.jannot@solo.io>
1 parent 1b7376e commit 0052fcb

File tree

6 files changed

+219
-136
lines changed

6 files changed

+219
-136
lines changed

content-processor.ts

Lines changed: 160 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -263,86 +263,126 @@ export class ContentProcessor {
263263
let errorCount = 0;
264264
let hasNetworkErrors = false;
265265

266-
while (queue.length > 0) {
267-
const url = queue.shift();
268-
if (!url) continue;
266+
// Launch a single browser instance and reuse one page (tab) for all URLs
267+
let browser: Browser | null = null;
268+
let page: Page | null = null;
269269

270-
const normalizedUrl = Utils.normalizeUrl(url);
271-
if (visitedUrls.has(normalizedUrl)) continue;
272-
visitedUrls.add(normalizedUrl);
270+
const launchBrowser = async (): Promise<{ browser: Browser; page: Page }> => {
271+
let executablePath: string | undefined = process.env.PUPPETEER_EXECUTABLE_PATH;
272+
if (!executablePath) {
273+
if (fs.existsSync('/usr/bin/chromium')) {
274+
executablePath = '/usr/bin/chromium';
275+
} else if (fs.existsSync('/usr/bin/chromium-browser')) {
276+
executablePath = '/usr/bin/chromium-browser';
277+
}
278+
}
279+
const b = await puppeteer.launch({
280+
executablePath,
281+
args: ['--no-sandbox', '--disable-setuid-sandbox'],
282+
});
283+
const p = await b.newPage();
284+
return { browser: b, page: p };
285+
};
273286

274-
if (!Utils.shouldProcessUrl(url)) {
275-
logger.debug(`Skipping URL with unsupported extension: ${url}`);
276-
skippedCount++;
277-
continue;
287+
const ensureBrowser = async (): Promise<Page> => {
288+
if (!browser || !browser.isConnected()) {
289+
logger.info(browser ? 'Browser disconnected, relaunching...' : 'Launching browser...');
290+
const launched = await launchBrowser();
291+
browser = launched.browser;
292+
page = launched.page;
278293
}
294+
return page!;
295+
};
279296

280-
try {
281-
logger.info(`Crawling: ${url}`);
282-
const sources = referrers.get(url) ?? new Set([baseUrl]);
283-
const content = await this.processPage(url, sourceConfig, (reportedUrl, status) => {
284-
if (status === 404) {
285-
for (const source of sources) {
286-
addBrokenLink(source, reportedUrl);
297+
try {
298+
while (queue.length > 0) {
299+
const url = queue.shift();
300+
if (!url) continue;
301+
302+
const normalizedUrl = Utils.normalizeUrl(url);
303+
if (visitedUrls.has(normalizedUrl)) continue;
304+
visitedUrls.add(normalizedUrl);
305+
306+
if (!Utils.shouldProcessUrl(url)) {
307+
logger.debug(`Skipping URL with unsupported extension: ${url}`);
308+
skippedCount++;
309+
continue;
310+
}
311+
312+
try {
313+
logger.info(`Crawling: ${url}`);
314+
const sources = referrers.get(url) ?? new Set([baseUrl]);
315+
316+
// For HTML pages, ensure the browser is running and pass the shared page
317+
// For PDFs, processPage handles them without Puppeteer
318+
const currentPage = Utils.isPdfUrl(url) ? undefined : await ensureBrowser();
319+
320+
const result = await this.processPage(url, sourceConfig, (reportedUrl, status) => {
321+
if (status === 404) {
322+
for (const source of sources) {
323+
addBrokenLink(source, reportedUrl);
324+
}
287325
}
288-
}
289-
});
326+
}, currentPage);
290327

291-
if (content !== null) {
292-
await processPageContent(url, content);
293-
if (Utils.isPdfUrl(url)) {
294-
pdfProcessedCount++;
328+
if (result.content !== null) {
329+
await processPageContent(url, result.content);
330+
if (Utils.isPdfUrl(url)) {
331+
pdfProcessedCount++;
332+
} else {
333+
processedCount++;
334+
}
295335
} else {
296-
processedCount++;
336+
skippedSizeCount++;
297337
}
298-
} else {
299-
skippedSizeCount++;
300-
}
301338

302-
// Only try to extract links from HTML pages, not PDFs
303-
if (!Utils.isPdfUrl(url)) {
304-
const response = await axios.get(url);
305-
const $ = load(response.data);
306-
const pageUrlForLinks = response?.request?.res?.responseUrl || url;
307-
308-
logger.debug(`Finding links on page ${url}`);
309-
let newLinksFound = 0;
310-
311-
$('a[href]').each((_, element) => {
312-
const href = $(element).attr('href');
313-
if (!href || href.startsWith('#') || href.startsWith('mailto:')) return;
314-
315-
const fullUrl = Utils.buildUrl(href, pageUrlForLinks);
316-
if (fullUrl.startsWith(sourceConfig.url)) {
317-
addReferrer(fullUrl, pageUrlForLinks);
318-
if (!visitedUrls.has(Utils.normalizeUrl(fullUrl))) {
319-
if (!queue.includes(fullUrl)) {
320-
queue.push(fullUrl);
321-
newLinksFound++;
339+
// Use links extracted from the full rendered DOM by processPage
340+
// (no separate axios request needed)
341+
if (result.links.length > 0) {
342+
const pageUrlForLinks = result.finalUrl || url;
343+
logger.debug(`Finding links on page ${url}`);
344+
let newLinksFound = 0;
345+
346+
for (const href of result.links) {
347+
const fullUrl = Utils.buildUrl(href, pageUrlForLinks);
348+
if (fullUrl.startsWith(sourceConfig.url)) {
349+
addReferrer(fullUrl, pageUrlForLinks);
350+
if (!visitedUrls.has(Utils.normalizeUrl(fullUrl))) {
351+
if (!queue.includes(fullUrl)) {
352+
queue.push(fullUrl);
353+
newLinksFound++;
354+
}
322355
}
323356
}
324357
}
325-
});
326358

327-
logger.debug(`Found ${newLinksFound} new links on ${url}`);
328-
}
329-
} catch (error: any) {
330-
logger.error(`Failed during processing or link discovery for ${url}:`, error);
331-
errorCount++;
359+
logger.debug(`Found ${newLinksFound} new links on ${url}`);
360+
}
361+
} catch (error: any) {
362+
logger.error(`Failed during processing or link discovery for ${url}:`, error);
363+
errorCount++;
332364

333-
const status = this.getHttpStatus(error);
334-
if (status === 404) {
335-
const sources = referrers.get(url) ?? new Set([baseUrl]);
336-
for (const source of sources) {
337-
addBrokenLink(source, url);
365+
const status = this.getHttpStatus(error);
366+
if (status === 404) {
367+
const sources = referrers.get(url) ?? new Set([baseUrl]);
368+
for (const source of sources) {
369+
addBrokenLink(source, url);
370+
}
371+
}
372+
373+
// Check if this is a network error (DNS resolution, connection issues, etc.)
374+
if (this.isNetworkError(error)) {
375+
hasNetworkErrors = true;
376+
logger.warn(`Network error detected for ${url}, this may affect cleanup decisions`);
338377
}
339378
}
340-
341-
// Check if this is a network error (DNS resolution, connection issues, etc.)
342-
if (this.isNetworkError(error)) {
343-
hasNetworkErrors = true;
344-
logger.warn(`Network error detected for ${url}, this may affect cleanup decisions`);
345-
}
379+
}
380+
} finally {
381+
// Close the shared browser instance when the crawl is done
382+
const browserToClose = browser as Browser | null;
383+
if (browserToClose && browserToClose.isConnected()) {
384+
await browserToClose.close();
385+
logger.debug('Shared browser closed after crawl completed');
346386
}
347387
}
348388

@@ -394,8 +434,9 @@ export class ContentProcessor {
394434
async processPage(
395435
url: string,
396436
sourceConfig: SourceConfig,
397-
onHttpStatus?: (url: string, status: number) => void
398-
): Promise<string | null> {
437+
onHttpStatus?: (url: string, status: number) => void,
438+
existingPage?: Page
439+
): Promise<{ content: string | null, links: string[], finalUrl: string }> {
399440
const logger = this.logger.child('page-processor');
400441
logger.debug(`Processing content from ${url}`);
401442

@@ -408,10 +449,10 @@ export class ContentProcessor {
408449
// Check size limit for PDF content
409450
if (markdown.length > sourceConfig.max_size) {
410451
logger.warn(`PDF content (${markdown.length} chars) exceeds max size (${sourceConfig.max_size}). Skipping ${url}.`);
411-
return null;
452+
return { content: null, links: [], finalUrl: url };
412453
}
413454

414-
return markdown;
455+
return { content: markdown, links: [], finalUrl: url };
415456
} catch (error) {
416457
const status = this.getHttpStatus(error);
417458
if (status !== undefined && status >= 400) {
@@ -421,28 +462,36 @@ export class ContentProcessor {
421462
throw error;
422463
}
423464
logger.error(`Failed to process PDF ${url}:`, error);
424-
return null;
465+
return { content: null, links: [], finalUrl: url };
425466
}
426467
}
427468

428-
// Original HTML page processing logic
469+
// HTML page processing logic
470+
// If an existing page (tab) is provided, reuse it; otherwise launch a standalone browser
429471
let browser: Browser | null = null;
472+
let page: Page;
473+
const ownsTheBrowser = !existingPage;
430474
try {
431-
// Use system Chromium if available (for Docker environments)
432-
let executablePath: string | undefined = process.env.PUPPETEER_EXECUTABLE_PATH;
433-
if (!executablePath) {
434-
if (fs.existsSync('/usr/bin/chromium')) {
435-
executablePath = '/usr/bin/chromium';
436-
} else if (fs.existsSync('/usr/bin/chromium-browser')) {
437-
executablePath = '/usr/bin/chromium-browser';
475+
if (existingPage) {
476+
page = existingPage;
477+
} else {
478+
// Standalone mode: launch a browser for this single page
479+
let executablePath: string | undefined = process.env.PUPPETEER_EXECUTABLE_PATH;
480+
if (!executablePath) {
481+
if (fs.existsSync('/usr/bin/chromium')) {
482+
executablePath = '/usr/bin/chromium';
483+
} else if (fs.existsSync('/usr/bin/chromium-browser')) {
484+
executablePath = '/usr/bin/chromium-browser';
485+
}
438486
}
487+
488+
browser = await puppeteer.launch({
489+
executablePath,
490+
args: ['--no-sandbox', '--disable-setuid-sandbox'],
491+
});
492+
page = await browser.newPage();
439493
}
440-
441-
browser = await puppeteer.launch({
442-
executablePath,
443-
args: ['--no-sandbox', '--disable-setuid-sandbox'],
444-
});
445-
const page: Page = await browser.newPage();
494+
446495
logger.debug(`Navigating to ${url}`);
447496
const response = await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
448497
const status = response?.status();
@@ -452,8 +501,26 @@ export class ContentProcessor {
452501
throw error;
453502
}
454503

504+
// Get the final URL after any redirects
505+
const finalUrl = page.url();
506+
507+
// Extract ALL links from the full rendered DOM before any content filtering
508+
// This searches the entire document, not just the main content area
509+
const links: string[] = await page.evaluate(() => {
510+
const anchors = document.querySelectorAll('a[href]');
511+
const hrefs: string[] = [];
512+
anchors.forEach(a => {
513+
const href = a.getAttribute('href');
514+
if (href && !href.startsWith('#') && !href.startsWith('mailto:')) {
515+
hrefs.push(href);
516+
}
517+
});
518+
return hrefs;
519+
});
520+
logger.debug(`Extracted ${links.length} links from full DOM of ${url}`);
521+
455522
const htmlContent: string = await page.evaluate(() => {
456-
// 💡 Try specific content selectors first, then fall back to broader ones
523+
// Try specific content selectors first, then fall back to broader ones
457524
const mainContentElement =
458525
document.querySelector('.docs-content') || // Common docs pattern
459526
document.querySelector('.doc-content') || // Alternative docs pattern
@@ -467,8 +534,7 @@ export class ContentProcessor {
467534

468535
if (htmlContent.length > sourceConfig.max_size) {
469536
logger.warn(`Raw HTML content (${htmlContent.length} chars) exceeds max size (${sourceConfig.max_size}). Skipping detailed processing for ${url}.`);
470-
await browser.close();
471-
return null;
537+
return { content: null, links, finalUrl };
472538
}
473539

474540
logger.debug(`Got HTML content (${htmlContent.length} chars), creating DOM`);
@@ -481,7 +547,7 @@ export class ContentProcessor {
481547
this.markCodeParents(pre.parentElement);
482548
});
483549

484-
// 💡 Extract H1s BEFORE Readability - it often strips them as "chrome"
550+
// Extract H1s BEFORE Readability - it often strips them as "chrome"
485551
// We'll inject them back after Readability processing
486552
const h1Elements = document.querySelectorAll('h1');
487553
const extractedH1s: string[] = [];
@@ -505,8 +571,7 @@ export class ContentProcessor {
505571

506572
if (!article) {
507573
logger.warn(`Failed to parse article content with Readability for ${url}`);
508-
await browser.close();
509-
return null;
574+
return { content: null, links, finalUrl };
510575
}
511576

512577
// Debug: Log what Readability extracted
@@ -517,7 +582,7 @@ export class ContentProcessor {
517582
logger.debug(`[Readability Debug] Contains H2 tag: ${article.content?.includes('<h2')}`);
518583
logger.debug(`[Readability Debug] Contains original-h1 class: ${article.content?.includes('original-h1')}`);
519584

520-
// 💡 Restore H1s: find elements with our marker class and convert back from H2
585+
// Restore H1s: find elements with our marker class and convert back from H2
521586
const articleDom = new JSDOM(article.content);
522587
const articleDoc = articleDom.window.document;
523588
const originalH1Elements = articleDoc.querySelectorAll('.original-h1');
@@ -557,7 +622,7 @@ export class ContentProcessor {
557622
logger.debug(`Converting HTML to Markdown`);
558623
let markdown = this.turndownService.turndown(cleanHtml);
559624

560-
// 💡 Inject extracted H1s back if they're not in the markdown
625+
// Inject extracted H1s back if they're not in the markdown
561626
// Readability often strips them as "page chrome"
562627
// Use article.title as fallback if no H1 was extracted
563628
const pageTitle = extractedH1s.length > 0 ? extractedH1s[0] : (article.title?.trim() || '');
@@ -578,7 +643,7 @@ export class ContentProcessor {
578643
}
579644

580645
logger.debug(`Markdown conversion complete (${markdown.length} chars)`);
581-
return markdown;
646+
return { content: markdown, links, finalUrl };
582647
} catch (error) {
583648
const status = this.getHttpStatus(error);
584649
if (status !== undefined && status >= 400) {
@@ -588,9 +653,10 @@ export class ContentProcessor {
588653
throw error;
589654
}
590655
logger.error(`Error processing page ${url}:`, error);
591-
return null;
656+
return { content: null, links: [], finalUrl: url };
592657
} finally {
593-
if (browser && browser.isConnected()) {
658+
// Only close the browser if we launched it ourselves (standalone mode)
659+
if (ownsTheBrowser && browser && browser.isConnected()) {
594660
await browser.close();
595661
logger.debug(`Browser closed for ${url}`);
596662
}

database.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -385,11 +385,11 @@ export class DatabaseManager {
385385
];
386386

387387
if (hasBranchColumn) {
388-
insertValues.push(chunk.metadata.branch ?? null);
388+
insertValues.push(chunk.metadata.branch ?? '');
389389
}
390390

391391
if (hasRepoColumn) {
392-
insertValues.push(chunk.metadata.repo ?? null);
392+
insertValues.push(chunk.metadata.repo ?? '');
393393
}
394394

395395
insertValues.push(
@@ -412,11 +412,11 @@ export class DatabaseManager {
412412
];
413413

414414
if (hasBranchColumn) {
415-
updateValues.push(chunk.metadata.branch ?? null);
415+
updateValues.push(chunk.metadata.branch ?? '');
416416
}
417417

418418
if (hasRepoColumn) {
419-
updateValues.push(chunk.metadata.repo ?? null);
419+
updateValues.push(chunk.metadata.repo ?? '');
420420
}
421421

422422
updateValues.push(

0 commit comments

Comments
 (0)