|
| 1 | +/** |
| 2 | + * Google Web Search via browser DOM extraction. |
| 3 | + * Uses browser mode to navigate google.com and extract results from the DOM. |
| 4 | + * |
| 5 | + * Extraction strategy (2026-03): Google no longer uses `.g` class containers. |
| 6 | + * Instead, we find all `a` tags containing `h3` within `#rso`, then walk up |
| 7 | + * to the result container (`div.tF2Cxc` or closest `div[data-hveid]`) to find |
| 8 | + * snippets. This approach is resilient to class name changes. |
| 9 | + */ |
| 10 | + |
| 11 | +import { cli, Strategy } from '../../registry.js'; |
| 12 | +import { CliError } from '../../errors.js'; |
| 13 | + |
| 14 | +cli({ |
| 15 | + site: 'google', |
| 16 | + name: 'search', |
| 17 | + description: 'Search Google', |
| 18 | + domain: 'google.com', |
| 19 | + strategy: Strategy.PUBLIC, |
| 20 | + browser: true, |
| 21 | + args: [ |
| 22 | + { name: 'keyword', positional: true, required: true, help: 'Search query' }, |
| 23 | + { name: 'limit', type: 'int', default: 10, help: 'Number of results (1-100)' }, |
| 24 | + { name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' }, |
| 25 | + ], |
| 26 | + columns: ['type', 'title', 'url', 'snippet'], |
| 27 | + func: async (page, args) => { |
| 28 | + const limit = Math.max(1, Math.min(Number(args.limit), 100)); |
| 29 | + const keyword = encodeURIComponent(args.keyword); |
| 30 | + const lang = encodeURIComponent(args.lang); |
| 31 | + const url = `https://www.google.com/search?q=${keyword}&hl=${lang}&num=${limit}`; |
| 32 | + |
| 33 | + await page.goto(url); |
| 34 | + await page.wait(2); |
| 35 | + |
| 36 | + const results = await page.evaluate(` |
| 37 | + (function() { |
| 38 | + var results = []; |
| 39 | + var seenUrls = {}; |
| 40 | + var rso = document.querySelector('#rso'); |
| 41 | + if (!rso) return results; |
| 42 | +
|
| 43 | + // -- Featured snippet (scoped to #rso to avoid matching unrelated elements) -- |
| 44 | + var featuredEl = rso.querySelector('.xpdopen .hgKElc') |
| 45 | + || rso.querySelector('.IZ6rdc'); |
| 46 | + if (featuredEl) { |
| 47 | + var parentBlock = featuredEl.closest('[data-hveid]') || featuredEl.parentElement; |
| 48 | + var fLink = parentBlock ? parentBlock.querySelector('a[href]') : null; |
| 49 | + var fUrl = fLink ? fLink.href : ''; |
| 50 | + if (fUrl) seenUrls[fUrl] = true; |
| 51 | + results.push({ |
| 52 | + type: 'snippet', |
| 53 | + title: featuredEl.textContent.trim().slice(0, 200), |
| 54 | + url: fUrl, |
| 55 | + snippet: '', |
| 56 | + }); |
| 57 | + } |
| 58 | +
|
| 59 | + // -- Standard search results -- |
| 60 | + // Strategy: find all links containing h3 within #rso |
| 61 | + var allLinks = rso.querySelectorAll('a'); |
| 62 | + for (var i = 0; i < allLinks.length; i++) { |
| 63 | + var link = allLinks[i]; |
| 64 | + var h3 = link.querySelector('h3'); |
| 65 | + if (!h3) continue; |
| 66 | +
|
| 67 | + var href = link.href || ''; |
| 68 | + // Skip non-http, Google internal links, and duplicates |
| 69 | + if (!href.match(/^https?:\\/\\//)) continue; |
| 70 | + if (href.indexOf('google.com/search') !== -1) continue; |
| 71 | + if (seenUrls[href]) continue; |
| 72 | + seenUrls[href] = true; |
| 73 | +
|
| 74 | + // Walk up to find result container for snippet extraction |
| 75 | + var container = link; |
| 76 | + for (var j = 0; j < 6; j++) { |
| 77 | + if (container.parentElement && container.parentElement !== rso) { |
| 78 | + container = container.parentElement; |
| 79 | + } |
| 80 | + // Stop at a known result boundary |
| 81 | + if (container.getAttribute && container.getAttribute('data-hveid')) break; |
| 82 | + } |
| 83 | +
|
| 84 | + // Find snippet: look for descriptive text, skip breadcrumbs and metadata |
| 85 | + var snippetText = ''; |
| 86 | + var titleText = h3.textContent.trim(); |
| 87 | + var candidates = container.querySelectorAll('span, div'); |
| 88 | + for (var k = 0; k < candidates.length; k++) { |
| 89 | + var el = candidates[k]; |
| 90 | + if (el.querySelector('h3') || el.querySelector('a[href]')) continue; |
| 91 | + var text = el.textContent.trim(); |
| 92 | + if (text.length < 40 || text.length > 500) continue; |
| 93 | + if (text === titleText) continue; |
| 94 | + // Skip URL breadcrumbs (e.g. "https://example.com › path..." or "Site Namehttps://...") |
| 95 | + if (text.indexOf('\u203A') !== -1) continue; |
| 96 | + if (new RegExp('https?://').test(text.slice(0, 60))) continue; |
| 97 | + snippetText = text; |
| 98 | + break; |
| 99 | + } |
| 100 | +
|
| 101 | + results.push({ |
| 102 | + type: 'result', |
| 103 | + title: h3.textContent.trim(), |
| 104 | + url: href, |
| 105 | + snippet: snippetText.slice(0, 300), |
| 106 | + }); |
| 107 | + } |
| 108 | +
|
| 109 | + // -- People Also Ask -- |
| 110 | + var paaContainers = document.querySelectorAll('[data-sgrd="true"]'); |
| 111 | + for (var i = 0; i < paaContainers.length; i++) { |
| 112 | + var questionEl = paaContainers[i].querySelector('span.CSkcDe'); |
| 113 | + if (questionEl) { |
| 114 | + results.push({ |
| 115 | + type: 'paa', |
| 116 | + title: questionEl.textContent.trim(), |
| 117 | + url: '', |
| 118 | + snippet: '', |
| 119 | + }); |
| 120 | + } |
| 121 | + } |
| 122 | +
|
| 123 | + return results; |
| 124 | + })() |
| 125 | + `); |
| 126 | + |
| 127 | + if (!Array.isArray(results) || results.length === 0) { |
| 128 | + throw new CliError('NOT_FOUND', 'No search results found', 'Try a different keyword or check for CAPTCHA'); |
| 129 | + } |
| 130 | + |
| 131 | + return results; |
| 132 | + }, |
| 133 | +}); |
0 commit comments