Skip to content

Commit 4e32599

Browse files
authored
feat(google): add search, suggest, news, and trends adapters (#184)
* feat(google): add search, suggest, news, and trends adapters Four new commands under `google`: - search: browser-based DOM extraction from google.com/search - suggest: public JSON API (suggestqueries.google.com) - news: public RSS feed (top stories + keyword search) - trends: public RSS feed (daily trending searches by region) Shared RSS parser in utils.ts with attribute/CDATA support. Unit tests for parseRssItems, E2E tests with network skip guards. * refactor(google): downgrade search strategy from COOKIE to PUBLIC Google search results are public data, no login needed. Browser is required for DOM rendering, not authentication. Standalone mode confirmed working in testing. * fix: update test comment to reflect PUBLIC strategy
1 parent bdf5967 commit 4e32599

9 files changed

Lines changed: 526 additions & 0 deletions

File tree

docs/adapters/browser/google.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Google
2+
3+
**Mode**: 🌐 / 🔐 Mixed · **Domains**: `google.com`, `suggestqueries.google.com`, `news.google.com`, `trends.google.com`
4+
5+
## Commands
6+
7+
| Command | Description |
8+
|---------|-------------|
9+
| `opencli google search <keyword>` | Search Google and extract results from the page |
10+
| `opencli google suggest <keyword>` | Get Google search suggestions |
11+
| `opencli google news [keyword]` | Get Google News headlines (top stories or search) |
12+
| `opencli google trends` | Get Google Trends daily trending searches |
13+
14+
## What works today
15+
16+
- Public API commands work without a browser:
17+
- `suggest` — JSON API, no auth needed
18+
- `news` — RSS feed, supports top stories and keyword search
19+
- `trends` — RSS feed, supports different regions
20+
- `google search` uses browser mode to extract results from google.com.
21+
22+
## Current limitations
23+
24+
- `google search` may trigger CAPTCHA in Standalone browser mode. Extension mode (with an established Chrome session) is more reliable.
25+
- Google frequently changes its DOM structure. If `search` stops returning results, selectors may need updating.
26+
- Snippet extraction may return empty for some results depending on Google's layout.
27+
28+
## Usage Examples
29+
30+
```bash
31+
# Search Google
32+
opencli google search "typescript tutorial" --limit 10
33+
34+
# Get search suggestions
35+
opencli google suggest python
36+
37+
# Get top news headlines
38+
opencli google news --limit 5
39+
40+
# Search news for a topic
41+
opencli google news "artificial intelligence" --limit 10 --lang en --region US
42+
43+
# Get trending searches in Japan
44+
opencli google trends --region JP --limit 10
45+
46+
# Output as JSON
47+
opencli google search "machine learning" -f json
48+
```
49+
50+
## Prerequisites
51+
52+
- `suggest`, `news`, `trends` do not require Chrome.
53+
- `search` requires:
54+
- Chrome running (or Standalone mode will auto-launch)
55+
- For best results, use the [Browser Bridge extension](/guide/browser-bridge) with an established Google session
56+
57+
## Notes
58+
59+
- `suggest` defaults to `--lang zh-CN`; other commands default to `--lang en`.
60+
- `news` supports `--lang` and `--region` parameters for localized results.
61+
- `trends` traffic values are raw strings (e.g. "500K+", "1,000,000+"), not numeric.
62+
- `search` output includes three result types: `result` (standard), `snippet` (featured answer box), and `paa` (People Also Ask).

src/clis/google/news.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/**
2+
* Google News via public RSS feed.
3+
* Supports top stories (no keyword) and search (with keyword).
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
import { parseRssItems } from './utils.js';
9+
10+
cli({
11+
site: 'google',
12+
name: 'news',
13+
description: 'Get Google News headlines',
14+
strategy: Strategy.PUBLIC,
15+
browser: false,
16+
args: [
17+
{ name: 'keyword', positional: true, help: 'Search query (omit for top stories)' },
18+
{ name: 'limit', type: 'int', default: 10, help: 'Number of results' },
19+
{ name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
20+
{ name: 'region', default: 'US', help: 'Region code (e.g. US, CN)' },
21+
],
22+
columns: ['title', 'source', 'date', 'url'],
23+
func: async (_page, args) => {
24+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
25+
const lang = encodeURIComponent(args.lang);
26+
const region = encodeURIComponent(args.region);
27+
const ceid = `${args.region}:${args.lang}`;
28+
29+
// Top stories or search
30+
const base = args.keyword
31+
? `https://news.google.com/rss/search?q=${encodeURIComponent(args.keyword)}&hl=${lang}&gl=${region}&ceid=${ceid}`
32+
: `https://news.google.com/rss?hl=${lang}&gl=${region}&ceid=${ceid}`;
33+
34+
const resp = await fetch(base);
35+
if (!resp.ok) {
36+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
37+
}
38+
39+
const xml = await resp.text();
40+
const items = parseRssItems(xml, ['title', 'link', 'pubDate', 'source']);
41+
42+
if (!items.length) {
43+
throw new CliError('NOT_FOUND', 'No news articles found', 'Try a different keyword or region');
44+
}
45+
46+
return items.slice(0, limit).map(item => {
47+
// Extract source: prefer <source> element, fallback to parsing title
48+
let title = item['title'] || '';
49+
let source = item['source'] || '';
50+
if (!source) {
51+
const idx = title.lastIndexOf(' - ');
52+
if (idx !== -1) {
53+
source = title.slice(idx + 3);
54+
title = title.slice(0, idx);
55+
}
56+
}
57+
58+
return {
59+
title,
60+
source,
61+
date: item['pubDate'] || '',
62+
url: item['link'] || '',
63+
};
64+
});
65+
},
66+
});

src/clis/google/search.ts

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/**
2+
* Google Web Search via browser DOM extraction.
3+
* Uses browser mode to navigate google.com and extract results from the DOM.
4+
*
5+
* Extraction strategy (2026-03): Google no longer uses `.g` class containers.
6+
* Instead, we find all `a` tags containing `h3` within `#rso`, then walk up
7+
* to the result container (`div.tF2Cxc` or closest `div[data-hveid]`) to find
8+
* snippets. This approach is resilient to class name changes.
9+
*/
10+
11+
import { cli, Strategy } from '../../registry.js';
12+
import { CliError } from '../../errors.js';
13+
14+
cli({
15+
site: 'google',
16+
name: 'search',
17+
description: 'Search Google',
18+
domain: 'google.com',
19+
strategy: Strategy.PUBLIC,
20+
browser: true,
21+
args: [
22+
{ name: 'keyword', positional: true, required: true, help: 'Search query' },
23+
{ name: 'limit', type: 'int', default: 10, help: 'Number of results (1-100)' },
24+
{ name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
25+
],
26+
columns: ['type', 'title', 'url', 'snippet'],
27+
func: async (page, args) => {
28+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
29+
const keyword = encodeURIComponent(args.keyword);
30+
const lang = encodeURIComponent(args.lang);
31+
const url = `https://www.google.com/search?q=${keyword}&hl=${lang}&num=${limit}`;
32+
33+
await page.goto(url);
34+
await page.wait(2);
35+
36+
const results = await page.evaluate(`
37+
(function() {
38+
var results = [];
39+
var seenUrls = {};
40+
var rso = document.querySelector('#rso');
41+
if (!rso) return results;
42+
43+
// -- Featured snippet (scoped to #rso to avoid matching unrelated elements) --
44+
var featuredEl = rso.querySelector('.xpdopen .hgKElc')
45+
|| rso.querySelector('.IZ6rdc');
46+
if (featuredEl) {
47+
var parentBlock = featuredEl.closest('[data-hveid]') || featuredEl.parentElement;
48+
var fLink = parentBlock ? parentBlock.querySelector('a[href]') : null;
49+
var fUrl = fLink ? fLink.href : '';
50+
if (fUrl) seenUrls[fUrl] = true;
51+
results.push({
52+
type: 'snippet',
53+
title: featuredEl.textContent.trim().slice(0, 200),
54+
url: fUrl,
55+
snippet: '',
56+
});
57+
}
58+
59+
// -- Standard search results --
60+
// Strategy: find all links containing h3 within #rso
61+
var allLinks = rso.querySelectorAll('a');
62+
for (var i = 0; i < allLinks.length; i++) {
63+
var link = allLinks[i];
64+
var h3 = link.querySelector('h3');
65+
if (!h3) continue;
66+
67+
var href = link.href || '';
68+
// Skip non-http, Google internal links, and duplicates
69+
if (!href.match(/^https?:\\/\\//)) continue;
70+
if (href.indexOf('google.com/search') !== -1) continue;
71+
if (seenUrls[href]) continue;
72+
seenUrls[href] = true;
73+
74+
// Walk up to find result container for snippet extraction
75+
var container = link;
76+
for (var j = 0; j < 6; j++) {
77+
if (container.parentElement && container.parentElement !== rso) {
78+
container = container.parentElement;
79+
}
80+
// Stop at a known result boundary
81+
if (container.getAttribute && container.getAttribute('data-hveid')) break;
82+
}
83+
84+
// Find snippet: look for descriptive text, skip breadcrumbs and metadata
85+
var snippetText = '';
86+
var titleText = h3.textContent.trim();
87+
var candidates = container.querySelectorAll('span, div');
88+
for (var k = 0; k < candidates.length; k++) {
89+
var el = candidates[k];
90+
if (el.querySelector('h3') || el.querySelector('a[href]')) continue;
91+
var text = el.textContent.trim();
92+
if (text.length < 40 || text.length > 500) continue;
93+
if (text === titleText) continue;
94+
// Skip URL breadcrumbs (e.g. "https://example.com › path..." or "Site Namehttps://...")
95+
if (text.indexOf('\u203A') !== -1) continue;
96+
if (new RegExp('https?://').test(text.slice(0, 60))) continue;
97+
snippetText = text;
98+
break;
99+
}
100+
101+
results.push({
102+
type: 'result',
103+
title: h3.textContent.trim(),
104+
url: href,
105+
snippet: snippetText.slice(0, 300),
106+
});
107+
}
108+
109+
// -- People Also Ask --
110+
var paaContainers = document.querySelectorAll('[data-sgrd="true"]');
111+
for (var i = 0; i < paaContainers.length; i++) {
112+
var questionEl = paaContainers[i].querySelector('span.CSkcDe');
113+
if (questionEl) {
114+
results.push({
115+
type: 'paa',
116+
title: questionEl.textContent.trim(),
117+
url: '',
118+
snippet: '',
119+
});
120+
}
121+
}
122+
123+
return results;
124+
})()
125+
`);
126+
127+
if (!Array.isArray(results) || results.length === 0) {
128+
throw new CliError('NOT_FOUND', 'No search results found', 'Try a different keyword or check for CAPTCHA');
129+
}
130+
131+
return results;
132+
},
133+
});

src/clis/google/suggest.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/**
2+
* Google Search Suggestions via public JSON API.
3+
* Uses suggestqueries.google.com with client=firefox for pure JSON (not JSONP).
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
9+
cli({
10+
site: 'google',
11+
name: 'suggest',
12+
description: 'Get Google search suggestions',
13+
strategy: Strategy.PUBLIC,
14+
browser: false,
15+
args: [
16+
{ name: 'keyword', positional: true, required: true, help: 'Search query' },
17+
{ name: 'lang', default: 'zh-CN', help: 'Language code' },
18+
],
19+
columns: ['suggestion'],
20+
func: async (_page, args) => {
21+
const keyword = encodeURIComponent(args.keyword);
22+
const lang = encodeURIComponent(args.lang);
23+
const url = `https://suggestqueries.google.com/complete/search?client=firefox&q=${keyword}&hl=${lang}`;
24+
25+
const resp = await fetch(url);
26+
if (!resp.ok) {
27+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection');
28+
}
29+
30+
const data = await resp.json();
31+
// Response format: ["query", ["suggestion1", "suggestion2", ...]]
32+
const suggestions: string[] = Array.isArray(data) && Array.isArray(data[1]) ? data[1] : [];
33+
34+
if (!suggestions.length) {
35+
throw new CliError('NOT_FOUND', 'No suggestions found', 'Try a different keyword');
36+
}
37+
38+
return suggestions.map(s => ({ suggestion: s }));
39+
},
40+
});

src/clis/google/trends.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/**
2+
* Google Trends via public RSS feed.
3+
* Shows daily trending searches for a given region.
4+
*/
5+
6+
import { cli, Strategy } from '../../registry.js';
7+
import { CliError } from '../../errors.js';
8+
import { parseRssItems } from './utils.js';
9+
10+
cli({
11+
site: 'google',
12+
name: 'trends',
13+
description: 'Get Google Trends daily trending searches',
14+
strategy: Strategy.PUBLIC,
15+
browser: false,
16+
args: [
17+
{ name: 'region', default: 'US', help: 'Region code (e.g. US, CN, JP)' },
18+
{ name: 'limit', type: 'int', default: 20, help: 'Number of results' },
19+
],
20+
columns: ['title', 'traffic', 'date'],
21+
func: async (_page, args) => {
22+
const limit = Math.max(1, Math.min(Number(args.limit), 100));
23+
const region = encodeURIComponent(args.region);
24+
const url = `https://trends.google.com/trending/rss?geo=${region}`;
25+
26+
const resp = await fetch(url);
27+
if (!resp.ok) {
28+
throw new CliError('FETCH_ERROR', `HTTP ${resp.status}`, 'Check your network connection or region code');
29+
}
30+
31+
const xml = await resp.text();
32+
const items = parseRssItems(xml, ['title', 'pubDate', 'ht:approx_traffic']);
33+
34+
if (!items.length) {
35+
throw new CliError('NOT_FOUND', 'No trending data found', 'Try a different region code');
36+
}
37+
38+
return items.slice(0, limit).map(item => ({
39+
title: item['title'],
40+
traffic: item['ht:approx_traffic'], // raw string e.g. "1,000,000+", no numeric conversion
41+
date: item['pubDate'],
42+
}));
43+
},
44+
});

0 commit comments

Comments
 (0)