diff --git a/CHANGELOG.md b/CHANGELOG.md index b047e0a..0d687dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ All notable changes to SearchMob Desktop are documented here. The version scheme for, other people on your network. You can **export, import, and reset** the learned model; the export uses a portable format shared with the Android app, so you can back it up or move it between devices. +- **Learns from the served browser page too.** When you search from your browser through the local + server, clicking a result trains the model the same way the in-app results do, through an + owner-only redirect that only ever sends you to the result you clicked. People on your network get + plain result links and never train or see your personalization. - **Setup wizard re-appears once after a feature update.** Existing users see the wizard again (now framed as "What's new") when an update adds a step worth seeing, so new opt-in features are not hidden from people who already onboarded. diff --git a/openspec/changes/add-click-personalization/tasks.md b/openspec/changes/add-click-personalization/tasks.md index 7f6673c..eff87f3 100644 --- a/openspec/changes/add-click-personalization/tasks.md +++ b/openspec/changes/add-click-personalization/tasks.md @@ -29,9 +29,9 @@ ## 6. Served-page learning (PR D2) -- [ ] 6.1 Add owner-only `/click` route in `server/app.py` with a bounded in-memory `rid -> ordered (url, host)` map; record the skip-above update and 302 to the recorded destination for `rid+pos`; 404 non-owner callers; add `/click` to the owner-gated paths. -- [ ] 6.2 Render result links as `/click?...` only for owner/loopback requests in `server/templates.py`; LAN clients keep bare `` and are never tracked or personalized. -- [ ] 6.3 Tests: owner click records and redirects correctly; LAN client gets no tracking link and cannot forge `rid`/poison the model; bad `rid`/`pos` fail safe. Gate green; open PR D2. +- [x] 6.1 Add owner-only `/click` route in `server/app.py` with a bounded in-memory `rid -> ordered (url, host)` map; record the skip-above update and 302 to the recorded destination for `rid+pos`; 404 non-owner callers; add `/click` to the owner-gated paths. +- [x] 6.2 Render result links as `/click?...` only for owner/loopback requests in `server/templates.py`; LAN clients keep bare `` and are never tracked or personalized. +- [x] 6.3 Tests: owner click records and redirects correctly; LAN client gets no tracking link and cannot forge `rid`/poison the model; bad `rid`/`pos` fail safe. Gate green; open PR D2. ## 7. Android parity (separate PRs A1/A2, mobile repo) diff --git a/src/searchmob_desktop/gui/server_controller.py b/src/searchmob_desktop/gui/server_controller.py index f248286..7cb0d5a 100644 --- a/src/searchmob_desktop/gui/server_controller.py +++ b/src/searchmob_desktop/gui/server_controller.py @@ -20,7 +20,10 @@ from searchmob_desktop.data.api_keys import read_vault_api_keys, resolve_api_key from searchmob_desktop.data.history import HistoryStore, InMemoryHistoryStore -from searchmob_desktop.data.personalization_store import load_personalization +from searchmob_desktop.data.personalization_store import ( + load_personalization, + save_personalization, +) from searchmob_desktop.data.ranking_store import load_ranking_rules, save_ranking_rules from searchmob_desktop.engines import ( EngineFn, @@ -203,6 +206,13 @@ def _personalization() -> PersonalizationModel | None: return None return load_personalization() + def _save_personalization(model: PersonalizationModel) -> bool: + # Persist the model after an owner click on the served page. Only writes when enabled, + # so a stale click cannot resurrect learning the owner has turned off. + if not self._prefs_store.load().personalization_enabled: + return False + return save_personalization(model) + app = build_app( self._engines, bound_port_getter=lambda: self._port, @@ -214,7 +224,9 @@ def _personalization() -> PersonalizationModel | None: ranking_rules_provider=load_ranking_rules, ranking_rules_saver=save_ranking_rules, # The learned click model, applied only for the loopback owner (never network visitors). + # The saver persists owner clicks on the served page back to the encrypted vault. personalization_provider=_personalization, + personalization_saver=_save_personalization, # Live prefs so the served Settings page reads and persists preferences without a # restart. Summary is always wired; the live `summary_enabled` pref gates it. prefs_provider=self._prefs_store.load, diff --git a/src/searchmob_desktop/server/app.py b/src/searchmob_desktop/server/app.py index f536ba8..ada352b 100644 --- a/src/searchmob_desktop/server/app.py +++ b/src/searchmob_desktop/server/app.py @@ -27,10 +27,12 @@ import asyncio import inspect import json +import secrets import socket import time +from collections import OrderedDict from collections.abc import Awaitable, Callable, Sequence -from dataclasses import asdict, replace +from dataclasses import asdict, dataclass, replace from ipaddress import ip_address from urllib.parse import parse_qsl, urlsplit @@ -60,6 +62,7 @@ parse_goggles, personalize_reorder, ) +from searchmob_desktop.engines.rank.personalize import query_terms, update_from_click from searchmob_desktop.engines.rank.slop_blocklist import load_slop_domains from searchmob_desktop.engines.sort import SortMode, sort_results from searchmob_desktop.engines.verticals import Vertical, default_sort, transform_query @@ -324,6 +327,22 @@ def _suggestions_body(query: str, suggestions: Sequence[str]) -> str: # through `aggregate`. The default plumbs through to the real `aggregate(ctx, engines)` call. _MetasearchFn = Callable[[EngineContext, Sequence[EngineFn]], Awaitable[list[SearchResult]]] +# Upper bound on remembered result renders for owner click-tracking (one per recent owner search). +# Small on purpose: only the most recent few pages need clickable tracking links at once. +_RENDER_CACHE_MAX = 64 + + +@dataclass(frozen=True) +class _RenderedResults: + """One owner-rendered result page: the query and the displayed (url, host) order. + + Held briefly in memory so the owner-only `/click` endpoint can resolve a click to its result and + its skipped-above neighbors using server state, never a client-supplied URL. Never persisted. + """ + + query: str + items: list[tuple[str, str | None]] + def build_app( engines: Sequence[EngineFn], @@ -336,6 +355,7 @@ def build_app( ranking_rules_provider: Callable[[], RankingRules] | None = None, ranking_rules_saver: Callable[[RankingRules], bool] | None = None, personalization_provider: Callable[[], PersonalizationModel | None] | None = None, + personalization_saver: Callable[[PersonalizationModel], bool] | None = None, prefs_provider: Callable[[], UserPreferences] | None = None, prefs_saver: Callable[[UserPreferences], bool] | None = None, history_provider: Callable[[], list[HistoryEntry]] | None = None, @@ -399,6 +419,10 @@ def _clamp(raw: str | None) -> str: static_rules = ranking_rules if ranking_rules is not None else RankingRules() rules_provider: Callable[[], RankingRules] = ranking_rules_provider or (lambda: static_rules) + # Per-app, in-memory map of recent owner renders (render id -> displayed order), used only to + # resolve owner clicks on the served page back to their result for the `/click` endpoint. + _render_cache: OrderedDict[str, _RenderedResults] = OrderedDict() + def _load_prefs() -> UserPreferences | None: # Read live preferences per request so the served Settings toggles apply without a restart. # Fail-soft: any error reading prefs falls back to the static defaults passed to build_app. @@ -414,7 +438,7 @@ async def _run_metasearch( sort_mode: SortMode = SortMode.FRESH_RELEVANT, vertical: Vertical = Vertical.WEB, *, - owner: bool = False, + model: PersonalizationModel | None = None, ) -> list[SearchResult]: if not query.strip() or not engines: return [] @@ -428,16 +452,14 @@ async def _run_metasearch( prefs = _load_prefs() slop_mode = prefs.ai_slop_mode if prefs is not None else ai_slop_mode # Sort (relevance/date/freshness blend), then nudge by the owner's learned click model (only - # for the loopback owner, never a network visitor), then apply the user's rules so the - # served results match the in-app results and PIN/RAISE preserve the order. + # when a model is passed, which the caller does for the loopback owner), then apply the + # user's rules so the served results match the in-app results and PIN/RAISE preserve order. now_ms = int(time.time() * 1000) ordered = sort_results(results, sort_mode, query, now_ms) - if owner and personalization_provider is not None: - model = personalization_provider() - if model is not None: - ordered = personalize_reorder( - ordered, lambda r: host_of_url(r.url), query, model, now_ms - ) + if model is not None: + ordered = personalize_reorder( + ordered, lambda r: host_of_url(r.url), query, model, now_ms + ) return apply_ranking( ordered, rules_provider(), @@ -447,6 +469,27 @@ async def _run_metasearch( slop_mode=slop_mode, ) + def _owner_model(request: Request) -> PersonalizationModel | None: + # The learned model for the loopback owner, or None when off / disabled / not the owner. + # Used to personalize the owner's results and to gate click-tracking links and `/click`. + if personalization_provider is None or not _is_loopback_request(request): + return None + try: + return personalization_provider() + except Exception: + return None + + def _register_render(query: str, results: list[SearchResult]) -> str: + # Remember the exact displayed order under a fresh, unguessable id so an owner click can be + # matched to its result (and its skipped-above neighbors) without trusting any client input. + # Bounded and in-memory only; never persisted. + rid = secrets.token_urlsafe(9) + items = [(r.url, host_of_url(r.url)) for r in results] + _render_cache[rid] = _RenderedResults(query=query, items=items) + while len(_render_cache) > _RENDER_CACHE_MAX: + _render_cache.popitem(last=False) # evict the oldest render + return rid + def _correction(query: str) -> str | None: # On-device "did you mean". `suggest` is fail-soft and already returns None when the # corrected query equals the input, so any non-None result is a genuine suggestion. @@ -517,10 +560,15 @@ async def search_html(request: Request) -> Response: # Fetch the contextual summary concurrently with the metasearch so the box never adds # latency to the results path. summary_task = asyncio.ensure_future(_maybe_summary(query)) - results = await _run_metasearch( - query, sort_mode, vertical, owner=_is_loopback_request(request) - ) + model = _owner_model(request) + results = await _run_metasearch(query, sort_mode, vertical, model=model) summary = await summary_task + # When personalization is on for the owner, route result links through `/click` so a click + # can train the model; everyone else (and a disabled owner) gets the plain destination link. + link_builder: Callable[[int, str], str] | None = None + if model is not None and results: + rid = _register_render(query, results) + link_builder = lambda pos, _url, _rid=rid: f"/click?rid={_rid}&pos={pos}" # noqa: E731 body = render_results_page( query, results, @@ -532,9 +580,41 @@ async def search_html(request: Request) -> Response: sort_mode=sort_mode.value, vertical=vertical.value, settings_link=_is_settings_owner(request), + link_builder=link_builder, ) return Response(body, media_type="text/html; charset=utf-8") + async def click(request: Request) -> Response: + # Owner-only redirector that learns from a click on the served results page. It resolves the + # destination from server-side render state (never a caller-supplied URL), so it cannot be + # an open redirect, and it trains only the loopback owner's model. + if not _is_loopback_request(request): + return PlainTextResponse("not found", status_code=404) + render = _render_cache.get(request.query_params.get("rid", "")) + if render is None: + return RedirectResponse("/", status_code=303) + try: + pos = int(request.query_params.get("pos", "")) + except ValueError: + return RedirectResponse("/", status_code=303) + if pos < 0 or pos >= len(render.items): + return RedirectResponse("/", status_code=303) + dest_url, _host = render.items[pos] + if not is_safe_http_url(dest_url): + return RedirectResponse("/", status_code=303) + if personalization_provider is not None and personalization_saver is not None: + model = _owner_model(request) + if model is not None: + try: + hosts = [host for _u, host in render.items] + update_from_click( + model, hosts, pos, query_terms(render.query), int(time.time() * 1000) + ) + personalization_saver(model) + except Exception: + pass # Learning is best-effort; never block the navigation on it. + return RedirectResponse(dest_url, status_code=302) + def _redirect_back(request: Request) -> Response: # Return to the page the POST came from when it is one of our own origins; else home. 303 # makes the browser re-fetch with GET so a refresh does not re-POST. @@ -683,9 +763,7 @@ async def search_json(request: Request) -> Response: vertical = Vertical.from_value(request.query_params.get("vertical")) sort_param = request.query_params.get("sort") sort_mode = SortMode.from_value(sort_param) if sort_param else default_sort(vertical) - results = await _run_metasearch( - query, sort_mode, vertical, owner=_is_loopback_request(request) - ) + results = await _run_metasearch(query, sort_mode, vertical, model=_owner_model(request)) payload = { "query": query, "results": [asdict(result) for result in results], @@ -720,6 +798,9 @@ async def suggest(request: Request) -> Response: Route("/api/search", search_json, methods=["GET"]), Route("/opensearch.xml", opensearch_xml, methods=["GET"]), Route("/suggest", suggest, methods=["GET"]), + # Owner-only click redirector that trains the personalization model (loopback-only; a + # non-owner gets 404, and it only redirects to a server-recorded result URL). + Route("/click", click, methods=["GET"]), # Personalization edits from the served UI. Gated loopback-only + same-origin in the # middleware; no-op (503) when no saver is wired. Route("/rules/domain", set_domain_rule, methods=["POST"]), diff --git a/src/searchmob_desktop/server/templates.py b/src/searchmob_desktop/server/templates.py index a6ce789..52a33f9 100644 --- a/src/searchmob_desktop/server/templates.py +++ b/src/searchmob_desktop/server/templates.py @@ -447,6 +447,7 @@ def render_results_page( sort_mode: str = "fresh", vertical: str = "web", settings_link: bool = False, + link_builder: Callable[[int, str], str] | None = None, ) -> str: """The results page. Empty/blank query -> a placeholder; otherwise -> the merged results. @@ -510,14 +511,18 @@ def render_results_page( parts.append(_sort_bar(query, sort_mode)) if editable: parts.append(_scope_bar(active_rules)) - for result in results_list: + for index, result in enumerate(results_list): parts.append('
') parts.append(f'
{escape(_display_url(result.url))}
') if is_safe_http_url(result.url): # rel=noreferrer backs up the Referrer-Policy header so the query (in the loopback - # URL) never leaks to the destination; noopener severs window.opener. + # URL) never leaks to the destination; noopener severs window.opener. When a + # link_builder is wired (owner + personalization on), the anchor points at the + # owner-only `/click` redirector so the click can train the model; otherwise it is + # the plain destination URL. + href = link_builder(index, result.url) if link_builder is not None else result.url parts.append( - f'
{escape(result.title)}' ) else: diff --git a/tests/server/test_click_route.py b/tests/server/test_click_route.py new file mode 100644 index 0000000..c59877d --- /dev/null +++ b/tests/server/test_click_route.py @@ -0,0 +1,122 @@ +"""Owner-only `/click` redirector: it learns from owner clicks on the served page and is safe. + +Drives `/search` to populate the render cache, then `/click` to exercise: owner click records the +skip-above update and redirects to the recorded URL; a network visitor is refused and gets no +tracking links; a forged or stale `rid`/`pos` fails safe without redirecting off-site. +""" + +from __future__ import annotations + +import time + +from starlette.testclient import TestClient + +from searchmob_desktop.engines import EngineContext, SearchResult +from searchmob_desktop.engines.rank import personalize as p +from searchmob_desktop.server.app import build_app + +_NOW = int(time.time() * 1000) + + +class _Model: + """A live model holder so the provider and saver share one instance across requests.""" + + def __init__(self) -> None: + self.model = p.PersonalizationModel(config=p.PersonalizationConfig(epsilon=0.0)) + + def provide(self) -> p.PersonalizationModel: + return self.model + + def save(self, model: p.PersonalizationModel) -> bool: + self.model = model + return True + + +async def _metasearch(_ctx: EngineContext, _engines: object) -> list[SearchResult]: + return [ + SearchResult(title="One", url="https://a.example/1", snippet="s", engine="e"), + SearchResult(title="Two", url="https://b.example/2", snippet="s", engine="e"), + SearchResult(title="Three", url="https://liked.example/3", snippet="s", engine="e"), + ] + + +def _app(holder: _Model | None) -> object: + return build_app( + [lambda _c, _ctx: []], + bound_port_getter=lambda: 8787, + bound_host_getter=lambda: "0.0.0.0", + personalization_provider=holder.provide if holder else None, + personalization_saver=holder.save if holder else None, + metasearch=_metasearch, # type: ignore[arg-type] + host_allowlist_enabled=False, + ) + + +def _loopback(app: object) -> TestClient: + return TestClient(app, client=("127.0.0.1", 9)) # type: ignore[arg-type] + + +def _remote(app: object) -> TestClient: + return TestClient(app, client=("192.168.1.20", 9)) # type: ignore[arg-type] + + +def _rid_from_search(client: TestClient) -> str: + html = client.get("/search", params={"q": "python list"}).text + # The owner page routes result links through /click?rid=..&pos=.. + start = html.index("/click?rid=") + len("/click?rid=") + return html[start : html.index("&", start)] + + +def test_owner_page_uses_click_links_and_records_then_redirects() -> None: + holder = _Model() + client = _loopback(_app(holder)) + rid = _rid_from_search(client) + + # Click the third result (the skipped-above ones are a.example and b.example). + resp = client.get(f"/click?rid={rid}&pos=2", follow_redirects=False) + assert resp.status_code == 302 + assert resp.headers["location"] == "https://liked.example/3" + # The model learned: clicked host got a click, the two above it got skips. + assert holder.model.domains["liked.example"].alpha == holder.model.config.alpha_prior + 1 + assert holder.model.domains["a.example"].beta == holder.model.config.beta_prior + 1 + assert "c.example" not in holder.model.domains # nothing below the click was touched + + +def test_network_visitor_gets_no_click_links_and_cannot_use_click() -> None: + holder = _Model() + app = _app(holder) + # The remote page renders bare destination links, not /click. + html = _remote(app).get("/search", params={"q": "python list"}).text + assert "/click?rid=" not in html + assert 'href="https://liked.example/3"' in html + # And the /click endpoint itself refuses a non-owner (404), so it cannot be used to train. + resp = _remote(app).get("/click?rid=anything&pos=0", follow_redirects=False) + assert resp.status_code == 404 + + +def test_owner_cannot_be_redirected_with_a_forged_or_stale_rid() -> None: + holder = _Model() + client = _loopback(_app(holder)) + # An unknown rid never redirects off-site; it falls back to home (303), and nothing is learned. + resp = client.get("/click?rid=forged&pos=0", follow_redirects=False) + assert resp.status_code == 303 + assert resp.headers["location"] == "/" + assert holder.model.is_empty() + + +def test_bad_position_fails_safe() -> None: + holder = _Model() + client = _loopback(_app(holder)) + rid = _rid_from_search(client) + for pos in ("99", "-1", "notanint"): + resp = client.get(f"/click?rid={rid}&pos={pos}", follow_redirects=False) + assert resp.status_code == 303 + assert resp.headers["location"] == "/" + assert holder.model.is_empty() + + +def test_disabled_personalization_renders_plain_links() -> None: + # No provider/saver wired (feature off): the owner page has no /click links. + html = _loopback(_app(None)).get("/search", params={"q": "python list"}).text + assert "/click?rid=" not in html + assert 'href="https://liked.example/3"' in html