From 3eb97814828022d2d83329251e46d66ec852769c Mon Sep 17 00:00:00 2001 From: Lu Wilson Date: Tue, 26 May 2026 16:24:16 +0200 Subject: [PATCH] morelike --- src/lib/config.ts | 5 + src/lib/fetchMorelikeSearch.ts | 142 ++++++++ src/lib/fetchPageThumbnailsBatch.ts | 105 ++++++ src/lib/fetchUserEditedPageTitles.ts | 149 +++++++++ src/prototypes/morelike-search/index.vue | 151 +++++++++ .../morelike-search/morelikeSearchStorage.ts | 62 ++++ .../morelike-search/useMorelikeSearch.ts | 306 ++++++++++++++++++ 7 files changed, 920 insertions(+) create mode 100644 src/lib/fetchMorelikeSearch.ts create mode 100644 src/lib/fetchPageThumbnailsBatch.ts create mode 100644 src/lib/fetchUserEditedPageTitles.ts create mode 100644 src/prototypes/morelike-search/index.vue create mode 100644 src/prototypes/morelike-search/morelikeSearchStorage.ts create mode 100644 src/prototypes/morelike-search/useMorelikeSearch.ts diff --git a/src/lib/config.ts b/src/lib/config.ts index fc747b9..43869ed 100644 --- a/src/lib/config.ts +++ b/src/lib/config.ts @@ -83,6 +83,11 @@ export function normalizeWikiUsername(raw: string): string { return trimmed.charAt(0).toUpperCase() + trimmed.slice(1) } +/** Hostname for a Wikipedia language edition (default English). */ +export function wikiHostFromLang(lang = 'en'): string { + return `${lang}.wikipedia.org` +} + export function configUserDisplayName(user: ConfigUser, realUsername = ''): string { if (user === 'real') { const name = normalizeWikiUsername(realUsername) diff --git a/src/lib/fetchMorelikeSearch.ts b/src/lib/fetchMorelikeSearch.ts new file mode 100644 index 0000000..954ad98 --- /dev/null +++ b/src/lib/fetchMorelikeSearch.ts @@ -0,0 +1,142 @@ +import { wikiHostFromLang } from '@/lib/config' + +const API_USER_AGENT = + 'ProtoWiki/0.1 (https://github.com/wikimedia-research/protowiki) morelike-search' + +const DEFAULT_LIMIT = 20 + +export class FetchMorelikeSearchError extends Error { + constructor( + message: string, + public readonly code: 'empty_seeds' | 'aborted' | 'http', + ) { + super(message) + this.name = 'FetchMorelikeSearchError' + } +} + +export interface MorelikeSearchResult { + title: string + snippet: string + pageid: number +} + +export interface FetchMorelikeSearchResponse { + results: MorelikeSearchResult[] + nextOffset?: number +} + +export interface FetchMorelikeSearchOptions { + signal?: AbortSignal + /** Wikipedia language code (default `en`). */ + lang?: string + /** Result limit per request (default 20). */ + limit?: number + /** Pagination offset from a prior response. */ + offset?: number +} + +function assertNotAborted(signal?: AbortSignal): void { + if (signal?.aborted) { + throw new FetchMorelikeSearchError('Request aborted', 'aborted') + } +} + +function actionUrl(wikiHost: string, params: Record): string { + const search = new URLSearchParams({ + ...params, + format: 'json', + formatversion: '2', + origin: '*', + }) + return `https://${wikiHost}/w/api.php?${search.toString()}` +} + +function normalizeTitleKey(title: string): string { + return title.trim().replace(/_/g, ' ').toLowerCase() +} + +export function stripSearchSnippetHtml(html: string): string { + if (typeof document !== 'undefined') { + const el = document.createElement('div') + el.innerHTML = html + return (el.textContent ?? '').replace(/\s+/g, ' ').trim() + } + return html + .replace(/<[^>]+>/g, ' ') + .replace(/\s+/g, ' ') + .trim() +} + +function buildMorelikeQuery(seedTitles: string[]): string { + return `morelike:${seedTitles.join('|')}` +} + +/** + * Cirrus Search "more like this" via Action API `list=search`. + */ +export async function fetchMorelikeSearch( + seedTitles: string[], + options: FetchMorelikeSearchOptions = {}, +): Promise { + const seeds = seedTitles.map((title) => title.trim()).filter(Boolean) + if (!seeds.length) { + throw new FetchMorelikeSearchError('Enter at least one seed page', 'empty_seeds') + } + + const wikiHost = wikiHostFromLang(options.lang ?? 'en') + const limit = Math.max(1, Math.min(options.limit ?? DEFAULT_LIMIT, 50)) + const offset = Math.max(0, options.offset ?? 0) + const seedKeys = new Set(seeds.map(normalizeTitleKey)) + + assertNotAborted(options.signal) + + const params: Record = { + action: 'query', + list: 'search', + srsearch: buildMorelikeQuery(seeds), + srwhat: 'text', + srnamespace: '0', + srlimit: String(limit), + sroffset: String(offset), + } + + const response = await fetch(actionUrl(wikiHost, params), { + signal: options.signal, + headers: { 'Api-User-Agent': API_USER_AGENT }, + }) + + if (!response.ok) { + throw new FetchMorelikeSearchError(`HTTP ${response.status}`, 'http') + } + + const data = (await response.json()) as { + query?: { + search?: Array<{ + title?: string + pageid?: number + snippet?: string + }> + } + continue?: { sroffset?: number } + } + + const results: MorelikeSearchResult[] = [] + + for (const hit of data.query?.search ?? []) { + const title = typeof hit.title === 'string' ? hit.title.trim() : '' + if (!title.length) continue + if (seedKeys.has(normalizeTitleKey(title))) continue + + results.push({ + title, + pageid: typeof hit.pageid === 'number' ? hit.pageid : 0, + snippet: stripSearchSnippetHtml(typeof hit.snippet === 'string' ? hit.snippet : ''), + }) + } + + const nextOffset = + typeof data.continue?.sroffset === 'number' ? data.continue.sroffset : undefined + + return { results, nextOffset } +} diff --git a/src/lib/fetchPageThumbnailsBatch.ts b/src/lib/fetchPageThumbnailsBatch.ts new file mode 100644 index 0000000..335daf4 --- /dev/null +++ b/src/lib/fetchPageThumbnailsBatch.ts @@ -0,0 +1,105 @@ +import { wikiHostFromLang } from '@/lib/config' + +const API_USER_AGENT = + 'ProtoWiki/0.1 (https://github.com/wikimedia-research/protowiki) morelike-search' + +const THUMB_SIZE = 96 +const MAX_TITLES_PER_REQUEST = 50 + +export class FetchPageThumbnailsBatchError extends Error { + constructor( + message: string, + public readonly code: 'aborted' | 'http', + ) { + super(message) + this.name = 'FetchPageThumbnailsBatchError' + } +} + +export interface FetchPageThumbnailsBatchOptions { + signal?: AbortSignal + /** Wikipedia language code (default `en`). */ + lang?: string +} + +function assertNotAborted(signal?: AbortSignal): void { + if (signal?.aborted) { + throw new FetchPageThumbnailsBatchError('Request aborted', 'aborted') + } +} + +function actionUrl(wikiHost: string, params: Record): string { + const search = new URLSearchParams({ + ...params, + format: 'json', + formatversion: '2', + origin: '*', + }) + return `https://${wikiHost}/w/api.php?${search.toString()}` +} + +async function fetchThumbnailBatch( + wikiHost: string, + titles: string[], + signal?: AbortSignal, +): Promise> { + assertNotAborted(signal) + + const response = await fetch( + actionUrl(wikiHost, { + action: 'query', + prop: 'pageimages', + pithumbsize: String(THUMB_SIZE), + redirects: '1', + titles: titles.join('|'), + }), + { + signal, + headers: { 'Api-User-Agent': API_USER_AGENT }, + }, + ) + + if (!response.ok) { + throw new FetchPageThumbnailsBatchError(`HTTP ${response.status}`, 'http') + } + + const data = (await response.json()) as { + query?: { + pages?: Array<{ + title?: string + thumbnail?: { source?: string } + }> + } + } + + const out: Record = {} + for (const page of data.query?.pages ?? []) { + const title = typeof page.title === 'string' ? page.title : '' + if (!title.length) continue + out[title] = page.thumbnail?.source + } + + return out +} + +/** + * Batch-fetch lead-image thumbnails for article titles. + */ +export async function fetchPageThumbnailsBatch( + titles: string[], + options: FetchPageThumbnailsBatchOptions = {}, +): Promise> { + const unique = [...new Set(titles.map((title) => title.trim()).filter(Boolean))] + if (!unique.length) return {} + + const wikiHost = wikiHostFromLang(options.lang ?? 'en') + const merged: Record = {} + + for (let i = 0; i < unique.length; i += MAX_TITLES_PER_REQUEST) { + const chunk = unique.slice(i, i + MAX_TITLES_PER_REQUEST) + const batch = await fetchThumbnailBatch(wikiHost, chunk, options.signal) + Object.assign(merged, batch) + } + + return merged +} diff --git a/src/lib/fetchUserEditedPageTitles.ts b/src/lib/fetchUserEditedPageTitles.ts new file mode 100644 index 0000000..a0b5225 --- /dev/null +++ b/src/lib/fetchUserEditedPageTitles.ts @@ -0,0 +1,149 @@ +import { normalizeWikiUsername, wikiHostFromLang } from '@/lib/config' + +const API_USER_AGENT = + 'ProtoWiki/0.1 (https://github.com/wikimedia-research/protowiki) morelike-search' + +const CONTRIBS_PER_PAGE = 500 +const MAX_CONTRIB_PAGES = 5 +export const MAX_SEED_PAGES = 20 + +export class FetchUserEditedPageTitlesError extends Error { + constructor( + message: string, + public readonly code: + | 'missing_username' + | 'user_not_found' + | 'no_edits' + | 'aborted' + | 'http', + ) { + super(message) + this.name = 'FetchUserEditedPageTitlesError' + } +} + +export interface FetchUserEditedPageTitlesOptions { + signal?: AbortSignal + /** Wikipedia language code (default `en`). */ + lang?: string + /** Max unique article titles to return (default 20). */ + limit?: number +} + +function assertNotAborted(signal?: AbortSignal): void { + if (signal?.aborted) { + throw new FetchUserEditedPageTitlesError('Request aborted', 'aborted') + } +} + +function actionUrl(wikiHost: string, params: Record): string { + const search = new URLSearchParams({ + ...params, + format: 'json', + formatversion: '2', + origin: '*', + }) + return `https://${wikiHost}/w/api.php?${search.toString()}` +} + +function normalizeTitleKey(title: string): string { + return title.trim().replace(/_/g, ' ').toLowerCase() +} + +/** + * Unique article titles from a user's namespace-0 edit history (most recent first). + */ +export async function fetchUserEditedPageTitles( + rawUsername: string, + options: FetchUserEditedPageTitlesOptions = {}, +): Promise { + const username = normalizeWikiUsername(rawUsername) + if (!username.length) { + throw new FetchUserEditedPageTitlesError('Enter a Wikipedia username', 'missing_username') + } + + const wikiHost = wikiHostFromLang(options.lang ?? 'en') + const limit = Math.max(1, Math.min(options.limit ?? MAX_SEED_PAGES, MAX_SEED_PAGES)) + + assertNotAborted(options.signal) + + const userResponse = await fetch( + actionUrl(wikiHost, { + action: 'query', + list: 'users', + ususers: username, + }), + { + signal: options.signal, + headers: { 'Api-User-Agent': API_USER_AGENT }, + }, + ) + + if (!userResponse.ok) { + throw new FetchUserEditedPageTitlesError(`HTTP ${userResponse.status}`, 'http') + } + + const userData = (await userResponse.json()) as { + query?: { users?: Array<{ name?: string; missing?: boolean }> } + } + + const userInfo = userData.query?.users?.[0] + if (!userInfo || userInfo.missing) { + throw new FetchUserEditedPageTitlesError(`User "${username}" not found`, 'user_not_found') + } + + const seen = new Set() + const titles: string[] = [] + let uccontinue: string | undefined + + for (let page = 0; page < MAX_CONTRIB_PAGES; page++) { + assertNotAborted(options.signal) + + const params: Record = { + action: 'query', + list: 'usercontribs', + ucuser: username, + ucnamespace: '0', + uclimit: String(CONTRIBS_PER_PAGE), + } + if (uccontinue) params.uccontinue = uccontinue + + const response = await fetch(actionUrl(wikiHost, params), { + signal: options.signal, + headers: { 'Api-User-Agent': API_USER_AGENT }, + }) + + if (!response.ok) { + throw new FetchUserEditedPageTitlesError(`HTTP ${response.status}`, 'http') + } + + const data = (await response.json()) as { + query?: { usercontribs?: Array<{ title?: string }> } + continue?: { uccontinue?: string } + } + + for (const contrib of data.query?.usercontribs ?? []) { + const title = typeof contrib.title === 'string' ? contrib.title.trim() : '' + if (!title.length) continue + + const key = normalizeTitleKey(title) + if (seen.has(key)) continue + + seen.add(key) + titles.push(title) + + if (titles.length >= limit) { + return titles + } + } + + uccontinue = data.continue?.uccontinue + if (!uccontinue) break + } + + if (!titles.length) { + throw new FetchUserEditedPageTitlesError('No article edits found', 'no_edits') + } + + return titles +} diff --git a/src/prototypes/morelike-search/index.vue b/src/prototypes/morelike-search/index.vue new file mode 100644 index 0000000..362a724 --- /dev/null +++ b/src/prototypes/morelike-search/index.vue @@ -0,0 +1,151 @@ + + + + + diff --git a/src/prototypes/morelike-search/morelikeSearchStorage.ts b/src/prototypes/morelike-search/morelikeSearchStorage.ts new file mode 100644 index 0000000..bf8261b --- /dev/null +++ b/src/prototypes/morelike-search/morelikeSearchStorage.ts @@ -0,0 +1,62 @@ +export type MorelikeInputMode = 'manual' | 'userEdits' + +const STORAGE_KEY = 'protowiki-morelike-search' + +export interface MorelikeSearchStoredInput { + inputMode: MorelikeInputMode + seedPagesInput: string + username: string +} + +export const DEFAULT_MORELIKE_SEARCH_INPUT: MorelikeSearchStoredInput = { + inputMode: 'manual', + seedPagesInput: 'Earth, Mars', + username: '', +} + +function isInputMode(value: unknown): value is MorelikeInputMode { + return value === 'manual' || value === 'userEdits' +} + +export function loadMorelikeSearchInput(): MorelikeSearchStoredInput { + if (typeof window === 'undefined') { + return { ...DEFAULT_MORELIKE_SEARCH_INPUT } + } + + try { + const raw = window.localStorage.getItem(STORAGE_KEY) + if (!raw) return { ...DEFAULT_MORELIKE_SEARCH_INPUT } + + const parsed: unknown = JSON.parse(raw) + if (typeof parsed !== 'object' || parsed === null) { + return { ...DEFAULT_MORELIKE_SEARCH_INPUT } + } + + const record = parsed as Record + return { + inputMode: isInputMode(record.inputMode) + ? record.inputMode + : DEFAULT_MORELIKE_SEARCH_INPUT.inputMode, + seedPagesInput: + typeof record.seedPagesInput === 'string' + ? record.seedPagesInput + : DEFAULT_MORELIKE_SEARCH_INPUT.seedPagesInput, + username: + typeof record.username === 'string' + ? record.username + : DEFAULT_MORELIKE_SEARCH_INPUT.username, + } + } catch { + return { ...DEFAULT_MORELIKE_SEARCH_INPUT } + } +} + +export function saveMorelikeSearchInput(input: MorelikeSearchStoredInput): void { + if (typeof window === 'undefined') return + + try { + window.localStorage.setItem(STORAGE_KEY, JSON.stringify(input)) + } catch { + // Quota or private-mode failures — ignore. + } +} diff --git a/src/prototypes/morelike-search/useMorelikeSearch.ts b/src/prototypes/morelike-search/useMorelikeSearch.ts new file mode 100644 index 0000000..9afcc4c --- /dev/null +++ b/src/prototypes/morelike-search/useMorelikeSearch.ts @@ -0,0 +1,306 @@ +import { computed, ref, watch, type ComputedRef, type Ref } from 'vue' + +import { useConfig } from '@/composables/useConfig' +import { parsePageList, wikiHostFromLang } from '@/lib/config' +import { + FetchMorelikeSearchError, + fetchMorelikeSearch, + type MorelikeSearchResult, +} from '@/lib/fetchMorelikeSearch' +import { FetchPageThumbnailsBatchError, fetchPageThumbnailsBatch } from '@/lib/fetchPageThumbnailsBatch' +import { + FetchUserEditedPageTitlesError, + fetchUserEditedPageTitles, + MAX_SEED_PAGES, +} from '@/lib/fetchUserEditedPageTitles' +import { + loadMorelikeSearchInput, + saveMorelikeSearchInput, + type MorelikeInputMode, +} from './morelikeSearchStorage' + +export type { MorelikeInputMode } + +const LANG = 'en' +const RESULT_LIMIT = 20 + +function wikiArticleUrl(title: string, lang = LANG): string { + const slug = encodeURIComponent(title.trim().replace(/ /g, '_')) + return `https://${wikiHostFromLang(lang)}/wiki/${slug}` +} + +function normalizeTitleKey(title: string): string { + return title.trim().replace(/_/g, ' ').toLowerCase() +} + +function errorMessage(error: unknown): string { + if (error instanceof FetchMorelikeSearchError) { + if (error.code === 'aborted') return '' + return error.message + } + if (error instanceof FetchUserEditedPageTitlesError) { + if (error.code === 'aborted') return '' + return error.message + } + if (error instanceof FetchPageThumbnailsBatchError) { + if (error.code === 'aborted') return '' + return error.message + } + if (error instanceof Error && error.message) return error.message + return 'Something went wrong. Try again.' +} + +export function useMorelikeSearch(): { + inputMode: Ref + inputModeOptions: { value: MorelikeInputMode; label: string }[] + seedPagesInput: Ref + username: Ref + resolvedSeeds: Ref + results: Ref + thumbnailsByTitle: Ref> + loadingHistory: Ref + loading: Ref + loadingMore: Ref + error: Ref + hasSearched: Ref + resultsEmpty: ComputedRef + canSubmit: ComputedRef + canShowMore: ComputedRef + loadingLabel: ComputedRef + wikiArticleUrl: (title: string) => string + onSubmit: () => Promise + onShowMore: () => Promise +} { + const { realUsername } = useConfig() + + const stored = loadMorelikeSearchInput() + const inputMode = ref(stored.inputMode) + const seedPagesInput = ref(stored.seedPagesInput) + const username = ref(stored.username.trim() ? stored.username : realUsername.value) + const resolvedSeeds = ref([]) + const results = ref([]) + const thumbnailsByTitle = ref>({}) + const nextOffset = ref(undefined) + const loadingHistory = ref(false) + const loading = ref(false) + const loadingMore = ref(false) + const error = ref(null) + const hasSearched = ref(false) + + let abortController: AbortController | null = null + + const inputModeOptions = [ + { value: 'manual' as const, label: 'Enter pages manually' }, + { value: 'userEdits' as const, label: 'User editing history' }, + ] + + watch(realUsername, (name) => { + if (!username.value.trim() && name.trim()) { + username.value = name + } + }) + + watch([inputMode, seedPagesInput, username], () => { + saveMorelikeSearchInput({ + inputMode: inputMode.value, + seedPagesInput: seedPagesInput.value, + username: username.value, + }) + }) + + const resultsEmpty = computed(() => hasSearched.value && results.value.length === 0) + + const canSubmit = computed( + () => + !loadingHistory.value && + !loading.value && + !loadingMore.value && + (inputMode.value === 'manual' + ? seedPagesInput.value.trim().length > 0 + : username.value.trim().length > 0), + ) + + const canShowMore = computed( + () => + hasSearched.value && + !resultsEmpty.value && + nextOffset.value != null && + !loading.value && + !loadingMore.value && + !loadingHistory.value, + ) + + const loadingLabel = computed(() => { + if (loadingHistory.value) return 'Loading edit history…' + if (loading.value) return 'Searching…' + if (loadingMore.value) return 'Loading more…' + return '' + }) + + function resetResults(): void { + results.value = [] + thumbnailsByTitle.value = {} + resolvedSeeds.value = [] + nextOffset.value = undefined + hasSearched.value = false + error.value = null + } + + watch(inputMode, () => { + abortController?.abort() + resetResults() + }) + + async function enrichThumbnails( + newResults: MorelikeSearchResult[], + signal: AbortSignal, + reset: boolean, + ): Promise { + const titlesToFetch = reset + ? newResults.map((result) => result.title) + : newResults + .map((result) => result.title) + .filter((title) => !(title in thumbnailsByTitle.value)) + + if (!titlesToFetch.length) { + if (reset) thumbnailsByTitle.value = {} + return + } + + const batch = await fetchPageThumbnailsBatch(titlesToFetch, { + lang: LANG, + signal, + }) + + thumbnailsByTitle.value = reset + ? batch + : { ...thumbnailsByTitle.value, ...batch } + } + + async function resolveSeeds(signal: AbortSignal): Promise { + if (inputMode.value === 'manual') { + const seeds = parsePageList(seedPagesInput.value) + if (!seeds.length) { + throw new FetchMorelikeSearchError('Enter at least one seed page', 'empty_seeds') + } + return seeds + } + + loadingHistory.value = true + try { + return await fetchUserEditedPageTitles(username.value, { + lang: LANG, + limit: MAX_SEED_PAGES, + signal, + }) + } finally { + loadingHistory.value = false + } + } + + async function runSearch( + seeds: string[], + offset: number | undefined, + signal: AbortSignal, + reset: boolean, + ): Promise { + const response = await fetchMorelikeSearch(seeds, { + lang: LANG, + limit: RESULT_LIMIT, + offset, + signal, + }) + + nextOffset.value = response.nextOffset + + const seen = reset + ? new Set() + : new Set(results.value.map((result) => normalizeTitleKey(result.title))) + + const merged = reset ? [] : [...results.value] + for (const result of response.results) { + const key = normalizeTitleKey(result.title) + if (seen.has(key)) continue + seen.add(key) + merged.push(result) + } + + results.value = merged + await enrichThumbnails(response.results, signal, reset) + hasSearched.value = true + } + + async function onSubmit(): Promise { + if (!canSubmit.value) return + + abortController?.abort() + const controller = new AbortController() + abortController = controller + + loading.value = true + error.value = null + results.value = [] + thumbnailsByTitle.value = {} + resolvedSeeds.value = [] + nextOffset.value = undefined + hasSearched.value = false + + try { + const seeds = await resolveSeeds(controller.signal) + resolvedSeeds.value = seeds + await runSearch(seeds, 0, controller.signal, true) + } catch (err) { + if (controller.signal.aborted) return + error.value = errorMessage(err) || 'Could not load results.' + hasSearched.value = false + } finally { + if (abortController === controller) { + loading.value = false + } + } + } + + async function onShowMore(): Promise { + if (!canShowMore.value || resolvedSeeds.value.length === 0) return + + abortController?.abort() + const controller = new AbortController() + abortController = controller + + loadingMore.value = true + error.value = null + + try { + await runSearch(resolvedSeeds.value, nextOffset.value, controller.signal, false) + } catch (err) { + if (controller.signal.aborted) return + error.value = errorMessage(err) || 'Could not load more results.' + } finally { + if (abortController === controller) { + loadingMore.value = false + } + } + } + + return { + inputMode, + inputModeOptions, + seedPagesInput, + username, + resolvedSeeds, + results, + thumbnailsByTitle, + loadingHistory, + loading, + loadingMore, + error, + hasSearched, + resultsEmpty, + canSubmit, + canShowMore, + loadingLabel, + wikiArticleUrl, + onSubmit, + onShowMore, + } +}