Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/lib/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ export function normalizeWikiUsername(raw: string): string {
return trimmed.charAt(0).toUpperCase() + trimmed.slice(1)
}

/** Hostname for a Wikipedia language edition (default English). */
export function wikiHostFromLang(lang = 'en'): string {
return `${lang}.wikipedia.org`
}

export function configUserDisplayName(user: ConfigUser, realUsername = ''): string {
if (user === 'real') {
const name = normalizeWikiUsername(realUsername)
Expand Down
142 changes: 142 additions & 0 deletions src/lib/fetchMorelikeSearch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { wikiHostFromLang } from '@/lib/config'

const API_USER_AGENT =
'ProtoWiki/0.1 (https://github.com/wikimedia-research/protowiki) morelike-search'

const DEFAULT_LIMIT = 20

export class FetchMorelikeSearchError extends Error {
constructor(
message: string,
public readonly code: 'empty_seeds' | 'aborted' | 'http',
) {
super(message)
this.name = 'FetchMorelikeSearchError'
}
}

export interface MorelikeSearchResult {
title: string
snippet: string
pageid: number
}

export interface FetchMorelikeSearchResponse {
results: MorelikeSearchResult[]
nextOffset?: number
}

export interface FetchMorelikeSearchOptions {
signal?: AbortSignal
/** Wikipedia language code (default `en`). */
lang?: string
/** Result limit per request (default 20). */
limit?: number
/** Pagination offset from a prior response. */
offset?: number
}

function assertNotAborted(signal?: AbortSignal): void {
if (signal?.aborted) {
throw new FetchMorelikeSearchError('Request aborted', 'aborted')
}
}

function actionUrl(wikiHost: string, params: Record<string, string>): string {
const search = new URLSearchParams({
...params,
format: 'json',
formatversion: '2',
origin: '*',
})
return `https://${wikiHost}/w/api.php?${search.toString()}`
}

function normalizeTitleKey(title: string): string {
return title.trim().replace(/_/g, ' ').toLowerCase()
}

export function stripSearchSnippetHtml(html: string): string {
if (typeof document !== 'undefined') {
const el = document.createElement('div')
el.innerHTML = html
return (el.textContent ?? '').replace(/\s+/g, ' ').trim()
}
return html
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
}

function buildMorelikeQuery(seedTitles: string[]): string {
return `morelike:${seedTitles.join('|')}`
}

/**
* Cirrus Search "more like this" via Action API `list=search`.
*/
export async function fetchMorelikeSearch(
seedTitles: string[],
options: FetchMorelikeSearchOptions = {},
): Promise<FetchMorelikeSearchResponse> {
const seeds = seedTitles.map((title) => title.trim()).filter(Boolean)
if (!seeds.length) {
throw new FetchMorelikeSearchError('Enter at least one seed page', 'empty_seeds')
}

const wikiHost = wikiHostFromLang(options.lang ?? 'en')
const limit = Math.max(1, Math.min(options.limit ?? DEFAULT_LIMIT, 50))
const offset = Math.max(0, options.offset ?? 0)
const seedKeys = new Set(seeds.map(normalizeTitleKey))

assertNotAborted(options.signal)

const params: Record<string, string> = {
action: 'query',
list: 'search',
srsearch: buildMorelikeQuery(seeds),
srwhat: 'text',
srnamespace: '0',
srlimit: String(limit),
sroffset: String(offset),
}

const response = await fetch(actionUrl(wikiHost, params), {
signal: options.signal,
headers: { 'Api-User-Agent': API_USER_AGENT },
})

if (!response.ok) {
throw new FetchMorelikeSearchError(`HTTP ${response.status}`, 'http')
}

const data = (await response.json()) as {
query?: {
search?: Array<{
title?: string
pageid?: number
snippet?: string
}>
}
continue?: { sroffset?: number }
}

const results: MorelikeSearchResult[] = []

for (const hit of data.query?.search ?? []) {
const title = typeof hit.title === 'string' ? hit.title.trim() : ''
if (!title.length) continue
if (seedKeys.has(normalizeTitleKey(title))) continue

results.push({
title,
pageid: typeof hit.pageid === 'number' ? hit.pageid : 0,
snippet: stripSearchSnippetHtml(typeof hit.snippet === 'string' ? hit.snippet : ''),
})
}

const nextOffset =
typeof data.continue?.sroffset === 'number' ? data.continue.sroffset : undefined

return { results, nextOffset }
}
105 changes: 105 additions & 0 deletions src/lib/fetchPageThumbnailsBatch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { wikiHostFromLang } from '@/lib/config'

const API_USER_AGENT =
'ProtoWiki/0.1 (https://github.com/wikimedia-research/protowiki) morelike-search'

const THUMB_SIZE = 96
const MAX_TITLES_PER_REQUEST = 50

export class FetchPageThumbnailsBatchError extends Error {
constructor(
message: string,
public readonly code: 'aborted' | 'http',
) {
super(message)
this.name = 'FetchPageThumbnailsBatchError'
}
}

export interface FetchPageThumbnailsBatchOptions {
signal?: AbortSignal
/** Wikipedia language code (default `en`). */
lang?: string
}

function assertNotAborted(signal?: AbortSignal): void {
if (signal?.aborted) {
throw new FetchPageThumbnailsBatchError('Request aborted', 'aborted')
}
}

function actionUrl(wikiHost: string, params: Record<string, string>): string {
const search = new URLSearchParams({
...params,
format: 'json',
formatversion: '2',
origin: '*',
})
return `https://${wikiHost}/w/api.php?${search.toString()}`
}

async function fetchThumbnailBatch(
wikiHost: string,
titles: string[],
signal?: AbortSignal,
): Promise<Record<string, string | undefined>> {
assertNotAborted(signal)

const response = await fetch(
actionUrl(wikiHost, {
action: 'query',
prop: 'pageimages',
pithumbsize: String(THUMB_SIZE),
redirects: '1',
titles: titles.join('|'),
}),
{
signal,
headers: { 'Api-User-Agent': API_USER_AGENT },
},
)

if (!response.ok) {
throw new FetchPageThumbnailsBatchError(`HTTP ${response.status}`, 'http')
}

const data = (await response.json()) as {
query?: {
pages?: Array<{
title?: string
thumbnail?: { source?: string }
}>
}
}

const out: Record<string, string | undefined> = {}
for (const page of data.query?.pages ?? []) {
const title = typeof page.title === 'string' ? page.title : ''
if (!title.length) continue
out[title] = page.thumbnail?.source
}

return out
}

/**
* Batch-fetch lead-image thumbnails for article titles.
*/
export async function fetchPageThumbnailsBatch(
titles: string[],
options: FetchPageThumbnailsBatchOptions = {},
): Promise<Record<string, string | undefined>> {
const unique = [...new Set(titles.map((title) => title.trim()).filter(Boolean))]
if (!unique.length) return {}

const wikiHost = wikiHostFromLang(options.lang ?? 'en')
const merged: Record<string, string | undefined> = {}

for (let i = 0; i < unique.length; i += MAX_TITLES_PER_REQUEST) {
const chunk = unique.slice(i, i + MAX_TITLES_PER_REQUEST)
const batch = await fetchThumbnailBatch(wikiHost, chunk, options.signal)
Object.assign(merged, batch)
}

return merged
}
Loading
Loading