From f78a5eb533eb12a290e029982c9004b7dddc0ef3 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 17:19:00 +0200 Subject: [PATCH 01/14] add HF source from the space repository --- package.json | 1 + src/components/App/App.tsx | 5 +- src/lib/sources/huggingFaceSource.ts | 268 +++++++++++++++++++++ src/lib/sources/index.ts | 1 + test/lib/sources/huggingFaceSource.test.ts | 257 ++++++++++++++++++++ 5 files changed, 531 insertions(+), 1 deletion(-) create mode 100644 src/lib/sources/huggingFaceSource.ts create mode 100644 test/lib/sources/huggingFaceSource.test.ts diff --git a/package.json b/package.json index 1296aadb..03248854 100644 --- a/package.json +++ b/package.json @@ -55,6 +55,7 @@ "watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet" }, "dependencies": { + "@huggingface/hub": "2.6.12", "hightable": "0.20.2", "hyparquet": "1.20.0", "hyparquet-compressors": "1.1.1", diff --git a/src/components/App/App.tsx b/src/components/App/App.tsx index 2181cb6f..3a539adc 100644 --- a/src/components/App/App.tsx +++ b/src/components/App/App.tsx @@ -1,6 +1,7 @@ import { useMemo } from 'react' import { Config, ConfigProvider } from '../../hooks/useConfig.js' import { getHttpSource } from '../../lib/sources/httpSource.js' +import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js' import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js' import Page from '../Page/Page.js' @@ -10,7 +11,9 @@ export default function App() { const row = search.get('row') === null ? undefined : Number(search.get('row')) const col = search.get('col') === null ? undefined : Number(search.get('col')) - const source = getHttpSource(sourceId) ?? getHyperparamSource(sourceId, { endpoint: location.origin }) + const source = getHuggingFaceSource(sourceId) ?? + getHttpSource(sourceId) ?? + getHyperparamSource(sourceId, { endpoint: location.origin }) // Memoize the config to avoid creating a new object on each render const config: Config = useMemo(() => ({ diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts new file mode 100644 index 00000000..39af506d --- /dev/null +++ b/src/lib/sources/huggingFaceSource.ts @@ -0,0 +1,268 @@ +import { listFiles } from '@huggingface/hub' +import type { DirSource, FileMetadata, FileSource, SourcePart } from 'hyperparam' +import { getFileName } from 'hyperparam' + +export const baseUrl = 'https://huggingface.co/datasets' + +function getSourceParts(url: HFUrl): SourcePart[] { + const sourceParts: SourcePart[] = [{ + sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/`, + text: `${baseUrl}/${url.repo}/${url.action}/${url.branch}/`, + }] + + const pathParts = url.path.split('/').filter(d => d.length > 0) + const lastPart = pathParts.at(-1) + if (lastPart) { + for (const [i, part] of pathParts.slice(0, -1).entries()) { + sourceParts.push({ + sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`, + text: part + '/', + }) + } + sourceParts.push({ + sourceId: `${baseUrl}/${url.repo}/${url.action}/${url.branch}${url.path}`, + text: lastPart, + }) + } + return sourceParts +} +function getPrefix(url: DirectoryUrl): string { + return `${url.origin}/datasets/${url.repo}/tree/${url.branch}${url.path}`.replace(/\/$/, '') +} +async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise { + const filesIterator = listFiles({ + repo: `datasets/${url.repo}`, + revision: url.branch, + path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any + expand: true, + accessToken: options?.accessToken, + }) + const files: FileMetadata[] = [] + for await (const file of filesIterator) { + files.push({ + name: getFileName(file.path), + eTag: file.lastCommit?.id, + size: file.size, + lastModified: file.lastCommit?.date, + sourceId: `${url.origin}/datasets/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), + kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory + }) + } + return files +} +export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined { + try { + const url = parseHuggingFaceUrl(sourceId) + async function fetchVersions() { + const refsList = await fetchRefsList(url.repo, options) + return { + label: 'Branches', + versions: refsList.map(({ refType, name, ref }) => { + const label = refType === 'branches' ? name : + refType === 'converts' ? `[convert] ${name}` : + refType === 'tags' ? `[tag] ${name}` : + `[pr] ${name}` + // remove refs/heads/ from the ref name + // e.g. refs/heads/main -> main + const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref + const branchSourceId = `${url.origin}/datasets/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` + return { + label, + sourceId: branchSourceId, + } + }), + } + } + if (url.kind === 'file') { + return { + kind: 'file', + sourceId, + sourceParts: getSourceParts(url), + fileName: getFileName(url.path), + resolveUrl: url.resolveUrl, + requestInit: options?.requestInit, + fetchVersions, + } + } else { + return { + kind: 'directory', + sourceId, + sourceParts: getSourceParts(url), + prefix: getPrefix(url), + listFiles: () => fetchFilesList(url, options), + fetchVersions, + } + } + } catch { + return undefined + } +} + +export interface DirectoryUrl { + kind: 'directory'; + source: string; + origin: string; + repo: string; + action: 'tree'; + branch: string; + path: string; +} + +export interface FileUrl { + kind: 'file'; + source: string; + origin: string; + repo: string; + action: 'resolve' | 'blob'; + branch: string; + path: string; + resolveUrl: string; +} + +type HFUrl = DirectoryUrl | FileUrl; + +export function parseHuggingFaceUrl(url: string): HFUrl { + const urlObject = new URL(url) + // ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL + + if (urlObject.protocol !== 'https:' && urlObject.protocol !== 'http:') { + throw new Error('url must be a HTTP URL') + } + + if ( + !['huggingface.co', 'huggingface.co', 'hf.co'].includes(urlObject.host) || + urlObject.protocol !== 'https:' + ) { + throw new Error('Not a Hugging Face URL') + } + + const repoGroups = /^\/datasets\/(?[^/]+)\/(?[^/]+)\/?$/.exec( + urlObject.pathname + )?.groups + if (repoGroups?.namespace !== undefined && repoGroups.dataset !== undefined) { + return { + kind: 'directory', + source: url, + origin: urlObject.origin, + repo: repoGroups.namespace + '/' + repoGroups.dataset, + action: 'tree', + branch: 'main', // hardcode the default branch + path: '', + } + } + + const folderGroups = + /^\/datasets\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( + urlObject.pathname + )?.groups + if ( + folderGroups?.namespace !== undefined && + folderGroups.dataset !== undefined && + folderGroups.action !== undefined && + folderGroups.branch !== undefined && + folderGroups.path !== undefined && + folderGroups.branch !== 'refs' + ) { + const branch = folderGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}/datasets/${folderGroups.namespace}/${folderGroups.dataset}/${folderGroups.action}/${branch}${folderGroups.path}` + return { + kind: 'directory', + source, + origin: urlObject.origin, + repo: folderGroups.namespace + '/' + folderGroups.dataset, + action: 'tree', + branch, + path: folderGroups.path, + } + } + + const fileGroups = + /^\/datasets\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( + urlObject.pathname + )?.groups + if ( + fileGroups?.namespace !== undefined && + fileGroups.dataset !== undefined && + fileGroups.action !== undefined && + fileGroups.branch !== undefined && + fileGroups.path !== undefined && + fileGroups.branch !== 'refs' + ) { + const branch = fileGroups.branch.replace(/\//g, '%2F') + const source = `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/${fileGroups.action}/${branch}${fileGroups.path}` + return { + kind: 'file', + source, + origin: urlObject.origin, + repo: fileGroups.namespace + '/' + fileGroups.dataset, + action: fileGroups.action === 'blob' ? 'blob' : 'resolve', + branch, + path: fileGroups.path, + resolveUrl: `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/resolve/${branch}${fileGroups.path}`, + } + } + + throw new Error('Unsupported Hugging Face URL') +} + +interface RefResponse { + name: string; + ref: string; + targetCommit: string; +} + +export const refTypes = [ + 'branches', + 'tags', + 'converts', + 'pullRequests', +] as const +type RefType = (typeof refTypes)[number]; +type RefsResponse = Partial>; + +export interface RefMetadata extends RefResponse { + refType: RefType; // TODO(SL): use it to style the refs differently? +} + +/** + * List refs in a HF dataset repo + * + * Example API URL: https://huggingface.co/api/datasets/codeparrot/github-code/refs + * + * @param repo (namespace/repo) + * @param [options] + * @param [options.requestInit] - request init object to pass to fetch + * @param [options.accessToken] - access token to use for authentication + * + * @returns the list of branches, tags, pull requests, and converts + */ +export async function fetchRefsList( + repo: string, + options?: {requestInit?: RequestInit, accessToken?: string} +): Promise { + if (options?.accessToken && !options.accessToken.startsWith('hf_')) { + throw new TypeError('Your access token must start with \'hf_\'') + } + const headers = new Headers(options?.requestInit?.headers) + headers.set('accept', 'application/json') + if (options?.accessToken) { + headers.set('Authorization', `Bearer ${options.accessToken}`) + } + const response = await fetch(`https://huggingface.co/api/datasets/${repo}/refs`, { ...options?.requestInit, headers }) + if (!response.ok) { + throw new Error(`HTTP error ${response.status.toString()}`) + } + const refsByType = await response.json() as RefsResponse + return refTypes.flatMap((refType) => { + const refResponse = refsByType[refType] + if (!refResponse) { + return [] + } + return refResponse.map((refResponse) => { + return { + refType, + ...refResponse, + } + }) + }) +} diff --git a/src/lib/sources/index.ts b/src/lib/sources/index.ts index 52437851..648af03d 100644 --- a/src/lib/sources/index.ts +++ b/src/lib/sources/index.ts @@ -1,5 +1,6 @@ export { getHttpSource } from './httpSource.js' export { getHyperparamSource } from './hyperparamSource.js' +export { getHuggingFaceSource } from './huggingFaceSource.js' export type { HyperparamFileMetadata } from './hyperparamSource.js' export type { DirSource, FileKind, FileMetadata, FileSource, Source, SourcePart } from './types.js' export { getFileName } from './utils.js' diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts new file mode 100644 index 00000000..02ee2e26 --- /dev/null +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -0,0 +1,257 @@ +import { expect, test } from 'vitest' +import { parseHuggingFaceUrl } from '../../../src/lib/sources/huggingFaceSource.js' + +const origin = 'https://huggingface.co' + +test.for([[''], ['abc']])('non-url string \'%s\' throws an error', ([url]) => { + expect(() => parseHuggingFaceUrl(url)).to.throw() +}) + +test.for([['ftp:'], ['email:']])('\'%s\' scheme throws an error', ([scheme]) => { + expect(() => parseHuggingFaceUrl(`${scheme}//abc`)).to.throw() +}) + +test.for([['https://some.url'], ['https://some.url/with/a/path']])( + 'non-huggingface URL throws: %s', + ([url]) => { + expect(() => parseHuggingFaceUrl(url)).to.throw() + } +) + +test.for([ + ['https://huggingface.co'], + ['https://hf.co'], + ['https://huggingface.co/'], + ['https://huggingface.co/datasets'], + ['https://huggingface.co/datasets/'], + ['https://huggingface.co/datasets/namespace'], + ['https://huggingface.co/datasets/namespace/'], +])('base huggingface URL throws: %s', ([url]) => { + expect(() => parseHuggingFaceUrl(url)).to.throw() +}) + +test.for([ + ['https://huggingface.co/datasets/namespace/repo', 'namespace/repo'], + ['https://huggingface.co/datasets/namespace/repo/', 'namespace/repo'], + ['https://huggingface.co/datasets/namespace/123', 'namespace/123'], +])('dataset repo URL returns a RepoUrl: %s', ([url, repo]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'directory', + origin, + repo, + source: url, + action: 'tree', + branch: 'main', + path: '', + }) +}) + +test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'namespace/repo', + 'branch', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/', + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'namespace/repo', + 'branch', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs/convert/parquet', + // also accepted because of URLSearchParams (see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams) + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs/pr/9', + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fpr%2F9', + 'namespace/repo', + 'refs%2Fpr%2F9', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', + 'namespace/repo', + 'branch', + '/folder', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c/', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c', + 'namespace/repo', + 'branch', + '/a/b/c', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', + 'namespace/repo', + 'branch', + '/folder.parquet', + ], +])( + 'tree repo URL with a branch and an optional path returns a FolderUrl: %s', + ([url, source, repo, branch, path]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'directory', + origin, + repo, + source, + action: 'tree', + branch, + path, + }) + } +) + +test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', + 'namespace/repo', + 'branch', + '/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/refs/convert/parquet/file', + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', + 'namespace/repo', + 'branch', + '/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + ], +])( + 'blob repo URL with a branch and a path returns a FileUrl: %s', + ([url, source, repo, branch, path, resolveUrl]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + source, + action: 'blob', + branch, + path, + resolveUrl, + }) + } +) + +test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file?download=true', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + 'namespace/repo', + 'branch', + '/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + 'namespace/repo', + 'branch', + '/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + ], +])( + 'resolve repo URL with a branch and a path returns a FileUrl: %s', + ([url, source, repo, branch, path, resolveUrl]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + source, + action: 'resolve', + branch, + path, + resolveUrl, + }) + } +) + +test.for([ + ['https://huggingface.co/not-supported'], + ['https://huggingface.co/not/supported'], + ['https://huggingface.co/tasks'], + ['https://huggingface.co/models'], + ['https://huggingface.co/spaces'], + ['https://huggingface.co/datasets/namespace/repo/branch'], + ['https://huggingface.co/datasets/namespace/repo/tree'], + ['https://huggingface.co/datasets/namespace/repo/tree/'], + ['https://huggingface.co/datasets/namespace/repo/blob'], + ['https://huggingface.co/datasets/namespace/repo/blob/'], + ['https://huggingface.co/datasets/namespace/repo/blob/branch'], + ['https://huggingface.co/datasets/namespace/repo/blob/branch/'], + ['https://huggingface.co/datasets/namespace/repo/blob/branch/file/'], + ['https://huggingface.co/datasets/namespace/repo/resolve'], + ['https://huggingface.co/datasets/namespace/repo/resolve/'], + ['https://huggingface.co/datasets/namespace/repo/resolve/branch'], + ['https://huggingface.co/datasets/namespace/repo/resolve/branch/'], + ['https://huggingface.co/datasets/namespace/repo/resolve/branch/file/'], +])('unrelated huggingface URL throws and error: %s', ([url]) => { + expect(() => parseHuggingFaceUrl(url)).to.throw() +}) From c4c119022922d53f3783da06dc6d195014064b5f Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 17:27:04 +0200 Subject: [PATCH 02/14] fix style --- src/components/Breadcrumb/Breadcrumb.module.css | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/components/Breadcrumb/Breadcrumb.module.css b/src/components/Breadcrumb/Breadcrumb.module.css index 34dcb4c6..79fd9069 100644 --- a/src/components/Breadcrumb/Breadcrumb.module.css +++ b/src/components/Breadcrumb/Breadcrumb.module.css @@ -15,7 +15,7 @@ min-height: 32px; border-bottom: 1px solid #ddd; background: var(--color-background-dark); - padding: 0 10px 0 20px; + padding: 0 20px; border-radius: var(--border-radius-lg); margin: var(--space-3xs); /* TODO(SL): forbid overflow? */ @@ -56,6 +56,13 @@ .versions { padding-left: 4px; + & > button { + color: #eee; + + &:hover { + color: #fff + } + } [aria-current] { font-weight: bold; From a35286a716fedd4a9def578bb8ecfc8c3530815b Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 17:38:06 +0200 Subject: [PATCH 03/14] fix imports --- src/lib/sources/huggingFaceSource.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index 39af506d..4287fdde 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -1,6 +1,6 @@ import { listFiles } from '@huggingface/hub' -import type { DirSource, FileMetadata, FileSource, SourcePart } from 'hyperparam' -import { getFileName } from 'hyperparam' +import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' +import { getFileName } from './utils.js' export const baseUrl = 'https://huggingface.co/datasets' From 45b7ec8bb5116ae371754ab85401dd17216a5aae Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 18:02:33 +0200 Subject: [PATCH 04/14] add tests about no support for models and spaces --- test/lib/sources/huggingFaceSource.test.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 02ee2e26..350cf13e 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -30,6 +30,15 @@ test.for([ expect(() => parseHuggingFaceUrl(url)).to.throw() }) +test.for([ + ['https://huggingface.co/namespace/model'], + ['https://huggingface.co/namespace/model/'], + ['https://huggingface.co/spaces/namespace/space'], + ['https://huggingface.co/spaces/namespace/space/'], +])('model or space huggingface URL throws: %s', ([url]) => { + expect(() => parseHuggingFaceUrl(url)).to.throw() +}) + test.for([ ['https://huggingface.co/datasets/namespace/repo', 'namespace/repo'], ['https://huggingface.co/datasets/namespace/repo/', 'namespace/repo'], From d06b6176d4ed559ec9088e3a2755a76f4f7f4671 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 21:58:53 +0200 Subject: [PATCH 05/14] simplify a bit --- src/lib/sources/huggingFaceSource.ts | 11 +- test/lib/sources/huggingFaceSource.test.ts | 502 +++++++++++---------- 2 files changed, 258 insertions(+), 255 deletions(-) diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index 4287fdde..387400dd 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -125,13 +125,12 @@ export function parseHuggingFaceUrl(url: string): HFUrl { const urlObject = new URL(url) // ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL - if (urlObject.protocol !== 'https:' && urlObject.protocol !== 'http:') { - throw new Error('url must be a HTTP URL') - } - if ( - !['huggingface.co', 'huggingface.co', 'hf.co'].includes(urlObject.host) || - urlObject.protocol !== 'https:' + urlObject.protocol !== 'https:' || + ![ + 'huggingface.co', 'huggingface.com', 'hf.co', + // hf.com is not a HF domain + ].includes(urlObject.host) ) { throw new Error('Not a Hugging Face URL') } diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 350cf13e..54987ca4 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -1,266 +1,270 @@ -import { expect, test } from 'vitest' +import { describe, expect, test } from 'vitest' import { parseHuggingFaceUrl } from '../../../src/lib/sources/huggingFaceSource.js' const origin = 'https://huggingface.co' -test.for([[''], ['abc']])('non-url string \'%s\' throws an error', ([url]) => { - expect(() => parseHuggingFaceUrl(url)).to.throw() -}) - -test.for([['ftp:'], ['email:']])('\'%s\' scheme throws an error', ([scheme]) => { - expect(() => parseHuggingFaceUrl(`${scheme}//abc`)).to.throw() -}) - -test.for([['https://some.url'], ['https://some.url/with/a/path']])( - 'non-huggingface URL throws: %s', - ([url]) => { - expect(() => parseHuggingFaceUrl(url)).to.throw() - } -) - -test.for([ - ['https://huggingface.co'], - ['https://hf.co'], - ['https://huggingface.co/'], - ['https://huggingface.co/datasets'], - ['https://huggingface.co/datasets/'], - ['https://huggingface.co/datasets/namespace'], - ['https://huggingface.co/datasets/namespace/'], -])('base huggingface URL throws: %s', ([url]) => { - expect(() => parseHuggingFaceUrl(url)).to.throw() -}) - -test.for([ - ['https://huggingface.co/namespace/model'], - ['https://huggingface.co/namespace/model/'], - ['https://huggingface.co/spaces/namespace/space'], - ['https://huggingface.co/spaces/namespace/space/'], -])('model or space huggingface URL throws: %s', ([url]) => { - expect(() => parseHuggingFaceUrl(url)).to.throw() -}) - -test.for([ - ['https://huggingface.co/datasets/namespace/repo', 'namespace/repo'], - ['https://huggingface.co/datasets/namespace/repo/', 'namespace/repo'], - ['https://huggingface.co/datasets/namespace/123', 'namespace/123'], -])('dataset repo URL returns a RepoUrl: %s', ([url, repo]) => { - expect(parseHuggingFaceUrl(url)).toEqual({ - kind: 'directory', - origin, - repo, - source: url, - action: 'tree', - branch: 'main', - path: '', - }) -}) - -test.for([ - [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch', - 'https://huggingface.co/datasets/namespace/repo/tree/branch', - 'namespace/repo', - 'branch', - '', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/', - 'https://huggingface.co/datasets/namespace/repo/tree/branch', - 'namespace/repo', - 'branch', - '', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs/convert/parquet', - // also accepted because of URLSearchParams (see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams) - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs/pr/9', - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fpr%2F9', - 'namespace/repo', - 'refs%2Fpr%2F9', - '', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', - 'namespace/repo', - 'branch', - '/folder', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c/', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c', - 'namespace/repo', - 'branch', - '/a/b/c', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', - 'namespace/repo', - 'branch', - '/folder.parquet', - ], -])( - 'tree repo URL with a branch and an optional path returns a FolderUrl: %s', - ([url, source, repo, branch, path]) => { +describe('parseHuggingFaceUrl', () => { + test.for([ + 'huggingface.co', + 'huggingface.com', + 'hf.co', + ])('accepts domain: %s', (domain) => { + const origin = `https://${domain}` + const url = `${origin}/datasets/namespace/repo` expect(parseHuggingFaceUrl(url)).toEqual({ kind: 'directory', origin, - repo, - source, + repo: 'namespace/repo', + source: url, action: 'tree', - branch, - path, + branch: 'main', + path: '', }) - } -) + }) + test.for([ + 'ftp://huggingface.co', + 'email://huggingface.co', + 'http://huggingface.co', + 'https://hf.com', + 'https://github.com', + 'huggingface.co', + ])('throws for unsupported scheme or domain: \'%s\'', (host) => { + expect(() => parseHuggingFaceUrl(`${host}/datasets/namespace/repo`)).to.throw() + }) -test.for([ - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', - 'namespace/repo', - 'branch', - '/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/refs/convert/parquet/file', - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', - 'namespace/repo', - 'branch', - '/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - ], -])( - 'blob repo URL with a branch and a path returns a FileUrl: %s', - ([url, source, repo, branch, path, resolveUrl]) => { - expect(parseHuggingFaceUrl(url)).toEqual({ - kind: 'file', - origin, - repo, - source, - action: 'blob', - branch, - path, - resolveUrl, - }) - } -) + test.for([ + '', + '/', + '/anything', + '/tasks', + '/models', + '/namespace/model', // TODO(SL): support model + '/settings/profile', // TODO(SL): add a block/allow list? + '/datasets', + '/datasets/', + '/datasets/namespace', + '/datasets/namespace/', + '/spaces', + '/spaces/namespace', + '/spaces/namespace/space', // TODO(SL): support space + '/datasets/namespace/repo/branch', + '/datasets/namespace/repo/tree', + '/datasets/namespace/repo/tree/', + '/datasets/namespace/repo/blob', + '/datasets/namespace/repo/blob/', + '/datasets/namespace/repo/blob/branch', + '/datasets/namespace/repo/blob/branch/', + '/datasets/namespace/repo/blob/branch/file/', + '/datasets/namespace/repo/resolve', + '/datasets/namespace/repo/resolve/', + '/datasets/namespace/repo/resolve/branch', + '/datasets/namespace/repo/resolve/branch/', + '/datasets/namespace/repo/resolve/branch/file/', + ])('throws for invalid path: %s', (path) => { + expect(() => parseHuggingFaceUrl(`https://huggingface.co${path}`)).to.throw() + }) -test.for([ - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file?download=true', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - 'namespace/repo', - 'branch', - '/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - 'namespace/repo', - 'branch', - '/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - ], -])( - 'resolve repo URL with a branch and a path returns a FileUrl: %s', - ([url, source, repo, branch, path, resolveUrl]) => { + test.for([ + ['/datasets/namespace/repo', 'namespace/repo'], + ['/datasets/namespace/repo/', 'namespace/repo'], + ['/datasets/namespace/123', 'namespace/123'], /* all-number identifier is not a valid HF repo name, but we accept any string */ + ])('returns a RepoURL for dataset repository URL: %s', ([path, repo]) => { + const url = `https://huggingface.co${path}` expect(parseHuggingFaceUrl(url)).toEqual({ - kind: 'file', + kind: 'directory', origin, repo, - source, - action: 'resolve', - branch, - path, - resolveUrl, + source: url, + action: 'tree', + branch: 'main', + path: '', }) - } -) + }) + + test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'namespace/repo', + 'branch', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/', + 'https://huggingface.co/datasets/namespace/repo/tree/branch', + 'namespace/repo', + 'branch', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs/convert/parquet', + // also accepted because of URLSearchParams (see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams) + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/refs/pr/9', + 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fpr%2F9', + 'namespace/repo', + 'refs%2Fpr%2F9', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', + 'namespace/repo', + 'branch', + '/folder', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c/', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c', + 'namespace/repo', + 'branch', + '/a/b/c', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', + 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', + 'namespace/repo', + 'branch', + '/folder.parquet', + ], + ])( + 'tree repo URL with a branch and an optional path returns a FolderUrl: %s', + ([url, source, repo, branch, path]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'directory', + origin, + repo, + source, + action: 'tree', + branch, + path, + }) + } + ) + + test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', + 'namespace/repo', + 'branch', + '/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/refs/convert/parquet/file', + 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', + 'namespace/repo', + 'branch', + '/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + ], + ])( + 'blob repo URL with a branch and a path returns a FileUrl: %s', + ([url, source, repo, branch, path, resolveUrl]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + source, + action: 'blob', + branch, + path, + resolveUrl, + }) + } + ) -test.for([ - ['https://huggingface.co/not-supported'], - ['https://huggingface.co/not/supported'], - ['https://huggingface.co/tasks'], - ['https://huggingface.co/models'], - ['https://huggingface.co/spaces'], - ['https://huggingface.co/datasets/namespace/repo/branch'], - ['https://huggingface.co/datasets/namespace/repo/tree'], - ['https://huggingface.co/datasets/namespace/repo/tree/'], - ['https://huggingface.co/datasets/namespace/repo/blob'], - ['https://huggingface.co/datasets/namespace/repo/blob/'], - ['https://huggingface.co/datasets/namespace/repo/blob/branch'], - ['https://huggingface.co/datasets/namespace/repo/blob/branch/'], - ['https://huggingface.co/datasets/namespace/repo/blob/branch/file/'], - ['https://huggingface.co/datasets/namespace/repo/resolve'], - ['https://huggingface.co/datasets/namespace/repo/resolve/'], - ['https://huggingface.co/datasets/namespace/repo/resolve/branch'], - ['https://huggingface.co/datasets/namespace/repo/resolve/branch/'], - ['https://huggingface.co/datasets/namespace/repo/resolve/branch/file/'], -])('unrelated huggingface URL throws and error: %s', ([url]) => { - expect(() => parseHuggingFaceUrl(url)).to.throw() + test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file?download=true', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + 'namespace/repo', + 'branch', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + 'namespace/repo', + 'branch', + '/path/to/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + 'namespace/repo', + 'refs%2Fconvert%2Fparquet', + '/file', + 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + 'namespace/repo', + 'branch', + '/file.parquet', + 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', + ], + ])( + 'resolve repo URL with a branch and a path returns a FileUrl: %s', + ([url, source, repo, branch, path, resolveUrl]) => { + expect(parseHuggingFaceUrl(url)).toEqual({ + kind: 'file', + origin, + repo, + source, + action: 'resolve', + branch, + path, + resolveUrl, + }) + } + ) }) From d71562e0d174087682b4ba0eb4dddca4abae815e Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 22:04:00 +0200 Subject: [PATCH 06/14] simplify --- test/lib/sources/huggingFaceSource.test.ts | 38 ++++++++++++---------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 54987ca4..635084ee 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -65,23 +65,27 @@ describe('parseHuggingFaceUrl', () => { }) test.for([ - ['/datasets/namespace/repo', 'namespace/repo'], - ['/datasets/namespace/repo/', 'namespace/repo'], - ['/datasets/namespace/123', 'namespace/123'], /* all-number identifier is not a valid HF repo name, but we accept any string */ - ])('returns a RepoURL for dataset repository URL: %s', ([path, repo]) => { - const url = `https://huggingface.co${path}` - expect(parseHuggingFaceUrl(url)).toEqual({ - kind: 'directory', - origin, - repo, - source: url, - action: 'tree', - branch: 'main', - path: '', - }) - }) - - test.for([ + [ + 'https://huggingface.co/datasets/namespace/repo', + 'https://huggingface.co/datasets/namespace/repo', + 'namespace/repo', + 'main', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/repo/', + 'https://huggingface.co/datasets/namespace/repo/', + 'namespace/repo', + 'main', + '', + ], + [ + 'https://huggingface.co/datasets/namespace/123', + 'https://huggingface.co/datasets/namespace/123', + // all-number identifier is not a valid HF repo name, but we accept any string + 'namespace/123', + 'main', + ''], [ 'https://huggingface.co/datasets/namespace/repo/tree/branch', 'https://huggingface.co/datasets/namespace/repo/tree/branch', From 4fb5188a9779ab42a214e10b1f893deb93a6531c Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 23:19:56 +0200 Subject: [PATCH 07/14] support space URLs --- src/lib/sources/huggingFaceSource.ts | 99 +++++----- test/lib/sources/huggingFaceSource.test.ts | 207 ++++++++------------- 2 files changed, 130 insertions(+), 176 deletions(-) diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index 387400dd..586a369b 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -1,13 +1,14 @@ -import { listFiles } from '@huggingface/hub' +import { type RepoFullName, type RepoType, listFiles, parseRepoType } from '@huggingface/hub' import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' import { getFileName } from './utils.js' -export const baseUrl = 'https://huggingface.co/datasets' +export const baseUrl = 'https://huggingface.co' function getSourceParts(url: HFUrl): SourcePart[] { + const fullName = getFullName(url) const sourceParts: SourcePart[] = [{ - sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/`, - text: `${baseUrl}/${url.repo}/${url.action}/${url.branch}/`, + sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/`, + text: `${baseUrl}/${fullName}/${url.action}/${url.branch}/`, }] const pathParts = url.path.split('/').filter(d => d.length > 0) @@ -15,23 +16,29 @@ function getSourceParts(url: HFUrl): SourcePart[] { if (lastPart) { for (const [i, part] of pathParts.slice(0, -1).entries()) { sourceParts.push({ - sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`, + sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`, text: part + '/', }) } sourceParts.push({ - sourceId: `${baseUrl}/${url.repo}/${url.action}/${url.branch}${url.path}`, + sourceId: `${baseUrl}/${fullName}/${url.action}/${url.branch}${url.path}`, text: lastPart, }) } return sourceParts } function getPrefix(url: DirectoryUrl): string { - return `${url.origin}/datasets/${url.repo}/tree/${url.branch}${url.path}`.replace(/\/$/, '') + return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '') +} +function getFullName(url: HFUrl): RepoFullName { + return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo } async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise { const filesIterator = listFiles({ - repo: `datasets/${url.repo}`, + repo: { + name: url.repo, + type: url.type, + }, revision: url.branch, path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any expand: true, @@ -44,7 +51,7 @@ async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: Reques eTag: file.lastCommit?.id, size: file.size, lastModified: file.lastCommit?.date, - sourceId: `${url.origin}/datasets/${url.repo}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), + sourceId: `${url.origin}/${getFullName(url)}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''), kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory }) } @@ -54,7 +61,7 @@ export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: try { const url = parseHuggingFaceUrl(sourceId) async function fetchVersions() { - const refsList = await fetchRefsList(url.repo, options) + const refsList = await fetchRefsList(url, options) return { label: 'Branches', versions: refsList.map(({ refType, name, ref }) => { @@ -65,7 +72,7 @@ export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: // remove refs/heads/ from the ref name // e.g. refs/heads/main -> main const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref - const branchSourceId = `${url.origin}/datasets/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` + const branchSourceId = `${url.origin}/${getFullName(url)}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}` return { label, sourceId: branchSourceId, @@ -98,25 +105,24 @@ export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: } } -export interface DirectoryUrl { - kind: 'directory'; - source: string; - origin: string; - repo: string; - action: 'tree'; - branch: string; - path: string; +interface BaseUrl { + source: string + origin: string + type: RepoType + repo: string + branch: string + path: string +} + +export interface DirectoryUrl extends BaseUrl { + kind: 'directory' + action: 'tree' } -export interface FileUrl { - kind: 'file'; - source: string; - origin: string; - repo: string; - action: 'resolve' | 'blob'; - branch: string; - path: string; - resolveUrl: string; +export interface FileUrl extends BaseUrl { + kind: 'file' + action: 'resolve' | 'blob' + resolveUrl: string } type HFUrl = DirectoryUrl | FileUrl; @@ -135,15 +141,16 @@ export function parseHuggingFaceUrl(url: string): HFUrl { throw new Error('Not a Hugging Face URL') } - const repoGroups = /^\/datasets\/(?[^/]+)\/(?[^/]+)\/?$/.exec( + const repoGroups = /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/?$/.exec( urlObject.pathname )?.groups - if (repoGroups?.namespace !== undefined && repoGroups.dataset !== undefined) { + if (repoGroups?.type !== undefined && repoGroups.namespace !== undefined && repoGroups.repo !== undefined) { return { kind: 'directory', source: url, origin: urlObject.origin, - repo: repoGroups.namespace + '/' + repoGroups.dataset, + type: parseRepoType(repoGroups.type.slice(1)), + repo: repoGroups.namespace + '/' + repoGroups.repo, action: 'tree', branch: 'main', // hardcode the default branch path: '', @@ -151,24 +158,26 @@ export function parseHuggingFaceUrl(url: string): HFUrl { } const folderGroups = - /^\/datasets\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( + /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( urlObject.pathname )?.groups if ( - folderGroups?.namespace !== undefined && - folderGroups.dataset !== undefined && + folderGroups?.type !== undefined && + folderGroups.namespace !== undefined && + folderGroups.repo !== undefined && folderGroups.action !== undefined && folderGroups.branch !== undefined && folderGroups.path !== undefined && folderGroups.branch !== 'refs' ) { const branch = folderGroups.branch.replace(/\//g, '%2F') - const source = `${urlObject.origin}/datasets/${folderGroups.namespace}/${folderGroups.dataset}/${folderGroups.action}/${branch}${folderGroups.path}` + const source = `${urlObject.origin}${folderGroups.type}/${folderGroups.namespace}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}` return { kind: 'directory', source, origin: urlObject.origin, - repo: folderGroups.namespace + '/' + folderGroups.dataset, + type: parseRepoType(folderGroups.type.slice(1)), + repo: folderGroups.namespace + '/' + folderGroups.repo, action: 'tree', branch, path: folderGroups.path, @@ -176,28 +185,30 @@ export function parseHuggingFaceUrl(url: string): HFUrl { } const fileGroups = - /^\/datasets\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( + /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( urlObject.pathname )?.groups if ( - fileGroups?.namespace !== undefined && - fileGroups.dataset !== undefined && + fileGroups?.type !== undefined && + fileGroups.namespace !== undefined && + fileGroups.repo !== undefined && fileGroups.action !== undefined && fileGroups.branch !== undefined && fileGroups.path !== undefined && fileGroups.branch !== 'refs' ) { const branch = fileGroups.branch.replace(/\//g, '%2F') - const source = `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/${fileGroups.action}/${branch}${fileGroups.path}` + const source = `${urlObject.origin}${fileGroups.type}/${fileGroups.namespace}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}` return { kind: 'file', source, origin: urlObject.origin, - repo: fileGroups.namespace + '/' + fileGroups.dataset, + type: parseRepoType(fileGroups.type.slice(1)), + repo: fileGroups.namespace + '/' + fileGroups.repo, action: fileGroups.action === 'blob' ? 'blob' : 'resolve', branch, path: fileGroups.path, - resolveUrl: `${urlObject.origin}/datasets/${fileGroups.namespace}/${fileGroups.dataset}/resolve/${branch}${fileGroups.path}`, + resolveUrl: `${urlObject.origin}${fileGroups.type}/${fileGroups.namespace}/${fileGroups.repo}/resolve/${branch}${fileGroups.path}`, } } @@ -236,7 +247,7 @@ export interface RefMetadata extends RefResponse { * @returns the list of branches, tags, pull requests, and converts */ export async function fetchRefsList( - repo: string, + url: HFUrl, options?: {requestInit?: RequestInit, accessToken?: string} ): Promise { if (options?.accessToken && !options.accessToken.startsWith('hf_')) { @@ -247,7 +258,7 @@ export async function fetchRefsList( if (options?.accessToken) { headers.set('Authorization', `Bearer ${options.accessToken}`) } - const response = await fetch(`https://huggingface.co/api/datasets/${repo}/refs`, { ...options?.requestInit, headers }) + const response = await fetch(`https://huggingface.co/api/${getFullName(url)}/refs`, { ...options?.requestInit, headers }) if (!response.ok) { throw new Error(`HTTP error ${response.status.toString()}`) } diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 635084ee..5aba2911 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -15,6 +15,7 @@ describe('parseHuggingFaceUrl', () => { kind: 'directory', origin, repo: 'namespace/repo', + type: 'dataset', source: url, action: 'tree', branch: 'main', @@ -46,7 +47,6 @@ describe('parseHuggingFaceUrl', () => { '/datasets/namespace/', '/spaces', '/spaces/namespace', - '/spaces/namespace/space', // TODO(SL): support space '/datasets/namespace/repo/branch', '/datasets/namespace/repo/tree', '/datasets/namespace/repo/tree/', @@ -65,91 +65,114 @@ describe('parseHuggingFaceUrl', () => { }) test.for([ + { type: 'dataset', typePath: 'datasets/' }, + { type: 'space', typePath: 'spaces/' }, + // { type: 'model', typePath: '' }, + ].flatMap(({ type, typePath }) => [ + // Root directory [ - 'https://huggingface.co/datasets/namespace/repo', - 'https://huggingface.co/datasets/namespace/repo', + `https://huggingface.co/${typePath}namespace/repo`, + `https://huggingface.co/${typePath}namespace/repo`, 'namespace/repo', + type, 'main', '', ], [ - 'https://huggingface.co/datasets/namespace/repo/', - 'https://huggingface.co/datasets/namespace/repo/', + `https://huggingface.co/${typePath}namespace/repo/`, + `https://huggingface.co/${typePath}namespace/repo/`, 'namespace/repo', + type, 'main', '', ], + // all-number identifier is not a valid HF repo name, but we accept any string [ - 'https://huggingface.co/datasets/namespace/123', - 'https://huggingface.co/datasets/namespace/123', - // all-number identifier is not a valid HF repo name, but we accept any string + `https://huggingface.co/${typePath}namespace/123`, + `https://huggingface.co/${typePath}namespace/123`, 'namespace/123', + type, 'main', - ''], + '', + ], + // Branches [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch', - 'https://huggingface.co/datasets/namespace/repo/tree/branch', + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, 'namespace/repo', + type, 'branch', '', ], [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/', - 'https://huggingface.co/datasets/namespace/repo/tree/branch', + `https://huggingface.co/${typePath}namespace/repo/tree/branch/`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch`, 'namespace/repo', + type, 'branch', '', ], + // special case: both forms 'refs/convert/parquet' and 'refs%2Fconvert%2Fparquet' are accepted + // see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, 'namespace/repo', + type, 'refs%2Fconvert%2Fparquet', '', ], [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs/convert/parquet', - // also accepted because of URLSearchParams (see note in https://url.spec.whatwg.org/#dom-urlsearchparams-urlsearchparams) - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fconvert%2Fparquet', + `https://huggingface.co/${typePath}namespace/repo/tree/refs/convert/parquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fconvert%2Fparquet`, 'namespace/repo', + type, 'refs%2Fconvert%2Fparquet', '', ], + // PRs are also accepted [ - 'https://huggingface.co/datasets/namespace/repo/tree/refs/pr/9', - 'https://huggingface.co/datasets/namespace/repo/tree/refs%2Fpr%2F9', + `https://huggingface.co/${typePath}namespace/repo/tree/refs/pr/9`, + `https://huggingface.co/${typePath}namespace/repo/tree/refs%2Fpr%2F9`, 'namespace/repo', + type, 'refs%2Fpr%2F9', '', ], + // Subdirectories [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder', + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder`, 'namespace/repo', + type, 'branch', '/folder', ], [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c/', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/a/b/c', + `https://huggingface.co/${typePath}namespace/repo/tree/branch/a/b/c/`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/a/b/c`, 'namespace/repo', + type, 'branch', '/a/b/c', ], + // A subdirectory can have a dot in its name (what matters is 'tree' vs 'blob' or 'resolve') [ - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', - 'https://huggingface.co/datasets/namespace/repo/tree/branch/folder.parquet', + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder.parquet`, + `https://huggingface.co/${typePath}namespace/repo/tree/branch/folder.parquet`, 'namespace/repo', + type, 'branch', '/folder.parquet', ], - ])( - 'tree repo URL with a branch and an optional path returns a FolderUrl: %s', - ([url, source, repo, branch, path]) => { + ]))( + 'parses a DirectoryUrl for dataset/space/model root or subdirectory: %s', + ([url, source, repo, type, branch, path]) => { expect(parseHuggingFaceUrl(url)).toEqual({ kind: 'directory', origin, repo, + type, source, action: 'tree', branch, @@ -158,114 +181,34 @@ describe('parseHuggingFaceUrl', () => { } ) - test.for([ - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/path/to/file', - 'namespace/repo', - 'branch', - '/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/refs/convert/parquet/file', - 'https://huggingface.co/datasets/namespace/repo/blob/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/blob/branch/file.parquet', - 'namespace/repo', - 'branch', - '/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - ], - ])( - 'blob repo URL with a branch and a path returns a FileUrl: %s', - ([url, source, repo, branch, path, resolveUrl]) => { + const repo = 'namespace/repo' + const path = '/path/to/file.parquet' + test.for( + [ + { type: 'dataset', typePath: 'datasets/' }, + { type: 'space', typePath: 'spaces/' }, + ].flatMap(d => [ + { ...d, branch: 'branch', sanitizedBranch: 'branch' }, + { ...d, branch: 'refs/convert/parquet', sanitizedBranch: 'refs%2Fconvert%2Fparquet' }, + { ...d, branch: 'refs%2Fconvert%2Fparquet', sanitizedBranch: 'refs%2Fconvert%2Fparquet' }, + ]).flatMap(d => [ + { ...d, action: 'blob' }, + { ...d, action: 'resolve' }, + ]).flatMap(d => [ + { ...d, url: `https://huggingface.co/${d.typePath}${repo}/${d.action}/${d.branch}${path}` }, + ]))( + 'parses a FileUrl for dataset/space/model file URL: $url', + ({ type, typePath, sanitizedBranch, action, url }) => { + const source = `https://huggingface.co/${typePath}${repo}/${action}/${sanitizedBranch}${path}` + const resolveUrl = `https://huggingface.co/${typePath}${repo}/resolve/${sanitizedBranch}${path}` expect(parseHuggingFaceUrl(url)).toEqual({ kind: 'file', origin, repo, + type, source, - action: 'blob', - branch, - path, - resolveUrl, - }) - } - ) - - test.for([ - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file?download=true', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - 'namespace/repo', - 'branch', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - 'namespace/repo', - 'branch', - '/path/to/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/path/to/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - 'namespace/repo', - 'refs%2Fconvert%2Fparquet', - '/file', - 'https://huggingface.co/datasets/namespace/repo/resolve/refs%2Fconvert%2Fparquet/file', - ], - [ - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - 'namespace/repo', - 'branch', - '/file.parquet', - 'https://huggingface.co/datasets/namespace/repo/resolve/branch/file.parquet', - ], - ])( - 'resolve repo URL with a branch and a path returns a FileUrl: %s', - ([url, source, repo, branch, path, resolveUrl]) => { - expect(parseHuggingFaceUrl(url)).toEqual({ - kind: 'file', - origin, - repo, - source, - action: 'resolve', - branch, + action, + branch: sanitizedBranch, path, resolveUrl, }) From 7f72e1759c33f994d0939ca73aed45cce25b54ef Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 20 Oct 2025 23:52:08 +0200 Subject: [PATCH 08/14] support models --- src/lib/sources/huggingFaceSource.ts | 54 +++++++++++++--------- test/lib/sources/huggingFaceSource.test.ts | 12 ++--- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index 586a369b..edaa721a 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -1,9 +1,12 @@ -import { type RepoFullName, type RepoType, listFiles, parseRepoType } from '@huggingface/hub' +import { type RepoFullName, type RepoType, listFiles } from '@huggingface/hub' import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' import { getFileName } from './utils.js' export const baseUrl = 'https://huggingface.co' +function getFullName(url: HFUrl): RepoFullName { + return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo +} function getSourceParts(url: HFUrl): SourcePart[] { const fullName = getFullName(url) const sourceParts: SourcePart[] = [{ @@ -30,9 +33,6 @@ function getSourceParts(url: HFUrl): SourcePart[] { function getPrefix(url: DirectoryUrl): string { return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '') } -function getFullName(url: HFUrl): RepoFullName { - return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo -} async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise { const filesIterator = listFiles({ repo: { @@ -141,15 +141,25 @@ export function parseHuggingFaceUrl(url: string): HFUrl { throw new Error('Not a Hugging Face URL') } - const repoGroups = /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/?$/.exec( - urlObject.pathname + let { pathname } = urlObject + let type: RepoType = 'model' + if (pathname.startsWith('/datasets')) { + type = 'dataset' + pathname = pathname.slice('/datasets'.length) + } else if (pathname.startsWith('/spaces')) { + type = 'space' + pathname = pathname.slice('/spaces'.length) + } + + const repoGroups = /^\/(?[^/]+)\/(?[^/]+)\/?$/.exec( + pathname )?.groups - if (repoGroups?.type !== undefined && repoGroups.namespace !== undefined && repoGroups.repo !== undefined) { + if (repoGroups?.namespace !== undefined && repoGroups.repo !== undefined) { return { kind: 'directory', source: url, origin: urlObject.origin, - type: parseRepoType(repoGroups.type.slice(1)), + type, repo: repoGroups.namespace + '/' + repoGroups.repo, action: 'tree', branch: 'main', // hardcode the default branch @@ -158,25 +168,25 @@ export function parseHuggingFaceUrl(url: string): HFUrl { } const folderGroups = - /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( - urlObject.pathname + /^\/(?[^/]+)\/(?[^/]+)\/(?tree)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)*)\/?$/.exec( + pathname )?.groups if ( - folderGroups?.type !== undefined && - folderGroups.namespace !== undefined && + folderGroups?.namespace !== undefined && folderGroups.repo !== undefined && folderGroups.action !== undefined && folderGroups.branch !== undefined && folderGroups.path !== undefined && folderGroups.branch !== 'refs' ) { + const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : '' const branch = folderGroups.branch.replace(/\//g, '%2F') - const source = `${urlObject.origin}${folderGroups.type}/${folderGroups.namespace}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}` + const source = `${urlObject.origin}${typePath}/${folderGroups.namespace}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}` return { kind: 'directory', source, origin: urlObject.origin, - type: parseRepoType(folderGroups.type.slice(1)), + type, repo: folderGroups.namespace + '/' + folderGroups.repo, action: 'tree', branch, @@ -185,30 +195,30 @@ export function parseHuggingFaceUrl(url: string): HFUrl { } const fileGroups = - /^(?\/datasets|\/spaces)\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( - urlObject.pathname + /^\/(?[^/]+)\/(?[^/]+)\/(?blob|resolve)\/(?(refs\/(convert|pr)\/)?[^/]+)(?(\/[^/]+)+)$/.exec( + pathname )?.groups if ( - fileGroups?.type !== undefined && - fileGroups.namespace !== undefined && + fileGroups?.namespace !== undefined && fileGroups.repo !== undefined && fileGroups.action !== undefined && fileGroups.branch !== undefined && fileGroups.path !== undefined && fileGroups.branch !== 'refs' ) { + const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : '' const branch = fileGroups.branch.replace(/\//g, '%2F') - const source = `${urlObject.origin}${fileGroups.type}/${fileGroups.namespace}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}` + const source = `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}` return { kind: 'file', source, origin: urlObject.origin, - type: parseRepoType(fileGroups.type.slice(1)), + type, repo: fileGroups.namespace + '/' + fileGroups.repo, action: fileGroups.action === 'blob' ? 'blob' : 'resolve', branch, path: fileGroups.path, - resolveUrl: `${urlObject.origin}${fileGroups.type}/${fileGroups.namespace}/${fileGroups.repo}/resolve/${branch}${fileGroups.path}`, + resolveUrl: `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/resolve/${branch}${fileGroups.path}`, } } @@ -258,7 +268,7 @@ export async function fetchRefsList( if (options?.accessToken) { headers.set('Authorization', `Bearer ${options.accessToken}`) } - const response = await fetch(`https://huggingface.co/api/${getFullName(url)}/refs`, { ...options?.requestInit, headers }) + const response = await fetch(`https://huggingface.co/api/${url.type}s/${url.repo}/refs`, { ...options?.requestInit, headers }) if (!response.ok) { throw new Error(`HTTP error ${response.status.toString()}`) } diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 5aba2911..84cbe9a8 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -39,14 +39,11 @@ describe('parseHuggingFaceUrl', () => { '/anything', '/tasks', '/models', - '/namespace/model', // TODO(SL): support model - '/settings/profile', // TODO(SL): add a block/allow list? + '/spaces', '/datasets', '/datasets/', '/datasets/namespace', '/datasets/namespace/', - '/spaces', - '/spaces/namespace', '/datasets/namespace/repo/branch', '/datasets/namespace/repo/tree', '/datasets/namespace/repo/tree/', @@ -67,7 +64,7 @@ describe('parseHuggingFaceUrl', () => { test.for([ { type: 'dataset', typePath: 'datasets/' }, { type: 'space', typePath: 'spaces/' }, - // { type: 'model', typePath: '' }, + { type: 'model', typePath: '' }, ].flatMap(({ type, typePath }) => [ // Root directory [ @@ -166,7 +163,7 @@ describe('parseHuggingFaceUrl', () => { '/folder.parquet', ], ]))( - 'parses a DirectoryUrl for dataset/space/model root or subdirectory: %s', + 'parses a DirectoryUrl for $type root or subdirectory: %s', ([url, source, repo, type, branch, path]) => { expect(parseHuggingFaceUrl(url)).toEqual({ kind: 'directory', @@ -187,6 +184,7 @@ describe('parseHuggingFaceUrl', () => { [ { type: 'dataset', typePath: 'datasets/' }, { type: 'space', typePath: 'spaces/' }, + { type: 'model', typePath: '' }, ].flatMap(d => [ { ...d, branch: 'branch', sanitizedBranch: 'branch' }, { ...d, branch: 'refs/convert/parquet', sanitizedBranch: 'refs%2Fconvert%2Fparquet' }, @@ -197,7 +195,7 @@ describe('parseHuggingFaceUrl', () => { ]).flatMap(d => [ { ...d, url: `https://huggingface.co/${d.typePath}${repo}/${d.action}/${d.branch}${path}` }, ]))( - 'parses a FileUrl for dataset/space/model file URL: $url', + 'parses a FileUrl for $type file URL: $url', ({ type, typePath, sanitizedBranch, action, url }) => { const source = `https://huggingface.co/${typePath}${repo}/${action}/${sanitizedBranch}${path}` const resolveUrl = `https://huggingface.co/${typePath}${repo}/resolve/${sanitizedBranch}${path}` From 0e93ebb58d74d8b549fc57c05563adedd4959ca1 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 21 Oct 2025 00:04:59 +0200 Subject: [PATCH 09/14] fix focus color --- src/components/Breadcrumb/Breadcrumb.module.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/Breadcrumb/Breadcrumb.module.css b/src/components/Breadcrumb/Breadcrumb.module.css index 79fd9069..f9e6df98 100644 --- a/src/components/Breadcrumb/Breadcrumb.module.css +++ b/src/components/Breadcrumb/Breadcrumb.module.css @@ -59,7 +59,7 @@ & > button { color: #eee; - &:hover { + &:hover, &:focus { color: #fff } } From fa12e878222ffeb42cb5da0c575a171625f0eb61 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 22 Oct 2025 16:15:06 +0200 Subject: [PATCH 10/14] simplify the tests a bit --- test/lib/sources/huggingFaceSource.test.ts | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 84cbe9a8..cb393c04 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -36,26 +36,20 @@ describe('parseHuggingFaceUrl', () => { test.for([ '', '/', - '/anything', - '/tasks', - '/models', - '/spaces', + // for the following tests, the same is true: + // - with a trailing slash + // - replacing /datasets with /anything, /spaces, /models or /. + // Avoiding for brevity. '/datasets', - '/datasets/', '/datasets/namespace', - '/datasets/namespace/', '/datasets/namespace/repo/branch', '/datasets/namespace/repo/tree', - '/datasets/namespace/repo/tree/', '/datasets/namespace/repo/blob', - '/datasets/namespace/repo/blob/', - '/datasets/namespace/repo/blob/branch', - '/datasets/namespace/repo/blob/branch/', - '/datasets/namespace/repo/blob/branch/file/', '/datasets/namespace/repo/resolve', - '/datasets/namespace/repo/resolve/', + '/datasets/namespace/repo/blob/branch', '/datasets/namespace/repo/resolve/branch', - '/datasets/namespace/repo/resolve/branch/', + // note the trailing slash + '/datasets/namespace/repo/blob/branch/file/', '/datasets/namespace/repo/resolve/branch/file/', ])('throws for invalid path: %s', (path) => { expect(() => parseHuggingFaceUrl(`https://huggingface.co${path}`)).to.throw() From 5d161473b6897e3bbadcff3e498b46df1567c381 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 22 Oct 2025 16:17:26 +0200 Subject: [PATCH 11/14] move type definitions to the top, and don't export them --- src/lib/sources/huggingFaceSource.ts | 84 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index edaa721a..11285f49 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -2,7 +2,48 @@ import { type RepoFullName, type RepoType, listFiles } from '@huggingface/hub' import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' import { getFileName } from './utils.js' -export const baseUrl = 'https://huggingface.co' +interface BaseUrl { + source: string + origin: string + type: RepoType + repo: string + branch: string + path: string +} + +interface DirectoryUrl extends BaseUrl { + kind: 'directory' + action: 'tree' +} + +interface FileUrl extends BaseUrl { + kind: 'file' + action: 'resolve' | 'blob' + resolveUrl: string +} + +type HFUrl = DirectoryUrl | FileUrl; + +interface RefResponse { + name: string; + ref: string; + targetCommit: string; +} + +const refTypes = [ + 'branches', + 'tags', + 'converts', + 'pullRequests', +] as const +type RefType = (typeof refTypes)[number]; +type RefsResponse = Partial>; + +interface RefMetadata extends RefResponse { + refType: RefType; // TODO(SL): use it to style the refs differently? +} + +const baseUrl = 'https://huggingface.co' function getFullName(url: HFUrl): RepoFullName { return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo @@ -105,28 +146,6 @@ export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: } } -interface BaseUrl { - source: string - origin: string - type: RepoType - repo: string - branch: string - path: string -} - -export interface DirectoryUrl extends BaseUrl { - kind: 'directory' - action: 'tree' -} - -export interface FileUrl extends BaseUrl { - kind: 'file' - action: 'resolve' | 'blob' - resolveUrl: string -} - -type HFUrl = DirectoryUrl | FileUrl; - export function parseHuggingFaceUrl(url: string): HFUrl { const urlObject = new URL(url) // ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL @@ -225,25 +244,6 @@ export function parseHuggingFaceUrl(url: string): HFUrl { throw new Error('Unsupported Hugging Face URL') } -interface RefResponse { - name: string; - ref: string; - targetCommit: string; -} - -export const refTypes = [ - 'branches', - 'tags', - 'converts', - 'pullRequests', -] as const -type RefType = (typeof refTypes)[number]; -type RefsResponse = Partial>; - -export interface RefMetadata extends RefResponse { - refType: RefType; // TODO(SL): use it to style the refs differently? -} - /** * List refs in a HF dataset repo * From 606ea70049b445d9a311fb44aca5405a27f45193 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 22 Oct 2025 16:37:44 +0200 Subject: [PATCH 12/14] vendor listFiles from @huggingface/hub --- package.json | 1 - src/lib/sources/huggingFaceSource.ts | 90 +++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/package.json b/package.json index 03248854..1296aadb 100644 --- a/package.json +++ b/package.json @@ -55,7 +55,6 @@ "watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet" }, "dependencies": { - "@huggingface/hub": "2.6.12", "hightable": "0.20.2", "hyparquet": "1.20.0", "hyparquet-compressors": "1.1.1", diff --git a/src/lib/sources/huggingFaceSource.ts b/src/lib/sources/huggingFaceSource.ts index 11285f49..f3bef842 100644 --- a/src/lib/sources/huggingFaceSource.ts +++ b/src/lib/sources/huggingFaceSource.ts @@ -1,7 +1,8 @@ -import { type RepoFullName, type RepoType, listFiles } from '@huggingface/hub' import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js' import { getFileName } from './utils.js' +type RepoType = 'model' | 'dataset' | 'space' + interface BaseUrl { source: string origin: string @@ -45,7 +46,7 @@ interface RefMetadata extends RefResponse { const baseUrl = 'https://huggingface.co' -function getFullName(url: HFUrl): RepoFullName { +function getFullName(url: HFUrl): string { return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo } function getSourceParts(url: HFUrl): SourcePart[] { @@ -74,15 +75,12 @@ function getSourceParts(url: HFUrl): SourcePart[] { function getPrefix(url: DirectoryUrl): string { return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '') } -async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise { +async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise { + const repoFullName = getFullName(url) const filesIterator = listFiles({ - repo: { - name: url.repo, - type: url.type, - }, + repoFullName, revision: url.branch, path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any - expand: true, accessToken: options?.accessToken, }) const files: FileMetadata[] = [] @@ -256,7 +254,7 @@ export function parseHuggingFaceUrl(url: string): HFUrl { * * @returns the list of branches, tags, pull requests, and converts */ -export async function fetchRefsList( +async function fetchRefsList( url: HFUrl, options?: {requestInit?: RequestInit, accessToken?: string} ): Promise { @@ -286,3 +284,77 @@ export async function fetchRefsList( }) }) } + +/* + * Copied and adapted from https://github.com/huggingface/huggingface.js/blob/main/packages/hub + * MIT License, Copyright (c) 2023 Hugging Face + */ + +interface ListFileEntry { + type: 'file' | 'directory' | 'unknown'; + size: number; + path: string; + lastCommit?: { + date: string; + id: string; + }; +} + +const HUB_URL = 'https://huggingface.co' + +/** + * List files in a folder. To list ALL files in the directory, call it + * with {@link params.recursive} set to `true`. + */ +async function* listFiles( + params: { + repoFullName: string; + /** + * Eg 'data' for listing all files in the 'data' folder. Leave it empty to list all + * files in the repo. + */ + path?: string; + revision?: string; + /** + * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. + */ + fetch?: typeof fetch; + accessToken?: string; + } +): AsyncGenerator { + let url: string | undefined = `${HUB_URL}/api/${params.repoFullName}/tree/${ + params.revision ?? 'main' + }${params.path ? '/' + params.path : ''}?expand=true` + + while (url) { + const res: Response = await (params.fetch ?? fetch)(url, { + headers: { + accept: 'application/json', + ...params.accessToken ? { Authorization: `Bearer ${params.accessToken}` } : undefined, + }, + }) + + if (!res.ok) { + throw new Error(`Failed to list files: ${res.status.toString()} ${res.statusText}`) + } + + const items = await res.json() as ListFileEntry[] + + for (const item of items) { + yield item + } + + const linkHeader = res.headers.get('Link') + + url = linkHeader ? parseLinkHeader(linkHeader).next : undefined + } +} + +/** + * Parse Link HTTP header, eg `; rel="next"` + */ +export function parseLinkHeader(header: string): Record { + const regex = /<(https?:[/][/][^>]+)>;\s+rel="([^"]+)"/g + + return Object.fromEntries([...header.matchAll(regex)].map(([, url, rel]) => [rel, url])) as Record +} From bb7604255beb23a4927d76784560a6ae0101e4ce Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 22 Oct 2025 17:32:35 -0400 Subject: [PATCH 13/14] Update test/lib/sources/huggingFaceSource.test.ts Co-authored-by: Kenny Daniel --- test/lib/sources/huggingFaceSource.test.ts | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index cb393c04..78887d4d 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -22,15 +22,13 @@ describe('parseHuggingFaceUrl', () => { path: '', }) }) - test.for([ - 'ftp://huggingface.co', - 'email://huggingface.co', - 'http://huggingface.co', - 'https://hf.com', - 'https://github.com', - 'huggingface.co', - ])('throws for unsupported scheme or domain: \'%s\'', (host) => { - expect(() => parseHuggingFaceUrl(`${host}/datasets/namespace/repo`)).to.throw() + it('throws for unsupported scheme or domain', () => { + expect(() => parseHuggingFaceUrl('ftp://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('email://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('http://huggingface.co/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('https://hf.com/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('https://github.com/datasets/namespace/repo')).toThrow() + expect(() => parseHuggingFaceUrl('huggingface.co/datasets/namespace/repo')).toThrow() }) test.for([ From 47de6a70ca511eae6dd70650f2a541f77a9520d0 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 22 Oct 2025 23:36:08 +0200 Subject: [PATCH 14/14] import it --- test/lib/sources/huggingFaceSource.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/sources/huggingFaceSource.test.ts b/test/lib/sources/huggingFaceSource.test.ts index 78887d4d..4fa073c6 100644 --- a/test/lib/sources/huggingFaceSource.test.ts +++ b/test/lib/sources/huggingFaceSource.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, test } from 'vitest' +import { describe, expect, it, test } from 'vitest' import { parseHuggingFaceUrl } from '../../../src/lib/sources/huggingFaceSource.js' const origin = 'https://huggingface.co'