From 4dc4cdcb24ab70c13525cf6be84b729c7ca8ae05 Mon Sep 17 00:00:00 2001 From: mschwab Date: Tue, 23 Jun 2026 16:09:32 -0700 Subject: [PATCH] fix: add known text extension allowlist to file binary detection The backend HEAD endpoint returns application/octet-stream for all files, causing useIsBinaryFile to incorrectly classify .jsonl and other text files as binary. Added KNOWN_TEXT_EXTENSIONS allowlist that short-circuits the HEAD request for recognized text extensions. - Add 50+ known text extensions (json, jsonl, csv, py, yaml, md, etc.) - Restructure hook to check text extensions before binary blocklist - Add unit tests for useIsBinaryFile hook Signed-off-by: mschwab --- .../filesets/hooks/useIsBinaryFile.test.ts | 123 ++++++++++++++++++ .../filesets/hooks/useIsBinaryFile.ts | 86 ++++++++++-- 2 files changed, 200 insertions(+), 9 deletions(-) create mode 100644 web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.test.ts diff --git a/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.test.ts b/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.test.ts new file mode 100644 index 0000000000..814b9f9d90 --- /dev/null +++ b/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.test.ts @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { useIsBinaryFile } from '@studio/components/filesets/hooks/useIsBinaryFile'; +import { useQuery, UseQueryResult } from '@tanstack/react-query'; +import { renderHook } from '@testing-library/react'; + +vi.mock('@tanstack/react-query'); +vi.mock('@nemo/sdk/generated/fetchers/platform'); +vi.mock('@nemo/sdk/generated/platform/api', () => ({ + getFilesDownloadFileQueryKey: vi.fn(() => ['/files/download']), +})); +vi.mock('axios', () => { + const mockInstance = { + head: vi.fn(), + interceptors: { + request: { use: vi.fn() }, + response: { use: vi.fn() }, + }, + }; + return { default: mockInstance }; +}); + +const mockUseQuery = vi.mocked(useQuery); + +describe('useIsBinaryFile', () => { + beforeEach(() => { + vi.clearAllMocks(); + mockUseQuery.mockReturnValue({ + data: false, + isPending: false, + refetch: vi.fn(), + isFetching: false, + isStale: false, + isError: false, + error: null, + failureCount: 0, + failureReason: null, + isFetchNextPageEnabled: undefined, + dataUpdateCount: 0, + dataUpdatedAt: 0, + errorUpdateCount: 0, + errorUpdatedAt: 0, + fetchStatus: 'idle', + status: 'success', + variables: undefined, + queryKey: [], + queryHash: '', + queryFn: undefined, + queryKeyHashFn: undefined, + meta: undefined, + isLoading: false, + isLoadingError: false, + isRefetchError: false, + isSuccess: true, + isObservingMisorderedIndex: false, + } as unknown as UseQueryResult); + }); + + it('returns isBinary=false for .jsonl files without a HEAD request', () => { + const { result } = renderHook(() => + useIsBinaryFile('default', 'test-dataset', 'data/test.jsonl') + ); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + // Query should be disabled for known text extensions + expect(mockUseQuery).toHaveBeenCalledWith(expect.objectContaining({ enabled: false })); + }); + + it('returns isBinary=false for .json files without a HEAD request', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'config.json')); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=false for .csv files without a HEAD request', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'data.csv')); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=false for .py files without a HEAD request', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'script.py')); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=false for .yaml and .yml files', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'config.yaml')); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=false for .md files', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'README.md')); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=false when filePath is undefined', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', undefined)); + + expect(result.current).toEqual({ isBinary: false, isLoading: false }); + }); + + it('returns isBinary=true for .png files (binary blocklist)', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'image.png')); + + expect(result.current).toEqual({ isBinary: true, isLoading: false }); + }); + + it('returns isBinary=true for .zip files (binary blocklist)', () => { + const { result } = renderHook(() => useIsBinaryFile('default', 'test-dataset', 'archive.zip')); + + expect(result.current).toEqual({ isBinary: true, isLoading: false }); + }); + + it('enables the HEAD query for unknown extensions', () => { + renderHook(() => useIsBinaryFile('default', 'test-dataset', 'data.unknown')); + + expect(mockUseQuery).toHaveBeenCalledWith(expect.objectContaining({ enabled: true })); + }); +}); diff --git a/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.ts b/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.ts index e699f8c60a..62e99a0523 100644 --- a/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.ts +++ b/web/packages/studio/src/components/filesets/hooks/useIsBinaryFile.ts @@ -9,37 +9,104 @@ import { isBinaryExtension } from '@studio/util/binaryFile'; import { useQuery } from '@tanstack/react-query'; import axios from 'axios'; +/** + * Extensions that are unambiguously text. The backend HEAD endpoint returns + * `application/octet-stream` for all files, so Content-Type-based detection + * fails for these. This allowlist short-circuits the HEAD request. + */ +const KNOWN_TEXT_EXTENSIONS = new Set([ + // Data + 'json', + 'jsonl', + 'csv', + 'tsv', + // Code + 'py', + 'js', + 'jsx', + 'ts', + 'tsx', + 'java', + 'c', + 'cpp', + 'h', + 'go', + 'rs', + 'rb', + 'php', + 'swift', + 'kt', + 'scala', + 'r', + 'm', + 'sh', + 'bash', + 'zsh', + 'fish', + // Markup / Config + 'html', + 'htm', + 'xml', + 'yaml', + 'yml', + 'toml', + 'ini', + 'cfg', + 'conf', + 'jsonc', + 'env', + // Text + 'txt', + 'md', + 'rst', + 'log', + 'diff', + 'patch', + // Other + 'sql', + 'graphql', + 'proto', + 'dockerfile', + 'makefile', +]); + +function isKnownTextExtension(path: string): boolean { + const ext = path.split('.').at(-1)?.toLowerCase(); + return ext !== undefined && KNOWN_TEXT_EXTENSIONS.has(ext); +} + /** * Determine whether a fileset file should be treated as binary (no text preview). * - * Strategy (three tiers): - * 1. Extension in `BINARY_FILE_EXTENSIONS` → binary immediately, no network. - * 2. Extension not in blocklist → HEAD request; `Content-Type` header decides. + * Strategy (four tiers): + * 1. Extension in `KNOWN_TEXT_EXTENSIONS` → text immediately, no network. + * 2. Extension in `BINARY_FILE_EXTENSIONS` → binary immediately, no network. + * 3. Extension not in either list → HEAD request; `Content-Type` decides. * - `text/*` or known text application types → not binary. * - Everything else → binary. - * 3. HEAD fails / no Content-Type → assume text (fail-open for preview). + * 4. HEAD fails / no Content-Type → assume text (fail-open for preview). * * Returns `{ isBinary, isLoading }`. `isLoading` is true only during the HEAD - * request (tier-2 path); tier-1 resolves synchronously. + * request (tier-3 path); tiers 1-2 resolve synchronously. */ export function useIsBinaryFile( workspace: string, filesetName: string, filePath: string | undefined ): { isBinary: boolean; isLoading: boolean } { + const knownText = filePath !== undefined && isKnownTextExtension(filePath); const blocklisted = filePath !== undefined && isBinaryExtension(filePath); const { data: headBinary, isPending } = useQuery({ queryKey: ['file-content-type', workspace, filesetName, filePath], queryFn: async (): Promise => { - if (!filePath) return false; try { // axios.head() is auth-aware via the interceptor registered when // '@nemo/sdk/generated/fetchers/platform' is imported above. const [fileUrl] = getFilesDownloadFileQueryKey( encodeURIComponent(workspace), encodeURIComponent(filesetName), - encodeURIComponent(filePath) + encodeURIComponent(filePath!) ); const res = await axios.head(fileUrl); const ct = String(res.headers['content-type'] ?? ''); @@ -48,13 +115,14 @@ export function useIsBinaryFile( return false; // fail-open: assume text } }, - enabled: !!filePath && !blocklisted, + enabled: !!filePath && !knownText && !blocklisted, staleTime: Infinity, retry: false, }); - if (blocklisted) return { isBinary: true, isLoading: false }; if (!filePath) return { isBinary: false, isLoading: false }; + if (knownText) return { isBinary: false, isLoading: false }; + if (blocklisted) return { isBinary: true, isLoading: false }; return { isBinary: headBinary ?? false, isLoading: isPending }; }