From 0851044fb721f47abcfc547c417e33a9bf9a0095 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 5 Dec 2025 14:10:43 -0800 Subject: [PATCH 1/2] feat: reimplement `dandi.utils.is_url` So that it can determine whether a given string is a standard HTTP, HTTPS, FTP URL, or DANDI URL more precisely. Additionally, this solution supports DANDI URL of different DANDI archive instances. --- dandi/utils.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/dandi/utils.py b/dandi/utils.py index 9d77f33cc..cd6ea7afd 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -28,7 +28,7 @@ import dateutil.parser from multidict import MultiDict # dependency of yarl -from pydantic import BaseModel, Field +from pydantic import AnyHttpUrl, BaseModel, Field, FtpUrl, TypeAdapter, ValidationError import requests import ruamel.yaml from semantic_version import Version @@ -648,13 +648,36 @@ def _get_instance( ) -def is_url(s: str) -> bool: - """Very primitive url detection for now +# This is defined in module level because repeated invocations of +# TypeAdapter creation is expensive +_url_adapter: TypeAdapter[AnyHttpUrl | FtpUrl] = TypeAdapter(AnyHttpUrl | FtpUrl) + - TODO: redo +def is_url(s: str) -> bool: """ - return s.lower().startswith(("http://", "https://", "dandi:", "ftp://")) - # Slashes are not required after "dandi:" so as to support "DANDI:" + Determines whether the input string `s` is a valid URL (standard URL or DANDI URL). + """ + + # Importing from within function to avoid possible circular imports + # since this a utility module + from dandi.dandiarchive import parse_dandi_url + from dandi.exceptions import UnknownURLError + + try: + _url_adapter.validate_python(s) + except ValidationError: + # `s` is not a standard URL, try parsing it as DANDI URL + try: + parse_dandi_url(s) + except UnknownURLError: + # `s` is neither a standard URL nor a DANDI URL, returning `False` + return False + + # `s` is a DANDI URL, returning `True` + return True + + # `s` is a standard URL + return True def get_module_version(module: str | types.ModuleType) -> str | None: From b8d137932f5cab03e3a93299465d89d2146ad9cf Mon Sep 17 00:00:00 2001 From: Isaac To Date: Mon, 8 Dec 2025 13:56:49 -0800 Subject: [PATCH 2/2] test: Add tests for `dandi.utils.is_url` --- dandi/tests/test_utils.py | 65 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/dandi/tests/test_utils.py b/dandi/tests/test_utils.py index 9f7eb8dfc..3a15043c5 100644 --- a/dandi/tests/test_utils.py +++ b/dandi/tests/test_utils.py @@ -29,6 +29,7 @@ get_utcnow_datetime, is_page2_url, is_same_time, + is_url, on_windows, post_upload_size_check, under_paths, @@ -589,3 +590,67 @@ def test_post_upload_size_check_erroring( logging.ERROR, f"Size of {p} was 42 at start of upload but is now 19 after upload", ) in caplog.record_tuples + + +class TestIsUrl: + @pytest.mark.parametrize( + "s", + [ + # Standard valid HTTP/FTP URLs + "http://example.com", + "https://example.com", + "https://example.com/path", + "https://example.com/path?query=1#frag", + "https://example.com:8443/path", + "http://127.0.0.1:8000", + "ftp://example.com/path/file.txt", + "ftp://user:pass@example.com/dir", + # These pass pydantic validation but are not very useful URLs + "http:/example.com", + # Typical DANDI Archive dandiset URLs (also valid HTTP URLs) + "https://dandiarchive.org/dandiset/000027", + "https://dandiarchive.org/dandiset/000027/draft", + "https://dandiarchive.org/dandiset/000027/0.210428.2206", + # DANDI identifiers and ids + "DANDI:123456", + "DANDI:123456/draft", + "DANDI:123456/1.123456.1234", + "DANDI-SANDBOX:123456", + "DANDI-SANDBOX:123456/draft", + "DANDI-SANDBOX:123456/1.123456.1234", + # Customized DANDI URLs + "dandi://dandi/123456", + "dandi://dandi/123456/draft", + "dandi://dandi/123456/1.123456.1234", + "dandi://dandi-sandbox/123456", + "dandi://dandi-sandbox/123456/draft", + "dandi://dandi-sandbox/123456/1.123456.1234", + ], + ) + def test_valid_urls(self, s: str) -> None: + assert is_url(s) is True + + @pytest.mark.parametrize( + "s", + [ + # Clearly invalid URLs + "not a url", + "example", + "example .com", + "://example.com", + "", + " ", + # DANDI-like string that should not be treated as a valid DANDI URL + "dandi://not-a-real-dandiset", + # Invalid DANDI identifiers and ids because of unknown instance name + "FAKEDANDI:123456", + "FAKEDANDI:123456/draft", + "FAKEDANDI:123456/1.123456.1234", + # Customized DANDI URLs + "dandi://fakedandi/123456", + "dandi://fakedandi/123456/draft", + "dandi://fakedandi/123456/1.123456.1234", + ], + ) + def test_invalid_urls(self, s: str) -> None: + assert is_url(s) is False