From 2449be5987f265c541f869afe1ca3aaec22943b5 Mon Sep 17 00:00:00 2001 From: Yuta HIROKAWA Date: Wed, 17 Apr 2024 10:37:46 +0900 Subject: [PATCH 1/2] add FS.scheme property --- pfio/v2/fs.py | 32 +++++++++++++++++----------- pfio/v2/hdfs.py | 4 ++-- pfio/v2/http_cache.py | 2 +- pfio/v2/local.py | 4 ++-- pfio/v2/pathlib.py | 27 ++++------------------- pfio/v2/s3.py | 3 ++- pfio/v2/zip.py | 2 +- tests/v2_tests/test_custom_scheme.py | 4 ++++ tests/v2_tests/test_hdfs.py | 4 ++++ tests/v2_tests/test_http_cache.py | 12 +++++++++++ tests/v2_tests/test_local.py | 7 ++++++ tests/v2_tests/test_pathlib.py | 11 ++++++++++ tests/v2_tests/test_s3.py | 5 +++++ tests/v2_tests/test_s3_zip.py | 2 ++ tests/v2_tests/test_zip.py | 4 ++++ 15 files changed, 81 insertions(+), 42 deletions(-) diff --git a/pfio/v2/fs.py b/pfio/v2/fs.py index 07927821..ffe279f7 100644 --- a/pfio/v2/fs.py +++ b/pfio/v2/fs.py @@ -87,10 +87,17 @@ class FS(abc.ABC): ''' _cwd = '' + _scheme = '' - def __init__(self): + def __init__(self, scheme=None): self.pid = os.getpid() self.trace = False + if scheme: + self._scheme = str(scheme) + + @property + def scheme(self): + return self._scheme @property def cwd(self): @@ -432,32 +439,33 @@ def _zip_check_create_not_supported(): def _from_scheme(scheme, dirname, kwargs, bucket=None): known_scheme = ['file', 'hdfs', 's3'] + actual_scheme = scheme # Custom scheme; using configparser for older Python. Will # update to toml in Python 3.11 once 3.10 is in the end. - if scheme not in known_scheme: + if actual_scheme not in known_scheme: config_dict = config.get_custom_scheme(scheme) if config_dict is not None: - scheme = config_dict.pop('scheme') # Get the real scheme + actual_scheme = config_dict.pop('scheme') # Get the real scheme # Custom scheme expected here - if scheme not in known_scheme: - raise ValueError("Scheme {} is not supported", scheme) + if actual_scheme not in known_scheme: + raise ValueError("Scheme {} is not supported", actual_scheme) for k in config_dict: if k not in kwargs: # Don't overwrite with configuration value kwargs[k] = config_dict[k] - if scheme == 'file': + if actual_scheme == 'file': from .local import Local - fs = Local(dirname, **kwargs) - elif scheme == 'hdfs': + fs = Local(dirname, scheme=scheme, **kwargs) + elif actual_scheme == 'hdfs': from .hdfs import Hdfs - fs = Hdfs(dirname, **kwargs) - elif scheme == 's3': + fs = Hdfs(dirname, scheme=scheme, **kwargs) + elif actual_scheme == 's3': from .s3 import S3 - fs = S3(bucket=bucket, prefix=dirname, **kwargs) + fs = S3(bucket=bucket, prefix=dirname, scheme=scheme, **kwargs) else: - raise RuntimeError("scheme '{}' is not defined".format(scheme)) + raise RuntimeError("scheme '{}' is not defined".format(actual_scheme)) return fs diff --git a/pfio/v2/hdfs.py b/pfio/v2/hdfs.py index a54428d8..1efd7908 100644 --- a/pfio/v2/hdfs.py +++ b/pfio/v2/hdfs.py @@ -219,8 +219,8 @@ class Hdfs(FS): ''' - def __init__(self, cwd=None, create=False, **_): - super().__init__() + def __init__(self, cwd=None, create=False, scheme=None, **_): + super().__init__(scheme=scheme) self._nameservice, self._fs = _create_fs() assert self._fs is not None self.username = self._get_principal_name() diff --git a/pfio/v2/http_cache.py b/pfio/v2/http_cache.py index 2784d09a..b7e2703e 100644 --- a/pfio/v2/http_cache.py +++ b/pfio/v2/http_cache.py @@ -51,7 +51,7 @@ def __init__(self, max_cache_size: int = 1024 * 1024 * 1024, bearer_token_path: Optional[str] = None): assert not isinstance(fs, HTTPCachedFS) - super().__init__() + super().__init__(scheme=fs.scheme) self.fs = fs self.trace = self.fs.is_traced diff --git a/pfio/v2/local.py b/pfio/v2/local.py index 6d9f0a47..54214352 100644 --- a/pfio/v2/local.py +++ b/pfio/v2/local.py @@ -70,8 +70,8 @@ def __init__(self, _stat, filename): class Local(FS): - def __init__(self, cwd=None, trace=False, create=False, **_): - super().__init__() + def __init__(self, cwd=None, trace=False, create=False, scheme=None, **_): + super().__init__(scheme=scheme) self.trace = trace diff --git a/pfio/v2/pathlib.py b/pfio/v2/pathlib.py index e9bec563..3ac52d7c 100644 --- a/pfio/v2/pathlib.py +++ b/pfio/v2/pathlib.py @@ -95,7 +95,6 @@ class PurePath(PathLike): Args: args: construct paths. fs: target file system. - scheme: specify URL scheme. (for `as_uri` method) Note: It conforms to `pathlib.PurePosixPath` of Python 3.12 specification. @@ -103,32 +102,16 @@ class PurePath(PathLike): this class not inherits any `pathlib` classes because pfio filesystems is not suitable for pathlib abstact and helper classes. - - TODO: - `scheme` should moves to `FS`. """ def __init__( self, *args: Union[str, PathLike], fs: FS, - scheme: Optional[str] = None, ) -> None: - if isinstance(fs, Local): - scheme = scheme or "file" - elif isinstance(fs, S3): - scheme = scheme or "s3" - elif isinstance(fs, Hdfs): - scheme = scheme or "hdfs" - elif isinstance(fs, Zip): - scheme = scheme or "" - else: - raise ValueError(f"unsupported FS: {fs}") - self._fs: FS = fs - self._scheme = scheme self._pure = PurePosixPath(*args) - self._hash = hash(self._pure) + hash(self._fs) + hash(self._scheme) + self._hash = hash(self._pure) + hash(self._fs) + hash(self.scheme) @property def sep(self) -> str: @@ -136,7 +119,7 @@ def sep(self) -> str: @property def scheme(self) -> str: - return self._scheme + return self._fs.scheme def __hash__(self) -> int: return self._hash @@ -372,7 +355,7 @@ def with_segments( self: SelfPurePathType, *args: Union[str, PathLike], ) -> SelfPurePathType: - return type(self)(*args, fs=self._fs, scheme=self.scheme) + return type(self)(*args, fs=self._fs) class Path(PurePath): @@ -382,7 +365,6 @@ class Path(PurePath): Args: args: construct paths. fs: target file system. - scheme: specify URL scheme. (for `as_uri` method) Note: many methods raise `NotImplementedError` @@ -397,9 +379,8 @@ def __init__( self, *args: str, fs: FS, - scheme: Optional[str] = None, ) -> None: - super().__init__(*args, fs=fs, scheme=scheme) + super().__init__(*args, fs=fs) def _as_relative_to_fs(self) -> str: return self.as_posix().removeprefix(self.anchor) diff --git a/pfio/v2/s3.py b/pfio/v2/s3.py index c078169b..8435043b 100644 --- a/pfio/v2/s3.py +++ b/pfio/v2/s3.py @@ -357,10 +357,11 @@ def __init__(self, bucket, prefix=None, create=False, connect_timeout=None, read_timeout=None, + scheme=None, _skip_connect=None, # For test purpose trace=False, **_): - super().__init__() + super().__init__(scheme=scheme) self.trace = trace diff --git a/pfio/v2/zip.py b/pfio/v2/zip.py index d3a1d498..acc3b8a8 100644 --- a/pfio/v2/zip.py +++ b/pfio/v2/zip.py @@ -79,7 +79,7 @@ class Zip(FS): def __init__(self, backend, file_path, mode='r', create=False, local_cache=False, local_cachedir=None, trace=False, **kwargs): - super().__init__() + super().__init__(scheme=backend.scheme) self.backend = backend self.file_path = file_path self.mode = mode diff --git a/tests/v2_tests/test_custom_scheme.py b/tests/v2_tests/test_custom_scheme.py index b205de2d..cc6c559a 100644 --- a/tests/v2_tests/test_custom_scheme.py +++ b/tests/v2_tests/test_custom_scheme.py @@ -15,9 +15,11 @@ def test_ini(): with pfio.v2.from_url('foobar://pfio/') as fs: assert isinstance(fs, pfio.v2.Local) + assert fs.scheme == "foobar" with pfio.v2.from_url('baz://pfio/', _skip_connect=True) as s3: assert isinstance(s3, pfio.v2.S3) + assert s3.scheme == "baz" assert 'https://s3.example.com' == s3.kwargs['endpoint_url'] assert 'hoge' == s3.kwargs['aws_access_key_id'] @@ -58,9 +60,11 @@ def test_add_custom_scheme(): with pfio.v2.from_url('foobar2://pfio/') as fs: assert isinstance(fs, pfio.v2.Local) + assert fs.scheme == "foobar2" with pfio.v2.from_url('baz2://pfio/', _skip_connect=True) as s3: assert isinstance(s3, pfio.v2.S3) + assert s3.scheme == "baz2" assert 'https://s3.example.com' == s3.kwargs['endpoint_url'] assert 'hoge' == s3.kwargs['aws_access_key_id'] diff --git a/tests/v2_tests/test_hdfs.py b/tests/v2_tests/test_hdfs.py index 523cc5a3..925260d5 100644 --- a/tests/v2_tests/test_hdfs.py +++ b/tests/v2_tests/test_hdfs.py @@ -55,6 +55,10 @@ def test_repr_str(self): repr(fs) str(fs) + def test_scheme(self): + with Hdfs(self.dirname, scheme="hdfs") as fs: + assert fs.scheme == "hdfs" + def test_read_non_exist(self): non_exist_file = "non_exist_file.txt" diff --git a/tests/v2_tests/test_http_cache.py b/tests/v2_tests/test_http_cache.py index d47dbf7a..850add81 100644 --- a/tests/v2_tests/test_http_cache.py +++ b/tests/v2_tests/test_http_cache.py @@ -73,6 +73,8 @@ def test_httpcache_simple(target): with gen_fs(target) as underlay: fs = HTTPCachedFS(http_cache, underlay) + + assert fs.scheme == underlay.scheme with fs.open(filename, mode="wb") as fp: fp.write(content) with fs.open(filename, mode="rb") as fp: @@ -96,6 +98,8 @@ def test_httpcache_too_large(): with gen_fs("local") as underlay: fs = HTTPCachedFS(http_cache, underlay) + assert fs.scheme == underlay.scheme + with fs.open(filename, mode="wb") as fp: for _ in range(1024 + 1): # 1 MB exceeds fp.write(one_mb_array) @@ -137,6 +141,8 @@ def test_httpcache_zipfile_flat(target): with gen_fs(target) as underlay: with underlay.open_zip(zipfilename, mode="w") as zipfs: fs = HTTPCachedFS(http_cache, zipfs) + assert fs.scheme == underlay.scheme + with fs.open(filename1, mode="wb") as fp: fp.write(filecontent1) with fs.open(filename2, mode="wb") as fp: @@ -146,6 +152,8 @@ def test_httpcache_zipfile_flat(target): with underlay.open_zip(zipfilename, mode="r") as zipfs: fs = HTTPCachedFS(http_cache, zipfs) + assert fs.scheme == underlay.scheme + with fs.open(filename1, mode="rb") as fp: assert fp.read(-1) == filecontent1 with fs.open(filename2, mode="rb") as fp: @@ -180,6 +188,8 @@ def test_httpcache_zipfile_archived(target): cached_fs = HTTPCachedFS(http_cache, underlay) with cached_fs.open_zip(zipfilename, mode="w") as fs: + assert cached_fs.scheme == underlay.scheme + with fs.open(filename1, mode="wb") as fp: fp.write(filecontent1) with fs.open(filename2, mode="wb") as fp: @@ -188,6 +198,8 @@ def test_httpcache_zipfile_archived(target): assert len(cache_content) == 0 with cached_fs.open_zip(zipfilename, mode="r") as fs: + assert cached_fs.scheme == underlay.scheme + with fs.open(filename1, mode="rb") as fp: assert fp.read(-1) == filecontent1 with fs.open(filename2, mode="rb") as fp: diff --git a/tests/v2_tests/test_local.py b/tests/v2_tests/test_local.py index 9161465d..3bad06b0 100644 --- a/tests/v2_tests/test_local.py +++ b/tests/v2_tests/test_local.py @@ -27,6 +27,13 @@ def test_repr_str(self): str(fs) repr(fs) + def test_scheme(self): + with Local(self.testdir.name) as fs: + assert fs.scheme == "" + + with Local(self.testdir.name, scheme="file") as fs: + assert fs.scheme == "file" + def test_read_string(self): with Local() as fs: diff --git a/tests/v2_tests/test_pathlib.py b/tests/v2_tests/test_pathlib.py index 1a661901..eb7b7466 100644 --- a/tests/v2_tests/test_pathlib.py +++ b/tests/v2_tests/test_pathlib.py @@ -671,6 +671,17 @@ def test_path_iterdir(storage: str, path: str) -> None: assert sorted(actual_entries) == sorted(filtered) +def test_path_scheme_property(storage: str) -> None: + scheme = urlparse(storage).scheme or "file" + + with from_url(url=storage) as fs: + p = PurePath(fs=fs) + assert p.scheme == scheme + + q = Path(fs=fs) + assert q.scheme == scheme + + def test_unlink(storage: str) -> None: with from_url(url=storage) as fs: target = Path("my", fs=fs) diff --git a/tests/v2_tests/test_s3.py b/tests/v2_tests/test_s3.py index f98ded45..420bb939 100644 --- a/tests/v2_tests/test_s3.py +++ b/tests/v2_tests/test_s3.py @@ -55,6 +55,11 @@ def test_s3_repr_str(s3_fixture): str(s3) +def test_scheme(s3_fixture): + with from_url('s3://test-bucket/base', **s3_fixture.aws_kwargs) as s3: + assert s3.scheme == "s3" + + def test_s3_files(s3_fixture): with from_url('s3://test-bucket/base', **s3_fixture.aws_kwargs) as s3: diff --git a/tests/v2_tests/test_s3_zip.py b/tests/v2_tests/test_s3_zip.py index 89f109e3..5ec0e655 100644 --- a/tests/v2_tests/test_s3_zip.py +++ b/tests/v2_tests/test_s3_zip.py @@ -35,6 +35,7 @@ def test_s3_zip(): with from_url('s3://{}/test.zip'.format(bucket)) as z: assert isinstance(z, Zip) assert isinstance(z.fileobj, io.BufferedReader) + assert z.scheme == "s3" assert zipfile.is_zipfile(z.fileobj) with z.open('file', 'rb') as fp: @@ -43,6 +44,7 @@ def test_s3_zip(): with from_url('s3://{}/test.zip'.format(bucket), buffering=0) as z: assert isinstance(z, Zip) + assert z.scheme == "s3" assert 'buffering' in z.kwargs assert isinstance(z.fileobj, pfio.v2.s3._ObjectReader) diff --git a/tests/v2_tests/test_zip.py b/tests/v2_tests/test_zip.py index 07f63f24..c5f76b69 100644 --- a/tests/v2_tests/test_zip.py +++ b/tests/v2_tests/test_zip.py @@ -128,6 +128,10 @@ def test_repr_str(self): repr(z) str(z) + def test_scheme(self): + with local.open_zip(self.zip_file_path) as z: + assert z.scheme == local.scheme + def test_read_bytes(self): with local.open_zip(os.path.abspath(self.zip_file_path)) as z: with z.open(self.zipped_file_path, "rb") as zipped_file: From f0298ce7b0f073c8d280ebe44fb6cd25567469f1 Mon Sep 17 00:00:00 2001 From: "Masaki.Nakano" Date: Wed, 13 May 2026 11:31:11 +0900 Subject: [PATCH 2/2] add doc comment for FS.scheme. --- pfio/v2/fs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pfio/v2/fs.py b/pfio/v2/fs.py index ffe279f7..d2ca554f 100644 --- a/pfio/v2/fs.py +++ b/pfio/v2/fs.py @@ -97,6 +97,11 @@ def __init__(self, scheme=None): @property def scheme(self): + '''This property is used to identify the _nominal_ scheme. + + If the scheme is a custom scheme, this property returns the custom scheme name. + If the scheme is a standard scheme (hdfs, s3, fs, etc...), this property returns the standard scheme name. + ''' return self._scheme @property