diff --git a/README.md b/README.md index f5b27c6..4f0f367 100644 --- a/README.md +++ b/README.md @@ -124,8 +124,8 @@ The SDK automatically handles all dependency packaging for Data Cloud deployment ├── payload │ ├── config.json │ ├── entrypoint.py -├── files -│ ├── data.csv +│ ├── files +│ │ ├── data.csv ``` ## py-files directory @@ -137,10 +137,10 @@ Your Python dependencies can be packaged as .py files, .zip archives (containing ├── payload │ ├── config.json │ ├── entrypoint.py -├── py-files -│ ├── moduleA -│ │ ├── __init__.py -│ │ ├── moduleA.py +│ ├── py-files +│ │ ├── moduleA +│ │ │ ├── __init__.py +│ │ │ ├── moduleA.py ``` ## API @@ -148,7 +148,7 @@ Your Python dependencies can be packaged as .py files, .zip archives (containing Your entry point script will define logic using the `Client` object which wraps data access layers. You should only need the following methods: -* `find_file_path(file_name)` - Returns a file path +* `find_file_path(file_name)` – Resolve a bundled file (placed under `payload/files/`) to a `pathlib.Path` that exists. Works the same locally and inside Data Cloud — see [Bundled file resolution](#bundled-file-resolution) below for the full lookup order. Raises `FileNotFoundError` if the file isn't found. * `read_dlo(name)` – Read from a Data Lake Object by name * `read_dmo(name)` – Read from a Data Model Object by name * `write_to_dlo(name, spark_dataframe, write_mode)` – Write to a Data Model Object by name with a Spark dataframe @@ -169,6 +169,24 @@ client.write_to_dlo('output_DLO') > [!WARNING] > Currently we only support reading from DMOs and writing to DMOs or reading from DLOs and writing to DLOs, but they cannot mix. +### Bundled file resolution + +Place bundled files (CSVs, prompt files, etc.) under `payload/files/`. The same `client.find_file_path("data.csv")` call resolves consistently across all three runtimes: + +- `datacustomcode run` (local) → `/payload/files/data.csv` +- Data Cloud script package → `$LIBRARY_PATH/files/data.csv` +- Data Cloud function package → `$LIBRARY_PATH/files/data.csv` + +Resolution order (first existing path wins): + +1. `$LIBRARY_PATH/files/`, then `$LIBRARY_PATH/` — when `LIBRARY_PATH` is set. Data Cloud sets this for you to the package root. +2. `payload/files/` relative to the current working directory. +3. `/files/` where `` is the directory of the nearest `config.json` discoverable by walking down from cwd. + +If none of these exist, `find_file_path` raises `FileNotFoundError` with the list of paths it tried. + +`$LIBRARY_PATH` is set automatically to the root of the package at runtime inside Data Cloud. + ## CLI diff --git a/src/datacustomcode/client.py b/src/datacustomcode/client.py index 9ad95be..596eeb7 100644 --- a/src/datacustomcode/client.py +++ b/src/datacustomcode/client.py @@ -221,8 +221,38 @@ def write_to_dmo( return self._writer.write_to_dmo(name, dataframe, write_mode, **kwargs) # type: ignore[no-any-return] def find_file_path(self, file_name: str) -> Path: - """Return a file path""" + """Resolve a bundled file shipped in the package to an absolute path. + + Resolution order (first existing path wins): + + 1. ``$LIBRARY_PATH//`` then + ``$LIBRARY_PATH/`` — when the ``LIBRARY_PATH`` environment + variable is set. The Data Cloud runtime sets this to the directory + containing the extracted package. + 2. ``//`` relative to the current + working directory — the default ``payload/files/`` layout + used by ``datacustomcode run`` from a project root. + 3. ``//`` where ```` is + the directory containing the nearest ``config.json`` discoverable + by walking the cwd subtree. + + ``LIBRARY_PATH`` must point to the directory that *contains* + ``files/`` — i.e., the package root, the same directory that holds + ``config.json`` and ``entrypoint.py``. See + ``docs/byoc_runtime_contract.md`` for the full runtime contract. + Args: + file_name: A file under the package's ``files/`` folder. Relative + subpaths (e.g., ``"file/data2.csv"``) are supported. + + Returns: + A ``pathlib.Path`` that exists. + + Raises: + FileNotFoundError: If the file does not exist at any of the + resolution-order locations. The message lists every candidate + path that was tried. + """ return self._file.find_file_path(file_name) # type: ignore[no-any-return] def _validate_data_layer_history_does_not_contain( diff --git a/src/datacustomcode/file/path/default.py b/src/datacustomcode/file/path/default.py index 96d2f51..3c120d7 100644 --- a/src/datacustomcode/file/path/default.py +++ b/src/datacustomcode/file/path/default.py @@ -16,7 +16,7 @@ import os from pathlib import Path -from typing import Optional +from typing import Iterator, Optional from datacustomcode.file.base import BaseDataAccessLayer @@ -66,7 +66,7 @@ def find_file_path(self, file_name: str) -> Path: file_name: The name of the file to open Returns: - A file path + A file path that exists Raises: FileNotFoundError: If the file cannot be found @@ -74,46 +74,40 @@ def find_file_path(self, file_name: str) -> Path: if not file_name: raise ValueError("file_name cannot be empty") - file_path = self._resolve_file_path(file_name) + tried: list[Path] = [] + for candidate in self._candidate_paths(file_name): + tried.append(candidate) + if candidate.exists(): + return candidate - if not file_path.exists(): - raise FileNotFoundError( - f"File '{file_name}' not found in any search location" - ) + raise FileNotFoundError( + f"File '{file_name}' not found in any search location. " + f"Tried: {[str(p) for p in tried]}" + ) - return file_path - - def _resolve_file_path(self, file_name: str) -> Path: - """Resolve the full path to a file. + def _candidate_paths(self, file_name: str) -> Iterator[Path]: + """Yield candidate paths for ``file_name`` in resolution order. Args: file_name: The name of the file to resolve Returns: - The full path to the file + An iterator of candidate paths """ - # First check if environment variable is set + # 1. $LIBRARY_PATH//, then $LIBRARY_PATH/ env_path = os.getenv(self.DEFAULT_ENV_VAR) if env_path: - file_path = Path(env_path) / file_name - if file_path.exists(): - return file_path + yield Path(env_path) / self.file_folder / file_name + yield Path(env_path) / file_name - # First try the default code package location + # 2. // relative to cwd if self._code_package_exists(): - file_path = self._get_code_package_file_path(file_name) - if file_path.exists(): - return file_path + yield self._get_code_package_file_path(file_name) - # Fall back to config.json-based location + # 3. // via config.json discovery config_path = self._find_config_file() - if config_path: - file_path = self._get_config_based_file_path(file_name, config_path) - if file_path.exists(): - return file_path - - # Return the file name as a Path if not found in any location - return Path(file_name) + if config_path is not None: + yield self._get_config_based_file_path(file_name, config_path) def _code_package_exists(self) -> bool: """Check if the default code package directory exists. @@ -146,6 +140,10 @@ def _find_config_file(self) -> Optional[Path]: def _get_config_based_file_path(self, file_name: str, config_path: Path) -> Path: """Get the file path relative to the config file location. + Anchors on the directory containing the discovered ``config.json`` so a + package found by walking up from cwd resolves files relative to its own + root, not the caller's cwd. + Args: file_name: The name of the file config_path: The path to the config file @@ -153,8 +151,7 @@ def _get_config_based_file_path(self, file_name: str, config_path: Path) -> Path Returns: The full path to the file """ - relative_path = f"{self.file_folder}/{file_name}" - return Path(relative_path) + return config_path.parent / self.file_folder / file_name def _find_file_in_tree(self, filename: str, search_path: Path) -> Optional[Path]: """Find a file within a directory tree. diff --git a/tests/file/test_path_default.py b/tests/file/test_path_default.py index 8350122..f6f8f78 100644 --- a/tests/file/test_path_default.py +++ b/tests/file/test_path_default.py @@ -51,6 +51,79 @@ def test_init_with_custom_values(self): assert finder.file_folder == "custom_files" assert finder.config_file == "custom_config.json" + def test_resolve_library_path_files_subdir(self, tmp_path, monkeypatch): + """$LIBRARY_PATH// resolves the BYOC layout.""" + files_dir = tmp_path / "files" + files_dir.mkdir() + target = files_dir / "data1.csv" + target.write_text("hello") + monkeypatch.setenv("LIBRARY_PATH", str(tmp_path)) + + finder = DefaultFindFilePath() + assert finder.find_file_path("data1.csv") == target + + def test_resolve_library_path_root_fallback(self, tmp_path, monkeypatch): + """Fall back to $LIBRARY_PATH/ when files/ is missing.""" + target = tmp_path / "data1.csv" + target.write_text("hello") + monkeypatch.setenv("LIBRARY_PATH", str(tmp_path)) + + finder = DefaultFindFilePath() + assert finder.find_file_path("data1.csv") == target + + def test_resolve_library_path_subpath_under_files(self, tmp_path, monkeypatch): + """Relative subpaths like 'file/data2.csv' resolve under $LIBRARY_PATH/files.""" + nested = tmp_path / "files" / "file" + nested.mkdir(parents=True) + target = nested / "data2.csv" + target.write_text("hello") + monkeypatch.setenv("LIBRARY_PATH", str(tmp_path)) + + finder = DefaultFindFilePath() + assert finder.find_file_path("file/data2.csv") == target + + def test_local_run_payload_files_default_layout(self, tmp_path, monkeypatch): + """AC1: local-run resolves payload/files/ with no LIBRARY_PATH set. + + Mirrors ``datacustomcode run payload/entrypoint.py`` from a freshly + ``init``ed package. + """ + monkeypatch.delenv("LIBRARY_PATH", raising=False) + package_dir = tmp_path / "my_package" + files_dir = package_dir / "payload" / "files" + files_dir.mkdir(parents=True) + target = files_dir / "data1.csv" + target.write_text("hello") + monkeypatch.chdir(package_dir) + + finder = DefaultFindFilePath() + result = finder.find_file_path("data1.csv") + + assert result.resolve() == target.resolve() + + def test_resolve_config_based_anchors_on_config_dir(self, tmp_path, monkeypatch): + """config.json discovery anchors on the config's parent, not cwd. + + ``_find_config_file`` walks down from cwd via ``rglob``, so we put cwd + at an ancestor of the package. The file lives only under + ``/files/`` — a cwd-relative ``files/`` would miss it. + """ + monkeypatch.delenv("LIBRARY_PATH", raising=False) + package_dir = tmp_path / "pkg" + files_dir = package_dir / "files" + files_dir.mkdir(parents=True) + (package_dir / "config.json").write_text("{}") + target = files_dir / "data1.csv" + target.write_text("hello") + + monkeypatch.chdir(tmp_path) + + # Use a code_package that doesn't exist relative to tmp_path, so step 3 + # is skipped and resolution falls through to config.json discovery. + finder = DefaultFindFilePath(code_package="nonexistent_pkg") + result = finder.find_file_path("data1.csv") + assert result.resolve() == target.resolve() + def test_find_file_path_empty_filename(self): """Test find_file_path with empty filename raises ValueError.""" finder = DefaultFindFilePath() @@ -65,10 +138,10 @@ def test_find_file_path_file_not_found(self): """Test find_file_path when file doesn't exist raises FileNotFoundError.""" finder = DefaultFindFilePath() - with patch.object(finder, "_resolve_file_path") as mock_resolve: + with patch.object(finder, "_candidate_paths") as mock_candidates: mock_path = MagicMock() mock_path.exists.return_value = False - mock_resolve.return_value = mock_path + mock_candidates.return_value = iter([mock_path]) with pytest.raises( FileNotFoundError, @@ -80,33 +153,34 @@ def test_find_file_path_success(self): """Test find_file_path when file exists returns Path.""" finder = DefaultFindFilePath() - with patch.object(finder, "_resolve_file_path") as mock_resolve: + with patch.object(finder, "_candidate_paths") as mock_candidates: mock_path = MagicMock() mock_path.exists.return_value = True - mock_resolve.return_value = mock_path + mock_candidates.return_value = iter([mock_path]) result = finder.find_file_path("test.txt") assert result == mock_path - mock_resolve.assert_called_once_with("test.txt") + mock_candidates.assert_called_once_with("test.txt") - def test_resolve_file_path_env_var_set_file_exists(self): - """Test _resolve_file_path when environment variable is set and file exists.""" + def test_find_file_path_env_var_set_file_exists(self): + """find_file_path returns $LIBRARY_PATH/files/ when present.""" finder = DefaultFindFilePath() with tempfile.TemporaryDirectory() as temp_dir: - test_file = Path(temp_dir) / "test.txt" + files_dir = Path(temp_dir) / "files" + files_dir.mkdir() + test_file = files_dir / "test.txt" test_file.write_text("test content") with patch.dict(os.environ, {finder.DEFAULT_ENV_VAR: str(temp_dir)}): - result = finder._resolve_file_path("test.txt") + result = finder.find_file_path("test.txt") assert result == test_file assert result.exists() - def test_resolve_file_path_env_var_set_file_not_found(self): - """Test _resolve_file_path when environment variable is set but file not found, - falls back to code package.""" + def test_find_file_path_env_var_set_falls_through_to_code_package(self): + """When $LIBRARY_PATH has no match, resolution falls through to code_package.""" finder = DefaultFindFilePath() with tempfile.TemporaryDirectory() as temp_dir: @@ -122,68 +196,33 @@ def test_resolve_file_path_env_var_set_file_not_found(self): mock_path.exists.return_value = True mock_get_path.return_value = mock_path - result = finder._resolve_file_path("test.txt") + result = finder.find_file_path("test.txt") assert result == mock_path mock_exists.assert_called_once() mock_get_path.assert_called_once_with("test.txt") - def test_resolve_file_path_env_var_not_set(self): - """Test _resolve_file_path when environment variable is not set, - uses normal flow.""" + def test_find_file_path_env_var_not_set_uses_code_package(self, monkeypatch): + """With LIBRARY_PATH unset, code_package is the next candidate.""" + monkeypatch.delenv("LIBRARY_PATH", raising=False) finder = DefaultFindFilePath() - # Ensure env var is not set - env_backup = os.environ.pop(finder.DEFAULT_ENV_VAR, None) - try: - with patch.object( - finder, "_code_package_exists", return_value=True - ) as mock_exists: - with patch.object( - finder, "_get_code_package_file_path" - ) as mock_get_path: - mock_path = MagicMock() - mock_path.exists.return_value = True - mock_get_path.return_value = mock_path - - result = finder._resolve_file_path("test.txt") - - assert result == mock_path - mock_exists.assert_called_once() - mock_get_path.assert_called_once_with("test.txt") - finally: - if env_backup is not None: - os.environ[finder.DEFAULT_ENV_VAR] = env_backup - - def test_resolve_file_path_code_package_exists(self): - """Test _resolve_file_path when code package exists and file is found.""" - finder = DefaultFindFilePath() + with patch.object( + finder, "_code_package_exists", return_value=True + ) as mock_exists: + with patch.object(finder, "_get_code_package_file_path") as mock_get_path: + mock_path = MagicMock() + mock_path.exists.return_value = True + mock_get_path.return_value = mock_path - # Ensure env var is not set to test normal flow - env_backup = os.environ.pop(finder.DEFAULT_ENV_VAR, None) - try: - with patch.object( - finder, "_code_package_exists", return_value=True - ) as mock_exists: - with patch.object( - finder, "_get_code_package_file_path" - ) as mock_get_path: - mock_path = MagicMock() - mock_path.exists.return_value = True - mock_get_path.return_value = mock_path - - result = finder._resolve_file_path("test.txt") - - assert result == mock_path - mock_exists.assert_called_once() - mock_get_path.assert_called_once_with("test.txt") - finally: - if env_backup is not None: - os.environ[finder.DEFAULT_ENV_VAR] = env_backup - - def test_resolve_file_path_code_package_exists_file_not_found(self): - """Test _resolve_file_path when code package exists but file not found, - falls back to config.""" + result = finder.find_file_path("test.txt") + + assert result == mock_path + mock_exists.assert_called_once() + mock_get_path.assert_called_once_with("test.txt") + + def test_find_file_path_code_package_exists_falls_through_to_config(self): + """When code_package candidate is missing, config.json discovery runs.""" finder = DefaultFindFilePath() with patch.object(finder, "_code_package_exists", return_value=True): @@ -205,7 +244,7 @@ def test_resolve_file_path_code_package_exists_file_not_found(self): mock_config_file_path.exists.return_value = True mock_get_config_path.return_value = mock_config_file_path - result = finder._resolve_file_path("test.txt") + result = finder.find_file_path("test.txt") assert result == mock_config_file_path mock_find_config.assert_called_once() @@ -213,16 +252,32 @@ def test_resolve_file_path_code_package_exists_file_not_found(self): "test.txt", mock_config_path ) - def test_resolve_file_path_fallback_to_filename(self): - """Test _resolve_file_path falls back to Path(filename) - when no other location works.""" + def test_find_file_path_no_candidates_raises(self, monkeypatch): + """When no candidate paths exist, find_file_path raises FileNotFoundError.""" + monkeypatch.delenv("LIBRARY_PATH", raising=False) finder = DefaultFindFilePath() with patch.object(finder, "_code_package_exists", return_value=False): with patch.object(finder, "_find_config_file", return_value=None): - result = finder._resolve_file_path("test.txt") + with pytest.raises(FileNotFoundError): + finder.find_file_path("test.txt") + + def test_find_file_path_error_lists_tried_locations(self, tmp_path, monkeypatch): + """FileNotFoundError lists every candidate location that was tried.""" + env_dir = tmp_path / "env" + env_dir.mkdir() + monkeypatch.setenv("LIBRARY_PATH", str(env_dir)) + + finder = DefaultFindFilePath() + with pytest.raises(FileNotFoundError) as exc_info: + finder.find_file_path("missing.txt") - assert result == Path("test.txt") + message = str(exc_info.value) + assert "missing.txt" in message + assert "Tried:" in message + # LIBRARY_PATH candidates should appear + assert str(env_dir / "files" / "missing.txt") in message + assert str(env_dir / "missing.txt") in message def test_code_package_exists_true(self): """Test _code_package_exists returns True when directory exists.""" @@ -281,23 +336,23 @@ def test_find_config_file_not_found(self): assert result is None def test_get_config_based_file_path(self): - """Test _get_config_based_file_path constructs correct path.""" + """_get_config_based_file_path anchors on the discovered config dir.""" finder = DefaultFindFilePath() config_path = Path("/some/path/config.json") result = finder._get_config_based_file_path("test.txt", config_path) - expected = Path("files/test.txt") + expected = Path("/some/path/files/test.txt") assert result == expected def test_get_config_based_file_path_custom_folder(self): - """Test _get_config_based_file_path with custom file folder.""" + """_get_config_based_file_path uses custom file_folder under config dir.""" finder = DefaultFindFilePath(file_folder="custom_files") config_path = Path("/some/path/config.json") result = finder._get_config_based_file_path("test.txt", config_path) - expected = Path("custom_files/test.txt") + expected = Path("/some/path/custom_files/test.txt") assert result == expected def test_find_file_in_tree_found(self):