From 3c66fc4d8efe9f9c481fe009853fb27aecd96615 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:20:17 -0400 Subject: [PATCH 1/8] use lazy imports for pandas and numpy --- hamilton/base.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/hamilton/base.py b/hamilton/base.py index 02241aaca..4fa5050c8 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -19,15 +19,13 @@ It should only import hamilton.node, numpy, pandas. It cannot import hamilton.graph, or hamilton.driver. """ +from __future__ import annotations import abc import collections import logging -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union, TYPE_CHECKING -import numpy as np -import pandas as pd -from pandas.core.indexes import extension as pd_extension from hamilton.lifecycle import api as lifecycle_api @@ -36,6 +34,11 @@ except ImportError: import node +if TYPE_CHECKING: + import numpy as np + import pandas as pd + + logger = logging.getLogger(__name__) @@ -120,6 +123,8 @@ def pandas_index_types( :param outputs: the dict we're trying to create a result from. :return: dict of all index types, dict of time series/categorical index types, dict if there is no index """ + import pandas as pd + all_index_types = collections.defaultdict(list) time_indexes = collections.defaultdict(list) no_indexes = collections.defaultdict(list) @@ -131,6 +136,8 @@ def index_key_name(pd_object: Union[pd.DataFrame, pd.Series]) -> str: def get_parent_time_index_type(): """Helper to pull the right time index parent class.""" + from pandas.core.indexes import extension as pd_extension + if hasattr(pd_extension, "NDArrayBackedExtensionIndex"): index_type = pd_extension.NDArrayBackedExtensionIndex else: @@ -220,6 +227,8 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: :param outputs: the outputs to build a dataframe from. """ + import pandas as pd + # TODO check inputs are pd.Series, arrays, or scalars -- else error output_index_type_tuple = PandasDataFrameResult.pandas_index_types(outputs) # this next line just log warnings @@ -255,6 +264,7 @@ def build_dataframe_with_dataframes(outputs: Dict[str, Any]) -> pd.DataFrame: :param outputs: The outputs to build the dataframe from. :return: A dataframe with the outputs. """ + import pandas as pd def get_output_name(output_name: str, column_name: str) -> str: """Add function prefix to columns. @@ -300,6 +310,7 @@ def input_types(self) -> List[Type[Type]]: return [Any] def output_type(self) -> Type: + import pandas as pd return pd.DataFrame @@ -365,6 +376,7 @@ def build_result(**outputs: Dict[str, Any]) -> np.matrix: :param outputs: function_name -> np.array. :return: numpy matrix """ + import numpy as np # TODO check inputs are all numpy arrays/array like things -- else error num_rows = -1 columns_with_lengths = collections.OrderedDict() @@ -402,6 +414,7 @@ def input_types(self) -> List[Type[Type]]: return [Any] # Typing def output_type(self) -> Type: + import pandas as pd return pd.DataFrame From 588638ce91f7dc0dff7b7935e41a1d911e3c05ec Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:20:58 -0400 Subject: [PATCH 2/8] use absolute import as fallback; required by proxy --- hamilton/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hamilton/__init__.py b/hamilton/__init__.py index a302407d2..4d3fd39f9 100644 --- a/hamilton/__init__.py +++ b/hamilton/__init__.py @@ -1,7 +1,7 @@ try: from .version import VERSION as __version__ # noqa: F401 except ImportError: - from version import VERSION as __version__ # noqa: F401 + from hamilton.version import VERSION as __version__ # noqa: F401 # this supposedly is required for namespace packages to work. __path__ = __import__("pkgutil").extend_path(__path__, __name__) From c1deb44066074182e5d0dac0e237e3eec9093217 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:38:25 -0400 Subject: [PATCH 3/8] add README explanations; wip --- hamilton-core/README.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 hamilton-core/README.md diff --git a/hamilton-core/README.md b/hamilton-core/README.md new file mode 100644 index 000000000..52accb309 --- /dev/null +++ b/hamilton-core/README.md @@ -0,0 +1,41 @@ +# Read carefully + +> Use at your own risk + +This directory contains code for the package `sf-hamilton-core`. It is a drop-in replacement of `sf-hamilton`, with two changes: +- disable plugin autoloading +- make `pandas` and `numpy` optional dependencies; and remove `networkx` dependency (currently unused). + +This makes the Hamilton package a much lighter install and solves long library loading time. + +## As a user +If you want to try `sf-hamilton-core`, you need to: +1. Remove your current Hamilton installation: `pip uninstall sf-hamilton` +2. Install Hamilton core `pip install sf-hamilton-core` +3. Check installation `pip list` should only include `sf-hamilton-core`. + +This will install a different Python package with the name `hamilton` with the smaller dependencies and plugin autoloading disabled. + +It should be a drop-in replacement and your existing Hamilton code should just work. Though, if you're relying on plugins (e.g., parquet materializers, dataframe result builders), you will need to manually load them. + + +## How does it work + + +## Why is another package `sf-hamilton` necessary +This exists to prevent backwards incompatible changes for people who `pip install sf-hamilton` and use it in production. It is a temporary solution until a major release `sf-hamilton==2.0.0` could allow breaking changes and a more robust solution. + +### Disable plugin autoloading +Hamilton has generous number of plugins (`pandas`, `polars`, `mlflow`, `spark`). To give a good user experience, Hamilton autoloads plugins based on the available Python libraries in the current Python environment. For example, `to.mlflow()` becomes available if `mlflow` is installed. Autoloaded features notably include materializers like `from_.parquet` and `to.parquet` and data validators (pydantic, pandera, etc.) + +The issue with this approach is that Python environment with a lot of dependencies, common in data science, can be very slow to start because of all the imports. Currently, Hamilton allows to disable autoloading via a user config or Python code. This require manual setups and is not the best default for some users. + +### `pandas` and `numpy` dependencies +Hamilton was initially created for workflows that used `pandas` and `numpy` heavily. For this reason, `numpy` and `pandas` are imported at the top-level of module `hamilton.base`. Because of the package structure, as a Hamilton user, you're importing `pandas` and `numpy` every time you import `hamilton`. + +A reasonable change would be to move `numpy` and `pandas` to a "lazy" location. Then, dependencies would only be imported when features requiring them are used and they could be removed from `pyproject.toml`. Unfortunately, plugin autoloading defaults make this solution a significant breaking change and insatisfactory. + +Since plugins are loaded based on the Python package available, removing `pandas` and `numpy` would allow disable the loading of these plugins. This would break popular CSV and parquet materializers. + +### `networkx` dependency +The `sf-hamilton[visualization]` extra currently includes `networkx` as a dependency, though it is never actually used. There's a single function requiring it and it could be implemented in pure Python. This has been made even easier with the addition of `graphlib` in the standard library in Python 3.9. From c63886b7a7da6f56f7b16a071c10e85f4a8e6803 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:38:52 -0400 Subject: [PATCH 4/8] dynamically define package using setup.py from pyproject.toml --- hamilton-core/setup.py | 52 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 hamilton-core/setup.py diff --git a/hamilton-core/setup.py b/hamilton-core/setup.py new file mode 100644 index 000000000..da780c52a --- /dev/null +++ b/hamilton-core/setup.py @@ -0,0 +1,52 @@ + +import tomllib +import pathlib +import re +from setuptools import setup + +def get_version(): + version_path = pathlib.Path(__file__).parent / "hamilton" / "_hamilton" / "version.py" + content = version_path.read_text() + match = re.search(r'^VERSION\s*=\s*\(([^)]+)\)', content, re.MULTILINE) + if match: + version_tuple_str = match.group(1) # "1, 88, 0" + # Parse tuple string into list of integers + version_parts = [part.strip() for part in version_tuple_str.split(",")] + version_str = ".".join(version_parts) + return version_str + +pyproject_path = pathlib.Path(__file__).parents[1] / "pyproject.toml" +pyproject = tomllib.loads(pyproject_path.read_text()) +project = pyproject["project"] + +readme_file = project.get("readme", None) +console_scripts = [ + f"{name}={target}" for name, target in project.get("entry-points", {}).get("console_scripts", {}).items() +] +install_requires = list( + set(project.get("dependencies", [])).difference(set(["pandas", "numpy"])) +) +extras_require = { + **project.get("optional-dependencies", {}), + **{"visualization": ["graphviz"]}, # drop networkx +} + +setup( + name="sf-hamilton-core", + version=get_version(), + description=project.get("description", ""), + long_description=pathlib.Path(readme_file).read_text() if readme_file else "", + long_description_content_type="text/markdown" if readme_file else None, + python_requires=project.get("requires-python", None), + license=project.get("license", {}).get("text", None), + keywords=project.get("keywords", []), + author=", ".join(a["name"] for a in project.get("authors", [])), + author_email=", ".join(a["email"] for a in project.get("authors", [])), + classifiers=project.get("classifiers", []), + install_requires=install_requires, + extras_require=extras_require, + entry_points={"console_scripts": console_scripts}, + project_urls=project.get("urls", {}), + packages=["hamilton"], + package_data={"hamilton": ["*.json", "*.md", "*.txt"]}, +) From 543d2672aaac33a455a5ea712c98319dddce7b40 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:39:22 -0400 Subject: [PATCH 5/8] implement module proxying logic --- hamilton-core/hamilton/__init__.py | 68 ++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 hamilton-core/hamilton/__init__.py diff --git a/hamilton-core/hamilton/__init__.py b/hamilton-core/hamilton/__init__.py new file mode 100644 index 000000000..3fdd2f4a0 --- /dev/null +++ b/hamilton-core/hamilton/__init__.py @@ -0,0 +1,68 @@ +import importlib.util +import pathlib +import sys +from typing import Any +from types import ModuleType + + +def _load_hamilton_module() -> ModuleType: + """Patch this relative import in the Hamilton core repository + + ```python + # hamilton/__init__.py + try: + from .version import VERSION as __version__ # noqa: F401 + except ImportError: + from version import VERSION as __version__ # noqa: F401 + ``` + """ + + origin_path = pathlib.Path(__file__).parent / "_hamilton" / "__init__.py" + origin_spec = importlib.util.spec_from_file_location("hamilton", origin_path) + origin_module = importlib.util.module_from_spec(origin_spec) + + # The following lines are only required if we don't modify `hamilton/__init__.py` + # source_segment = "from version import VERSION as __version__" + # # the namespace `hamilton._hamilton` is only temporarily available; it will be removed + # # by the end of this initialization + # patched_segment = "from hamilton._hamilton.version import VERSION as __version__" + + # source_code = pathlib.Path(origin_path).read_text() + # patched_code = source_code.replace(source_segment, patched_segment) + + # exec(patched_code, origin_module.__dict__) + # sys.modules["hamilton"] = origin_module + + origin_spec.loader.exec_module(origin_module) + return origin_module + + +def _load_hamilton_registry_module(): + module_path = pathlib.Path(__file__).parent / "_hamilton" / "registry.py" + module_spec = importlib.util.spec_from_file_location("hamilton.registry", module_path) + module = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(module) + return module + + +def _create_proxy_module() -> ModuleType: + proxy_module = ModuleType(__name__) + sys.modules[__name__] = proxy_module + return proxy_module + + +_registry_module = _load_hamilton_registry_module() +# disable plugin autoloading +_registry_module.disable_autoload() + +_origin_module = _load_hamilton_module() +_proxy_module = _create_proxy_module() + +def __getattr__(name: str) -> Any: + try: + return getattr(_origin_module, name) + except AttributeError: + raise AttributeError(f"module {__name__} has no attribute {name}") + +# `getattr()` must be available to build the package +_proxy_module.__getattr__ = __getattr__ From 399b4a7020919089fac603608263383b2f13e960 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:57:10 -0400 Subject: [PATCH 6/8] setup.py now copies the main source code --- hamilton-core/.gitignore | 1 + hamilton-core/setup.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 hamilton-core/.gitignore diff --git a/hamilton-core/.gitignore b/hamilton-core/.gitignore new file mode 100644 index 000000000..750d8503c --- /dev/null +++ b/hamilton-core/.gitignore @@ -0,0 +1 @@ +hamilton/_hamilton diff --git a/hamilton-core/setup.py b/hamilton-core/setup.py index da780c52a..0a4dfa056 100644 --- a/hamilton-core/setup.py +++ b/hamilton-core/setup.py @@ -2,8 +2,38 @@ import tomllib import pathlib import re +import os +import shutil +import sys from setuptools import setup +# ensure the right current working directory +os.chdir(os.path.abspath(os.path.dirname(__file__))) + +def copy_hamilton_library(): + setup_dir = pathlib.Path(__file__).resolve().parent + source_dir = (setup_dir.parent / 'hamilton').resolve() + dest_dir = (setup_dir / 'hamilton' / '_hamilton').resolve() + + # Safety checks + if not source_dir.is_dir(): + print(f"Error: Source directory does not exist: {source_dir}") + sys.exit(1) + + if not str(dest_dir).startswith(str(setup_dir)): + print(f"Error: Destination directory {dest_dir} is outside the setup directory {setup_dir}") + sys.exit(1) + + # Remove destination if it exists to avoid errors or stale files + if dest_dir.exists(): + print("delete: ", dest_dir) + shutil.rmtree(dest_dir) + + # Copy entire directory tree from source to destination + print(f"copy from: {source_dir}; to {dest_dir}") + shutil.copytree(source_dir, dest_dir) + + def get_version(): version_path = pathlib.Path(__file__).parent / "hamilton" / "_hamilton" / "version.py" content = version_path.read_text() @@ -15,6 +45,8 @@ def get_version(): version_str = ".".join(version_parts) return version_str +copy_hamilton_library() + pyproject_path = pathlib.Path(__file__).parents[1] / "pyproject.toml" pyproject = tomllib.loads(pyproject_path.read_text()) project = pyproject["project"] From 6d4fbfaa247a33dc2ac81dab5ca3b8ecf6d48982 Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Fri, 5 Sep 2025 21:14:26 -0400 Subject: [PATCH 7/8] added CI workflow to test hamilton-core --- .github/workflows/hamilton-core-main.yml | 48 ++++++++++++++++++++++++ hamilton-core/setup.py | 24 +++++++++--- 2 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/hamilton-core-main.yml diff --git a/.github/workflows/hamilton-core-main.yml b/.github/workflows/hamilton-core-main.yml new file mode 100644 index 000000000..f26ec478a --- /dev/null +++ b/.github/workflows/hamilton-core-main.yml @@ -0,0 +1,48 @@ +name: Unit tests (hamilton-core) + +on: + workflow_dispatch: + + pull_request: + branches: + - main + paths: + - '.github/**' + - 'hamilton/**' + - 'tests/**' + - 'pyproject.toml' + +jobs: + test: + name: "Unit Tests (hamilton-core)" + runs-on: ubuntu-latest + env: + UV_PRERELEASE: "allow" + HAMILTON_TELEMETRY_ENABLED: false + + steps: + - name: Install Graphviz on Linux + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install --yes --no-install-recommends graphviz + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: "3.12" # most popular Python version + enable-cache: true + cache-dependency-glob: "uv.lock" + activate-environment: true + + - name: Install dependencies + run: | + uv venv + . .venv/bin/activate + uv pip install ./hamilton-core[core-tests] + + # NOTE `test_caching.py` is the older caching mechanism + - name: Test hamilton main package + run: | + uv run pytest tests/ --ignore tests/integrations --ignore tests/plugins --ignore tests/test_caching.py diff --git a/hamilton-core/setup.py b/hamilton-core/setup.py index 0a4dfa056..d51a042ab 100644 --- a/hamilton-core/setup.py +++ b/hamilton-core/setup.py @@ -1,19 +1,20 @@ -import tomllib +import os import pathlib import re -import os import shutil import sys + +import tomllib from setuptools import setup -# ensure the right current working directory os.chdir(os.path.abspath(os.path.dirname(__file__))) + def copy_hamilton_library(): setup_dir = pathlib.Path(__file__).resolve().parent - source_dir = (setup_dir.parent / 'hamilton').resolve() - dest_dir = (setup_dir / 'hamilton' / '_hamilton').resolve() + source_dir = (setup_dir.parent / "hamilton").resolve() + dest_dir = (setup_dir / "hamilton" / "_hamilton").resolve() # Safety checks if not source_dir.is_dir(): @@ -37,7 +38,7 @@ def copy_hamilton_library(): def get_version(): version_path = pathlib.Path(__file__).parent / "hamilton" / "_hamilton" / "version.py" content = version_path.read_text() - match = re.search(r'^VERSION\s*=\s*\(([^)]+)\)', content, re.MULTILINE) + match = re.search(r"^VERSION\s*=\s*\(([^)]+)\)", content, re.MULTILINE) if match: version_tuple_str = match.group(1) # "1, 88, 0" # Parse tuple string into list of integers @@ -61,8 +62,19 @@ def get_version(): extras_require = { **project.get("optional-dependencies", {}), **{"visualization": ["graphviz"]}, # drop networkx + **{ + "core-tests": [ # dependencies required to run unit tests; used in CI + "pytest", + "pytest-asyncio", + "pandas", + "typer", + "networkx", + "graphviz", + ] + } } + setup( name="sf-hamilton-core", version=get_version(), From 44ba8d867481779f3629571c3e0420125d78d6be Mon Sep 17 00:00:00 2001 From: zilto <68975210+zilto@users.noreply.github.com> Date: Fri, 5 Sep 2025 21:31:31 -0400 Subject: [PATCH 8/8] pre-commits; fix hamilton.base imports --- hamilton-core/hamilton/__init__.py | 6 ++++-- hamilton-core/setup.py | 11 +++++------ hamilton/base.py | 15 ++++++++------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/hamilton-core/hamilton/__init__.py b/hamilton-core/hamilton/__init__.py index 3fdd2f4a0..469c2b01d 100644 --- a/hamilton-core/hamilton/__init__.py +++ b/hamilton-core/hamilton/__init__.py @@ -1,8 +1,8 @@ import importlib.util import pathlib import sys -from typing import Any from types import ModuleType +from typing import Any def _load_hamilton_module() -> ModuleType: @@ -58,11 +58,13 @@ def _create_proxy_module() -> ModuleType: _origin_module = _load_hamilton_module() _proxy_module = _create_proxy_module() + def __getattr__(name: str) -> Any: try: return getattr(_origin_module, name) except AttributeError: - raise AttributeError(f"module {__name__} has no attribute {name}") + raise + # `getattr()` must be available to build the package _proxy_module.__getattr__ = __getattr__ diff --git a/hamilton-core/setup.py b/hamilton-core/setup.py index d51a042ab..264eaea83 100644 --- a/hamilton-core/setup.py +++ b/hamilton-core/setup.py @@ -1,4 +1,3 @@ - import os import pathlib import re @@ -46,6 +45,7 @@ def get_version(): version_str = ".".join(version_parts) return version_str + copy_hamilton_library() pyproject_path = pathlib.Path(__file__).parents[1] / "pyproject.toml" @@ -54,11 +54,10 @@ def get_version(): readme_file = project.get("readme", None) console_scripts = [ - f"{name}={target}" for name, target in project.get("entry-points", {}).get("console_scripts", {}).items() + f"{name}={target}" + for name, target in project.get("entry-points", {}).get("console_scripts", {}).items() ] -install_requires = list( - set(project.get("dependencies", [])).difference(set(["pandas", "numpy"])) -) +install_requires = list(set(project.get("dependencies", [])).difference(set(["pandas", "numpy"]))) extras_require = { **project.get("optional-dependencies", {}), **{"visualization": ["graphviz"]}, # drop networkx @@ -71,7 +70,7 @@ def get_version(): "networkx", "graphviz", ] - } + }, } diff --git a/hamilton/base.py b/hamilton/base.py index 4fa5050c8..77fb46a9d 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -19,25 +19,23 @@ It should only import hamilton.node, numpy, pandas. It cannot import hamilton.graph, or hamilton.driver. """ + from __future__ import annotations import abc import collections import logging -from typing import Any, Dict, List, Optional, Tuple, Type, Union, TYPE_CHECKING - +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from hamilton import htypes from hamilton.lifecycle import api as lifecycle_api -try: - from . import htypes, node -except ImportError: - import node - if TYPE_CHECKING: import numpy as np import pandas as pd + import hamilton.node as node + logger = logging.getLogger(__name__) @@ -311,6 +309,7 @@ def input_types(self) -> List[Type[Type]]: def output_type(self) -> Type: import pandas as pd + return pd.DataFrame @@ -377,6 +376,7 @@ def build_result(**outputs: Dict[str, Any]) -> np.matrix: :return: numpy matrix """ import numpy as np + # TODO check inputs are all numpy arrays/array like things -- else error num_rows = -1 columns_with_lengths = collections.OrderedDict() @@ -415,6 +415,7 @@ def input_types(self) -> List[Type[Type]]: def output_type(self) -> Type: import pandas as pd + return pd.DataFrame