Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@ Changelog
=========


4.2.0 - UNRELEASED
------------------

**New feature:**

- :func:`tabmat.from_formula` now also supports any dataframe supported by narwhals.

**Other changes:**

- Require Python>=3.10


4.1.5 - 2025-12-17
------------------

Expand Down
2 changes: 1 addition & 1 deletion conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ requirements:
run:
- python
- {{ pin_compatible('numpy') }}
- formulaic>=0.6
- formulaic>=1.2
- scipy
- narwhals>=2

Expand Down
3,104 changes: 1,111 additions & 1,993 deletions pixi.lock

Large diffs are not rendered by default.

13 changes: 5 additions & 8 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ xsimd = "<11|>12.1"
jemalloc-local = "*"

[host-dependencies]
python = ">=3.9"
python = ">=3.10"
pip = "*"
setuptools = "*"
wheel = "*"

[dependencies]
formulaic = ">=0.6.4"
formulaic = ">=1.2.0"
numpy = ">=1.24.0"
pandas = ">=1.4.4"
scipy = ">=1.7.3"
Expand All @@ -112,7 +112,7 @@ ipykernel = "*"
click = "*"
pytest = "*"
pytest-xdist = "*"
polars = "*" # exclusively for polars tests
polars = ">=1.35.0" # exclusively for polars tests
pyarrow = "*" # exclusively for polars tests
mypy = "*"

Expand Down Expand Up @@ -140,8 +140,6 @@ blas = { build = "*mkl" }
[feature.benchmark.target.linux-64.dependencies]
blas = { build = "*mkl" }

[feature.py39.dependencies]
python = "3.9.*"
[feature.py310.dependencies]
python = "3.10.*"
[feature.py311.dependencies]
Expand All @@ -153,18 +151,17 @@ python = "3.13.*"

[feature.oldies.dependencies]
setuptools = "62.*"
python = "=3.9.19"
python = "=3.10"
numpy = "=1.24.0"
pandas = "=1.4.4"
scipy = "=1.7.3"
formulaic = "=0.6.4"
formulaic = "1.2.*"
narwhals = "2.0.*"

[environments]
default = ["dev", "test"]
docs = ["docs"]
benchmark = ["benchmark"]
py39 = ["py39", "test"]
py310 = ["py310", "test"]
py311 = ["py311", "test"]
py312 = ["py312", "test"]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ select = [
known-first-party = ["tabmat"]

[tool.mypy]
python_version = '3.9'
python_version = '3.10'
exclude = [
"tests/",
]
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,15 @@
author_email="noreply@quantco.com",
classifiers=[ # Optional
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
],
package_dir={"": "src"},
packages=find_packages(where="src"),
install_requires=["formulaic>=0.6", "narwhals", "numpy", "scipy"],
python_requires=">=3.9",
install_requires=["formulaic>=1.2", "narwhals", "numpy", "scipy"],
python_requires=">=3.10",
ext_modules=cythonize(
ext_modules,
annotate=False,
Expand Down
2 changes: 1 addition & 1 deletion src/tabmat/benchmark/generate_matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def make_dense_matrices(n_rows: int, n_cols: int) -> dict:
dense_matrices = {"numpy_C": np.random.random((n_rows, n_cols))}
dense_matrices["numpy_F"] = dense_matrices["numpy_C"].copy(order="F")
assert dense_matrices["numpy_F"].flags["F_CONTIGUOUS"]
dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"])
dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"]) # type: ignore
return dense_matrices


Expand Down
10 changes: 8 additions & 2 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,13 @@ def _extract_codes_and_categories_pandas(cat_vec) -> tuple[np.ndarray, np.ndarra


def _extract_codes_and_categories_polars(cat_vec) -> tuple[np.ndarray, np.ndarray]:
if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)):
dtype = cat_vec.dtype
if isinstance(dtype, pl.Enum):
categories = cat_vec.cat.get_categories().to_numpy()
indices = cat_vec.to_physical().fill_null(-1).to_numpy()
return indices, categories

if not isinstance(cat_vec.dtype, pl.Categorical):
cat_vec = cat_vec.cast(pl.Categorical)
# as of polars 1.32, `get_categories()` won't yield a useful result as
# this is "not per column" anymore.
Expand Down Expand Up @@ -300,7 +306,7 @@ def _row_col_indexing(
is_col_indexed = not (cols is None or len(cols) == arr.shape[1])

if is_row_indexed and is_col_indexed:
return arr[np.ix_(rows, cols)]
return arr[np.ix_(rows, cols)] # type: ignore
elif is_row_indexed:
return arr[rows]
elif is_col_indexed:
Expand Down
99 changes: 52 additions & 47 deletions src/tabmat/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,21 @@
from collections.abc import Iterable
from typing import Any, Optional, Union

import narwhals.stable.v2 as nw
import numpy as np
import numpy.typing
import pandas as pd
from formulaic import ModelMatrix, ModelSpec
from formulaic.errors import FactorEncodingError
from formulaic.materializers import FormulaMaterializer
from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm
from formulaic.materializers.types import FactorValues, ScopedTerm
from formulaic.parser.types import Term
from formulaic.transforms import stateful_transform
from formulaic.utils.null_handling import drop_rows as drop_nulls
from interface_meta import override
from scipy import sparse as sps

from .categorical_matrix import CategoricalMatrix
from .categorical_matrix import CategoricalMatrix, _extract_codes_and_categories
from .constructor_util import _split_sparse_and_dense_parts
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase
Expand Down Expand Up @@ -53,34 +55,24 @@ def _init(self):
self.cat_missing_method = self.params.get("cat_missing_method", "fail")
self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)")

# Always convert input to narwhals DataFrame
self.__narwhals_data = nw.from_native(self.data, eager_only=True)
self.__data_context = self.__narwhals_data.to_dict()

# We can override formulaic's C() function here
self.context["C"] = _C

@override
def _is_categorical(self, values):
if isinstance(values, (pd.Series, pd.Categorical)):
return values.dtype == object or isinstance(
values.dtype, (pd.CategoricalDtype, pd.StringDtype)
)
return super()._is_categorical(values)
@override # type: ignore
@property
def data_context(self):
return self.__data_context

@override
def _check_for_nulls(self, name, values, na_action, drop_rows):
if na_action is NAAction.IGNORE:
return

if na_action is NAAction.RAISE:
if isinstance(values, pd.Series) and values.isnull().values.any():
raise ValueError(f"`{name}` contains null values after evaluation.")

elif na_action is NAAction.DROP:
if isinstance(values, pd.Series):
drop_rows.update(np.flatnonzero(values.isnull().values))

else:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
)
def _is_categorical(self, values: Any) -> bool:
if nw.dependencies.is_narwhals_series(values):
if not values.dtype.is_numeric():
return True
return super()._is_categorical(values)

@override
def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows):
Expand All @@ -90,9 +82,9 @@ def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows):
@override
def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows):
if drop_rows:
values = values.drop(index=values.index[drop_rows])
if isinstance(values, pd.Series):
values = values.to_numpy().astype(self.dtype, copy=False)
values = drop_nulls(values, indices=drop_rows)
if isinstance(values, nw.Series):
values = values.to_numpy().astype(self.dtype)
if (values != 0).mean() <= self.sparse_threshold:
return _InteractableSparseVector(sps.csc_matrix(values[:, np.newaxis]))
else:
Expand All @@ -104,7 +96,7 @@ def _encode_categorical(
):
# We do not do any encoding here as it is handled by tabmat
if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
return encode_contrasts(
values,
reduced_rank=reduced_rank,
Expand Down Expand Up @@ -428,17 +420,18 @@ def __init__(
self.name = name

@classmethod
def from_categorical(
def from_codes(
cls,
cat: pd.Categorical,
codes: np.ndarray,
categories: list,
reduced_rank: bool,
missing_method: str = "fail",
missing_name: str = "(MISSING)",
add_missing_category: bool = False,
) -> "_InteractableCategoricalVector":
"""Create an interactable categorical vector from a pandas categorical."""
categories = cat.categories.tolist()
codes = cat.codes.copy().astype(np.int64)
codes = codes.copy().astype(np.int64)
categories = categories.copy()

if reduced_rank:
codes[codes == 0] = -2
Expand All @@ -458,7 +451,7 @@ def from_categorical(
return cls(
codes=codes,
categories=categories,
multipliers=np.ones(len(cat.codes)),
multipliers=np.ones(len(codes)),
)

def __rmul__(self, other):
Expand Down Expand Up @@ -674,7 +667,7 @@ def _C(
data,
*,
levels: Optional[Iterable[str]] = None,
missing_method: str = "fail",
missing_method: Optional[str] = None,
missing_name: str = "(MISSING)",
spans_intercept: bool = True,
):
Expand All @@ -694,12 +687,13 @@ def encoder(
model_spec: ModelSpec,
):
if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
return encode_contrasts(
values,
levels=levels,
reduced_rank=reduced_rank,
missing_method=missing_method,
missing_method=missing_method
or model_spec.materializer_params.get("cat_missing_method", "fail"), # type: ignore
missing_name=missing_name,
_state=encoder_state,
_spec=model_spec,
Expand All @@ -715,14 +709,14 @@ def encoder(

@stateful_transform
def encode_contrasts(
data,
data: nw.Series,
*,
levels: Optional[Iterable[str]] = None,
missing_method: str = "fail",
missing_name: str = "(MISSING)",
reduced_rank: bool = False,
_state=None,
_spec=None,
_state: dict[str, Any] = {},
_spec: Optional[ModelSpec] = None,
) -> FactorValues[_InteractableCategoricalVector]:
"""
Encode a categorical dataset into one an _InteractableCategoricalVector
Expand All @@ -738,6 +732,10 @@ def encode_contrasts(
levels = levels if levels is not None else _state.get("categories")
add_missing_category = _state.get("add_missing_category", False)

if data.dtype.is_numeric():
# Polars enums only suppport string values
data = data.cast(nw.String)

# Check for unseen categories when levels are specified
if levels is not None:
if missing_method == "convert" and not add_missing_category:
Expand All @@ -746,21 +744,28 @@ def encode_contrasts(
# - missings are no problem in the other cases
unseen_categories = set(data.unique()) - set(levels)
else:
unseen_categories = set(data.dropna().unique()) - set(levels)
unseen_categories = set(data.drop_nulls().unique()) - set(levels)

if unseen_categories:
raise ValueError(
f"Column {data.name} contains unseen categories: {unseen_categories}."
)
else:
# Not super efficient as we do it again in _extract_codes_and_categories
levels = list(data.drop_nulls().unique().sort())

cat = data.cast(nw.Enum(levels))
codes, categories = _extract_codes_and_categories(cat)
categories = list(categories)

cat = pd.Categorical(data._values, categories=levels)
_state["categories"] = cat.categories
_state["add_missing_category"] = add_missing_category or (
missing_method == "convert" and cat.isna().any()
_state["categories"] = categories
_state["add_missing_category"] = add_missing_category or bool(
missing_method == "convert" and cat.is_null().any()
)

return _InteractableCategoricalVector.from_categorical(
cat,
return _InteractableCategoricalVector.from_codes(
codes=codes,
categories=categories,
reduced_rank=reduced_rank,
missing_method=missing_method,
missing_name=missing_name,
Expand Down
Loading
Loading