Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7486594
add note to find spot later
Meghansaha Jan 8, 2026
db2b70a
add in polars lib check; prep for dict handling and schema creation
Meghansaha Jan 25, 2026
3f37b61
add schema
Meghansaha Feb 1, 2026
fc2b745
proto df
Meghansaha Feb 4, 2026
698fc48
Created draft df to be returned
Meghansaha Feb 10, 2026
2d7e9b8
Merge remote-tracking branch 'upstream/main' into add-get_dataframe
Meghansaha Feb 10, 2026
cf66d3f
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Feb 20, 2026
a958ded
first draft of polars table complete.
Meghansaha Feb 25, 2026
44f41c7
pandas done
Meghansaha Mar 7, 2026
b3099a8
add duckdb/ibis workflow
Meghansaha Mar 11, 2026
ee5264f
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Mar 11, 2026
c53e3ed
fix ruff errors
Meghansaha Mar 11, 2026
c3b3fa7
Merge branch 'add-get_dataframe' of https://github.com/Meghansaha/poi…
Meghansaha Mar 11, 2026
9f47df9
reformat w/ ruff
Meghansaha Mar 11, 2026
3b5f38e
pull in updates from main
Meghansaha Mar 11, 2026
e3210cb
update documentation, add test placeholder
Meghansaha Mar 21, 2026
47237c1
add tests for `get_dataframe`; need to look at ibis workflow again
Meghansaha Mar 25, 2026
6ce63b3
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Mar 25, 2026
eefb6ce
debugging
Meghansaha Apr 2, 2026
68aa301
Merge branch 'add-get_dataframe' of https://github.com/Meghansaha/poi…
Meghansaha Apr 2, 2026
bd970a3
typo fix
Meghansaha Apr 2, 2026
583e755
cleanup
Meghansaha Apr 7, 2026
17cb5d7
Merge branch 'posit-dev:main' into add-get_dataframe
Meghansaha Apr 7, 2026
9b838a7
typing fixes
Meghansaha Apr 12, 2026
73911ad
Merge branch 'add-get_dataframe' of https://github.com/Meghansaha/poi…
Meghansaha Apr 12, 2026
694ae37
add todo note
Meghansaha Apr 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 268 additions & 1 deletion pointblank/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@

import commonmark
import narwhals as nw
from narwhals.dependencies import is_narwhals_lazyframe
from great_tables import GT, from_column, google_font, html, loc, md, style, vals
from great_tables.gt import _get_column_of_values
from great_tables.vals import fmt_integer, fmt_number
from importlib_resources import files
from narwhals.dependencies import is_narwhals_lazyframe

from pointblank._agg import (
is_valid_agg,
Expand Down Expand Up @@ -157,6 +157,7 @@
"missing_vals_tbl",
"get_action_metadata",
"get_column_count",
"get_dataframe",
"get_data_path",
"get_row_count",
"get_validation_summary",
Expand Down Expand Up @@ -4150,6 +4151,8 @@ def connect_to_table(connection_string: str) -> Any:
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
)

import os

import ibis

# Check if connection string includes table specification
Expand All @@ -4165,6 +4168,8 @@ def connect_to_table(connection_string: str) -> Any:
available_tables = []

conn.disconnect()
conn.close()
os.unlink(base_connection)

# Create helpful error message
if available_tables:
Expand Down Expand Up @@ -17805,6 +17810,268 @@ def get_step_report(

return step_report


def get_dataframe(
self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polars"
) -> Any:
"""
Return validation results as a tabular object (Polars by default).

This method returns a compact, row-wise summary of validation step results that is suitable for
logging and exporting (e.g., writing to CSV). The returned object depends on `tbl_type`.

Parameters
----------
tbl_type :
The output backend. One of `"polars"`, `"pandas"`, or `"duckdb"`. Default is `"polars"`.

Returns
-------
polars.DataFrame | pandas.DataFrame | ibis.expr.types.relations.Table
A tabular summary of validation results. When `tbl_type="duckdb"`, the return value is an Ibis memtable (a `Table` expression).

Supported DataFrame Types
-------------------------
The `tbl_type=` parameter can be set to one of the following:

- `"polars"`: A Polars DataFrame.
- `"pandas"`: A Pandas DataFrame.
- `"duckdb"`: An Ibis memtable.

Examples
--------
```{python}
import pointblank as pb

validation = (
pb.Validate(data=pb.load_dataset("small_table", tbl_type="polars"), label="My validation")
.col_vals_gt(columns="d", value=100)
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
.interrogate()
)

df_validation = validation.get_dataframe()
```
"""
allowed_tbl_types = ("polars", "pandas", "duckdb")
if tbl_type not in allowed_tbl_types:
raise ValueError(
f"The DataFrame type `{tbl_type}` is not valid. Choose one of the following:\n"
"- `polars`\n"
"- `pandas`\n"
"- `duckdb`"
)

report_original = _validation_info_as_dict(self.validation_info)

# Remove extracts (can be large / nested; not intended for summary logging)
report_original.pop("extract", None)

names_dict = {
"active": "active",
"i": "step_number",
"assertion_type": "step_description",
"column": "columns",
"values": "values",
"pre": "original_pre",
"segments": "original_segments",
"eval_error": "step_evaluated",
"n": "units",
"all_passed": "all_units_passed",
"n_passed": "pass_n",
"f_passed": "pass_pct",
"n_failed": "failed_n",
"f_failed": "failed_pct",
"warning": "warning",
"error": "error",
"critical": "critical",
"brief": "input_brief",
"autobrief": "autobrief",
}

final_report = {k: report_original[k] for k in names_dict if k in report_original}

# Normalize `values`: if a dict contains a regex `pattern`, log just that pattern.
values = final_report.get("values")
if isinstance(values, list):
final_report = {
**final_report,
"values": [
v.get("pattern") if isinstance(v, dict) and "pattern" in v else v for v in values
],
}

if tbl_type == "polars":
if not _is_lib_present(lib_name="polars"):
raise ImportError(
"The Polars library is not installed but is required when specifying "
'`tbl_type="polars".'
)

import polars as pl

pl_schema = pl.Schema(
{
"active": pl.Boolean,
"i": pl.Int64,
"assertion_type": pl.String,
"column": pl.String,
"values": pl.Object,
"pre": pl.Object,
"segments": pl.String,
"eval_error": pl.Boolean,
"n": pl.Int64,
"all_passed": pl.Boolean,
"n_passed": pl.Int64,
"f_passed": pl.Float64,
"n_failed": pl.Int64,
"f_failed": pl.Float64,
"warning": pl.Boolean,
"error": pl.Boolean,
"critical": pl.Boolean,
"brief": pl.String,
"autobrief": pl.String,
}
)

inactive_null_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

df_validation_results = (
pl.DataFrame(data=final_report, schema=pl_schema)
.rename(names_dict)
.with_columns(
brief=pl.coalesce(pl.col("input_brief"), pl.col("autobrief")),
preprocessed=pl.col("original_pre").is_not_null(),
segmented=pl.col("original_segments").is_not_null(),
)
.with_columns(
[
pl.when(~pl.col("active")).then(pl.lit(None)).otherwise(pl.col(col)).alias(col)
for col in inactive_null_cols
]
)
.drop(["input_brief", "autobrief", "original_pre", "original_segments"])
)

return df_validation_results

elif tbl_type == "pandas":
if not _is_lib_present(lib_name="pandas"):
raise ImportError(
"The Pandas library is not installed but is required when specifying "
'`tbl_type="pandas".'
)

import pandas as pd

def transform_validation_results(df: pd.DataFrame) -> pd.DataFrame:
df = df.assign(brief=df["input_brief"].fillna(df["autobrief"]))
df = df.assign(
preprocessed=df["original_pre"].notna(),
segmented=df["original_segments"].notna(),
)

inactive_null_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]
for col in inactive_null_cols:
df[col] = df[col].where(df["active"], pd.NA)

return df.drop(
columns=["input_brief", "autobrief", "original_pre", "original_segments"]
)

df_validation_results = (
pd.DataFrame(data=final_report)
.rename(columns=names_dict)
.pipe(transform_validation_results)
)

return df_validation_results

else: # tbl_type == "duckdb"
if not _is_lib_present(lib_name="ibis"):
raise ImportError(
"The Ibis library is not installed but is required when specifying "
'`tbl_type="duckdb".'
)

import ibis
import ibis.expr.datatypes as dt

ibis_schema = {
"active": dt.Boolean(),
"i": dt.Int64(),
"assertion_type": dt.String(),
"column": dt.String(),
"values": dt.json(),
"pre": dt.json(),
"segments": dt.String(),
"eval_error": dt.Boolean(),
"n": dt.Int64(),
"all_passed": dt.Boolean(),
"n_passed": dt.Int64(),
"f_passed": dt.Float64(),
"n_failed": dt.Int64(),
"f_failed": dt.Float64(),
"warning": dt.Boolean(),
"error": dt.Boolean(),
"critical": dt.Boolean(),
"brief": dt.String(),
"autobrief": dt.String(),
}

report_table = ibis.memtable(final_report, schema=ibis_schema).rename(names_dict)

inactive_null_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

df_validation_results = report_table.mutate(
brief=ibis.coalesce(report_table.input_brief, report_table.autobrief),
preprocessed=report_table.original_pre.notnull(),
segmented=report_table.original_segments.notnull(),
**{
col: ibis.ifelse(
~report_table.active,
ibis.null().cast(report_table[col].type()),
report_table[col],
)
for col in inactive_null_cols
},
).drop("input_brief", "autobrief", "original_pre", "original_segments")

return df_validation_results

def _add_validation(self, validation_info):
"""
Add a validation to the list of validations.
Expand Down
45 changes: 44 additions & 1 deletion tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class StrEnum(str, Enum):

## If we specifically disable tests in pytest set the availability to False
if os.environ.get("SKIP_PYSPARK_TESTS", "").lower() in ("true", "1", "yes"):
PYSPARK_AVAILABLE = False
PYSPARKAVAILABLE = False
SQLITE_AVAILABLE = True
if os.environ.get("SKIP_SQLITE_TESTS", "").lower() in ("true", "1", "yes"):
SQLITE_AVAILABLE = False
Expand Down Expand Up @@ -13899,6 +13899,48 @@ def test_get_step_report_schema_checks(schema) -> None:
assert isinstance(validation.get_step_report(i=1), GT.GT)


def test_get_dataframe_wrong_tbl_type_messaging():
tbl = pl.DataFrame({"name": ["Monica", "Erica", "Rita", "Tina"], "mambo_no": [2, 3, 4, 5]})

validation = Validate(data=tbl).col_vals_gt(columns="mambo_no", value=5).interrogate()

with pytest.raises(ValueError, match="The DataFrame type `polar` is not valid. Choose one of"):
validation.get_dataframe("polar")


@pytest.mark.parametrize(
"library, tbl_type", [("Polars", "polars"), ("Pandas", "pandas"), ("Ibis", "duckdb")]
)
def test_get_dataframe_missing_libraries(library, tbl_type):

validation = Validate(data="small_table")

with patch("pointblank.validate._is_lib_present") as mock_is_lib:
mock_is_lib.return_value = False # library not present

with pytest.raises(ImportError, match=f"The {library} library is not installed"):
validation.get_dataframe(tbl_type)


def test_get_dataframe_returns_polars_df():
validation = Validate(data="small_table")
df_polars = validation.get_dataframe("polars")
assert isinstance(df_polars, pl.DataFrame)


def test_get_dataframe_returns_pandas_df():
validation = Validate(data="small_table")
df_pandas = validation.get_dataframe("pandas")
assert isinstance(df_pandas, pd.DataFrame)


def test_get_dataframe_returns_ibis_memtable():
validation = Validate(data="small_table")
df_ibis = validation.get_dataframe("duckdb")
assert isinstance(df_ibis, ibis.expr.types.relations.Table)

# TODO: MEGHAN - test col names, brief coalescing, values to dict check, test inactive steps output, empty validation check

def get_schema_info(
data_tbl,
schema,
Expand Down Expand Up @@ -19317,6 +19359,7 @@ def test_col_vals_ge_timezone_datetime_duckdb() -> None:

finally:
conn.close()
os.unlink(temp_db_path)


@pytest.mark.xfail(reason="Mixed timezone comparisons may not work correctly yet")
Expand Down
Loading