diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 93af3c0..144bff4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -35,7 +35,7 @@ jobs: run: | python -m pip install --upgrade pip if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then - pip install ".[test,pandas,spark,test_numpy_pre2]" + pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]" else pip install ".[test,pandas,spark]" fi diff --git a/README.rst b/README.rst index f65a832..e5adbca 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,8 @@ more quickly via Numpy commands, rather than Python for loops. This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation. -Latest Python release: v1.1.1 (Aug 2025). -Latest update: Aug 2025. +Latest Python release: v1.1.2 (Sep 2025). +Latest update: Sep 2025. References ========== @@ -38,19 +38,19 @@ Changes See Changes log `here `_. -Spark 3.X ---------- +Spark +----- -With Spark 3.X, based on Scala 2.12 or 2.13, make sure to pick up the correct histogrammar jar files: +With Spark, make sure to pick up the correct histogrammar jar files. Spark 4.X is based on Scala 2.13; Spark 3.X is based on Scala 2.12 or 2.13. .. code-block:: python - spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.30").getOrCreate() + spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.13:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.13:1.0.30").getOrCreate() -For Scala 2.13, in the string above simply replace "2.12" with "2.13". +For Scala 2.12, in the string above simply replace "2.13" with "2.12". -December, 2023 +September, 2025 Example notebooks diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py index 154ce31..97c14fb 100644 --- a/histogrammar/dfinterface/spark_histogrammar.py +++ b/histogrammar/dfinterface/spark_histogrammar.py @@ -225,7 +225,7 @@ def construct_empty_hist(self, df, features): for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] - quant = df[col] + quant = f.col(col) hist = self.get_hist_bin(hist, features, quant, col, dt) return hist diff --git a/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb b/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb index 7e99bde..349153b 100644 --- a/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb +++ b/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb @@ -3,6 +3,7 @@ { "cell_type": "markdown", "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -118,9 +119,9 @@ "# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n", "\n", "if pyspark_installed:\n", - " scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n", - " hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n", - " hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n", + " scala = '2.12' if int(pyspark_version[0]) == 3 else '2.13'\n", + " hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.30'\n", + " hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.30'\n", "\n", " spark = SparkSession.builder.config(\n", " \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n", @@ -521,7 +522,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.11.11" }, "nteract": { "version": "0.15.0" diff --git a/histogrammar/util.py b/histogrammar/util.py index 25987eb..cc2fd61 100644 --- a/histogrammar/util.py +++ b/histogrammar/util.py @@ -247,7 +247,7 @@ def __init__(self, expr, name=None): ok = False else: if isinstance(expr, Column) and self.name is None: - self.name = str(expr)[7:-1] + self.name = str(expr)[8:-2] ok = True if not ok: raise TypeError(f"quantity ({expr}) must be a string, function, or SparkSQL Column") diff --git a/histogrammar/version.py b/histogrammar/version.py index 54c25e9..dd17fa1 100644 --- a/histogrammar/version.py +++ b/histogrammar/version.py @@ -2,7 +2,7 @@ import re -version = "1.1.1" +version = "1.1.2" def split_version_string(version_string: str) -> tuple[int, int]: diff --git a/pyproject.toml b/pyproject.toml index 97da278..268fa69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "histogrammar" -description = "Composable histogram primitives for distributed data reduction" +description = "Histograms for Pandas/Spark/Numpy" keywords = [ "pandas", "spark", @@ -17,7 +17,7 @@ keywords = [ ] readme = "README.rst" requires-python = ">=3.9" -authors = [{ name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }, { name = "Max Baak", email = "maxbaak@gmail.com" }] +authors = [{ name = "Max Baak", email = "maxbaak@gmail.com" }, { name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }] maintainers = [{ name = "Max Baak", email = "maxbaak@gmail.com" }] license = { type = "Apache Software License v2", file = "LICENSE" } dependencies = [ @@ -40,7 +40,7 @@ pandas = [ "pandas" ] spark = [ - "pyspark<4; python_version <= '3.11'", + "pyspark", ] test = [ "ipykernel>=5.1.3", @@ -55,6 +55,9 @@ test_numpy_pre2 = [ "numpy<2", "pandas<2", ] +test_spark_pre2 = [ + "pyspark<4; python_version <= '3.11'", +] # files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks # after installation, these can be found with the functions in resources.py diff --git a/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar b/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar new file mode 100644 index 0000000..cf31b7e Binary files /dev/null and b/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar differ diff --git a/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar b/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar new file mode 100644 index 0000000..1d04384 Binary files /dev/null and b/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar differ diff --git a/tests/jars/histogrammar_2.12-1.0.30.jar b/tests/jars/histogrammar_2.12-1.0.30.jar new file mode 100644 index 0000000..e39dcc9 Binary files /dev/null and b/tests/jars/histogrammar_2.12-1.0.30.jar differ diff --git a/tests/jars/histogrammar_2.13-1.0.30.jar b/tests/jars/histogrammar_2.13-1.0.30.jar new file mode 100644 index 0000000..4bf3acd Binary files /dev/null and b/tests/jars/histogrammar_2.13-1.0.30.jar differ diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py index 42aef1a..7b06e8f 100644 --- a/tests/test_spark_histogrammar.py +++ b/tests/test_spark_histogrammar.py @@ -21,9 +21,9 @@ def get_spark(): current_path = Path(__file__).resolve().parent - scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11" - hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.20.jar" - hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.20.jar" + scala = "2.12" if int(pyspark_version[0]) == 3 else "2.13" + hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.30.jar" + hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.30.jar" return ( SparkSession.builder.master("local") @@ -44,16 +44,16 @@ def spark_co(): @pytest.mark.skipif(not spark_found, reason="spark not found") @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because") def test_get_histograms(spark_co): - pytest.age["data"]["name"] = "'age'" - pytest.company["data"]["name"] = "'company'" - pytest.eyesColor["data"]["name"] = "'eyeColor'" - pytest.gender["data"]["name"] = "'gender'" - pytest.isActive["data"]["name"] = "'isActive'" - pytest.latitude["data"]["name"] = "'latitude'" - pytest.longitude["data"]["name"] = "'longitude'" - pytest.transaction["data"]["name"] = "'transaction'" - - pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'" + pytest.age["data"]["name"] = "age" + pytest.company["data"]["name"] = "company" + pytest.eyesColor["data"]["name"] = "eyeColor" + pytest.gender["data"]["name"] = "gender" + pytest.isActive["data"]["name"] = "isActive" + pytest.latitude["data"]["name"] = "latitude" + pytest.longitude["data"]["name"] = "longitude" + pytest.transaction["data"]["name"] = "transaction" + + pytest.latitude_longitude["data"]["name"] = "latitude:longitude" pytest.latitude_longitude["data"]["bins:name"] = "unit_func" spark = spark_co @@ -104,15 +104,15 @@ def test_get_histograms(spark_co): @pytest.mark.skipif(not spark_found, reason="spark not found") @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because") def test_get_histograms_module(spark_co): - pytest.age["data"]["name"] = "'age'" - pytest.company["data"]["name"] = "'company'" - pytest.eyesColor["data"]["name"] = "'eyeColor'" - pytest.gender["data"]["name"] = "'gender'" - pytest.isActive["data"]["name"] = "'isActive'" - pytest.latitude["data"]["name"] = "'latitude'" - pytest.longitude["data"]["name"] = "'longitude'" - - pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'" + pytest.age["data"]["name"] = "age" + pytest.company["data"]["name"] = "company" + pytest.eyesColor["data"]["name"] = "eyeColor" + pytest.gender["data"]["name"] = "gender" + pytest.isActive["data"]["name"] = "isActive" + pytest.latitude["data"]["name"] = "latitude" + pytest.longitude["data"]["name"] = "longitude" + + pytest.latitude_longitude["data"]["name"] = "latitude:longitude" pytest.latitude_longitude["data"]["bins:name"] = "unit_func" spark = spark_co @@ -187,7 +187,7 @@ def test_get_histograms_timestamp(spark_co): "bins": {"108": 9.0, "109": 1.0}, "bins:type": "Count", "entries": 10.0, - "name": "'dt'", + "name": "dt", "nanflow": 0.0, "nanflow:type": "Count", "origin": 1.2625632e18, @@ -229,7 +229,7 @@ def test_get_histograms_date(spark_co): "bins": {"108": 9.0, "109": 1.0}, "bins:type": "Count", "entries": 10.0, - "name": "'dt'", + "name": "dt", "nanflow": 0.0, "nanflow:type": "Count", "origin": 1.2625632e18,