From de2f74c2cf4dab6b19a5e958641af090935a85cb Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 09:52:46 +0100
Subject: [PATCH 01/15] Relax numpy versioning constraint

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 4186a42..14a04da 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,7 +20,7 @@ package_dir =
 packages = find:
 python_requires = >=3.9
 install_requires =
-    numpy>=1.20.0,<2.0.0
+    numpy>=1.20.0,<3.0.0
     scipy>=1.6.0
     pandas>=1.2.0
     statsmodels>=0.13.0,<1.0.0

From 1ff55041c718176d9971a3e238db32478ab133c5 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 10:13:30 +0100
Subject: [PATCH 02/15] Modernize build infrastructure

---
 .flake8                              |  3 --
 .github/workflows/confidence.yml     | 10 ++--
 .github/workflows/python-publish.yml |  6 +--
 .gitignore                           |  6 ++-
 MANIFEST.in                          | 10 ----
 Makefile                             |  8 ++--
 pyproject.toml                       | 70 ++++++++++++++++++++++++++--
 requirements_dev.txt                 | 20 --------
 setup.cfg                            | 31 ------------
 setup.py                             |  3 --
 tox.ini                              | 38 ++++-----------
 11 files changed, 92 insertions(+), 113 deletions(-)
 delete mode 100644 .flake8
 delete mode 100644 MANIFEST.in
 delete mode 100644 requirements_dev.txt
 delete mode 100644 setup.cfg
 delete mode 100644 setup.py

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index adf399e..0000000
--- a/.flake8
+++ /dev/null
@@ -1,3 +0,0 @@
-[flake8]
-max-line-length = 120
-ignore = E203,E231,W503
diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml
index d657dc6..b2682e3 100644
--- a/.github/workflows/confidence.yml
+++ b/.github/workflows/confidence.yml
@@ -13,18 +13,18 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
 
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi
-        python -m pip install tox tox-gh-actions
+        pip install -e ".[dev]"
+        pip install tox tox-gh-actions
     - name: Test with tox
       run: tox
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4f2e205..fb9d87f 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -18,11 +18,11 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.gitignore b/.gitignore
index 9cd5fd6..edcb3af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,4 +90,8 @@ ENV/
 
 .DS_store
 
-.idea/
\ No newline at end of file
+.idea/
+
+# uv
+uv.lock
+.venv/
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 2353e85..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,10 +0,0 @@
-include README.md
-include CONTRIBUTING.md
-include AUTHORS.md
-
-recursive-include tests *
-recursive-exclude * __pycache__
-recursive-exclude * *.py[co]
-
-recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py
-recursive-include confidence/ *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py
\ No newline at end of file
diff --git a/Makefile b/Makefile
index b8f0ff3..fd000df 100644
--- a/Makefile
+++ b/Makefile
@@ -48,13 +48,13 @@ clean-test: ## remove test and coverage artifacts
 	rm -fr htmlcov/
 
 lint: ## check style with flake8
-	flake8 confidence tests
+	flake8 spotify_confidence tests
 
 test: ## run tests quickly with the default Python
 	python3 -m pytest
 
 coverage: ## check code coverage quickly with the default Python
-	coverage run --source confidence -m pytest
+	coverage run --source spotify_confidence -m pytest
 	coverage report -m
 	coverage html
 	$(BROWSER) htmlcov/index.html
@@ -86,10 +86,10 @@ install: clean ## install the package to the active Python's site-packages
 	pip install -e .
 
 install-test: clean
-	pip3 install --index-url https://test.pypi.org/simple/ confidence-spotify
+	pip3 install --index-url https://test.pypi.org/simple/ spotify-confidence
 
 install-prod: clean
-	pip3 install confidence-spotify
+	pip3 install spotify-confidence
 
 black:
 	black spotify_confidence tests --line-length 119
diff --git a/pyproject.toml b/pyproject.toml
index f6c1689..59873e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,68 @@
 [build-system]
-requires = [
-    "setuptools>=42",
-    "wheel",
-]
+requires = ["setuptools>=61.2"]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "spotify-confidence"
+version = "4.0.0"
+description = "Package for calculating and visualising confidence intervals, e.g. for A/B test analysis."
+readme = "README.md"
+license = {file = "LICENSE"}
+authors = [{name = "Per Sillren", email = "pers@spotify.com"}]
+requires-python = ">=3.9"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy>=1.20.0,<3.0.0",
+    "scipy>=1.6.0",
+    "pandas>=1.2.0",
+    "statsmodels>=0.13.0,<1.0.0",
+    "chartify>=5.0.1",
+    "ipywidgets>=8.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "build",
+    "twine",
+    "flake8>=4.0.1",
+    "tox>=4.4.7",
+    "pytest>=7.0.1",
+    "pytest-cov>=2.5.1",
+    "pytest-runner>=6.0.0",
+    "coverage>=4.5.1",
+    "black>=23.1.0",
+    "pylint>=1.7.4",
+    "jupyterlab>=3.2.9",
+]
+
+[project.urls]
+Homepage = "https://github.com/spotify/confidence"
+"Bug Tracker" = "https://github.com/spotify/confidence/issues"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["spotify_confidence*"]
+namespaces = false
+
+[tool.black]
+line-length = 119
+target-version = ["py39", "py310", "py311"]
+
+[tool.flake8]
+max-line-length = 120
+ignore = ["E203", "E231", "W503"]
+exclude = [".venv", ".tox", ".git", "dist", "docs", "*.egg", "build", "scratch.py", "confidence_dev"]
+
+[tool.pytest.ini_options]
+addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing"
+testpaths = ["tests"]
+
+[tool.coverage.run]
+source = ["spotify_confidence"]
+
+[tool.coverage.report]
+show_missing = true
diff --git a/requirements_dev.txt b/requirements_dev.txt
deleted file mode 100644
index cf02b2e..0000000
--- a/requirements_dev.txt
+++ /dev/null
@@ -1,20 +0,0 @@
--e .
-pip==23.0.1
-build
-twine
-bumpversion==0.5.3
-watchdog==0.8.3
-flake8==4.0.1
-tox==4.4.7
-Sphinx==1.4.8
-pytest-runner==6.0.0
-jupyterlab==3.2.9
-pylint==1.7.4
-coverage==4.5.1
-pytest==7.0.1
-pytest-cov==2.5.1
-ipywidgets>=7.1.0
-black==23.1.0
-ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability
-setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability
-tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 14a04da..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-[metadata]
-name = spotify-confidence
-version = 4.0.0
-author = Per Sillren
-author_email = pers@spotify.com
-description = Package for calculating and visualising confidence intervals, e.g. for A/B test analysis.
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/spotify/confidence
-project_urls =
-    Bug Tracker = https://github.com/spotify/confidence/issues
-classifiers =
-    Programming Language :: Python :: 3
-    License :: OSI Approved :: Apache Software License
-    Operating System :: OS Independent
-
-[options]
-package_dir =
-    = .
-packages = find:
-python_requires = >=3.9
-install_requires =
-    numpy>=1.20.0,<3.0.0
-    scipy>=1.6.0
-    pandas>=1.2.0
-    statsmodels>=0.13.0,<1.0.0
-    chartify>=5.0.1
-    ipywidgets>=8.0.0
-
-[options.packages.find]
-where = .
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b908cbe..0000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import setuptools
-
-setuptools.setup()
diff --git a/tox.ini b/tox.ini
index fc4e692..38d0ebb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,37 +1,17 @@
 [tox]
-envlist = python3.9, python3.10, python3.11
-skipsdist = True
-usedevelop = True
-
-[travis]
-python =
-    3.9: python3.9
-    3.10: python3.10
-    3.11: python3.11
+envlist = py39, py310, py311, py312
+isolated_build = True
 
 [gh-actions]
 python =
-    3.9: python3.9
-    3.10: python3.10
-    3.11: python3.11
+    3.9: py39
+    3.10: py310
+    3.11: py311
+    3.12: py312
 
 [testenv]
-setenv =
-    PYTHONPATH = {toxinidir}
-deps =
-    -r{toxinidir}/requirements_dev.txt
+extras = dev
 commands =
-    flake8 {posargs}
-    coverage erase
-    py.test {posargs}
-
-[flake8]
-show-source = true
-max-line-length = 120
-exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev
-ignore = E203, W503
-
-[pytest]
-addopts = -v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing
-testpaths = tests
+    flake8 spotify_confidence tests
+    pytest {posargs}
 

From 1966f7af38609393f759680a7d84b7e4a8a08cc5 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 10:59:40 +0100
Subject: [PATCH 03/15] Fix flake8/tests

---
 CONTRIBUTING.rst                              | 69 +++++++++++++------
 Makefile                                      |  5 +-
 pyproject.toml                                |  9 +--
 .../analysis/confidence_utils.py              |  2 +-
 .../confidence_computer.py                    |  2 +-
 .../samplesize/sample_size_calculator.py      | 36 +++++-----
 tests/bayesian/test_betabinomial.py           |  6 +-
 tests/frequentist/test_chisquared.py          |  6 +-
 tests/frequentist/test_experiment.py          |  2 +-
 tests/frequentist/test_ttest.py               | 10 +--
 tests/frequentist/test_ztest.py               | 42 +++++------
 tests/frequentist/test_ztest_linreg.py        | 16 ++---
 tox.ini                                       |  5 ++
 13 files changed, 119 insertions(+), 91 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index b175338..a46b8b7 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -57,41 +57,55 @@ Get Started!
 
 Ready to contribute? Here's how to set up `confidence` for local development.
 
+**Prerequisites:**
+
+* `uv <https://docs.astral.sh/uv/>`_ - Fast Python package installer (recommended)
+* Python 3.9 or later
+
 1. Fork the `confidence` repo on GitHub.
 2. Clone your fork locally::
 
-    $ git clone https://github.com/spotify/confidence
+    $ git clone git@github.com:your_username/confidence.git
+    $ cd confidence
+
+3. Set up your development environment using uv::
+
+    $ uv venv
+    $ uv pip install -e ".[dev]"
 
-3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
+   This creates a virtual environment and installs the package in editable mode with all development dependencies.
 
-    $ mkvirtualenv confidence_dev
-    $ cd confidence/
-    $ tox
+4. Verify your setup by running the tests::
 
-   The tox command will install the dev requirements in requirements_dev.txt and run all tests.
+    $ uv run pytest
 
-4. Create a branch for local development::
+   This should run all tests and show they pass.
+
+5. Create a branch for local development::
 
     $ git checkout -b name-of-your-bugfix-or-feature
 
    Now you can make your changes locally.
 
-5. When you're done making changes, format using `make black`, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
+6. When you're done making changes, check that your changes pass all quality checks::
+
+    $ uv run black spotify_confidence tests --line-length 119  # Format code
+    $ uv run flake8 spotify_confidence tests                   # Lint code
+    $ uv run pytest                                            # Run tests
+
+   To test across all supported Python versions (3.9, 3.10, 3.11, 3.12)::
 
-    $ make black
-    $ flake8 confidence tests
-    $ python setup.py test or py.test
-    $ tox
+    $ uv run tox
 
-   To get flake8 and tox, just pip install them into your virtualenv.
+   Note: tox requires all Python versions to be installed on your system.
 
-6. Commit your changes and push your branch to GitHub::
+7. Commit your changes and push your branch to GitHub::
 
     $ git add .
     $ git commit -m "Your detailed description of your changes."
     $ git push origin name-of-your-bugfix-or-feature
 
-7. Submit a pull request through the GitHub website.
+8. Submit a pull request through the GitHub website.
 
 Pull Request Guidelines
 -----------------------
@@ -101,23 +115,36 @@ Before you submit a pull request, check that it meets these guidelines:
 1. The pull request should include tests.
 2. If the pull request adds functionality, the docs should be updated. Put
    your new functionality into a function with a docstring, and add the
-   feature to the list in README.rst.
-3. The pull request should work for Python 3.6 and 3.7. Check
-   and make sure that the tests pass for all supported Python versions.
+   feature to the list in README.md.
+3. The pull request should work for Python 3.9, 3.10, 3.11, and 3.12. The CI
+   pipeline will automatically test all supported Python versions.
 
 Tips
 ----
 
 To run a subset of tests::
 
-$ py.test tests.test_confidence
+    $ uv run pytest tests/frequentist/test_ttest.py
+
+To run a specific test::
+
+    $ uv run pytest tests/frequentist/test_ttest.py::TestCategorical::test_summary
+
+To run tests with verbose output::
+
+    $ uv run pytest -v
+
+To see test coverage::
+
+    $ uv run pytest --cov=spotify_confidence --cov-report=html
+    $ open htmlcov/index.html
 
 
 Release Process
 -----------------------
 
 While commits and pull requests are welcome from  any contributor, we try to
-simplify the distribution process for everyone by managing the release 
+simplify the distribution process for everyone by managing the release
 process with specific contributors serving in the role of Release Managers.
 
 Release Managers are responsible for:
@@ -142,7 +169,7 @@ PATCH version when you make backwards-compatible bug fixes.
 
 Release Stategy
 ~~~~~~~~~~~~~~~~
-Each new release will be made on its own branch, with the branch Master 
+Each new release will be made on its own branch, with the branch Master
 representing the most recent, furthest release. Releases are published to PyPi
 automatically once a new release branch is merged to Master. Additionally,
 rew releases are also tracked manually on `github
diff --git a/Makefile b/Makefile
index fd000df..8c0ff85 100644
--- a/Makefile
+++ b/Makefile
@@ -47,6 +47,9 @@ clean-test: ## remove test and coverage artifacts
 	rm -f .coverage
 	rm -fr htmlcov/
 
+format: ## format code with black
+	black spotify_confidence tests --line-length 119
+
 lint: ## check style with flake8
 	flake8 spotify_confidence tests
 
@@ -91,5 +94,3 @@ install-test: clean
 install-prod: clean
 	pip3 install spotify-confidence
 
-black:
-	black spotify_confidence tests --line-length 119
diff --git a/pyproject.toml b/pyproject.toml
index 59873e5..8d0a969 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,13 +28,13 @@ dependencies = [
 dev = [
     "build",
     "twine",
+    "black>=23.1.0",
     "flake8>=4.0.1",
     "tox>=4.4.7",
     "pytest>=7.0.1",
     "pytest-cov>=2.5.1",
     "pytest-runner>=6.0.0",
     "coverage>=4.5.1",
-    "black>=23.1.0",
     "pylint>=1.7.4",
     "jupyterlab>=3.2.9",
 ]
@@ -50,12 +50,7 @@ namespaces = false
 
 [tool.black]
 line-length = 119
-target-version = ["py39", "py310", "py311"]
-
-[tool.flake8]
-max-line-length = 120
-ignore = ["E203", "E231", "W503"]
-exclude = [".venv", ".tox", ".git", "dist", "docs", "*.egg", "build", "scratch.py", "confidence_dev"]
+target-version = ["py39", "py310", "py311", "py312"]
 
 [tool.pytest.ini_options]
 addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing"
diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py
index 44db803..29f241a 100644
--- a/spotify_confidence/analysis/confidence_utils.py
+++ b/spotify_confidence/analysis/confidence_utils.py
@@ -250,5 +250,5 @@ def dfmatmul(x, y, outer=True):
 
 def de_list_if_length_one(x):
     """Return first element of x if x is a list of length one"""
-    is_iterable = type(x) != str and isinstance(x, Iterable)
+    is_iterable = not isinstance(x, str) and isinstance(x, Iterable)
     return x[0] if is_iterable and len(x) == 1 else x
diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
index 4c47c22..4ea6105 100644
--- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
+++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
@@ -351,7 +351,7 @@ def compute_differences(
         level_columns = get_remaning_groups(self._all_group_columns, groupby)
         difference_df = self._compute_differences(
             level_columns=level_columns,
-            levels=[levels] if type(levels) == tuple else levels,
+            levels=[levels] if isinstance(levels, tuple) else levels,
             absolute=absolute,
             groupby=groupby,
             level_as_reference=True,
diff --git a/spotify_confidence/samplesize/sample_size_calculator.py b/spotify_confidence/samplesize/sample_size_calculator.py
index 10b77d5..4c22506 100644
--- a/spotify_confidence/samplesize/sample_size_calculator.py
+++ b/spotify_confidence/samplesize/sample_size_calculator.py
@@ -422,18 +422,18 @@ def show_samplesize(
                 )
                 code_html = widgets.HTML(
                     "<pre><code>"
-                    f"SampleSize.continuous(average_absolute_mde={ mde },\n"
-                    f"                      baseline_variance={ baseline },\n"
-                    f"                      alpha={ alpha },\n"
-                    f"                      power={ power },\n"
-                    f"                      treatments={ treatments },\n"
+                    f"SampleSize.continuous(average_absolute_mde={mde},\n"
+                    f"                      baseline_variance={baseline},\n"
+                    f"                      alpha={alpha},\n"
+                    f"                      power={power},\n"
+                    f"                      treatments={treatments},\n"
                     f"                      comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                      treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                      treatment_allocations=None,\n"
                     f"                      bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "<code></pre>"
                 )
             else:
@@ -461,19 +461,19 @@ def show_samplesize(
                 )
                 code_html = widgets.HTML(
                     "<pre><code>"
-                    f"SampleSize.binomial(absolute_percentage_mde={ mde },\n"
+                    f"SampleSize.binomial(absolute_percentage_mde={mde},\n"
                     f"                    baseline_proportion="
-                    f"{ baseline },\n"
-                    f"                    alpha={ alpha },\n"
-                    f"                    power={ power },\n"
-                    f"                    treatments={ treatments },\n"
+                    f"{baseline},\n"
+                    f"                    alpha={alpha},\n"
+                    f"                    power={power},\n"
+                    f"                    treatments={treatments},\n"
                     f"                    comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                    treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                    treatment_allocations=None,\n"
                     f"                    bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "<code></pre>"
                 )
 
@@ -482,7 +482,7 @@ def compare_against_optimal(current, optimal):
                     return ""
                 else:
                     return (
-                        f"<br><small><em>{current/optimal:.1f}x "
+                        f"<br><small><em>{current / optimal:.1f}x "
                         f"optimal group allocation of {optimal:,}."
                         f"</em></small>"
                     )
@@ -501,7 +501,7 @@ def compare_against_optimal(current, optimal):
                 else:
                     treatment = "Variant " + str(i)
 
-                cell_str += f"<br><em>{treatment}:</em> " f"{n_cell[i]:,} ({prop_cell[i]*100:.1f}%)"
+                cell_str += f"<br><em>{treatment}:</em> " f"{n_cell[i]:,} ({prop_cell[i] * 100:.1f}%)"
 
             display(widgets.HTML(cell_str))
             display(code_html)
diff --git a/tests/bayesian/test_betabinomial.py b/tests/bayesian/test_betabinomial.py
index 1779623..3a6bf74 100644
--- a/tests/bayesian/test_betabinomial.py
+++ b/tests/bayesian/test_betabinomial.py
@@ -9,7 +9,7 @@
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -148,7 +148,7 @@ def test_multiple_difference_level_as_reference(self):
 
 
 class TestOrdinal:
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -307,7 +307,7 @@ def test_multiple_difference_plot(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
diff --git a/tests/frequentist/test_chisquared.py b/tests/frequentist/test_chisquared.py
index 97e67e8..7b68aed 100644
--- a/tests/frequentist/test_chisquared.py
+++ b/tests/frequentist/test_chisquared.py
@@ -24,7 +24,7 @@ def chart_data(chart_object, series_name):
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -335,7 +335,7 @@ def test_achieved_power_groupby(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -514,7 +514,7 @@ def test_sample_ratio_test(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
diff --git a/tests/frequentist/test_experiment.py b/tests/frequentist/test_experiment.py
index ac4775c..34946a4 100644
--- a/tests/frequentist/test_experiment.py
+++ b/tests/frequentist/test_experiment.py
@@ -7,7 +7,7 @@
 
 
 class TestBootstrap(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n_bootstraps = int(5e5)
         self.data = pd.DataFrame(
diff --git a/tests/frequentist/test_ttest.py b/tests/frequentist/test_ttest.py
index f310606..61735c6 100644
--- a/tests/frequentist/test_ttest.py
+++ b/tests/frequentist/test_ttest.py
@@ -36,7 +36,7 @@ def chart_data(chart_object, series_name):
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -244,7 +244,7 @@ def test_achieved_power(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -403,7 +403,7 @@ def test_achieved_power(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -805,7 +805,7 @@ def test_differece_plot_with_nims(self):
 
 
 class TestCategoricalBinomialData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -877,7 +877,7 @@ def test_multiple_difference(self):
 
 
 class TestWithNims(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
diff --git a/tests/frequentist/test_ztest.py b/tests/frequentist/test_ztest.py
index b03b4f1..957029b 100644
--- a/tests/frequentist/test_ztest.py
+++ b/tests/frequentist/test_ztest.py
@@ -26,7 +26,7 @@
 
 
 class TestPoweredEffectContinuousSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -87,7 +87,7 @@ def test_powered_effect2(self):
 
 
 class TestPoweredEffectContinuousMultipleSuccessMetrics(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -149,7 +149,7 @@ def test_powered_effect1(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricTypes(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -212,7 +212,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -342,7 +342,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments2(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -472,7 +472,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments3(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -605,7 +605,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -750,7 +750,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectBinaryOnlyGuardrail(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -816,7 +816,7 @@ def test_powered_effect(self):
 
 
 class TestBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -922,7 +922,7 @@ def test_multiple_difference_plot(self):
 
 
 class TestCategoricalBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -1038,7 +1038,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestCategoricalContinuous(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -1126,7 +1126,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -1230,7 +1230,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -1892,7 +1892,7 @@ def test_differece_plot_with_nims_in_df(self):
 
 
 class TestCategoricalBinomialData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -2004,7 +2004,7 @@ def test_multiple_difference(self):
 
 
 class TestWithNims(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -2109,7 +2109,7 @@ def test_one_sided_ztest_negative(self):
 
 
 class TestSequentialOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         d = 50 + 1 * np.random.randn(60)
         u = np.floor(2000 + np.linspace(0, 1000, 60) + 10 * np.random.randn(60))
@@ -2636,7 +2636,7 @@ def test_multiple_difference_groupby_mixed_nims(self):
 
 
 class TestSequentialOrdinalPlusTwoCategorical2(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -3406,7 +3406,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialOneSided(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3448,7 +3448,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialTwoSided(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3489,7 +3489,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialOneSidedThreeGroups(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3534,7 +3534,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestNimsWithNaN(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "count": {
diff --git a/tests/frequentist/test_ztest_linreg.py b/tests/frequentist/test_ztest_linreg.py
index eecd4ad..cb81a14 100644
--- a/tests/frequentist/test_ztest_linreg.py
+++ b/tests/frequentist/test_ztest_linreg.py
@@ -7,7 +7,7 @@
 
 
 class TestUnivariateSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 10000
         d = np.random.randint(2, size=n)
@@ -74,7 +74,7 @@ def linreg(X, y):
 
 
 class TestUnivariateMultiMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 20000
         d = np.random.randint(2, size=n)
@@ -164,7 +164,7 @@ def linreg(X, y):
 
 
 class TestUnivariateNoFeatures(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -214,7 +214,7 @@ def test_summary(self):
 
 
 class TestMultivariateSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         n = 10000
@@ -316,7 +316,7 @@ def linreg(X, y):
 
 
 class TestMultivariateMultipleMetrics(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         n = 10000
@@ -441,7 +441,7 @@ def linreg(X, y):
 
 
 class TestUnivariateMultiMetricRequiredSampleSize(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 2000000
         d = np.random.randint(2, size=n)
@@ -512,7 +512,7 @@ def test_parameters_univariate_required_sample_size(self):
 
 
 class TestUnivariateSingleMetricWithBadPreExposureData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 10000
         d = np.random.randint(2, size=n)
@@ -569,7 +569,7 @@ def test_parameters_univariate(self):
 
 
 class TestUnivariateSingleMetricNegativeVariance(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
diff --git a/tox.ini b/tox.ini
index 38d0ebb..ca49fa1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,3 +15,8 @@ commands =
     flake8 spotify_confidence tests
     pytest {posargs}
 
+[flake8]
+max-line-length = 120
+ignore = E203,E231,W503
+exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev
+

From 6f9cc1c4596299f9f5505be8d120491f6182227e Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 11:27:34 +0100
Subject: [PATCH 04/15] Pick more sensible dependencies

---
 pyproject.toml | 23 ++++++++++-------------
 tox.ini        | 20 +++++++++++++++++++-
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8d0a969..27edacb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,11 +16,11 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "numpy>=1.20.0,<3.0.0",
-    "scipy>=1.6.0",
-    "pandas>=1.2.0",
-    "statsmodels>=0.13.0,<1.0.0",
-    "chartify>=5.0.1",
+    "numpy>=1.21.0",
+    "scipy>=1.9.0",
+    "pandas>=1.4.0",
+    "statsmodels>=0.13.5",
+    "chartify>=5.0.0",
     "ipywidgets>=8.0.0",
 ]
 
@@ -29,14 +29,11 @@ dev = [
     "build",
     "twine",
     "black>=23.1.0",
-    "flake8>=4.0.1",
-    "tox>=4.4.7",
-    "pytest>=7.0.1",
-    "pytest-cov>=2.5.1",
-    "pytest-runner>=6.0.0",
-    "coverage>=4.5.1",
-    "pylint>=1.7.4",
-    "jupyterlab>=3.2.9",
+    "flake8>=6.0.0",
+    "tox>=4.0.0",
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+    "coverage>=7.0.0",
 ]
 
 [project.urls]
diff --git a/tox.ini b/tox.ini
index ca49fa1..77a9b79 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py39, py310, py311, py312
+envlist = py39, py310, py311, py312, py39-min
 isolated_build = True
 
 [gh-actions]
@@ -15,6 +15,24 @@ commands =
     flake8 spotify_confidence tests
     pytest {posargs}
 
+[testenv:py39-min]
+basepython = python3.9
+deps =
+    numpy==1.21.0
+    scipy==1.9.0
+    pandas==1.4.0
+    statsmodels==0.13.5
+    chartify==5.0.0
+    ipywidgets==8.0.0
+    black==23.1.0
+    flake8==6.0.0
+    pytest==7.0.0
+    pytest-cov==4.0.0
+    coverage==7.0.0
+commands =
+    flake8 spotify_confidence tests
+    pytest {posargs}
+
 [flake8]
 max-line-length = 120
 ignore = E203,E231,W503

From 951e1bed7ee084364dbc25c773eb69e9d70498f4 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 11:29:13 +0100
Subject: [PATCH 05/15] Add black check to tox

---
 tox.ini | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tox.ini b/tox.ini
index 77a9b79..15d4141 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,7 @@ python =
 [testenv]
 extras = dev
 commands =
+    black --check --diff spotify_confidence tests
     flake8 spotify_confidence tests
     pytest {posargs}
 
@@ -30,6 +31,7 @@ deps =
     pytest-cov==4.0.0
     coverage==7.0.0
 commands =
+    black --check --diff spotify_confidence tests
     flake8 spotify_confidence tests
     pytest {posargs}
 

From ca2d9de15f981d7655a5233f52b216829779339a Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 11:37:59 +0100
Subject: [PATCH 06/15] Fix version for pytest

---
 spotify_confidence/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spotify_confidence/__init__.py b/spotify_confidence/__init__.py
index 6369a1f..d8e9f24 100644
--- a/spotify_confidence/__init__.py
+++ b/spotify_confidence/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pkg_resources import require as _require
+from importlib.metadata import version as _version
 from .analysis.bayesian.bayesian_models import BetaBinomial
 from spotify_confidence.analysis.frequentist.chi_squared import ChiSquared
 from spotify_confidence.analysis.frequentist.t_test import StudentsTTest
@@ -25,7 +25,7 @@
 from . import examples
 from .options import options
 
-__version__ = _require("spotify_confidence")[0].version
+__version__ = _version("spotify_confidence")
 
 __all__ = [
     "BetaBinomial",

From 8ea27a18451d24f34b5141daeb39d005af976825 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 13:55:01 +0100
Subject: [PATCH 07/15] add CLAUDE.md

---
 CLAUDE.md | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..fae842d
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,157 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+Spotify Confidence is a Python library for A/B test analysis. It provides convenience wrappers around statsmodel's functions for computing p-values and confidence intervals. The library supports both frequentist (Z-test, Student's T-test, Chi-squared) and Bayesian (BetaBinomial) statistical methods, with features for variance reduction, sequential testing, and sample size calculations.
+
+## Development Commands
+
+### Setup
+```bash
+# Install with development dependencies (including tox-uv)
+uv pip install -e ".[dev]"
+```
+
+### Testing
+```bash
+# Run all tests with coverage
+uv run pytest
+
+# Run tests without coverage reports
+uv run pytest --no-cov
+
+# Run specific test file
+uv run pytest tests/frequentist/test_z_test.py
+
+# Run specific test
+uv run pytest tests/frequentist/test_z_test.py::test_name
+
+# Run all tests across Python versions
+uv run tox
+```
+
+### Code Quality
+```bash
+# Format code with black (line length: 119)
+uv run black spotify_confidence tests
+
+# Check formatting without making changes
+uv run black --check --diff spotify_confidence tests
+
+# Lint with flake8 (max line length: 120)
+uv run flake8 spotify_confidence tests
+
+# Run all quality checks (as done in CI)
+uv run black --check --diff spotify_confidence tests && uv run flake8 spotify_confidence tests && uv run pytest
+```
+
+### Build
+```bash
+# Build distribution packages
+uv run python -m build
+```
+
+## Architecture
+
+### Core Design Pattern
+
+The library follows an object-oriented design with separation of concerns:
+
+1. **Statistical Test Classes**: High-level APIs (`ZTest`, `StudentsTTest`, `ChiSquared`, `BetaBinomial`, `ZTestLinreg`)
+2. **Experiment Class**: Base class containing shared analysis methods for frequentist tests
+3. **Computer Classes**: Perform the actual statistical computations
+4. **Grapher Classes**: Generate visualizations using Chartify
+
+All main test classes inherit from abstract base classes in `spotify_confidence/analysis/abstract_base_classes/`:
+- `ConfidenceABC`: Base for all statistical test classes
+- `ConfidenceComputerABC`: Base for computation logic
+- `ConfidenceGrapherABC`: Base for visualization logic
+
+### Module Structure
+
+```
+spotify_confidence/
+├── analysis/
+│   ├── abstract_base_classes/    # ABC definitions for the framework
+│   ├── frequentist/               # Frequentist statistical methods
+│   │   ├── confidence_computers/  # Statistical computation logic
+│   │   ├── experiment.py          # Base class for frequentist tests
+│   │   ├── z_test.py              # Z-test implementation
+│   │   ├── t_test.py              # Student's T-test implementation
+│   │   ├── chi_squared.py         # Chi-squared test
+│   │   ├── z_test_linreg.py       # Z-test with linear regression variance reduction
+│   │   ├── sequential_bound_solver.py  # Group sequential testing
+│   │   ├── multiple_comparison.py # Multiple testing correction
+│   │   └── sample_size_calculator.py
+│   ├── bayesian/                  # Bayesian methods
+│   │   └── bayesian_models.py     # BetaBinomial implementation
+│   ├── constants.py               # Shared constants
+│   └── confidence_utils.py        # Shared utility functions
+├── samplesize/                    # Sample size calculations
+├── examples.py                    # Example data generators
+├── chartgrid.py                   # Chart grid utilities
+└── options.py                     # Global configuration
+```
+
+### Key Classes and Their Relationships
+
+- **Experiment** (in `frequentist/experiment.py`): The core base class for frequentist tests. Provides methods like:
+  - `summary()`: Overall metric summaries
+  - `difference()`: Pairwise comparisons
+  - `multiple_difference()`: Multiple comparisons with correction
+  - `difference_plot()`, `summary_plot()`, etc.: Visualization methods
+  - `sample_size()`: Required sample size calculations
+  - `statistical_power()`: Power analysis
+
+- **ZTest, StudentsTTest, ChiSquared**: Thin wrappers that initialize `Experiment` with the appropriate computer and method
+
+- **Computer Classes** (in `frequentist/confidence_computers/`): Handle the statistical calculations
+  - `ZTestComputer`, `TTestComputer`, `ChiSquaredComputer`: Specific computation implementations
+  - All inherit from `ConfidenceComputerABC`
+
+- **ChartifyGrapher**: Implements visualization using the Chartify library
+
+### Data Model
+
+The library works with DataFrames containing sufficient statistics:
+- `numerator_column`: Sum or count (e.g., sum of conversions)
+- `denominator_column`: Total observations (e.g., total users)
+- `numerator_sum_squares_column`: Sum of squares (optional, for variance calculations)
+- `categorical_group_columns`: Treatment/control groups and other dimensions
+- `ordinal_group_column`: Time-based grouping for sequential analysis
+
+### Important Conventions
+
+1. **Method Column**: Tests add a `METHOD_COLUMN_NAME` to data indicating the test type (e.g., "z-test", "t-test")
+
+2. **Multiple Comparison Correction**: Supported methods defined in `constants.py`:
+   - Standard: bonferroni, holm, hommel, sidak, FDR methods
+   - SPOT-1 variants: Custom Spotify methods for specific use cases
+
+3. **Non-Inferiority Margins (NIMs)**: Can be specified as absolute values or relative percentages
+
+4. **Sequential Testing**: The `sequential_bound_solver.py` module implements group sequential designs with spending functions
+
+5. **Variance Reduction**: `ZTestLinreg` uses pre-exposure data to fit a linear model and reduce variance (CUPED method)
+
+## Testing Guidelines
+
+- Tests are organized to mirror the source structure under `tests/`
+- Use pytest fixtures for common test data
+- Tests check both DataFrame outputs and chart generation
+- Coverage target is configured in `pyproject.toml`
+
+## Python Version Support
+
+Supports Python 3.9, 3.10, 3.11, and 3.12. The `tox.ini` includes a `py39-min` environment that tests with minimum dependency versions.
+
+The project uses `tox-uv` to leverage uv's fast package installation and environment management in tox, significantly speeding up multi-environment testing. The GitHub Actions CI workflow also uses uv for faster dependency installation.
+
+## Code Style
+
+- Black formatting with 119 character line length
+- Flake8 linting with max line length 120
+- Ignored flake8 rules: E203, E231, W503
+- Excluded from linting: `.venv`, `.tox`, `dist`, `build`, `scratch.py`, `confidence_dev`

From d611f9bda8c389268dddb461fe39fd957ecf04b8 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 13:55:05 +0100
Subject: [PATCH 08/15] Use uv for CI as well

---
 .github/workflows/confidence.yml | 7 ++++---
 pyproject.toml                   | 1 +
 tox.ini                          | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml
index b2682e3..2f8d32e 100644
--- a/.github/workflows/confidence.yml
+++ b/.github/workflows/confidence.yml
@@ -21,10 +21,11 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install -e ".[dev]"
-        pip install tox tox-gh-actions
+        uv pip install --system -e ".[dev]"
+        uv pip install --system tox tox-gh-actions
     - name: Test with tox
       run: tox
diff --git a/pyproject.toml b/pyproject.toml
index 27edacb..8543725 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ dev = [
     "black>=23.1.0",
     "flake8>=6.0.0",
     "tox>=4.0.0",
+    "tox-uv>=1.0.0",
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
     "coverage>=7.0.0",
diff --git a/tox.ini b/tox.ini
index 15d4141..2b76cad 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,7 @@
 [tox]
 envlist = py39, py310, py311, py312, py39-min
 isolated_build = True
+requires = tox-uv
 
 [gh-actions]
 python =

From 08956078a99f49b228af989a5ce8b776c5091d4b Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 14:45:40 +0100
Subject: [PATCH 09/15] Fix pandas deprecation warnings

---
 .../analysis/bayesian/bayesian_base.py        |  4 ++-
 .../analysis/bayesian/bayesian_models.py      | 28 +++++++++++++++----
 .../analysis/confidence_utils.py              |  4 ++-
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/spotify_confidence/analysis/bayesian/bayesian_base.py b/spotify_confidence/analysis/bayesian/bayesian_base.py
index 13f666e..9629127 100644
--- a/spotify_confidence/analysis/bayesian/bayesian_base.py
+++ b/spotify_confidence/analysis/bayesian/bayesian_base.py
@@ -447,7 +447,9 @@ def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_
     @staticmethod
     def _validate_levels(level_df, remaining_groups, level):
         try:
-            level_df.groupby(remaining_groups).get_group(level)
+            # When grouping with a length-1 list, get_group expects a tuple
+            group_key = (level,) if isinstance(remaining_groups, list) and len(remaining_groups) == 1 else level
+            level_df.groupby(remaining_groups).get_group(group_key)
         except (KeyError, ValueError):
             raise ValueError(
                 """
diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py
index cd3e93a..f58f89b 100644
--- a/spotify_confidence/analysis/bayesian/bayesian_models.py
+++ b/spotify_confidence/analysis/bayesian/bayesian_models.py
@@ -185,9 +185,17 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou
             ch.set_legend_location("outside_bottom")
         return ch
 
-    def _difference_posteriors(self, data, level_1, level_2, absolute=True):
-        posterior_1 = self._sample_posterior(data.get_group(level_1))
-        posterior_2 = self._sample_posterior(data.get_group(level_2))
+    def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None):
+        # When grouping with a length-1 list, get_group expects a tuple
+        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
+            level_1_key = (level_1,)
+            level_2_key = (level_2,)
+        else:
+            level_1_key = level_1
+            level_2_key = level_2
+
+        posterior_1 = self._sample_posterior(data.get_group(level_1_key))
+        posterior_2 = self._sample_posterior(data.get_group(level_2_key))
 
         if absolute:
             difference_posterior = posterior_2 - posterior_1
@@ -256,7 +264,7 @@ def _difference_and_difference_posterior(self, level_df, remaining_groups, level
         self._validate_levels(level_df, remaining_groups, level_2)
         # difference is posterior_2 - posterior_1
         difference_posterior = self._difference_posteriors(
-            level_df.groupby(remaining_groups), level_1, level_2, absolute
+            level_df.groupby(remaining_groups), level_1, level_2, absolute, remaining_groups
         )
         difference_df = self._differences(difference_posterior, level_1, level_2, absolute)
         return difference_df, difference_posterior
@@ -384,7 +392,11 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
+        # When grouping with a length-1 list, get_group expects a tuple
+        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
+            posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys]
+        else:
+            posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
         other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level]
@@ -627,7 +639,11 @@ def _categorical_multiple_difference_chart(
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
+        # When grouping with a length-1 list, get_group expects a tuple
+        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
+            posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys]
+        else:
+            posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
 
diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py
index 29f241a..73585dc 100644
--- a/spotify_confidence/analysis/confidence_utils.py
+++ b/spotify_confidence/analysis/confidence_utils.py
@@ -98,7 +98,9 @@ def get_all_categorical_group_columns(
 def validate_levels(df: DataFrame, level_columns: Union[str, Iterable], levels: Iterable):
     for level in levels:
         try:
-            df.groupby(level_columns).get_group(level)
+            # When grouping with a length-1 list, get_group expects a tuple
+            group_key = (level,) if isinstance(level_columns, list) and len(level_columns) == 1 else level
+            df.groupby(level_columns).get_group(group_key)
         except (KeyError, ValueError):
             raise ValueError(
                 """

From 6045a4e042d514236e02bf3597aac3df995ffaad Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 15:40:54 +0100
Subject: [PATCH 10/15] Bump black to fix python 3.9 - but requires revert for
 warnings

---
 pyproject.toml                                |  2 +-
 .../analysis/bayesian/bayesian_base.py        |  4 +---
 .../analysis/bayesian/bayesian_models.py      | 24 ++++---------------
 .../analysis/confidence_utils.py              |  4 +---
 tox.ini                                       |  3 +--
 5 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8543725..a4ee754 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
 dev = [
     "build",
     "twine",
-    "black>=23.1.0",
+    "black>=23.7.0",
     "flake8>=6.0.0",
     "tox>=4.0.0",
     "tox-uv>=1.0.0",
diff --git a/spotify_confidence/analysis/bayesian/bayesian_base.py b/spotify_confidence/analysis/bayesian/bayesian_base.py
index 9629127..13f666e 100644
--- a/spotify_confidence/analysis/bayesian/bayesian_base.py
+++ b/spotify_confidence/analysis/bayesian/bayesian_base.py
@@ -447,9 +447,7 @@ def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_
     @staticmethod
     def _validate_levels(level_df, remaining_groups, level):
         try:
-            # When grouping with a length-1 list, get_group expects a tuple
-            group_key = (level,) if isinstance(remaining_groups, list) and len(remaining_groups) == 1 else level
-            level_df.groupby(remaining_groups).get_group(group_key)
+            level_df.groupby(remaining_groups).get_group(level)
         except (KeyError, ValueError):
             raise ValueError(
                 """
diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py
index f58f89b..2755e1d 100644
--- a/spotify_confidence/analysis/bayesian/bayesian_models.py
+++ b/spotify_confidence/analysis/bayesian/bayesian_models.py
@@ -186,16 +186,8 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou
         return ch
 
     def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None):
-        # When grouping with a length-1 list, get_group expects a tuple
-        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
-            level_1_key = (level_1,)
-            level_2_key = (level_2,)
-        else:
-            level_1_key = level_1
-            level_2_key = level_2
-
-        posterior_1 = self._sample_posterior(data.get_group(level_1_key))
-        posterior_2 = self._sample_posterior(data.get_group(level_2_key))
+        posterior_1 = self._sample_posterior(data.get_group(level_1))
+        posterior_2 = self._sample_posterior(data.get_group(level_2))
 
         if absolute:
             difference_posterior = posterior_2 - posterior_1
@@ -392,11 +384,7 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        # When grouping with a length-1 list, get_group expects a tuple
-        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
-            posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys]
-        else:
-            posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
+        posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
         other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level]
@@ -639,11 +627,7 @@ def _categorical_multiple_difference_chart(
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        # When grouping with a length-1 list, get_group expects a tuple
-        if isinstance(remaining_groups, list) and len(remaining_groups) == 1:
-            posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys]
-        else:
-            posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
+        posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
 
diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py
index 73585dc..29f241a 100644
--- a/spotify_confidence/analysis/confidence_utils.py
+++ b/spotify_confidence/analysis/confidence_utils.py
@@ -98,9 +98,7 @@ def get_all_categorical_group_columns(
 def validate_levels(df: DataFrame, level_columns: Union[str, Iterable], levels: Iterable):
     for level in levels:
         try:
-            # When grouping with a length-1 list, get_group expects a tuple
-            group_key = (level,) if isinstance(level_columns, list) and len(level_columns) == 1 else level
-            df.groupby(level_columns).get_group(group_key)
+            df.groupby(level_columns).get_group(level)
         except (KeyError, ValueError):
             raise ValueError(
                 """
diff --git a/tox.ini b/tox.ini
index 2b76cad..a12faa4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -26,7 +26,7 @@ deps =
     statsmodels==0.13.5
     chartify==5.0.0
     ipywidgets==8.0.0
-    black==23.1.0
+    black==23.7.0
     flake8==6.0.0
     pytest==7.0.0
     pytest-cov==4.0.0
@@ -40,4 +40,3 @@ commands =
 max-line-length = 120
 ignore = E203,E231,W503
 exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev
-

From df4df5daa66ca510d9cce0c5f817a9ca7526c37e Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 15:51:11 +0100
Subject: [PATCH 11/15] Improve GH actions

---
 .github/workflows/confidence.yml | 3 +++
 tox.ini                          | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml
index 2f8d32e..c4553ed 100644
--- a/.github/workflows/confidence.yml
+++ b/.github/workflows/confidence.yml
@@ -23,6 +23,9 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install uv
       uses: astral-sh/setup-uv@v5
+      with:
+        enable-cache: true
+        cache-dependency-glob: "**/pyproject.toml"
     - name: Install dependencies
       run: |
         uv pip install --system -e ".[dev]"
diff --git a/tox.ini b/tox.ini
index a12faa4..fa686cb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@ requires = tox-uv
 
 [gh-actions]
 python =
-    3.9: py39
+    3.9: py39, py39-min
     3.10: py310
     3.11: py311
     3.12: py312

From 570e7f6be4f78beeeb6b53699a9c7b7761603d75 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 16:14:38 +0100
Subject: [PATCH 12/15] Run tox and tests in parallel

---
 CONTRIBUTING.rst |  2 +-
 pyproject.toml   |  1 +
 tox.ini          | 14 +++++++++++---
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index a46b8b7..86b3bd7 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -95,7 +95,7 @@ Ready to contribute? Here's how to set up `confidence` for local development.
 
    To test across all supported Python versions (3.9, 3.10, 3.11, 3.12)::
 
-    $ uv run tox
+    $ uv run tox -p auto
 
    Note: tox requires all Python versions to be installed on your system.
 
diff --git a/pyproject.toml b/pyproject.toml
index a4ee754..68a5837 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dev = [
     "tox-uv>=1.0.0",
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
+    "pytest-xdist>=3.0.2",
     "coverage>=7.0.0",
 ]
 
diff --git a/tox.ini b/tox.ini
index fa686cb..064fa09 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@ requires = tox-uv
 
 [gh-actions]
 python =
-    3.9: py39, py39-min
+    3.9: py39-min
     3.10: py310
     3.11: py311
     3.12: py312
@@ -15,7 +15,14 @@ extras = dev
 commands =
     black --check --diff spotify_confidence tests
     flake8 spotify_confidence tests
-    pytest {posargs}
+    pytest -n auto --no-cov --basetemp={envtmpdir} {posargs}
+
+[testenv:py312]
+extras = dev
+commands =
+    black --check --diff spotify_confidence tests
+    flake8 spotify_confidence tests
+    pytest -n auto --basetemp={envtmpdir} {posargs}
 
 [testenv:py39-min]
 basepython = python3.9
@@ -30,11 +37,12 @@ deps =
     flake8==6.0.0
     pytest==7.0.0
     pytest-cov==4.0.0
+    pytest-xdist==3.0.2
     coverage==7.0.0
 commands =
     black --check --diff spotify_confidence tests
     flake8 spotify_confidence tests
-    pytest {posargs}
+    pytest -n auto --no-cov --basetemp={envtmpdir} {posargs}
 
 [flake8]
 max-line-length = 120

From f6cc8b5b52a0f004269baf688cfcbbc9a4ba64aa Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 16:20:54 +0100
Subject: [PATCH 13/15] revert

---
 spotify_confidence/analysis/bayesian/bayesian_models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py
index 2755e1d..cd3e93a 100644
--- a/spotify_confidence/analysis/bayesian/bayesian_models.py
+++ b/spotify_confidence/analysis/bayesian/bayesian_models.py
@@ -185,7 +185,7 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou
             ch.set_legend_location("outside_bottom")
         return ch
 
-    def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None):
+    def _difference_posteriors(self, data, level_1, level_2, absolute=True):
         posterior_1 = self._sample_posterior(data.get_group(level_1))
         posterior_2 = self._sample_posterior(data.get_group(level_2))
 
@@ -256,7 +256,7 @@ def _difference_and_difference_posterior(self, level_df, remaining_groups, level
         self._validate_levels(level_df, remaining_groups, level_2)
         # difference is posterior_2 - posterior_1
         difference_posterior = self._difference_posteriors(
-            level_df.groupby(remaining_groups), level_1, level_2, absolute, remaining_groups
+            level_df.groupby(remaining_groups), level_1, level_2, absolute
         )
         difference_df = self._differences(difference_posterior, level_1, level_2, absolute)
         return difference_df, difference_posterior
@@ -384,7 +384,7 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
+        posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
         other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level]
@@ -627,7 +627,7 @@ def _categorical_multiple_difference_chart(
 
         self._validate_levels(level_df, remaining_groups, level)
 
-        posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys]
+        posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys]
 
         var_indx = grouped_df_keys.index(level)
 

From 2ef27f327215239bf7097dcc945db5ae73193902 Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 16:40:34 +0100
Subject: [PATCH 14/15] Execute tests in parallel

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 68a5837..57fda33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ line-length = 119
 target-version = ["py39", "py310", "py311", "py312"]
 
 [tool.pytest.ini_options]
-addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing"
+addopts = "-v -n auto --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing"
 testpaths = ["tests"]
 
 [tool.coverage.run]

From 3884c88e941c57017c88149ebbd961c0e23567ac Mon Sep 17 00:00:00 2001
From: Benjamin Elbers <benjamine@spotify.com>
Date: Tue, 23 Dec 2025 16:55:10 +0100
Subject: [PATCH 15/15] don't skip tests that are actually not that slow

---
 tests/frequentist/test_bounds.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/frequentist/test_bounds.py b/tests/frequentist/test_bounds.py
index 5c2a0f8..c85d7e9 100644
--- a/tests/frequentist/test_bounds.py
+++ b/tests/frequentist/test_bounds.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import pytest
 import time
 import numpy as np
 from pandas import Timestamp
@@ -10,7 +9,6 @@
 )
 
 
-@pytest.mark.skip(reason="Skipping because this test is very slow")
 def test_many_days():
     """
     This input (based on a real experiment) is very long, which can cause slow calculation
@@ -404,7 +402,6 @@ def test_many_days():
     assert (time.time() - start_time) < 0.01
 
 
-@pytest.mark.skip(reason="Skipping because this test is very slow")
 def test_many_days_fast_and_no_crash():
     """
     This is based on experiment 1735 on 26.11.2020. The calculation of the corresponding bounds takes many minutes