diff --git a/.flake8 b/.flake8
deleted file mode 100644
index adf399e..0000000
--- a/.flake8
+++ /dev/null
@@ -1,3 +0,0 @@
-[flake8]
-max-line-length = 120
-ignore = E203,E231,W503
diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml
index d657dc6..c4553ed 100644
--- a/.github/workflows/confidence.yml
+++ b/.github/workflows/confidence.yml
@@ -13,18 +13,22 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11']
+        python-version: ['3.9', '3.10', '3.11', '3.12']
 
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+      with:
+        enable-cache: true
+        cache-dependency-glob: "**/pyproject.toml"
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi
-        python -m pip install tox tox-gh-actions
+        uv pip install --system -e ".[dev]"
+        uv pip install --system tox tox-gh-actions
     - name: Test with tox
       run: tox
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4f2e205..fb9d87f 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -18,11 +18,11 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.gitignore b/.gitignore
index 9cd5fd6..edcb3af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,4 +90,8 @@ ENV/
 
 .DS_store
 
-.idea/
\ No newline at end of file
+.idea/
+
+# uv
+uv.lock
+.venv/
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..fae842d
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,157 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+Spotify Confidence is a Python library for A/B test analysis. It provides convenience wrappers around statsmodel's functions for computing p-values and confidence intervals. The library supports both frequentist (Z-test, Student's T-test, Chi-squared) and Bayesian (BetaBinomial) statistical methods, with features for variance reduction, sequential testing, and sample size calculations.
+
+## Development Commands
+
+### Setup
+```bash
+# Install with development dependencies (including tox-uv)
+uv pip install -e ".[dev]"
+```
+
+### Testing
+```bash
+# Run all tests with coverage
+uv run pytest
+
+# Run tests without coverage reports
+uv run pytest --no-cov
+
+# Run specific test file
+uv run pytest tests/frequentist/test_z_test.py
+
+# Run specific test
+uv run pytest tests/frequentist/test_z_test.py::test_name
+
+# Run all tests across Python versions
+uv run tox
+```
+
+### Code Quality
+```bash
+# Format code with black (line length: 119)
+uv run black spotify_confidence tests
+
+# Check formatting without making changes
+uv run black --check --diff spotify_confidence tests
+
+# Lint with flake8 (max line length: 120)
+uv run flake8 spotify_confidence tests
+
+# Run all quality checks (as done in CI)
+uv run black --check --diff spotify_confidence tests && uv run flake8 spotify_confidence tests && uv run pytest
+```
+
+### Build
+```bash
+# Build distribution packages
+uv run python -m build
+```
+
+## Architecture
+
+### Core Design Pattern
+
+The library follows an object-oriented design with separation of concerns:
+
+1. **Statistical Test Classes**: High-level APIs (`ZTest`, `StudentsTTest`, `ChiSquared`, `BetaBinomial`, `ZTestLinreg`)
+2. **Experiment Class**: Base class containing shared analysis methods for frequentist tests
+3. **Computer Classes**: Perform the actual statistical computations
+4. **Grapher Classes**: Generate visualizations using Chartify
+
+All main test classes inherit from abstract base classes in `spotify_confidence/analysis/abstract_base_classes/`:
+- `ConfidenceABC`: Base for all statistical test classes
+- `ConfidenceComputerABC`: Base for computation logic
+- `ConfidenceGrapherABC`: Base for visualization logic
+
+### Module Structure
+
+```
+spotify_confidence/
+├── analysis/
+│   ├── abstract_base_classes/    # ABC definitions for the framework
+│   ├── frequentist/               # Frequentist statistical methods
+│   │   ├── confidence_computers/  # Statistical computation logic
+│   │   ├── experiment.py          # Base class for frequentist tests
+│   │   ├── z_test.py              # Z-test implementation
+│   │   ├── t_test.py              # Student's T-test implementation
+│   │   ├── chi_squared.py         # Chi-squared test
+│   │   ├── z_test_linreg.py       # Z-test with linear regression variance reduction
+│   │   ├── sequential_bound_solver.py  # Group sequential testing
+│   │   ├── multiple_comparison.py # Multiple testing correction
+│   │   └── sample_size_calculator.py
+│   ├── bayesian/                  # Bayesian methods
+│   │   └── bayesian_models.py     # BetaBinomial implementation
+│   ├── constants.py               # Shared constants
+│   └── confidence_utils.py        # Shared utility functions
+├── samplesize/                    # Sample size calculations
+├── examples.py                    # Example data generators
+├── chartgrid.py                   # Chart grid utilities
+└── options.py                     # Global configuration
+```
+
+### Key Classes and Their Relationships
+
+- **Experiment** (in `frequentist/experiment.py`): The core base class for frequentist tests. Provides methods like:
+  - `summary()`: Overall metric summaries
+  - `difference()`: Pairwise comparisons
+  - `multiple_difference()`: Multiple comparisons with correction
+  - `difference_plot()`, `summary_plot()`, etc.: Visualization methods
+  - `sample_size()`: Required sample size calculations
+  - `statistical_power()`: Power analysis
+
+- **ZTest, StudentsTTest, ChiSquared**: Thin wrappers that initialize `Experiment` with the appropriate computer and method
+
+- **Computer Classes** (in `frequentist/confidence_computers/`): Handle the statistical calculations
+  - `ZTestComputer`, `TTestComputer`, `ChiSquaredComputer`: Specific computation implementations
+  - All inherit from `ConfidenceComputerABC`
+
+- **ChartifyGrapher**: Implements visualization using the Chartify library
+
+### Data Model
+
+The library works with DataFrames containing sufficient statistics:
+- `numerator_column`: Sum or count (e.g., sum of conversions)
+- `denominator_column`: Total observations (e.g., total users)
+- `numerator_sum_squares_column`: Sum of squares (optional, for variance calculations)
+- `categorical_group_columns`: Treatment/control groups and other dimensions
+- `ordinal_group_column`: Time-based grouping for sequential analysis
+
+### Important Conventions
+
+1. **Method Column**: Tests add a `METHOD_COLUMN_NAME` to data indicating the test type (e.g., "z-test", "t-test")
+
+2. **Multiple Comparison Correction**: Supported methods defined in `constants.py`:
+   - Standard: bonferroni, holm, hommel, sidak, FDR methods
+   - SPOT-1 variants: Custom Spotify methods for specific use cases
+
+3. **Non-Inferiority Margins (NIMs)**: Can be specified as absolute values or relative percentages
+
+4. **Sequential Testing**: The `sequential_bound_solver.py` module implements group sequential designs with spending functions
+
+5. **Variance Reduction**: `ZTestLinreg` uses pre-exposure data to fit a linear model and reduce variance (CUPED method)
+
+## Testing Guidelines
+
+- Tests are organized to mirror the source structure under `tests/`
+- Use pytest fixtures for common test data
+- Tests check both DataFrame outputs and chart generation
+- Coverage target is configured in `pyproject.toml`
+
+## Python Version Support
+
+Supports Python 3.9, 3.10, 3.11, and 3.12. The `tox.ini` includes a `py39-min` environment that tests with minimum dependency versions.
+
+The project uses `tox-uv` to leverage uv's fast package installation and environment management in tox, significantly speeding up multi-environment testing. The GitHub Actions CI workflow also uses uv for faster dependency installation.
+
+## Code Style
+
+- Black formatting with 119 character line length
+- Flake8 linting with max line length 120
+- Ignored flake8 rules: E203, E231, W503
+- Excluded from linting: `.venv`, `.tox`, `dist`, `build`, `scratch.py`, `confidence_dev`
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index b175338..86b3bd7 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -57,41 +57,55 @@ Get Started!
 
 Ready to contribute? Here's how to set up `confidence` for local development.
 
+**Prerequisites:**
+
+* `uv <https://docs.astral.sh/uv/>`_ - Fast Python package installer (recommended)
+* Python 3.9 or later
+
 1. Fork the `confidence` repo on GitHub.
 2. Clone your fork locally::
 
-    $ git clone https://github.com/spotify/confidence
+    $ git clone git@github.com:your_username/confidence.git
+    $ cd confidence
+
+3. Set up your development environment using uv::
+
+    $ uv venv
+    $ uv pip install -e ".[dev]"
 
-3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
+   This creates a virtual environment and installs the package in editable mode with all development dependencies.
 
-    $ mkvirtualenv confidence_dev
-    $ cd confidence/
-    $ tox
+4. Verify your setup by running the tests::
 
-   The tox command will install the dev requirements in requirements_dev.txt and run all tests.
+    $ uv run pytest
 
-4. Create a branch for local development::
+   This should run all tests and show they pass.
+
+5. Create a branch for local development::
 
     $ git checkout -b name-of-your-bugfix-or-feature
 
    Now you can make your changes locally.
 
-5. When you're done making changes, format using `make black`, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
+6. When you're done making changes, check that your changes pass all quality checks::
+
+    $ uv run black spotify_confidence tests --line-length 119  # Format code
+    $ uv run flake8 spotify_confidence tests                   # Lint code
+    $ uv run pytest                                            # Run tests
+
+   To test across all supported Python versions (3.9, 3.10, 3.11, 3.12)::
 
-    $ make black
-    $ flake8 confidence tests
-    $ python setup.py test or py.test
-    $ tox
+    $ uv run tox -p auto
 
-   To get flake8 and tox, just pip install them into your virtualenv.
+   Note: tox requires all Python versions to be installed on your system.
 
-6. Commit your changes and push your branch to GitHub::
+7. Commit your changes and push your branch to GitHub::
 
     $ git add .
     $ git commit -m "Your detailed description of your changes."
     $ git push origin name-of-your-bugfix-or-feature
 
-7. Submit a pull request through the GitHub website.
+8. Submit a pull request through the GitHub website.
 
 Pull Request Guidelines
 -----------------------
@@ -101,23 +115,36 @@ Before you submit a pull request, check that it meets these guidelines:
 1. The pull request should include tests.
 2. If the pull request adds functionality, the docs should be updated. Put
    your new functionality into a function with a docstring, and add the
-   feature to the list in README.rst.
-3. The pull request should work for Python 3.6 and 3.7. Check
-   and make sure that the tests pass for all supported Python versions.
+   feature to the list in README.md.
+3. The pull request should work for Python 3.9, 3.10, 3.11, and 3.12. The CI
+   pipeline will automatically test all supported Python versions.
 
 Tips
 ----
 
 To run a subset of tests::
 
-$ py.test tests.test_confidence
+    $ uv run pytest tests/frequentist/test_ttest.py
+
+To run a specific test::
+
+    $ uv run pytest tests/frequentist/test_ttest.py::TestCategorical::test_summary
+
+To run tests with verbose output::
+
+    $ uv run pytest -v
+
+To see test coverage::
+
+    $ uv run pytest --cov=spotify_confidence --cov-report=html
+    $ open htmlcov/index.html
 
 
 Release Process
 -----------------------
 
 While commits and pull requests are welcome from  any contributor, we try to
-simplify the distribution process for everyone by managing the release 
+simplify the distribution process for everyone by managing the release
 process with specific contributors serving in the role of Release Managers.
 
 Release Managers are responsible for:
@@ -142,7 +169,7 @@ PATCH version when you make backwards-compatible bug fixes.
 
 Release Stategy
 ~~~~~~~~~~~~~~~~
-Each new release will be made on its own branch, with the branch Master 
+Each new release will be made on its own branch, with the branch Master
 representing the most recent, furthest release. Releases are published to PyPi
 automatically once a new release branch is merged to Master. Additionally,
 rew releases are also tracked manually on `github
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 2353e85..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,10 +0,0 @@
-include README.md
-include CONTRIBUTING.md
-include AUTHORS.md
-
-recursive-include tests *
-recursive-exclude * __pycache__
-recursive-exclude * *.py[co]
-
-recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py
-recursive-include confidence/ *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py
\ No newline at end of file
diff --git a/Makefile b/Makefile
index b8f0ff3..8c0ff85 100644
--- a/Makefile
+++ b/Makefile
@@ -47,14 +47,17 @@ clean-test: ## remove test and coverage artifacts
 	rm -f .coverage
 	rm -fr htmlcov/
 
+format: ## format code with black
+	black spotify_confidence tests --line-length 119
+
 lint: ## check style with flake8
-	flake8 confidence tests
+	flake8 spotify_confidence tests
 
 test: ## run tests quickly with the default Python
 	python3 -m pytest
 
 coverage: ## check code coverage quickly with the default Python
-	coverage run --source confidence -m pytest
+	coverage run --source spotify_confidence -m pytest
 	coverage report -m
 	coverage html
 	$(BROWSER) htmlcov/index.html
@@ -86,10 +89,8 @@ install: clean ## install the package to the active Python's site-packages
 	pip install -e .
 
 install-test: clean
-	pip3 install --index-url https://test.pypi.org/simple/ confidence-spotify
+	pip3 install --index-url https://test.pypi.org/simple/ spotify-confidence
 
 install-prod: clean
-	pip3 install confidence-spotify
+	pip3 install spotify-confidence
 
-black:
-	black spotify_confidence tests --line-length 119
diff --git a/pyproject.toml b/pyproject.toml
index f6c1689..57fda33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,62 @@
 [build-system]
-requires = [
-    "setuptools>=42",
-    "wheel",
-]
+requires = ["setuptools>=61.2"]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "spotify-confidence"
+version = "4.0.0"
+description = "Package for calculating and visualising confidence intervals, e.g. for A/B test analysis."
+readme = "README.md"
+license = {file = "LICENSE"}
+authors = [{name = "Per Sillren", email = "pers@spotify.com"}]
+requires-python = ">=3.9"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy>=1.21.0",
+    "scipy>=1.9.0",
+    "pandas>=1.4.0",
+    "statsmodels>=0.13.5",
+    "chartify>=5.0.0",
+    "ipywidgets>=8.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "build",
+    "twine",
+    "black>=23.7.0",
+    "flake8>=6.0.0",
+    "tox>=4.0.0",
+    "tox-uv>=1.0.0",
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+    "pytest-xdist>=3.0.2",
+    "coverage>=7.0.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/spotify/confidence"
+"Bug Tracker" = "https://github.com/spotify/confidence/issues"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["spotify_confidence*"]
+namespaces = false
+
+[tool.black]
+line-length = 119
+target-version = ["py39", "py310", "py311", "py312"]
+
+[tool.pytest.ini_options]
+addopts = "-v -n auto --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing"
+testpaths = ["tests"]
+
+[tool.coverage.run]
+source = ["spotify_confidence"]
+
+[tool.coverage.report]
+show_missing = true
diff --git a/requirements_dev.txt b/requirements_dev.txt
deleted file mode 100644
index cf02b2e..0000000
--- a/requirements_dev.txt
+++ /dev/null
@@ -1,20 +0,0 @@
--e .
-pip==23.0.1
-build
-twine
-bumpversion==0.5.3
-watchdog==0.8.3
-flake8==4.0.1
-tox==4.4.7
-Sphinx==1.4.8
-pytest-runner==6.0.0
-jupyterlab==3.2.9
-pylint==1.7.4
-coverage==4.5.1
-pytest==7.0.1
-pytest-cov==2.5.1
-ipywidgets>=7.1.0
-black==23.1.0
-ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability
-setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability
-tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 4186a42..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-[metadata]
-name = spotify-confidence
-version = 4.0.0
-author = Per Sillren
-author_email = pers@spotify.com
-description = Package for calculating and visualising confidence intervals, e.g. for A/B test analysis.
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/spotify/confidence
-project_urls =
-    Bug Tracker = https://github.com/spotify/confidence/issues
-classifiers =
-    Programming Language :: Python :: 3
-    License :: OSI Approved :: Apache Software License
-    Operating System :: OS Independent
-
-[options]
-package_dir =
-    = .
-packages = find:
-python_requires = >=3.9
-install_requires =
-    numpy>=1.20.0,<2.0.0
-    scipy>=1.6.0
-    pandas>=1.2.0
-    statsmodels>=0.13.0,<1.0.0
-    chartify>=5.0.1
-    ipywidgets>=8.0.0
-
-[options.packages.find]
-where = .
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b908cbe..0000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import setuptools
-
-setuptools.setup()
diff --git a/spotify_confidence/__init__.py b/spotify_confidence/__init__.py
index 6369a1f..d8e9f24 100644
--- a/spotify_confidence/__init__.py
+++ b/spotify_confidence/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pkg_resources import require as _require
+from importlib.metadata import version as _version
 from .analysis.bayesian.bayesian_models import BetaBinomial
 from spotify_confidence.analysis.frequentist.chi_squared import ChiSquared
 from spotify_confidence.analysis.frequentist.t_test import StudentsTTest
@@ -25,7 +25,7 @@
 from . import examples
 from .options import options
 
-__version__ = _require("spotify_confidence")[0].version
+__version__ = _version("spotify_confidence")
 
 __all__ = [
     "BetaBinomial",
diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py
index 44db803..29f241a 100644
--- a/spotify_confidence/analysis/confidence_utils.py
+++ b/spotify_confidence/analysis/confidence_utils.py
@@ -250,5 +250,5 @@ def dfmatmul(x, y, outer=True):
 
 def de_list_if_length_one(x):
     """Return first element of x if x is a list of length one"""
-    is_iterable = type(x) != str and isinstance(x, Iterable)
+    is_iterable = not isinstance(x, str) and isinstance(x, Iterable)
     return x[0] if is_iterable and len(x) == 1 else x
diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
index 4c47c22..4ea6105 100644
--- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
+++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
@@ -351,7 +351,7 @@ def compute_differences(
         level_columns = get_remaning_groups(self._all_group_columns, groupby)
         difference_df = self._compute_differences(
             level_columns=level_columns,
-            levels=[levels] if type(levels) == tuple else levels,
+            levels=[levels] if isinstance(levels, tuple) else levels,
             absolute=absolute,
             groupby=groupby,
             level_as_reference=True,
diff --git a/spotify_confidence/samplesize/sample_size_calculator.py b/spotify_confidence/samplesize/sample_size_calculator.py
index 10b77d5..4c22506 100644
--- a/spotify_confidence/samplesize/sample_size_calculator.py
+++ b/spotify_confidence/samplesize/sample_size_calculator.py
@@ -422,18 +422,18 @@ def show_samplesize(
                 )
                 code_html = widgets.HTML(
                     "<pre><code>"
-                    f"SampleSize.continuous(average_absolute_mde={ mde },\n"
-                    f"                      baseline_variance={ baseline },\n"
-                    f"                      alpha={ alpha },\n"
-                    f"                      power={ power },\n"
-                    f"                      treatments={ treatments },\n"
+                    f"SampleSize.continuous(average_absolute_mde={mde},\n"
+                    f"                      baseline_variance={baseline},\n"
+                    f"                      alpha={alpha},\n"
+                    f"                      power={power},\n"
+                    f"                      treatments={treatments},\n"
                     f"                      comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                      treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                      treatment_allocations=None,\n"
                     f"                      bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "<code></pre>"
                 )
             else:
@@ -461,19 +461,19 @@ def show_samplesize(
                 )
                 code_html = widgets.HTML(
                     "<pre><code>"
-                    f"SampleSize.binomial(absolute_percentage_mde={ mde },\n"
+                    f"SampleSize.binomial(absolute_percentage_mde={mde},\n"
                     f"                    baseline_proportion="
-                    f"{ baseline },\n"
-                    f"                    alpha={ alpha },\n"
-                    f"                    power={ power },\n"
-                    f"                    treatments={ treatments },\n"
+                    f"{baseline},\n"
+                    f"                    alpha={alpha},\n"
+                    f"                    power={power},\n"
+                    f"                    treatments={treatments},\n"
                     f"                    comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                    treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                    treatment_allocations=None,\n"
                     f"                    bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "<code></pre>"
                 )
 
@@ -482,7 +482,7 @@ def compare_against_optimal(current, optimal):
                     return ""
                 else:
                     return (
-                        f"<br><small><em>{current/optimal:.1f}x "
+                        f"<br><small><em>{current / optimal:.1f}x "
                         f"optimal group allocation of {optimal:,}."
                         f"</em></small>"
                     )
@@ -501,7 +501,7 @@ def compare_against_optimal(current, optimal):
                 else:
                     treatment = "Variant " + str(i)
 
-                cell_str += f"<br><em>{treatment}:</em> " f"{n_cell[i]:,} ({prop_cell[i]*100:.1f}%)"
+                cell_str += f"<br><em>{treatment}:</em> " f"{n_cell[i]:,} ({prop_cell[i] * 100:.1f}%)"
 
             display(widgets.HTML(cell_str))
             display(code_html)
diff --git a/tests/bayesian/test_betabinomial.py b/tests/bayesian/test_betabinomial.py
index 1779623..3a6bf74 100644
--- a/tests/bayesian/test_betabinomial.py
+++ b/tests/bayesian/test_betabinomial.py
@@ -9,7 +9,7 @@
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -148,7 +148,7 @@ def test_multiple_difference_level_as_reference(self):
 
 
 class TestOrdinal:
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -307,7 +307,7 @@ def test_multiple_difference_plot(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
diff --git a/tests/frequentist/test_bounds.py b/tests/frequentist/test_bounds.py
index 5c2a0f8..c85d7e9 100644
--- a/tests/frequentist/test_bounds.py
+++ b/tests/frequentist/test_bounds.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import pytest
 import time
 import numpy as np
 from pandas import Timestamp
@@ -10,7 +9,6 @@
 )
 
 
-@pytest.mark.skip(reason="Skipping because this test is very slow")
 def test_many_days():
     """
     This input (based on a real experiment) is very long, which can cause slow calculation
@@ -404,7 +402,6 @@ def test_many_days():
     assert (time.time() - start_time) < 0.01
 
 
-@pytest.mark.skip(reason="Skipping because this test is very slow")
 def test_many_days_fast_and_no_crash():
     """
     This is based on experiment 1735 on 26.11.2020. The calculation of the corresponding bounds takes many minutes
diff --git a/tests/frequentist/test_chisquared.py b/tests/frequentist/test_chisquared.py
index 97e67e8..7b68aed 100644
--- a/tests/frequentist/test_chisquared.py
+++ b/tests/frequentist/test_chisquared.py
@@ -24,7 +24,7 @@ def chart_data(chart_object, series_name):
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -335,7 +335,7 @@ def test_achieved_power_groupby(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -514,7 +514,7 @@ def test_sample_ratio_test(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
diff --git a/tests/frequentist/test_experiment.py b/tests/frequentist/test_experiment.py
index ac4775c..34946a4 100644
--- a/tests/frequentist/test_experiment.py
+++ b/tests/frequentist/test_experiment.py
@@ -7,7 +7,7 @@
 
 
 class TestBootstrap(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n_bootstraps = int(5e5)
         self.data = pd.DataFrame(
diff --git a/tests/frequentist/test_ttest.py b/tests/frequentist/test_ttest.py
index f310606..61735c6 100644
--- a/tests/frequentist/test_ttest.py
+++ b/tests/frequentist/test_ttest.py
@@ -36,7 +36,7 @@ def chart_data(chart_object, series_name):
 
 
 class TestCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -244,7 +244,7 @@ def test_achieved_power(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -403,7 +403,7 @@ def test_achieved_power(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -805,7 +805,7 @@ def test_differece_plot_with_nims(self):
 
 
 class TestCategoricalBinomialData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -877,7 +877,7 @@ def test_multiple_difference(self):
 
 
 class TestWithNims(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
diff --git a/tests/frequentist/test_ztest.py b/tests/frequentist/test_ztest.py
index b03b4f1..957029b 100644
--- a/tests/frequentist/test_ztest.py
+++ b/tests/frequentist/test_ztest.py
@@ -26,7 +26,7 @@
 
 
 class TestPoweredEffectContinuousSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -87,7 +87,7 @@ def test_powered_effect2(self):
 
 
 class TestPoweredEffectContinuousMultipleSuccessMetrics(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -149,7 +149,7 @@ def test_powered_effect1(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricTypes(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": ["test", "control", "test2", "test", "control", "test2"],
@@ -212,7 +212,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -342,7 +342,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments2(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -472,7 +472,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectContinuousMultipleMetricsSegments3(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -605,7 +605,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -750,7 +750,7 @@ def test_powered_effect(self):
 
 
 class TestPoweredEffectBinaryOnlyGuardrail(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -816,7 +816,7 @@ def test_powered_effect(self):
 
 
 class TestBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -922,7 +922,7 @@ def test_multiple_difference_plot(self):
 
 
 class TestCategoricalBinary(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -1038,7 +1038,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestCategoricalContinuous(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -1126,7 +1126,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestOrdinal(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -1230,7 +1230,7 @@ def test_multiple_difference_plot_groupby(self):
 
 
 class TestOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "variation_name": [
@@ -1892,7 +1892,7 @@ def test_differece_plot_with_nims_in_df(self):
 
 
 class TestCategoricalBinomialData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         self.data = pd.DataFrame(
@@ -2004,7 +2004,7 @@ def test_multiple_difference(self):
 
 
 class TestWithNims(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -2109,7 +2109,7 @@ def test_one_sided_ztest_negative(self):
 
 
 class TestSequentialOrdinalPlusTwoCategorical(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         d = 50 + 1 * np.random.randn(60)
         u = np.floor(2000 + np.linspace(0, 1000, 60) + 10 * np.random.randn(60))
@@ -2636,7 +2636,7 @@ def test_multiple_difference_groupby_mixed_nims(self):
 
 
 class TestSequentialOrdinalPlusTwoCategorical2(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -3406,7 +3406,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialOneSided(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3448,7 +3448,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialTwoSided(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3489,7 +3489,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestSequentialOneSidedThreeGroups(object):
-    def setup(self):
+    def setup_method(self):
         DATE = "date"
         COUNT = "count"
         SUM = "sum"
@@ -3534,7 +3534,7 @@ def test_multiple_difference_groupby(self):
 
 
 class TestNimsWithNaN(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             {
                 "count": {
diff --git a/tests/frequentist/test_ztest_linreg.py b/tests/frequentist/test_ztest_linreg.py
index eecd4ad..cb81a14 100644
--- a/tests/frequentist/test_ztest_linreg.py
+++ b/tests/frequentist/test_ztest_linreg.py
@@ -7,7 +7,7 @@
 
 
 class TestUnivariateSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 10000
         d = np.random.randint(2, size=n)
@@ -74,7 +74,7 @@ def linreg(X, y):
 
 
 class TestUnivariateMultiMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 20000
         d = np.random.randint(2, size=n)
@@ -164,7 +164,7 @@ def linreg(X, y):
 
 
 class TestUnivariateNoFeatures(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
@@ -214,7 +214,7 @@ def test_summary(self):
 
 
 class TestMultivariateSingleMetric(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         n = 10000
@@ -316,7 +316,7 @@ def linreg(X, y):
 
 
 class TestMultivariateMultipleMetrics(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
 
         n = 10000
@@ -441,7 +441,7 @@ def linreg(X, y):
 
 
 class TestUnivariateMultiMetricRequiredSampleSize(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 2000000
         d = np.random.randint(2, size=n)
@@ -512,7 +512,7 @@ def test_parameters_univariate_required_sample_size(self):
 
 
 class TestUnivariateSingleMetricWithBadPreExposureData(object):
-    def setup(self):
+    def setup_method(self):
         np.random.seed(123)
         n = 10000
         d = np.random.randint(2, size=n)
@@ -569,7 +569,7 @@ def test_parameters_univariate(self):
 
 
 class TestUnivariateSingleMetricNegativeVariance(object):
-    def setup(self):
+    def setup_method(self):
         self.data = pd.DataFrame(
             [
                 {
diff --git a/tox.ini b/tox.ini
index fc4e692..064fa09 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,37 +1,50 @@
 [tox]
-envlist = python3.9, python3.10, python3.11
-skipsdist = True
-usedevelop = True
-
-[travis]
-python =
-    3.9: python3.9
-    3.10: python3.10
-    3.11: python3.11
+envlist = py39, py310, py311, py312, py39-min
+isolated_build = True
+requires = tox-uv
 
 [gh-actions]
 python =
-    3.9: python3.9
-    3.10: python3.10
-    3.11: python3.11
+    3.9: py39-min
+    3.10: py310
+    3.11: py311
+    3.12: py312
 
 [testenv]
-setenv =
-    PYTHONPATH = {toxinidir}
+extras = dev
+commands =
+    black --check --diff spotify_confidence tests
+    flake8 spotify_confidence tests
+    pytest -n auto --no-cov --basetemp={envtmpdir} {posargs}
+
+[testenv:py312]
+extras = dev
+commands =
+    black --check --diff spotify_confidence tests
+    flake8 spotify_confidence tests
+    pytest -n auto --basetemp={envtmpdir} {posargs}
+
+[testenv:py39-min]
+basepython = python3.9
 deps =
-    -r{toxinidir}/requirements_dev.txt
+    numpy==1.21.0
+    scipy==1.9.0
+    pandas==1.4.0
+    statsmodels==0.13.5
+    chartify==5.0.0
+    ipywidgets==8.0.0
+    black==23.7.0
+    flake8==6.0.0
+    pytest==7.0.0
+    pytest-cov==4.0.0
+    pytest-xdist==3.0.2
+    coverage==7.0.0
 commands =
-    flake8 {posargs}
-    coverage erase
-    py.test {posargs}
+    black --check --diff spotify_confidence tests
+    flake8 spotify_confidence tests
+    pytest -n auto --no-cov --basetemp={envtmpdir} {posargs}
 
 [flake8]
-show-source = true
 max-line-length = 120
+ignore = E203,E231,W503
 exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev
-ignore = E203, W503
-
-[pytest]
-addopts = -v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing
-testpaths = tests
-