From de2f74c2cf4dab6b19a5e958641af090935a85cb Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 09:52:46 +0100 Subject: [PATCH 01/15] Relax numpy versioning constraint --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 4186a42..14a04da 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ package_dir = packages = find: python_requires = >=3.9 install_requires = - numpy>=1.20.0,<2.0.0 + numpy>=1.20.0,<3.0.0 scipy>=1.6.0 pandas>=1.2.0 statsmodels>=0.13.0,<1.0.0 From 1ff55041c718176d9971a3e238db32478ab133c5 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 10:13:30 +0100 Subject: [PATCH 02/15] Modernize build infrastructure --- .flake8 | 3 -- .github/workflows/confidence.yml | 10 ++-- .github/workflows/python-publish.yml | 6 +-- .gitignore | 6 ++- MANIFEST.in | 10 ---- Makefile | 8 ++-- pyproject.toml | 70 ++++++++++++++++++++++++++-- requirements_dev.txt | 20 -------- setup.cfg | 31 ------------ setup.py | 3 -- tox.ini | 38 ++++----------- 11 files changed, 92 insertions(+), 113 deletions(-) delete mode 100644 .flake8 delete mode 100644 MANIFEST.in delete mode 100644 requirements_dev.txt delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/.flake8 b/.flake8 deleted file mode 100644 index adf399e..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 120 -ignore = E203,E231,W503 diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml index d657dc6..b2682e3 100644 --- a/.github/workflows/confidence.yml +++ b/.github/workflows/confidence.yml @@ -13,18 +13,18 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi - python -m pip install tox tox-gh-actions + pip install -e ".[dev]" + pip install tox tox-gh-actions - name: Test with tox run: tox diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4f2e205..fb9d87f 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -18,11 +18,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.gitignore b/.gitignore index 9cd5fd6..edcb3af 100644 --- a/.gitignore +++ b/.gitignore @@ -90,4 +90,8 @@ ENV/ .DS_store -.idea/ \ No newline at end of file +.idea/ + +# uv +uv.lock +.venv/ \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2353e85..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,10 +0,0 @@ -include README.md -include CONTRIBUTING.md -include AUTHORS.md - -recursive-include tests * -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] - -recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py -recursive-include confidence/ *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.py \ No newline at end of file diff --git a/Makefile b/Makefile index b8f0ff3..fd000df 100644 --- a/Makefile +++ b/Makefile @@ -48,13 +48,13 @@ clean-test: ## remove test and coverage artifacts rm -fr htmlcov/ lint: ## check style with flake8 - flake8 confidence tests + flake8 spotify_confidence tests test: ## run tests quickly with the default Python python3 -m pytest coverage: ## check code coverage quickly with the default Python - coverage run --source confidence -m pytest + coverage run --source spotify_confidence -m pytest coverage report -m coverage html $(BROWSER) htmlcov/index.html @@ -86,10 +86,10 @@ install: clean ## install the package to the active Python's site-packages pip install -e . install-test: clean - pip3 install --index-url https://test.pypi.org/simple/ confidence-spotify + pip3 install --index-url https://test.pypi.org/simple/ spotify-confidence install-prod: clean - pip3 install confidence-spotify + pip3 install spotify-confidence black: black spotify_confidence tests --line-length 119 diff --git a/pyproject.toml b/pyproject.toml index f6c1689..59873e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,68 @@ [build-system] -requires = [ - "setuptools>=42", - "wheel", -] +requires = ["setuptools>=61.2"] build-backend = "setuptools.build_meta" + +[project] +name = "spotify-confidence" +version = "4.0.0" +description = "Package for calculating and visualising confidence intervals, e.g. for A/B test analysis." +readme = "README.md" +license = {file = "LICENSE"} +authors = [{name = "Per Sillren", email = "pers@spotify.com"}] +requires-python = ">=3.9" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dependencies = [ + "numpy>=1.20.0,<3.0.0", + "scipy>=1.6.0", + "pandas>=1.2.0", + "statsmodels>=0.13.0,<1.0.0", + "chartify>=5.0.1", + "ipywidgets>=8.0.0", +] + +[project.optional-dependencies] +dev = [ + "build", + "twine", + "flake8>=4.0.1", + "tox>=4.4.7", + "pytest>=7.0.1", + "pytest-cov>=2.5.1", + "pytest-runner>=6.0.0", + "coverage>=4.5.1", + "black>=23.1.0", + "pylint>=1.7.4", + "jupyterlab>=3.2.9", +] + +[project.urls] +Homepage = "https://github.com/spotify/confidence" +"Bug Tracker" = "https://github.com/spotify/confidence/issues" + +[tool.setuptools.packages.find] +where = ["."] +include = ["spotify_confidence*"] +namespaces = false + +[tool.black] +line-length = 119 +target-version = ["py39", "py310", "py311"] + +[tool.flake8] +max-line-length = 120 +ignore = ["E203", "E231", "W503"] +exclude = [".venv", ".tox", ".git", "dist", "docs", "*.egg", "build", "scratch.py", "confidence_dev"] + +[tool.pytest.ini_options] +addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing" +testpaths = ["tests"] + +[tool.coverage.run] +source = ["spotify_confidence"] + +[tool.coverage.report] +show_missing = true diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index cf02b2e..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,20 +0,0 @@ --e . -pip==23.0.1 -build -twine -bumpversion==0.5.3 -watchdog==0.8.3 -flake8==4.0.1 -tox==4.4.7 -Sphinx==1.4.8 -pytest-runner==6.0.0 -jupyterlab==3.2.9 -pylint==1.7.4 -coverage==4.5.1 -pytest==7.0.1 -pytest-cov==2.5.1 -ipywidgets>=7.1.0 -black==23.1.0 -ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability -setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability -tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 14a04da..0000000 --- a/setup.cfg +++ /dev/null @@ -1,31 +0,0 @@ -[metadata] -name = spotify-confidence -version = 4.0.0 -author = Per Sillren -author_email = pers@spotify.com -description = Package for calculating and visualising confidence intervals, e.g. for A/B test analysis. -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/spotify/confidence -project_urls = - Bug Tracker = https://github.com/spotify/confidence/issues -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: Apache Software License - Operating System :: OS Independent - -[options] -package_dir = - = . -packages = find: -python_requires = >=3.9 -install_requires = - numpy>=1.20.0,<3.0.0 - scipy>=1.6.0 - pandas>=1.2.0 - statsmodels>=0.13.0,<1.0.0 - chartify>=5.0.1 - ipywidgets>=8.0.0 - -[options.packages.find] -where = . diff --git a/setup.py b/setup.py deleted file mode 100644 index b908cbe..0000000 --- a/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -import setuptools - -setuptools.setup() diff --git a/tox.ini b/tox.ini index fc4e692..38d0ebb 100644 --- a/tox.ini +++ b/tox.ini @@ -1,37 +1,17 @@ [tox] -envlist = python3.9, python3.10, python3.11 -skipsdist = True -usedevelop = True - -[travis] -python = - 3.9: python3.9 - 3.10: python3.10 - 3.11: python3.11 +envlist = py39, py310, py311, py312 +isolated_build = True [gh-actions] python = - 3.9: python3.9 - 3.10: python3.10 - 3.11: python3.11 + 3.9: py39 + 3.10: py310 + 3.11: py311 + 3.12: py312 [testenv] -setenv = - PYTHONPATH = {toxinidir} -deps = - -r{toxinidir}/requirements_dev.txt +extras = dev commands = - flake8 {posargs} - coverage erase - py.test {posargs} - -[flake8] -show-source = true -max-line-length = 120 -exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev -ignore = E203, W503 - -[pytest] -addopts = -v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing -testpaths = tests + flake8 spotify_confidence tests + pytest {posargs} From 1966f7af38609393f759680a7d84b7e4a8a08cc5 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 10:59:40 +0100 Subject: [PATCH 03/15] Fix flake8/tests --- CONTRIBUTING.rst | 69 +++++++++++++------ Makefile | 5 +- pyproject.toml | 9 +-- .../analysis/confidence_utils.py | 2 +- .../confidence_computer.py | 2 +- .../samplesize/sample_size_calculator.py | 36 +++++----- tests/bayesian/test_betabinomial.py | 6 +- tests/frequentist/test_chisquared.py | 6 +- tests/frequentist/test_experiment.py | 2 +- tests/frequentist/test_ttest.py | 10 +-- tests/frequentist/test_ztest.py | 42 +++++------ tests/frequentist/test_ztest_linreg.py | 16 ++--- tox.ini | 5 ++ 13 files changed, 119 insertions(+), 91 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index b175338..a46b8b7 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -57,41 +57,55 @@ Get Started! Ready to contribute? Here's how to set up `confidence` for local development. +**Prerequisites:** + +* `uv `_ - Fast Python package installer (recommended) +* Python 3.9 or later + 1. Fork the `confidence` repo on GitHub. 2. Clone your fork locally:: - $ git clone https://github.com/spotify/confidence + $ git clone git@github.com:your_username/confidence.git + $ cd confidence + +3. Set up your development environment using uv:: + + $ uv venv + $ uv pip install -e ".[dev]" -3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: + This creates a virtual environment and installs the package in editable mode with all development dependencies. - $ mkvirtualenv confidence_dev - $ cd confidence/ - $ tox +4. Verify your setup by running the tests:: - The tox command will install the dev requirements in requirements_dev.txt and run all tests. + $ uv run pytest -4. Create a branch for local development:: + This should run all tests and show they pass. + +5. Create a branch for local development:: $ git checkout -b name-of-your-bugfix-or-feature Now you can make your changes locally. -5. When you're done making changes, format using `make black`, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: +6. When you're done making changes, check that your changes pass all quality checks:: + + $ uv run black spotify_confidence tests --line-length 119 # Format code + $ uv run flake8 spotify_confidence tests # Lint code + $ uv run pytest # Run tests + + To test across all supported Python versions (3.9, 3.10, 3.11, 3.12):: - $ make black - $ flake8 confidence tests - $ python setup.py test or py.test - $ tox + $ uv run tox - To get flake8 and tox, just pip install them into your virtualenv. + Note: tox requires all Python versions to be installed on your system. -6. Commit your changes and push your branch to GitHub:: +7. Commit your changes and push your branch to GitHub:: $ git add . $ git commit -m "Your detailed description of your changes." $ git push origin name-of-your-bugfix-or-feature -7. Submit a pull request through the GitHub website. +8. Submit a pull request through the GitHub website. Pull Request Guidelines ----------------------- @@ -101,23 +115,36 @@ Before you submit a pull request, check that it meets these guidelines: 1. The pull request should include tests. 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the - feature to the list in README.rst. -3. The pull request should work for Python 3.6 and 3.7. Check - and make sure that the tests pass for all supported Python versions. + feature to the list in README.md. +3. The pull request should work for Python 3.9, 3.10, 3.11, and 3.12. The CI + pipeline will automatically test all supported Python versions. Tips ---- To run a subset of tests:: -$ py.test tests.test_confidence + $ uv run pytest tests/frequentist/test_ttest.py + +To run a specific test:: + + $ uv run pytest tests/frequentist/test_ttest.py::TestCategorical::test_summary + +To run tests with verbose output:: + + $ uv run pytest -v + +To see test coverage:: + + $ uv run pytest --cov=spotify_confidence --cov-report=html + $ open htmlcov/index.html Release Process ----------------------- While commits and pull requests are welcome from any contributor, we try to -simplify the distribution process for everyone by managing the release +simplify the distribution process for everyone by managing the release process with specific contributors serving in the role of Release Managers. Release Managers are responsible for: @@ -142,7 +169,7 @@ PATCH version when you make backwards-compatible bug fixes. Release Stategy ~~~~~~~~~~~~~~~~ -Each new release will be made on its own branch, with the branch Master +Each new release will be made on its own branch, with the branch Master representing the most recent, furthest release. Releases are published to PyPi automatically once a new release branch is merged to Master. Additionally, rew releases are also tracked manually on `github diff --git a/Makefile b/Makefile index fd000df..8c0ff85 100644 --- a/Makefile +++ b/Makefile @@ -47,6 +47,9 @@ clean-test: ## remove test and coverage artifacts rm -f .coverage rm -fr htmlcov/ +format: ## format code with black + black spotify_confidence tests --line-length 119 + lint: ## check style with flake8 flake8 spotify_confidence tests @@ -91,5 +94,3 @@ install-test: clean install-prod: clean pip3 install spotify-confidence -black: - black spotify_confidence tests --line-length 119 diff --git a/pyproject.toml b/pyproject.toml index 59873e5..8d0a969 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,13 +28,13 @@ dependencies = [ dev = [ "build", "twine", + "black>=23.1.0", "flake8>=4.0.1", "tox>=4.4.7", "pytest>=7.0.1", "pytest-cov>=2.5.1", "pytest-runner>=6.0.0", "coverage>=4.5.1", - "black>=23.1.0", "pylint>=1.7.4", "jupyterlab>=3.2.9", ] @@ -50,12 +50,7 @@ namespaces = false [tool.black] line-length = 119 -target-version = ["py39", "py310", "py311"] - -[tool.flake8] -max-line-length = 120 -ignore = ["E203", "E231", "W503"] -exclude = [".venv", ".tox", ".git", "dist", "docs", "*.egg", "build", "scratch.py", "confidence_dev"] +target-version = ["py39", "py310", "py311", "py312"] [tool.pytest.ini_options] addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing" diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py index 44db803..29f241a 100644 --- a/spotify_confidence/analysis/confidence_utils.py +++ b/spotify_confidence/analysis/confidence_utils.py @@ -250,5 +250,5 @@ def dfmatmul(x, y, outer=True): def de_list_if_length_one(x): """Return first element of x if x is a list of length one""" - is_iterable = type(x) != str and isinstance(x, Iterable) + is_iterable = not isinstance(x, str) and isinstance(x, Iterable) return x[0] if is_iterable and len(x) == 1 else x diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py index 4c47c22..4ea6105 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py @@ -351,7 +351,7 @@ def compute_differences( level_columns = get_remaning_groups(self._all_group_columns, groupby) difference_df = self._compute_differences( level_columns=level_columns, - levels=[levels] if type(levels) == tuple else levels, + levels=[levels] if isinstance(levels, tuple) else levels, absolute=absolute, groupby=groupby, level_as_reference=True, diff --git a/spotify_confidence/samplesize/sample_size_calculator.py b/spotify_confidence/samplesize/sample_size_calculator.py index 10b77d5..4c22506 100644 --- a/spotify_confidence/samplesize/sample_size_calculator.py +++ b/spotify_confidence/samplesize/sample_size_calculator.py @@ -422,18 +422,18 @@ def show_samplesize( ) code_html = widgets.HTML( "
"
-                    f"SampleSize.continuous(average_absolute_mde={ mde },\n"
-                    f"                      baseline_variance={ baseline },\n"
-                    f"                      alpha={ alpha },\n"
-                    f"                      power={ power },\n"
-                    f"                      treatments={ treatments },\n"
+                    f"SampleSize.continuous(average_absolute_mde={mde},\n"
+                    f"                      baseline_variance={baseline},\n"
+                    f"                      alpha={alpha},\n"
+                    f"                      power={power},\n"
+                    f"                      treatments={treatments},\n"
                     f"                      comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                      treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                      treatment_allocations=None,\n"
                     f"                      bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "
" ) else: @@ -461,19 +461,19 @@ def show_samplesize( ) code_html = widgets.HTML( "
"
-                    f"SampleSize.binomial(absolute_percentage_mde={ mde },\n"
+                    f"SampleSize.binomial(absolute_percentage_mde={mde},\n"
                     f"                    baseline_proportion="
-                    f"{ baseline },\n"
-                    f"                    alpha={ alpha },\n"
-                    f"                    power={ power },\n"
-                    f"                    treatments={ treatments },\n"
+                    f"{baseline},\n"
+                    f"                    alpha={alpha},\n"
+                    f"                    power={power},\n"
+                    f"                    treatments={treatments},\n"
                     f"                    comparisons="
-                    f"'{ comparisons }',\n"
+                    f"'{comparisons}',\n"
                     f"                    treatment_costs="
-                    f"{ list(treatment_costs) },\n"
+                    f"{list(treatment_costs)},\n"
                     f"                    treatment_allocations=None,\n"
                     f"                    bonferroni_correction="
-                    f"{ bonferroni_correction })"
+                    f"{bonferroni_correction})"
                     "
" ) @@ -482,7 +482,7 @@ def compare_against_optimal(current, optimal): return "" else: return ( - f"
{current/optimal:.1f}x " + f"
{current / optimal:.1f}x " f"optimal group allocation of {optimal:,}." f"" ) @@ -501,7 +501,7 @@ def compare_against_optimal(current, optimal): else: treatment = "Variant " + str(i) - cell_str += f"
{treatment}: " f"{n_cell[i]:,} ({prop_cell[i]*100:.1f}%)" + cell_str += f"
{treatment}: " f"{n_cell[i]:,} ({prop_cell[i] * 100:.1f}%)" display(widgets.HTML(cell_str)) display(code_html) diff --git a/tests/bayesian/test_betabinomial.py b/tests/bayesian/test_betabinomial.py index 1779623..3a6bf74 100644 --- a/tests/bayesian/test_betabinomial.py +++ b/tests/bayesian/test_betabinomial.py @@ -9,7 +9,7 @@ class TestCategorical(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": ["test", "control", "test2", "test", "control", "test2"], @@ -148,7 +148,7 @@ def test_multiple_difference_level_as_reference(self): class TestOrdinal: - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -307,7 +307,7 @@ def test_multiple_difference_plot(self): class TestOrdinalPlusTwoCategorical(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ diff --git a/tests/frequentist/test_chisquared.py b/tests/frequentist/test_chisquared.py index 97e67e8..7b68aed 100644 --- a/tests/frequentist/test_chisquared.py +++ b/tests/frequentist/test_chisquared.py @@ -24,7 +24,7 @@ def chart_data(chart_object, series_name): class TestCategorical(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -335,7 +335,7 @@ def test_achieved_power_groupby(self): class TestOrdinal(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -514,7 +514,7 @@ def test_sample_ratio_test(self): class TestOrdinalPlusTwoCategorical(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ diff --git a/tests/frequentist/test_experiment.py b/tests/frequentist/test_experiment.py index ac4775c..34946a4 100644 --- a/tests/frequentist/test_experiment.py +++ b/tests/frequentist/test_experiment.py @@ -7,7 +7,7 @@ class TestBootstrap(object): - def setup(self): + def setup_method(self): np.random.seed(123) n_bootstraps = int(5e5) self.data = pd.DataFrame( diff --git a/tests/frequentist/test_ttest.py b/tests/frequentist/test_ttest.py index f310606..61735c6 100644 --- a/tests/frequentist/test_ttest.py +++ b/tests/frequentist/test_ttest.py @@ -36,7 +36,7 @@ def chart_data(chart_object, series_name): class TestCategorical(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -244,7 +244,7 @@ def test_achieved_power(self): class TestOrdinal(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -403,7 +403,7 @@ def test_achieved_power(self): class TestOrdinalPlusTwoCategorical(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -805,7 +805,7 @@ def test_differece_plot_with_nims(self): class TestCategoricalBinomialData(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -877,7 +877,7 @@ def test_multiple_difference(self): class TestWithNims(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( [ { diff --git a/tests/frequentist/test_ztest.py b/tests/frequentist/test_ztest.py index b03b4f1..957029b 100644 --- a/tests/frequentist/test_ztest.py +++ b/tests/frequentist/test_ztest.py @@ -26,7 +26,7 @@ class TestPoweredEffectContinuousSingleMetric(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -87,7 +87,7 @@ def test_powered_effect2(self): class TestPoweredEffectContinuousMultipleSuccessMetrics(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": ["test", "control", "test2", "test", "control", "test2"], @@ -149,7 +149,7 @@ def test_powered_effect1(self): class TestPoweredEffectContinuousMultipleMetricTypes(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": ["test", "control", "test2", "test", "control", "test2"], @@ -212,7 +212,7 @@ def test_powered_effect(self): class TestPoweredEffectContinuousMultipleMetricsSegments(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -342,7 +342,7 @@ def test_powered_effect(self): class TestPoweredEffectContinuousMultipleMetricsSegments2(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -472,7 +472,7 @@ def test_powered_effect(self): class TestPoweredEffectContinuousMultipleMetricsSegments3(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -605,7 +605,7 @@ def test_powered_effect(self): class TestPoweredEffectBinary(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -750,7 +750,7 @@ def test_powered_effect(self): class TestPoweredEffectBinaryOnlyGuardrail(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -816,7 +816,7 @@ def test_powered_effect(self): class TestBinary(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -922,7 +922,7 @@ def test_multiple_difference_plot(self): class TestCategoricalBinary(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -1038,7 +1038,7 @@ def test_multiple_difference_plot_groupby(self): class TestCategoricalContinuous(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -1126,7 +1126,7 @@ def test_multiple_difference_plot_groupby(self): class TestOrdinal(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -1230,7 +1230,7 @@ def test_multiple_difference_plot_groupby(self): class TestOrdinalPlusTwoCategorical(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "variation_name": [ @@ -1892,7 +1892,7 @@ def test_differece_plot_with_nims_in_df(self): class TestCategoricalBinomialData(object): - def setup(self): + def setup_method(self): np.random.seed(123) self.data = pd.DataFrame( @@ -2004,7 +2004,7 @@ def test_multiple_difference(self): class TestWithNims(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( [ { @@ -2109,7 +2109,7 @@ def test_one_sided_ztest_negative(self): class TestSequentialOrdinalPlusTwoCategorical(object): - def setup(self): + def setup_method(self): np.random.seed(123) d = 50 + 1 * np.random.randn(60) u = np.floor(2000 + np.linspace(0, 1000, 60) + 10 * np.random.randn(60)) @@ -2636,7 +2636,7 @@ def test_multiple_difference_groupby_mixed_nims(self): class TestSequentialOrdinalPlusTwoCategorical2(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( [ { @@ -3406,7 +3406,7 @@ def test_multiple_difference_groupby(self): class TestSequentialOneSided(object): - def setup(self): + def setup_method(self): DATE = "date" COUNT = "count" SUM = "sum" @@ -3448,7 +3448,7 @@ def test_multiple_difference_groupby(self): class TestSequentialTwoSided(object): - def setup(self): + def setup_method(self): DATE = "date" COUNT = "count" SUM = "sum" @@ -3489,7 +3489,7 @@ def test_multiple_difference_groupby(self): class TestSequentialOneSidedThreeGroups(object): - def setup(self): + def setup_method(self): DATE = "date" COUNT = "count" SUM = "sum" @@ -3534,7 +3534,7 @@ def test_multiple_difference_groupby(self): class TestNimsWithNaN(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( { "count": { diff --git a/tests/frequentist/test_ztest_linreg.py b/tests/frequentist/test_ztest_linreg.py index eecd4ad..cb81a14 100644 --- a/tests/frequentist/test_ztest_linreg.py +++ b/tests/frequentist/test_ztest_linreg.py @@ -7,7 +7,7 @@ class TestUnivariateSingleMetric(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 10000 d = np.random.randint(2, size=n) @@ -74,7 +74,7 @@ def linreg(X, y): class TestUnivariateMultiMetric(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 20000 d = np.random.randint(2, size=n) @@ -164,7 +164,7 @@ def linreg(X, y): class TestUnivariateNoFeatures(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( [ { @@ -214,7 +214,7 @@ def test_summary(self): class TestMultivariateSingleMetric(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 10000 @@ -316,7 +316,7 @@ def linreg(X, y): class TestMultivariateMultipleMetrics(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 10000 @@ -441,7 +441,7 @@ def linreg(X, y): class TestUnivariateMultiMetricRequiredSampleSize(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 2000000 d = np.random.randint(2, size=n) @@ -512,7 +512,7 @@ def test_parameters_univariate_required_sample_size(self): class TestUnivariateSingleMetricWithBadPreExposureData(object): - def setup(self): + def setup_method(self): np.random.seed(123) n = 10000 d = np.random.randint(2, size=n) @@ -569,7 +569,7 @@ def test_parameters_univariate(self): class TestUnivariateSingleMetricNegativeVariance(object): - def setup(self): + def setup_method(self): self.data = pd.DataFrame( [ { diff --git a/tox.ini b/tox.ini index 38d0ebb..ca49fa1 100644 --- a/tox.ini +++ b/tox.ini @@ -15,3 +15,8 @@ commands = flake8 spotify_confidence tests pytest {posargs} +[flake8] +max-line-length = 120 +ignore = E203,E231,W503 +exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev + From 6f9cc1c4596299f9f5505be8d120491f6182227e Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 11:27:34 +0100 Subject: [PATCH 04/15] Pick more sensible dependencies --- pyproject.toml | 23 ++++++++++------------- tox.ini | 20 +++++++++++++++++++- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d0a969..27edacb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,11 +16,11 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "numpy>=1.20.0,<3.0.0", - "scipy>=1.6.0", - "pandas>=1.2.0", - "statsmodels>=0.13.0,<1.0.0", - "chartify>=5.0.1", + "numpy>=1.21.0", + "scipy>=1.9.0", + "pandas>=1.4.0", + "statsmodels>=0.13.5", + "chartify>=5.0.0", "ipywidgets>=8.0.0", ] @@ -29,14 +29,11 @@ dev = [ "build", "twine", "black>=23.1.0", - "flake8>=4.0.1", - "tox>=4.4.7", - "pytest>=7.0.1", - "pytest-cov>=2.5.1", - "pytest-runner>=6.0.0", - "coverage>=4.5.1", - "pylint>=1.7.4", - "jupyterlab>=3.2.9", + "flake8>=6.0.0", + "tox>=4.0.0", + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "coverage>=7.0.0", ] [project.urls] diff --git a/tox.ini b/tox.ini index ca49fa1..77a9b79 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py39, py310, py311, py312 +envlist = py39, py310, py311, py312, py39-min isolated_build = True [gh-actions] @@ -15,6 +15,24 @@ commands = flake8 spotify_confidence tests pytest {posargs} +[testenv:py39-min] +basepython = python3.9 +deps = + numpy==1.21.0 + scipy==1.9.0 + pandas==1.4.0 + statsmodels==0.13.5 + chartify==5.0.0 + ipywidgets==8.0.0 + black==23.1.0 + flake8==6.0.0 + pytest==7.0.0 + pytest-cov==4.0.0 + coverage==7.0.0 +commands = + flake8 spotify_confidence tests + pytest {posargs} + [flake8] max-line-length = 120 ignore = E203,E231,W503 From 951e1bed7ee084364dbc25c773eb69e9d70498f4 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 11:29:13 +0100 Subject: [PATCH 05/15] Add black check to tox --- tox.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tox.ini b/tox.ini index 77a9b79..15d4141 100644 --- a/tox.ini +++ b/tox.ini @@ -12,6 +12,7 @@ python = [testenv] extras = dev commands = + black --check --diff spotify_confidence tests flake8 spotify_confidence tests pytest {posargs} @@ -30,6 +31,7 @@ deps = pytest-cov==4.0.0 coverage==7.0.0 commands = + black --check --diff spotify_confidence tests flake8 spotify_confidence tests pytest {posargs} From ca2d9de15f981d7655a5233f52b216829779339a Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 11:37:59 +0100 Subject: [PATCH 06/15] Fix version for pytest --- spotify_confidence/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spotify_confidence/__init__.py b/spotify_confidence/__init__.py index 6369a1f..d8e9f24 100644 --- a/spotify_confidence/__init__.py +++ b/spotify_confidence/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pkg_resources import require as _require +from importlib.metadata import version as _version from .analysis.bayesian.bayesian_models import BetaBinomial from spotify_confidence.analysis.frequentist.chi_squared import ChiSquared from spotify_confidence.analysis.frequentist.t_test import StudentsTTest @@ -25,7 +25,7 @@ from . import examples from .options import options -__version__ = _require("spotify_confidence")[0].version +__version__ = _version("spotify_confidence") __all__ = [ "BetaBinomial", From 8ea27a18451d24f34b5141daeb39d005af976825 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 13:55:01 +0100 Subject: [PATCH 07/15] add CLAUDE.md --- CLAUDE.md | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..fae842d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,157 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Spotify Confidence is a Python library for A/B test analysis. It provides convenience wrappers around statsmodel's functions for computing p-values and confidence intervals. The library supports both frequentist (Z-test, Student's T-test, Chi-squared) and Bayesian (BetaBinomial) statistical methods, with features for variance reduction, sequential testing, and sample size calculations. + +## Development Commands + +### Setup +```bash +# Install with development dependencies (including tox-uv) +uv pip install -e ".[dev]" +``` + +### Testing +```bash +# Run all tests with coverage +uv run pytest + +# Run tests without coverage reports +uv run pytest --no-cov + +# Run specific test file +uv run pytest tests/frequentist/test_z_test.py + +# Run specific test +uv run pytest tests/frequentist/test_z_test.py::test_name + +# Run all tests across Python versions +uv run tox +``` + +### Code Quality +```bash +# Format code with black (line length: 119) +uv run black spotify_confidence tests + +# Check formatting without making changes +uv run black --check --diff spotify_confidence tests + +# Lint with flake8 (max line length: 120) +uv run flake8 spotify_confidence tests + +# Run all quality checks (as done in CI) +uv run black --check --diff spotify_confidence tests && uv run flake8 spotify_confidence tests && uv run pytest +``` + +### Build +```bash +# Build distribution packages +uv run python -m build +``` + +## Architecture + +### Core Design Pattern + +The library follows an object-oriented design with separation of concerns: + +1. **Statistical Test Classes**: High-level APIs (`ZTest`, `StudentsTTest`, `ChiSquared`, `BetaBinomial`, `ZTestLinreg`) +2. **Experiment Class**: Base class containing shared analysis methods for frequentist tests +3. **Computer Classes**: Perform the actual statistical computations +4. **Grapher Classes**: Generate visualizations using Chartify + +All main test classes inherit from abstract base classes in `spotify_confidence/analysis/abstract_base_classes/`: +- `ConfidenceABC`: Base for all statistical test classes +- `ConfidenceComputerABC`: Base for computation logic +- `ConfidenceGrapherABC`: Base for visualization logic + +### Module Structure + +``` +spotify_confidence/ +├── analysis/ +│ ├── abstract_base_classes/ # ABC definitions for the framework +│ ├── frequentist/ # Frequentist statistical methods +│ │ ├── confidence_computers/ # Statistical computation logic +│ │ ├── experiment.py # Base class for frequentist tests +│ │ ├── z_test.py # Z-test implementation +│ │ ├── t_test.py # Student's T-test implementation +│ │ ├── chi_squared.py # Chi-squared test +│ │ ├── z_test_linreg.py # Z-test with linear regression variance reduction +│ │ ├── sequential_bound_solver.py # Group sequential testing +│ │ ├── multiple_comparison.py # Multiple testing correction +│ │ └── sample_size_calculator.py +│ ├── bayesian/ # Bayesian methods +│ │ └── bayesian_models.py # BetaBinomial implementation +│ ├── constants.py # Shared constants +│ └── confidence_utils.py # Shared utility functions +├── samplesize/ # Sample size calculations +├── examples.py # Example data generators +├── chartgrid.py # Chart grid utilities +└── options.py # Global configuration +``` + +### Key Classes and Their Relationships + +- **Experiment** (in `frequentist/experiment.py`): The core base class for frequentist tests. Provides methods like: + - `summary()`: Overall metric summaries + - `difference()`: Pairwise comparisons + - `multiple_difference()`: Multiple comparisons with correction + - `difference_plot()`, `summary_plot()`, etc.: Visualization methods + - `sample_size()`: Required sample size calculations + - `statistical_power()`: Power analysis + +- **ZTest, StudentsTTest, ChiSquared**: Thin wrappers that initialize `Experiment` with the appropriate computer and method + +- **Computer Classes** (in `frequentist/confidence_computers/`): Handle the statistical calculations + - `ZTestComputer`, `TTestComputer`, `ChiSquaredComputer`: Specific computation implementations + - All inherit from `ConfidenceComputerABC` + +- **ChartifyGrapher**: Implements visualization using the Chartify library + +### Data Model + +The library works with DataFrames containing sufficient statistics: +- `numerator_column`: Sum or count (e.g., sum of conversions) +- `denominator_column`: Total observations (e.g., total users) +- `numerator_sum_squares_column`: Sum of squares (optional, for variance calculations) +- `categorical_group_columns`: Treatment/control groups and other dimensions +- `ordinal_group_column`: Time-based grouping for sequential analysis + +### Important Conventions + +1. **Method Column**: Tests add a `METHOD_COLUMN_NAME` to data indicating the test type (e.g., "z-test", "t-test") + +2. **Multiple Comparison Correction**: Supported methods defined in `constants.py`: + - Standard: bonferroni, holm, hommel, sidak, FDR methods + - SPOT-1 variants: Custom Spotify methods for specific use cases + +3. **Non-Inferiority Margins (NIMs)**: Can be specified as absolute values or relative percentages + +4. **Sequential Testing**: The `sequential_bound_solver.py` module implements group sequential designs with spending functions + +5. **Variance Reduction**: `ZTestLinreg` uses pre-exposure data to fit a linear model and reduce variance (CUPED method) + +## Testing Guidelines + +- Tests are organized to mirror the source structure under `tests/` +- Use pytest fixtures for common test data +- Tests check both DataFrame outputs and chart generation +- Coverage target is configured in `pyproject.toml` + +## Python Version Support + +Supports Python 3.9, 3.10, 3.11, and 3.12. The `tox.ini` includes a `py39-min` environment that tests with minimum dependency versions. + +The project uses `tox-uv` to leverage uv's fast package installation and environment management in tox, significantly speeding up multi-environment testing. The GitHub Actions CI workflow also uses uv for faster dependency installation. + +## Code Style + +- Black formatting with 119 character line length +- Flake8 linting with max line length 120 +- Ignored flake8 rules: E203, E231, W503 +- Excluded from linting: `.venv`, `.tox`, `dist`, `build`, `scratch.py`, `confidence_dev` From d611f9bda8c389268dddb461fe39fd957ecf04b8 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 13:55:05 +0100 Subject: [PATCH 08/15] Use uv for CI as well --- .github/workflows/confidence.yml | 7 ++++--- pyproject.toml | 1 + tox.ini | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml index b2682e3..2f8d32e 100644 --- a/.github/workflows/confidence.yml +++ b/.github/workflows/confidence.yml @@ -21,10 +21,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install uv + uses: astral-sh/setup-uv@v5 - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" - pip install tox tox-gh-actions + uv pip install --system -e ".[dev]" + uv pip install --system tox tox-gh-actions - name: Test with tox run: tox diff --git a/pyproject.toml b/pyproject.toml index 27edacb..8543725 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dev = [ "black>=23.1.0", "flake8>=6.0.0", "tox>=4.0.0", + "tox-uv>=1.0.0", "pytest>=7.0.0", "pytest-cov>=4.0.0", "coverage>=7.0.0", diff --git a/tox.ini b/tox.ini index 15d4141..2b76cad 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,7 @@ [tox] envlist = py39, py310, py311, py312, py39-min isolated_build = True +requires = tox-uv [gh-actions] python = From 08956078a99f49b228af989a5ce8b776c5091d4b Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 14:45:40 +0100 Subject: [PATCH 09/15] Fix pandas deprecation warnings --- .../analysis/bayesian/bayesian_base.py | 4 ++- .../analysis/bayesian/bayesian_models.py | 28 +++++++++++++++---- .../analysis/confidence_utils.py | 4 ++- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/spotify_confidence/analysis/bayesian/bayesian_base.py b/spotify_confidence/analysis/bayesian/bayesian_base.py index 13f666e..9629127 100644 --- a/spotify_confidence/analysis/bayesian/bayesian_base.py +++ b/spotify_confidence/analysis/bayesian/bayesian_base.py @@ -447,7 +447,9 @@ def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_ @staticmethod def _validate_levels(level_df, remaining_groups, level): try: - level_df.groupby(remaining_groups).get_group(level) + # When grouping with a length-1 list, get_group expects a tuple + group_key = (level,) if isinstance(remaining_groups, list) and len(remaining_groups) == 1 else level + level_df.groupby(remaining_groups).get_group(group_key) except (KeyError, ValueError): raise ValueError( """ diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py index cd3e93a..f58f89b 100644 --- a/spotify_confidence/analysis/bayesian/bayesian_models.py +++ b/spotify_confidence/analysis/bayesian/bayesian_models.py @@ -185,9 +185,17 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou ch.set_legend_location("outside_bottom") return ch - def _difference_posteriors(self, data, level_1, level_2, absolute=True): - posterior_1 = self._sample_posterior(data.get_group(level_1)) - posterior_2 = self._sample_posterior(data.get_group(level_2)) + def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None): + # When grouping with a length-1 list, get_group expects a tuple + if isinstance(remaining_groups, list) and len(remaining_groups) == 1: + level_1_key = (level_1,) + level_2_key = (level_2,) + else: + level_1_key = level_1 + level_2_key = level_2 + + posterior_1 = self._sample_posterior(data.get_group(level_1_key)) + posterior_2 = self._sample_posterior(data.get_group(level_2_key)) if absolute: difference_posterior = posterior_2 - posterior_1 @@ -256,7 +264,7 @@ def _difference_and_difference_posterior(self, level_df, remaining_groups, level self._validate_levels(level_df, remaining_groups, level_2) # difference is posterior_2 - posterior_1 difference_posterior = self._difference_posteriors( - level_df.groupby(remaining_groups), level_1, level_2, absolute + level_df.groupby(remaining_groups), level_1, level_2, absolute, remaining_groups ) difference_df = self._differences(difference_posterior, level_1, level_2, absolute) return difference_df, difference_posterior @@ -384,7 +392,11 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups self._validate_levels(level_df, remaining_groups, level) - posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] + # When grouping with a length-1 list, get_group expects a tuple + if isinstance(remaining_groups, list) and len(remaining_groups) == 1: + posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys] + else: + posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] var_indx = grouped_df_keys.index(level) other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] @@ -627,7 +639,11 @@ def _categorical_multiple_difference_chart( self._validate_levels(level_df, remaining_groups, level) - posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] + # When grouping with a length-1 list, get_group expects a tuple + if isinstance(remaining_groups, list) and len(remaining_groups) == 1: + posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys] + else: + posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] var_indx = grouped_df_keys.index(level) diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py index 29f241a..73585dc 100644 --- a/spotify_confidence/analysis/confidence_utils.py +++ b/spotify_confidence/analysis/confidence_utils.py @@ -98,7 +98,9 @@ def get_all_categorical_group_columns( def validate_levels(df: DataFrame, level_columns: Union[str, Iterable], levels: Iterable): for level in levels: try: - df.groupby(level_columns).get_group(level) + # When grouping with a length-1 list, get_group expects a tuple + group_key = (level,) if isinstance(level_columns, list) and len(level_columns) == 1 else level + df.groupby(level_columns).get_group(group_key) except (KeyError, ValueError): raise ValueError( """ From 6045a4e042d514236e02bf3597aac3df995ffaad Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 15:40:54 +0100 Subject: [PATCH 10/15] Bump black to fix python 3.9 - but requires revert for warnings --- pyproject.toml | 2 +- .../analysis/bayesian/bayesian_base.py | 4 +--- .../analysis/bayesian/bayesian_models.py | 24 ++++--------------- .../analysis/confidence_utils.py | 4 +--- tox.ini | 3 +-- 5 files changed, 8 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8543725..a4ee754 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ dev = [ "build", "twine", - "black>=23.1.0", + "black>=23.7.0", "flake8>=6.0.0", "tox>=4.0.0", "tox-uv>=1.0.0", diff --git a/spotify_confidence/analysis/bayesian/bayesian_base.py b/spotify_confidence/analysis/bayesian/bayesian_base.py index 9629127..13f666e 100644 --- a/spotify_confidence/analysis/bayesian/bayesian_base.py +++ b/spotify_confidence/analysis/bayesian/bayesian_base.py @@ -447,9 +447,7 @@ def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_ @staticmethod def _validate_levels(level_df, remaining_groups, level): try: - # When grouping with a length-1 list, get_group expects a tuple - group_key = (level,) if isinstance(remaining_groups, list) and len(remaining_groups) == 1 else level - level_df.groupby(remaining_groups).get_group(group_key) + level_df.groupby(remaining_groups).get_group(level) except (KeyError, ValueError): raise ValueError( """ diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py index f58f89b..2755e1d 100644 --- a/spotify_confidence/analysis/bayesian/bayesian_models.py +++ b/spotify_confidence/analysis/bayesian/bayesian_models.py @@ -186,16 +186,8 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou return ch def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None): - # When grouping with a length-1 list, get_group expects a tuple - if isinstance(remaining_groups, list) and len(remaining_groups) == 1: - level_1_key = (level_1,) - level_2_key = (level_2,) - else: - level_1_key = level_1 - level_2_key = level_2 - - posterior_1 = self._sample_posterior(data.get_group(level_1_key)) - posterior_2 = self._sample_posterior(data.get_group(level_2_key)) + posterior_1 = self._sample_posterior(data.get_group(level_1)) + posterior_2 = self._sample_posterior(data.get_group(level_2)) if absolute: difference_posterior = posterior_2 - posterior_1 @@ -392,11 +384,7 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups self._validate_levels(level_df, remaining_groups, level) - # When grouping with a length-1 list, get_group expects a tuple - if isinstance(remaining_groups, list) and len(remaining_groups) == 1: - posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys] - else: - posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] + posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] var_indx = grouped_df_keys.index(level) other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] @@ -639,11 +627,7 @@ def _categorical_multiple_difference_chart( self._validate_levels(level_df, remaining_groups, level) - # When grouping with a length-1 list, get_group expects a tuple - if isinstance(remaining_groups, list) and len(remaining_groups) == 1: - posteriors = [self._sample_posterior(grouped_df.get_group((lvl,))) for lvl in grouped_df_keys] - else: - posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] + posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] var_indx = grouped_df_keys.index(level) diff --git a/spotify_confidence/analysis/confidence_utils.py b/spotify_confidence/analysis/confidence_utils.py index 73585dc..29f241a 100644 --- a/spotify_confidence/analysis/confidence_utils.py +++ b/spotify_confidence/analysis/confidence_utils.py @@ -98,9 +98,7 @@ def get_all_categorical_group_columns( def validate_levels(df: DataFrame, level_columns: Union[str, Iterable], levels: Iterable): for level in levels: try: - # When grouping with a length-1 list, get_group expects a tuple - group_key = (level,) if isinstance(level_columns, list) and len(level_columns) == 1 else level - df.groupby(level_columns).get_group(group_key) + df.groupby(level_columns).get_group(level) except (KeyError, ValueError): raise ValueError( """ diff --git a/tox.ini b/tox.ini index 2b76cad..a12faa4 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = statsmodels==0.13.5 chartify==5.0.0 ipywidgets==8.0.0 - black==23.1.0 + black==23.7.0 flake8==6.0.0 pytest==7.0.0 pytest-cov==4.0.0 @@ -40,4 +40,3 @@ commands = max-line-length = 120 ignore = E203,E231,W503 exclude = .venv,.tox,.git,dist,docs,*.egg,build,scratch.py,confidence_dev - From df4df5daa66ca510d9cce0c5f817a9ca7526c37e Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 15:51:11 +0100 Subject: [PATCH 11/15] Improve GH actions --- .github/workflows/confidence.yml | 3 +++ tox.ini | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml index 2f8d32e..c4553ed 100644 --- a/.github/workflows/confidence.yml +++ b/.github/workflows/confidence.yml @@ -23,6 +23,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install uv uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" - name: Install dependencies run: | uv pip install --system -e ".[dev]" diff --git a/tox.ini b/tox.ini index a12faa4..fa686cb 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ requires = tox-uv [gh-actions] python = - 3.9: py39 + 3.9: py39, py39-min 3.10: py310 3.11: py311 3.12: py312 From 570e7f6be4f78beeeb6b53699a9c7b7761603d75 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 16:14:38 +0100 Subject: [PATCH 12/15] Run tox and tests in parallel --- CONTRIBUTING.rst | 2 +- pyproject.toml | 1 + tox.ini | 14 +++++++++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index a46b8b7..86b3bd7 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -95,7 +95,7 @@ Ready to contribute? Here's how to set up `confidence` for local development. To test across all supported Python versions (3.9, 3.10, 3.11, 3.12):: - $ uv run tox + $ uv run tox -p auto Note: tox requires all Python versions to be installed on your system. diff --git a/pyproject.toml b/pyproject.toml index a4ee754..68a5837 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dev = [ "tox-uv>=1.0.0", "pytest>=7.0.0", "pytest-cov>=4.0.0", + "pytest-xdist>=3.0.2", "coverage>=7.0.0", ] diff --git a/tox.ini b/tox.ini index fa686cb..064fa09 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ requires = tox-uv [gh-actions] python = - 3.9: py39, py39-min + 3.9: py39-min 3.10: py310 3.11: py311 3.12: py312 @@ -15,7 +15,14 @@ extras = dev commands = black --check --diff spotify_confidence tests flake8 spotify_confidence tests - pytest {posargs} + pytest -n auto --no-cov --basetemp={envtmpdir} {posargs} + +[testenv:py312] +extras = dev +commands = + black --check --diff spotify_confidence tests + flake8 spotify_confidence tests + pytest -n auto --basetemp={envtmpdir} {posargs} [testenv:py39-min] basepython = python3.9 @@ -30,11 +37,12 @@ deps = flake8==6.0.0 pytest==7.0.0 pytest-cov==4.0.0 + pytest-xdist==3.0.2 coverage==7.0.0 commands = black --check --diff spotify_confidence tests flake8 spotify_confidence tests - pytest {posargs} + pytest -n auto --no-cov --basetemp={envtmpdir} {posargs} [flake8] max-line-length = 120 From f6cc8b5b52a0f004269baf688cfcbbc9a4ba64aa Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 16:20:54 +0100 Subject: [PATCH 13/15] revert --- spotify_confidence/analysis/bayesian/bayesian_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spotify_confidence/analysis/bayesian/bayesian_models.py b/spotify_confidence/analysis/bayesian/bayesian_models.py index 2755e1d..cd3e93a 100644 --- a/spotify_confidence/analysis/bayesian/bayesian_models.py +++ b/spotify_confidence/analysis/bayesian/bayesian_models.py @@ -185,7 +185,7 @@ def _categorical_summary_plot(self, level_name, level_df, remaining_groups, grou ch.set_legend_location("outside_bottom") return ch - def _difference_posteriors(self, data, level_1, level_2, absolute=True, remaining_groups=None): + def _difference_posteriors(self, data, level_1, level_2, absolute=True): posterior_1 = self._sample_posterior(data.get_group(level_1)) posterior_2 = self._sample_posterior(data.get_group(level_2)) @@ -256,7 +256,7 @@ def _difference_and_difference_posterior(self, level_df, remaining_groups, level self._validate_levels(level_df, remaining_groups, level_2) # difference is posterior_2 - posterior_1 difference_posterior = self._difference_posteriors( - level_df.groupby(remaining_groups), level_1, level_2, absolute, remaining_groups + level_df.groupby(remaining_groups), level_1, level_2, absolute ) difference_df = self._differences(difference_posterior, level_1, level_2, absolute) return difference_df, difference_posterior @@ -384,7 +384,7 @@ def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups self._validate_levels(level_df, remaining_groups, level) - posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] + posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] var_indx = grouped_df_keys.index(level) other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] @@ -627,7 +627,7 @@ def _categorical_multiple_difference_chart( self._validate_levels(level_df, remaining_groups, level) - posteriors = [self._sample_posterior(grouped_df.get_group(lvl)) for lvl in grouped_df_keys] + posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] var_indx = grouped_df_keys.index(level) From 2ef27f327215239bf7097dcc945db5ae73193902 Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 16:40:34 +0100 Subject: [PATCH 14/15] Execute tests in parallel --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 68a5837..57fda33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ line-length = 119 target-version = ["py39", "py310", "py311", "py312"] [tool.pytest.ini_options] -addopts = "-v --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing" +addopts = "-v -n auto --cov=spotify_confidence --cov-report=html --cov-report=xml --cov-report=term-missing" testpaths = ["tests"] [tool.coverage.run] From 3884c88e941c57017c88149ebbd961c0e23567ac Mon Sep 17 00:00:00 2001 From: Benjamin Elbers Date: Tue, 23 Dec 2025 16:55:10 +0100 Subject: [PATCH 15/15] don't skip tests that are actually not that slow --- tests/frequentist/test_bounds.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/frequentist/test_bounds.py b/tests/frequentist/test_bounds.py index 5c2a0f8..c85d7e9 100644 --- a/tests/frequentist/test_bounds.py +++ b/tests/frequentist/test_bounds.py @@ -1,5 +1,4 @@ import pandas as pd -import pytest import time import numpy as np from pandas import Timestamp @@ -10,7 +9,6 @@ ) -@pytest.mark.skip(reason="Skipping because this test is very slow") def test_many_days(): """ This input (based on a real experiment) is very long, which can cause slow calculation @@ -404,7 +402,6 @@ def test_many_days(): assert (time.time() - start_time) < 0.01 -@pytest.mark.skip(reason="Skipping because this test is very slow") def test_many_days_fast_and_no_crash(): """ This is based on experiment 1735 on 26.11.2020. The calculation of the corresponding bounds takes many minutes