From a08b7bb5eef68e0d8d5a60499991e0790bdaaf28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 21:41:58 +0300
Subject: [PATCH 01/33] Basic template of the project

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 32 +++++++++
 .github/ISSUE_TEMPLATE/documentation.md   | 26 +++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 17 +++++
 .github/ISSUE_TEMPLATE/question.md        | 12 ++++
 .github/workflows/lint.yml                | 65 +++++++++++++++++
 .github/workflows/release.yml             | 40 +++++++++++
 .github/workflows/tests_and_coverage.yml  | 64 +++++++++++++++++
 .gitignore                                | 22 ++++++
 microbenchmark/__init__.py                |  0
 microbenchmark/py.typed                   |  0
 pyproject.toml                            | 63 +++++++++++++++++
 requirements_dev.txt                      | 13 ++++
 spec.md                                   | 86 +++++++++++++++++++++++
 tests/__init__.py                         |  0
 tests/cli/__init__.py                     |  0
 tests/documentation/__init__.py           |  0
 tests/documentation/test_readme.md        |  0
 tests/typing/__init__.py                  |  0
 tests/units/__init__.py                   |  0
 19 files changed, 440 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/documentation.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 create mode 100644 .github/ISSUE_TEMPLATE/question.md
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .github/workflows/tests_and_coverage.yml
 create mode 100644 .gitignore
 create mode 100644 microbenchmark/__init__.py
 create mode 100644 microbenchmark/py.typed
 create mode 100644 pyproject.toml
 create mode 100644 requirements_dev.txt
 create mode 100644 spec.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/cli/__init__.py
 create mode 100644 tests/documentation/__init__.py
 create mode 100644 tests/documentation/test_readme.md
 create mode 100644 tests/typing/__init__.py
 create mode 100644 tests/units/__init__.py

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..95e7494
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: pomponchik
+
+---
+
+## Short description
+
+Replace this text with a short description of the error and the behavior that you expected to see instead.
+
+
+## Describe the bug in detail
+
+Please add a test that reproduces the bug (i.e., currently fails):
+
+```python
+def test_your_bug():
+    ...
+```
+
+When writing the test, please ensure compatibility with the [`pytest`](https://docs.pytest.org/) framework.
+
+If for some reason you cannot describe the error in the test format, describe the steps to reproduce it here.
+
+
+## Environment
+ - OS: ...
+ - Python version (the output of the `python --version` command): ...
+ - Version of this package: ...
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 0000000..5f5fdc0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,26 @@
+---
+name: Documentation fix
+about: Add something to the documentation, delete it, or change it
+title: ''
+labels: documentation
+assignees: pomponchik
+---
+
+## It's cool that you're here!
+
+Documentation is an important part of the project; we strive to make it high-quality and keep it up to date. Please adjust this template by outlining your proposal.
+
+
+## Type of action
+
+What do you want to do: remove something, add something, or change something?
+
+
+## Where?
+
+Specify which part of the documentation you want to change. For example, the name of an existing documentation section or a line number in `README.md`.
+
+
+## The essence
+
+Please describe the essence of the proposed change.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..117d79f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,17 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: pomponchik
+
+---
+
+## Short description
+
+What do you propose and why do you consider it important?
+
+
+## Some details
+
+If you can, provide code examples that will show how your proposal will work. Also, if you can, indicate which alternative approaches you have considered. And finally, describe how you propose to verify that your idea is implemented correctly, if at all possible.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
new file mode 100644
index 0000000..6f86494
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,12 @@
+---
+name: Question or consultation
+about: Ask anything about this project
+title: ''
+labels: question
+assignees: pomponchik
+
+---
+
+## Your question
+
+Here you can freely describe your question about the project. Please read the documentation provided before doing this, and ask the question only if it is not answered there. In addition, please keep in mind that this is a free non-commercial project and user support is optional for its author. Response times are not guaranteed.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..a9e6850
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,65 @@
+name: Lint
+
+on:
+  push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.14t', '3.15.0-alpha.1']
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+          python-version: ${{ matrix.python-version }}
+
+    - name: Set up uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+
+    - name: Install dependencies
+      shell: bash
+      run: uv pip install --system -r requirements_dev.txt
+
+    - name: Install the library
+      shell: bash
+      run: uv pip install --system .
+
+    - name: Run ruff
+      shell: bash
+      run: ruff check microbenchmark
+
+    - name: Run ruff for tests
+      shell: bash
+      run: ruff check tests
+
+    - name: Run mypy
+      shell: bash
+      run: >-
+        mypy
+        --show-error-codes
+        --strict
+        --disallow-any-decorated
+        --disallow-any-explicit
+        --disallow-any-expr
+        --disallow-any-generics
+        --disallow-any-unimported
+        --disallow-subclassing-any
+        --warn-return-any
+        microbenchmark
+
+    - name: Run mypy for tests
+      shell: bash
+      run: mypy --exclude '^tests/typing/' tests
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..0db2c72
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,40 @@
+name: Release
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  pypi-publish:
+    name: upload release to PyPI
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    # Specifying a GitHub environment is optional, but strongly encouraged
+    environment: release
+    permissions:
+      # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v5
+        with:
+            python-version: '3.13'
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        shell: bash
+        run: uv pip install --system -r requirements_dev.txt
+
+      - name: Build the project
+        shell: bash
+        run: python -m build .
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests_and_coverage.yml b/.github/workflows/tests_and_coverage.yml
new file mode 100644
index 0000000..099f18d
--- /dev/null
+++ b/.github/workflows/tests_and_coverage.yml
@@ -0,0 +1,64 @@
+name: Tests
+
+on:
+  push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        os: [macos-latest, ubuntu-latest, windows-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.14t', '3.15.0-alpha.1']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+          python-version: ${{ matrix.python-version }}
+
+    - name: Set up uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+
+    - name: Install dependencies
+      shell: bash
+      run: uv pip install --system -r requirements_dev.txt
+
+    - name: Install the library
+      shell: bash
+      run: uv pip install --system .
+
+    - name: Print all libs
+      shell: bash
+      run: uv pip list --system
+
+    - name: Run tests and show the branch coverage on the command line
+      shell: bash
+      run: |
+        pth_file="$(python -c 'import sysconfig; print(sysconfig.get_path("purelib"))')/microbenchmark_coverage_process_startup.pth"
+        printf "import os; os.getenv('COVERAGE_PROCESS_START') and __import__('coverage').process_startup()\n" > "$pth_file"
+        coverage erase
+        COVERAGE_PROCESS_START="$PWD/pyproject.toml" coverage run -m pytest -n auto --cache-clear --assert=plain
+        coverage combine
+        coverage report -m --fail-under=100 --omit='*tests*'
+        coverage xml --omit='*tests*'
+
+    - name: Upload coverage to Coveralls
+      if: runner.os == 'Linux' && matrix.python-version == '3.13'
+      env:
+        COVERALLS_REPO_TOKEN: ${{secrets.COVERALLS_REPO_TOKEN}}
+      uses: coverallsapp/github-action@v2
+      with:
+        format: cobertura
+        file: coverage.xml
+        flag-name: ubuntu-python-3.13-branch
+      continue-on-error: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..689d029
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+.DS_Store
+__pycache__
+venv
+.pytest_cache
+build
+dist
+*.egg-info
+test.py
+.coverage
+.coverage.*
+.idea
+.ruff_cache
+.mutmut-cache
+.mypy_cache
+html
+CLAUDE.md
+.claude
+mutants
+planning_features.md
+coverage.xml
+.qwen
+uv.lock
diff --git a/microbenchmark/__init__.py b/microbenchmark/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/microbenchmark/py.typed b/microbenchmark/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..6c17998
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+requires = ["setuptools==68.0.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "microbenchmark"
+version = "0.0.1"
+authors = [
+  { name="Evgeniy Blinov", email="zheni-b@yandex.ru" },
+]
+description = ''
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Operating System :: OS Independent",
+    'Operating System :: MacOS :: MacOS X',
+    'Operating System :: Microsoft :: Windows',
+    'Operating System :: POSIX',
+    'Operating System :: POSIX :: Linux',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 3.8',
+    'Programming Language :: Python :: 3.9',
+    'Programming Language :: Python :: 3.10',
+    'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
+    'Programming Language :: Python :: 3.14',
+    'Programming Language :: Python :: 3.15',
+    'Programming Language :: Python :: Free Threading',
+    'Programming Language :: Python :: Free Threading :: 3 - Stable',
+    'License :: OSI Approved :: MIT License',
+    'Intended Audience :: Developers',
+    'Topic :: Software Development :: Libraries',
+    'Typing :: Typed',
+]
+keywords = [
+    'benchmarks'
+]
+
+[tool.setuptools]
+package-data = { microbenchmark = ["py.typed"] }
+packages = { find = { include = ["microbenchmark"] } }
+
+[tool.mutmut]
+paths_to_mutate=["microbenchmark"]
+
+[tool.coverage.run]
+branch = true
+parallel = true
+plugins = ["coverage_pyver_pragma"]
+source = ["microbenchmark"]
+
+[tool.pytest.ini_options]
+norecursedirs = ["build", "mutants"]
+
+[tool.ruff]
+lint.ignore = ['E501', 'E712', 'PTH123', 'PTH118', 'PLR2004', 'PTH107', 'SIM105', 'SIM102', 'RET503', 'PLR0912', 'C901', 'E731', 'F821']
+lint.select = ["ERA001", "YTT", "ASYNC", "BLE", "B", "A", "COM", "INP", "PIE", "T20", "PT", "RSE", "RET", "SIM", "SLOT", "TID252", "ARG", "PTH", "I", "C90", "N", "E", "W", "D201", "D202", "D419", "F", "PL", "PLE", "PLR", "PLW", "RUF", "TRY201", "TRY400", "TRY401"]
+format.quote-style = "single"
+
+[project.urls]
+'Source' = 'https://github.com/mutating/microbenchmark'
+'Tracker' = 'https://github.com/mutating/microbenchmark/issues'
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..47af4f7
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1,13 @@
+pytest==8.3.5
+pytest-xdist==3.6.1; python_version < '3.9'
+pytest-xdist==3.8.0; python_version >= '3.9'
+coverage==7.6.1
+coverage-pyver-pragma==0.4.0
+build==1.2.2.post1
+mypy==1.14.1
+pytest-mypy-testing==0.1.3
+ruff==0.14.6
+mutmut==3.2.3
+cosmic-ray==8.3.15; python_version < '3.9'
+cosmic-ray==8.4.6; python_version >= '3.9'
+full_match==0.0.3
diff --git a/spec.md b/spec.md
new file mode 100644
index 0000000..8fa1541
--- /dev/null
+++ b/spec.md
@@ -0,0 +1,86 @@
+# О проекте
+
+Ты сейчас в пустой обвязке для проекта, посвященном бенчмаркингу. Проект называется "microbenchmark", потому что я планирую оставлять код проекта очень простым и минималистичным. Нам предстоит разработать его сейчас с нуля.
+
+Основная цель проекта: дать разработчикам инструментарий для быстрого написания бенчмарков и включения их в состав своих библиотек.
+
+Главные принципы разработки следующие:
+
+- Мы много думаем прежде, чем что-то делать.
+- Дизайн - важен. Мы делаем вещи красивыми и максимально минималистичными.
+- Мы проводим обширное ревью каждого сделанного шага. Источник ревью - программа qwen code, которая установлена на этом компьютере. Мы просим у нее ревью постоянно, как можно чаще, по несколько раз подряд.
+- Мы поддерживаем крайне высокий уровень тестирования. Покрытие тестами в 100% - сильно ниже минимальной планки, в которую мы целимся. Наша задача: покрыть тестами все мыслимые сценарии использования, а потом еще и все немыслимые, которые мы можем придумать.
+- Мы избегаем использования моков в тестировании, стараемся все возможное тестировать "в живую".
+- Мы придерживаемся принципов TDD: сначала продумываем тесты, и только потом приступаем к реализации.
+- Документация (README) - это источник правды. Тесты - основаны на документации. Основной код - основан на тестах.
+- *Каждое* содержательное утверждение в документации должно опираться на соответствующие тесты.
+
+
+## Порядок работы
+
+Работа должна вестись в таком порядке:
+
+- Определяем цели.
+- Определяемся с тем, какой именно код хотим написать.
+- Детально продумываем тест-сьют.
+- Пишем все тесты.
+- Пишем основной код.
+- Добиваемся прохождения новых тестов.
+- Фиксим issues линтеров и проверяем, не упали ли старые тесты.
+- Проверяем покрытие.
+- Пишем или дополняем/исправляем документацию.
+
+После каждого этапа разработки (создание плана, написание основного кода, документации или тестов) нужно запрашивать детальное ревью у qwen code. Программа установлена на данном компьютере, ее можно запустить командой qwen. Qwen нужно промптить, описывая ему текущую подзадачу и наши основные принципы разработки. Нужно просить его быть максимально дотошным. Qwen нужно запустить минимум 5 раз (если доработка сложная - 10+), агрегировать результаты вызовов, "отделить зерна от плевел", и только потом приступать к исправлениям.
+
+
+## Что именно делаем?
+
+Я хочу создать кодовую обвязку для бенчмаркинга проекта. Весь код мы положим в папку benchmarks, и все тесты нового кода - туда же.
+
+Важно: мы не создаем CLI-тулинг для бенчмарков. Это скорее набор сценариев / некий базовый код, который уже при желании можно вызвать из CLI-тулзы или из тестов производительности.
+
+Базовый дизайн состоит из:
+- Класса Сценария
+- Класса Группы Сценариев
+- Класса Результата Бенчмарка
+
+Подробнее о классе Сценария:
+
+- Нужно создать базовый класс Сценария
+- Сценарий принимает в конструктор функцию, которую он должен вызывать, список аргументов для нее (именно список, а не *args и **kwargs, чтобы мы могли расширять API в будущем), параметр doc (текстовое описание бенчмарка), параметр name (имя конкретного сценария), а также параметр количества раз, которое будет вызван бенчмарк
+- У Сценария есть метод run(), который возвращает объект Результата Бенчмарка
+- Метод run() должен иметь также опциональный аргумент, отвечающий за прогрев - сколько раз запустить сценарий до того, как время прогонов начнет заменяться и счетчик прогонов начнет считаться - результаты этих прогонов не должны сохраняться в Результате
+- Конструктор Сценария должен иметь опциональный аргумент - функцию, которая будет использоваться для генерации таймстемпов, со значением по умолчанию
+- У Сценария есть метод cli(), который запускает конкретный сценарий как CLI-программу, принимающую в качестве опционального CLI-аргумента количество итераций бенчмарка и выводящая в консоль все базовые параметры Результата Бенчмарка
+- Метод cli() также должен принимать опциональный CLI-параметр с средним временем бенчмарка, и если среднее время выше - он должен "падать" (чтобы можно было использовать в CI)
+
+О классе Группы Сценариев:
+
+- Группу сценариев можно создать 4 способами: создав инстанс класса Группы напрямую, (опционально) передав ему один или несколько Сценариев в качестве аргументов (через *args); просто суммировав несколько сценариев через оператор "+"; суммировав одну или несколько Групп; суммировав один или несколько Сценариев и одну или несколько групп. Все суммы должны быть "плоскими", то есть если будет два плюсика, то не должна образовываться вложенная иерархия.
+- Группа сценариев имеет внешний API, похожий на API одного сценария: метод run(), возвращающий список Результатов Бенчмарка и метод cli(), запускающий все бенчмарки группы сразу и выводящий общий результат в консоль через разделители
+
+О классе Результата Бенчмарка:
+- Результат Бенчмарка - это датакласс
+- У Результата Бенчмарка которого должны быть следующие поля: среднее время, худшее время, лучшее время, а также объект Сценария, из которого получен этот результат
+- Также Результат Бенчмарка должен хранить в себе продолжительность каждого отдельного запуска бенчмарка
+- Результат должен иметь метод percentile(), который возвращает другой объект Результата Бенчмарка, суженный по данному персентилю
+- Также в Результате должен быть кэшируемый property с 95 персентилем, и такой же с 99 персентилем, возвращающий тоже объекты Результатов Бенчмарка
+- У результата бенчмарка должно быть bool-поле, показывающее, является он первичным или производным (то есть суженным, например по персентилю)
+- Результат должен иметь метод сериализации в json и метод десериализации из json
+- Для аггрегации результатов должны использоваться методы, минимизирующие погрешность из-за складывания чисел с плавающей точкой
+
+Про документацию:
+- Документация должна содержаться только в README.md
+- Она должна быть простой и понятной, но исчерпывающей
+- Должна сопровождаться примерами кода
+- В примерах кода не нужно дублировать импорты, вывод должен демонстрироваться через print'ы и потом строчку с комментарием, начинаяющимся с "#>" и потом реальный ожидаемый вывод
+- Она должна иметь следующую структуру: краткое описание проекта и ключевые фичи, оглавление (с якорными ссылками на разделы), способ установки, быстрый старт, раздел про Сценарий, раздел про Группу Сценариев, раздел про Результат Бенчмарка, раздел сравнения с конкурентами
+- Документация должна быть оформлена в спокойном и дружелюбном стиле, без злоупотребления эмодзи
+
+Про тесты:
+- Минимальное покрытие - 100%
+- Все, о чем сказано в README, должно быть тщательно протестировано
+- Все mypy-контракты должны быть покрыты тестами с использованием библиотеки pytest-mypy-testing
+- Все CLI-штуки должны быть протестированы с помощью библиотеки subprocess
+- ClI-тесты должны лежать в tests/cli, обычные юнит-тесты в tests/units, а тесты типизации - в tests/typing
+- На основе каждого примера кода нужно также создать по тесту, эти тесты должны лежать в tests/documentation/test_readme.py
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/documentation/__init__.py b/tests/documentation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/documentation/test_readme.md b/tests/documentation/test_readme.md
new file mode 100644
index 0000000..e69de29
diff --git a/tests/typing/__init__.py b/tests/typing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/units/__init__.py b/tests/units/__init__.py
new file mode 100644
index 0000000..e69de29

From eadf7bbb2f2f18e4ab3d86aaa135329d6818dee1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:19:22 +0300
Subject: [PATCH 02/33] Add full documentation with examples and feature
 comparison

---
 README.md | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 324 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b79d431..f76c434 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,324 @@
-# microbenchmark
\ No newline at end of file
+# microbenchmark
+
+A minimal Python library for writing and running benchmarks.
+
+`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. There is no CLI tool to install and no configuration to manage. You write a Python file, call `.run()` or `.cli()`, and you're done.
+
+**Key features:**
+
+- A `Scenario` wraps any callable with a fixed argument list and runs it `n` times, collecting per-run timings.
+- A `ScenarioGroup` lets you combine scenarios and run them together.
+- `BenchmarkResult` holds every individual duration and gives you mean, best, worst, and percentile views.
+- Results can be serialised to and restored from JSON.
+- No external dependencies beyond the Python standard library.
+
+---
+
+## Table of contents
+
+- [Installation](#installation)
+- [Quick start](#quick-start)
+- [Scenario](#scenario)
+- [ScenarioGroup](#scenariogroup)
+- [BenchmarkResult](#benchmarkresult)
+- [Comparison with alternatives](#comparison-with-alternatives)
+
+---
+
+## Installation
+
+```
+pip install microbenchmark
+```
+
+---
+
+## Quick start
+
+```python
+from microbenchmark import Scenario
+
+def build_list():
+    return list(range(1000))
+
+scenario = Scenario(build_list, name='build_list', number=500)
+result = scenario.run()
+
+print(result.mean)
+#> 0.000012  (example value, actual result will vary)
+print(result.best)
+#> 0.000010
+print(result.worst)
+#> 0.000018
+```
+
+---
+
+## Scenario
+
+A `Scenario` describes a single benchmark: the function to call, what arguments to pass, and how many times to run it.
+
+### Constructor
+
+```python
+Scenario(
+    function,
+    args=None,
+    *,
+    name,
+    doc='',
+    number=1000,
+    timer=time.perf_counter,
+)
+```
+
+- `function` — the callable to benchmark.
+- `args` — a list of positional arguments to pass on each call. `None` (the default) means the function is called with no arguments. The list is copied on construction, so mutating it afterwards has no effect.
+- `name` — a short label for this scenario (required).
+- `doc` — an optional longer description.
+- `number` — how many times to call `function` per run. Must be at least `1`.
+- `timer` — a callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Useful for injecting a controlled clock in tests.
+
+```python
+scenario = Scenario(
+    sorted,
+    args=[[3, 1, 2]],
+    name='sort_three_items',
+    doc='Sort a list of three integers.',
+    number=10000,
+)
+```
+
+### `run(warmup=0)`
+
+Runs the benchmark and returns a `BenchmarkResult`.
+
+The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls execute the function and consume timer ticks, but their timings are not included in the result.
+
+```python
+result = scenario.run(warmup=100)
+print(len(result.durations))
+#> 10000
+```
+
+### `cli()`
+
+Turns the scenario into a small command-line programme. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result.
+
+Supported arguments:
+
+- `--number N` — override the scenario's `number` for this run.
+- `--max-mean THRESHOLD` — exit with code `1` if the mean time (in seconds) exceeds `THRESHOLD`. Useful in CI.
+
+```python
+# benchmark.py
+from microbenchmark import Scenario
+
+def build_list():
+    return list(range(1000))
+
+scenario = Scenario(build_list, name='build_list', number=500)
+
+if __name__ == '__main__':
+    scenario.cli()
+```
+
+```
+$ python benchmark.py --number 1000
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+```
+
+```
+$ python benchmark.py --max-mean 0.001
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+```
+
+```
+$ python benchmark.py --max-mean 0.000001
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+$ echo $?
+#> 1
+```
+
+---
+
+## ScenarioGroup
+
+A `ScenarioGroup` holds a flat collection of scenarios and lets you run them together.
+
+### Creating a group
+
+There are four ways to create a group.
+
+**Direct construction** — pass any number of scenarios to the constructor:
+
+```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+
+group = ScenarioGroup(s1, s2)
+```
+
+**The `+` operator between scenarios** — adding two or more `Scenario` objects produces a `ScenarioGroup`:
+
+```python
+group = s1 + s2
+```
+
+**Adding a scenario to a group** — the result is always a flat group:
+
+```python
+s3 = Scenario(lambda: None, name='s3')
+group = s1 + s2 + s3
+print(type(group).__name__)
+#> ScenarioGroup
+```
+
+**Adding two groups together** — the result is a single flat group containing the scenarios from both:
+
+```python
+g1 = ScenarioGroup(s1)
+g2 = ScenarioGroup(s2, s3)
+combined = g1 + g2
+print(len(combined.run()))
+#> 3
+```
+
+### `run(warmup=0)`
+
+Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order in the list matches the order the scenarios were added.
+
+```python
+results = group.run(warmup=50)
+for result in results:
+    print(result.scenario.name, result.mean)
+#> s1 ...
+#> s2 ...
+#> s3 ...
+```
+
+### `cli()`
+
+Runs all scenarios and prints their results separated by dividers.
+
+Supported arguments:
+
+- `--number N` — passed to every scenario.
+- `--max-mean THRESHOLD` — exits with code `1` if any scenario's mean exceeds the threshold.
+
+```python
+# benchmarks.py
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: list(range(100)), name='range_100')
+s2 = Scenario(lambda: list(range(1000)), name='range_1000')
+
+group = s1 + s2
+
+if __name__ == '__main__':
+    group.cli()
+```
+
+```
+$ python benchmarks.py
+benchmark: range_100
+mean:  0.000003s
+best:  0.000002s
+worst: 0.000005s
+---
+benchmark: range_1000
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+```
+
+---
+
+## BenchmarkResult
+
+`BenchmarkResult` is a dataclass that holds the outcome of a single benchmark run.
+
+### Fields
+
+- `scenario` — the `Scenario` that produced this result, or `None` if the result was restored from JSON.
+- `durations` — a tuple of per-call timings in seconds, one entry per call.
+- `mean` — arithmetic mean of `durations`, computed with `math.fsum` to minimise floating-point error.
+- `best` — the shortest individual timing.
+- `worst` — the longest individual timing.
+- `is_primary` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`.
+
+```python
+result = Scenario(lambda: None, name='noop', number=100).run()
+print(len(result.durations))
+#> 100
+print(result.is_primary)
+#> True
+```
+
+### `percentile(p)`
+
+Returns a new `BenchmarkResult` containing only the fastest `ceil(len(durations) * p / 100)` timings. The returned result has `is_primary=False`.
+
+```python
+trimmed = result.percentile(95)
+print(trimmed.is_primary)
+#> False
+print(len(trimmed.durations) <= len(result.durations))
+#> True
+```
+
+`p` must be in the range `(0, 100]`. Passing `0` or a value above `100` raises `ValueError`.
+
+### `p95` and `p99`
+
+Convenient cached properties that return `percentile(95)` and `percentile(99)` respectively. The value is computed once and cached for the lifetime of the result object.
+
+```python
+print(result.p95.mean <= result.mean)
+#> True
+```
+
+### `to_json()` and `from_json()`
+
+`to_json()` serialises the result to a JSON string. It stores all individual `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
+
+`from_json()` restores a `BenchmarkResult` from a JSON string produced by `to_json()`. Because the original callable cannot be serialised, the restored result has `scenario=None`.
+
+```python
+json_str = result.to_json()
+restored = BenchmarkResult.from_json(json_str)
+
+print(restored.scenario)
+#> None
+print(restored.mean == result.mean)
+#> True
+print(restored.durations == result.durations)
+#> True
+```
+
+---
+
+## Comparison with alternatives
+
+| Feature | `microbenchmark` | `timeit` (stdlib) | `pytest-benchmark` |
+|---|---|---|---|
+| Per-call timings | yes | no | yes |
+| Percentile views | yes | no | yes |
+| JSON serialisation | yes | no | no |
+| CI integration (`--max-mean`) | yes | no | via plugins |
+| `+` operator for grouping | yes | no | no |
+| External dependencies | none | none | several |
+| Embeddable in your own code | yes | yes | test suite only |
+
+`timeit` from the standard library is great for interactive exploration but gives you only a single aggregate number. `pytest-benchmark` is powerful but is tightly coupled to the `pytest` runner and brings its own dependencies. `microbenchmark` occupies the space between: richer than `timeit`, lighter than `pytest-benchmark`, and not tied to any test framework.

From 728d0bb8584c9dd7bf25e4199197af13a3eadee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:25:50 +0300
Subject: [PATCH 03/33] Update spec.md to clarify documentation language, code
 formatting, and test coverage requirements

---
 spec.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spec.md b/spec.md
index 8fa1541..48a275c 100644
--- a/spec.md
+++ b/spec.md
@@ -14,6 +14,7 @@
 - Мы придерживаемся принципов TDD: сначала продумываем тесты, и только потом приступаем к реализации.
 - Документация (README) - это источник правды. Тесты - основаны на документации. Основной код - основан на тестах.
 - *Каждое* содержательное утверждение в документации должно опираться на соответствующие тесты.
+- Нет внешним зависимостям, разрешены только зависимости на проекты, находящиеся внутри организации https://github.com/mutating
 
 
 ## Порядок работы
@@ -76,11 +77,15 @@
 - В примерах кода не нужно дублировать импорты, вывод должен демонстрироваться через print'ы и потом строчку с комментарием, начинаяющимся с "#>" и потом реальный ожидаемый вывод
 - Она должна иметь следующую структуру: краткое описание проекта и ключевые фичи, оглавление (с якорными ссылками на разделы), способ установки, быстрый старт, раздел про Сценарий, раздел про Группу Сценариев, раздел про Результат Бенчмарка, раздел сравнения с конкурентами
 - Документация должна быть оформлена в спокойном и дружелюбном стиле, без злоупотребления эмодзи
+- Документация должна быть на английском
+- Нужно активно использовать бэктики в тексте для выделения сущностей из кода, имен библиотек и всякого подобного
+- Хороший пример документации, на который можно равняться, показан в проекте https://github.com/mutating/pristan
 
 Про тесты:
 - Минимальное покрытие - 100%
 - Все, о чем сказано в README, должно быть тщательно протестировано
-- Все mypy-контракты должны быть покрыты тестами с использованием библиотеки pytest-mypy-testing
+- Не скупимся как на негативные, так и на позитивные кейсы в тестах
+- Все mypy-контракты должны быть покрыты тестами с использованием библиотеки pytest-mypy-testing, также должно быть много негативных кейсов и все возможные положительные
 - Все CLI-штуки должны быть протестированы с помощью библиотеки subprocess
 - ClI-тесты должны лежать в tests/cli, обычные юнит-тесты в tests/units, а тесты типизации - в tests/typing
 - На основе каждого примера кода нужно также создать по тесту, эти тесты должны лежать в tests/documentation/test_readme.py

From 05c4e1f420bd0b179ca34fe2d5fc227baf581818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:30:23 +0300
Subject: [PATCH 04/33] Better readme

---
 README.md | 156 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index f76c434..9a552e8 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,14 @@
 
 A minimal Python library for writing and running benchmarks.
 
-`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. There is no CLI tool to install and no configuration to manage. You write a Python file, call `.run()` or `.cli()`, and you're done.
+`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. There is no CLI tool to install and no configuration to manage. You write a Python file, call `.run()` or `.cli()`, and you are done.
 
 **Key features:**
 
 - A `Scenario` wraps any callable with a fixed argument list and runs it `n` times, collecting per-run timings.
-- A `ScenarioGroup` lets you combine scenarios and run them together.
+- A `ScenarioGroup` lets you combine scenarios and run them together with a single call.
 - `BenchmarkResult` holds every individual duration and gives you mean, best, worst, and percentile views.
-- Results can be serialised to and restored from JSON.
+- Results can be serialized to and restored from JSON.
 - No external dependencies beyond the Python standard library.
 
 ---
@@ -44,12 +44,11 @@ def build_list():
 scenario = Scenario(build_list, name='build_list', number=500)
 result = scenario.run()
 
-print(result.mean)
-#> 0.000012  (example value, actual result will vary)
+print(result.mean)   # example — actual value depends on your hardware
 print(result.best)
-#> 0.000010
 print(result.worst)
-#> 0.000018
+print(len(result.durations))
+#> 500
 ```
 
 ---
@@ -68,18 +67,20 @@ Scenario(
     name,
     doc='',
     number=1000,
-    timer=time.perf_counter,
+    timer=...,   # defaults to time.perf_counter
 )
 ```
 
 - `function` — the callable to benchmark.
-- `args` — a list of positional arguments to pass on each call. `None` (the default) means the function is called with no arguments. The list is copied on construction, so mutating it afterwards has no effect.
+- `args` — a list of positional arguments passed to `function` on every call. `None` (the default) and `[]` both mean the function is called with no positional arguments. The list is shallow-copied on construction, so appending to your original list afterward has no effect. Keyword arguments are not supported; wrap your callable in a `functools.partial` or a lambda if you need them.
 - `name` — a short label for this scenario (required).
 - `doc` — an optional longer description.
-- `number` — how many times to call `function` per run. Must be at least `1`.
-- `timer` — a callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Useful for injecting a controlled clock in tests.
+- `number` — how many times to call `function` per run. Must be at least `1`; passing `0` or a negative value raises `ValueError`.
+- `timer` — a zero-argument callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Useful for injecting a controlled clock in tests.
 
 ```python
+from microbenchmark import Scenario
+
 scenario = Scenario(
     sorted,
     args=[[3, 1, 2]],
@@ -89,26 +90,54 @@ scenario = Scenario(
 )
 ```
 
+For keyword arguments, use `functools.partial`:
+
+```python
+from functools import partial
+from microbenchmark import Scenario
+
+scenario = Scenario(
+    partial(sorted, key=lambda x: -x),
+    args=[[3, 1, 2]],
+    name='sort_descending',
+)
+```
+
 ### `run(warmup=0)`
 
 Runs the benchmark and returns a `BenchmarkResult`.
 
-The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls execute the function and consume timer ticks, but their timings are not included in the result.
+The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls invoke the function and consume timer ticks, but their timings are not included in the result.
 
 ```python
+from microbenchmark import Scenario
+
+scenario = Scenario(lambda: list(range(100)), name='build', number=1000)
 result = scenario.run(warmup=100)
 print(len(result.durations))
-#> 10000
+#> 1000
 ```
 
 ### `cli()`
 
-Turns the scenario into a small command-line programme. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result.
+Turns the scenario into a small command-line program. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result.
 
 Supported arguments:
 
 - `--number N` — override the scenario's `number` for this run.
 - `--max-mean THRESHOLD` — exit with code `1` if the mean time (in seconds) exceeds `THRESHOLD`. Useful in CI.
+- `--help` — print usage information and exit.
+
+Output format:
+
+```
+benchmark: <name>
+mean:  <mean>s
+best:  <best>s
+worst: <worst>s
+```
+
+Values are in seconds. The `mean`, `best`, and `worst` labels are padded to the same width. If `--max-mean` is supplied and the actual mean exceeds the threshold, the same output is printed but the process exits with code `1`.
 
 ```python
 # benchmark.py
@@ -124,7 +153,7 @@ if __name__ == '__main__':
 ```
 
 ```
-$ python benchmark.py --number 1000
+$ python benchmark.py
 benchmark: build_list
 mean:  0.000012s
 best:  0.000010s
@@ -137,6 +166,8 @@ benchmark: build_list
 mean:  0.000012s
 best:  0.000010s
 worst: 0.000018s
+$ echo $?
+0
 ```
 
 ```
@@ -146,7 +177,7 @@ mean:  0.000012s
 best:  0.000010s
 worst: 0.000018s
 $ echo $?
-#> 1
+1
 ```
 
 ---
@@ -170,15 +201,31 @@ s2 = Scenario(lambda: None, name='s2')
 group = ScenarioGroup(s1, s2)
 ```
 
+You can also create an empty group and combine it with others later:
+
+```python
+empty = ScenarioGroup()
+print(len(empty.run()))
+#> 0
+```
+
 **The `+` operator between scenarios** — adding two or more `Scenario` objects produces a `ScenarioGroup`:
 
 ```python
+from microbenchmark import Scenario
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
 group = s1 + s2
 ```
 
-**Adding a scenario to a group** — the result is always a flat group:
+**Adding a scenario to a group** — the result is always a flat group with no nesting:
 
 ```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
 s3 = Scenario(lambda: None, name='s3')
 group = s1 + s2 + s3
 print(type(group).__name__)
@@ -188,6 +235,11 @@ print(type(group).__name__)
 **Adding two groups together** — the result is a single flat group containing the scenarios from both:
 
 ```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+s3 = Scenario(lambda: None, name='s3')
 g1 = ScenarioGroup(s1)
 g2 = ScenarioGroup(s2, s3)
 combined = g1 + g2
@@ -197,25 +249,30 @@ print(len(combined.run()))
 
 ### `run(warmup=0)`
 
-Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order in the list matches the order the scenarios were added.
+Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order in the list matches the order the scenarios were added. The `warmup` argument is forwarded to each scenario.
 
 ```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+group = ScenarioGroup(s1, s2)
 results = group.run(warmup=50)
 for result in results:
-    print(result.scenario.name, result.mean)
-#> s1 ...
-#> s2 ...
-#> s3 ...
+    print(result.scenario.name)
+#> s1
+#> s2
 ```
 
 ### `cli()`
 
-Runs all scenarios and prints their results separated by dividers.
+Runs all scenarios and prints their results separated by `---` dividers.
 
 Supported arguments:
 
 - `--number N` — passed to every scenario.
 - `--max-mean THRESHOLD` — exits with code `1` if any scenario's mean exceeds the threshold.
+- `--help` — print usage information and exit.
 
 ```python
 # benchmarks.py
@@ -251,14 +308,16 @@ worst: 0.000018s
 
 ### Fields
 
-- `scenario` — the `Scenario` that produced this result, or `None` if the result was restored from JSON.
-- `durations` — a tuple of per-call timings in seconds, one entry per call.
-- `mean` — arithmetic mean of `durations`, computed with `math.fsum` to minimise floating-point error.
-- `best` — the shortest individual timing.
-- `worst` — the longest individual timing.
-- `is_primary` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`.
+- `scenario: Scenario | None` — the `Scenario` that produced this result, or `None` if the result was restored from JSON.
+- `durations: tuple[float, ...]` — per-call timings in seconds, one entry per call.
+- `mean: float` — arithmetic mean of `durations`, computed with `math.fsum` to minimize floating-point error.
+- `best: float` — the shortest individual timing.
+- `worst: float` — the longest individual timing.
+- `is_primary: bool` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`.
 
 ```python
+from microbenchmark import Scenario
+
 result = Scenario(lambda: None, name='noop', number=100).run()
 print(len(result.durations))
 #> 100
@@ -268,34 +327,45 @@ print(result.is_primary)
 
 ### `percentile(p)`
 
-Returns a new `BenchmarkResult` containing only the fastest `ceil(len(durations) * p / 100)` timings. The returned result has `is_primary=False`.
+Returns a new `BenchmarkResult` containing only the `ceil(len(durations) * p / 100)` fastest timings, sorted by duration ascending. The returned result has `is_primary=False`. `p` must be in the range `(0, 100]`; passing `0` or a value above `100` raises `ValueError`.
 
 ```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
 trimmed = result.percentile(95)
 print(trimmed.is_primary)
 #> False
-print(len(trimmed.durations) <= len(result.durations))
-#> True
+print(len(trimmed.durations))
+#> 95
 ```
 
-`p` must be in the range `(0, 100]`. Passing `0` or a value above `100` raises `ValueError`.
-
 ### `p95` and `p99`
 
 Convenient cached properties that return `percentile(95)` and `percentile(99)` respectively. The value is computed once and cached for the lifetime of the result object.
 
 ```python
-print(result.p95.mean <= result.mean)
-#> True
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+p95 = result.p95
+print(len(p95.durations))
+#> 95
+print(p95.is_primary)
+#> False
 ```
 
 ### `to_json()` and `from_json()`
 
-`to_json()` serialises the result to a JSON string. It stores all individual `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
+`to_json()` serializes the result to a JSON string. It stores all individual `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
 
-`from_json()` restores a `BenchmarkResult` from a JSON string produced by `to_json()`. Because the original callable cannot be serialised, the restored result has `scenario=None`.
+`from_json()` is a class method that restores a `BenchmarkResult` from a JSON string produced by `to_json()`. Because the original callable cannot be serialized, the restored result has `scenario=None`. The `mean`, `best`, and `worst` fields are recomputed from `durations` on restoration.
 
 ```python
+from microbenchmark import Scenario, BenchmarkResult
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+
 json_str = result.to_json()
 restored = BenchmarkResult.from_json(json_str)
 
@@ -315,10 +385,12 @@ print(restored.durations == result.durations)
 |---|---|---|---|
 | Per-call timings | yes | no | yes |
 | Percentile views | yes | no | yes |
-| JSON serialisation | yes | no | no |
-| CI integration (`--max-mean`) | yes | no | via plugins |
+| JSON serialization | yes | no | yes (internal format) |
+| Inject custom timer | yes | yes | no |
+| Warmup support | yes | no | yes (calibration) |
+| CI integration (`--max-mean`) | yes | no | via configuration |
 | `+` operator for grouping | yes | no | no |
 | External dependencies | none | none | several |
-| Embeddable in your own code | yes | yes | test suite only |
+| Embeddable in your own code | yes | yes | pytest plugin required |
 
-`timeit` from the standard library is great for interactive exploration but gives you only a single aggregate number. `pytest-benchmark` is powerful but is tightly coupled to the `pytest` runner and brings its own dependencies. `microbenchmark` occupies the space between: richer than `timeit`, lighter than `pytest-benchmark`, and not tied to any test framework.
+`timeit` from the standard library is great for interactive exploration but gives you only a single aggregate number and offers no per-call data. `pytest-benchmark` is powerful and well-integrated into the `pytest` ecosystem, but it is tightly coupled to the test runner and brings its own dependencies. `microbenchmark` sits between the two: richer than `timeit`, lighter and more portable than `pytest-benchmark`, and not tied to any test framework.

From 0bb653b27a9ca2544f4daef6c8e12871327d5116 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:43:50 +0300
Subject: [PATCH 05/33] Better readme

---
 README.md | 106 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 72 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 9a552e8..3d5c195 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 A minimal Python library for writing and running benchmarks.
 
-`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. There is no CLI tool to install and no configuration to manage. You write a Python file, call `.run()` or `.cli()`, and you are done.
+`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. No separate CLI package to install; `.cli()` is built in. You write a Python file, call `.run()` or `.cli()`, and you are done.
 
 **Key features:**
 
@@ -44,11 +44,14 @@ def build_list():
 scenario = Scenario(build_list, name='build_list', number=500)
 result = scenario.run()
 
+print(len(result.durations))
+#> 500
 print(result.mean)   # example — actual value depends on your hardware
+#> 0.000012
 print(result.best)
+#> 0.000010
 print(result.worst)
-print(len(result.durations))
-#> 500
+#> 0.000018
 ```
 
 ---
@@ -67,18 +70,19 @@ Scenario(
     name,
     doc='',
     number=1000,
-    timer=...,   # defaults to time.perf_counter
+    timer=time.perf_counter,
 )
 ```
 
 - `function` — the callable to benchmark.
-- `args` — a list of positional arguments passed to `function` on every call. `None` (the default) and `[]` both mean the function is called with no positional arguments. The list is shallow-copied on construction, so appending to your original list afterward has no effect. Keyword arguments are not supported; wrap your callable in a `functools.partial` or a lambda if you need them.
+- `args` — a list of positional arguments passed to `function` on every call as `function(*args)`. `None` (the default) and `[]` both mean the function is called with no arguments. The list is shallow-copied on construction, so appending to your original list afterward has no effect. Keyword arguments are not supported; wrap your callable in a `functools.partial` or a lambda if you need them.
 - `name` — a short label for this scenario (required).
 - `doc` — an optional longer description.
 - `number` — how many times to call `function` per run. Must be at least `1`; passing `0` or a negative value raises `ValueError`.
-- `timer` — a zero-argument callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Useful for injecting a controlled clock in tests.
+- `timer` — a zero-argument callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Supply a custom clock to get deterministic measurements in tests.
 
 ```python
+import time
 from microbenchmark import Scenario
 
 scenario = Scenario(
@@ -103,11 +107,22 @@ scenario = Scenario(
 )
 ```
 
+For functions that take multiple positional arguments, list all of them in `args`:
+
+```python
+from microbenchmark import Scenario
+
+scenario = Scenario(pow, args=[2, 10], name='power')
+result = scenario.run()
+print(result.mean)
+#> 0.000000  # example — very fast operation
+```
+
 ### `run(warmup=0)`
 
 Runs the benchmark and returns a `BenchmarkResult`.
 
-The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls invoke the function and consume timer ticks, but their timings are not included in the result.
+The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls execute the function but are not timed and their results are discarded.
 
 ```python
 from microbenchmark import Scenario
@@ -120,7 +135,7 @@ print(len(result.durations))
 
 ### `cli()`
 
-Turns the scenario into a small command-line program. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result.
+Turns the scenario into a small command-line program. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result to stdout.
 
 Supported arguments:
 
@@ -141,6 +156,7 @@ Values are in seconds. The `mean`, `best`, and `worst` labels are padded to the
 
 ```python
 # benchmark.py
+import time
 from microbenchmark import Scenario
 
 def build_list():
@@ -160,6 +176,14 @@ best:  0.000010s
 worst: 0.000018s
 ```
 
+```
+$ python benchmark.py --number 100
+benchmark: build_list
+mean:  0.000013s
+best:  0.000010s
+worst: 0.000020s
+```
+
 ```
 $ python benchmark.py --max-mean 0.001
 benchmark: build_list
@@ -190,7 +214,7 @@ A `ScenarioGroup` holds a flat collection of scenarios and lets you run them tog
 
 There are four ways to create a group.
 
-**Direct construction** — pass any number of scenarios to the constructor:
+**Direct construction** — pass any number of scenarios to the constructor. Passing no scenarios creates an empty group:
 
 ```python
 from microbenchmark import Scenario, ScenarioGroup
@@ -199,17 +223,12 @@ s1 = Scenario(lambda: None, name='s1')
 s2 = Scenario(lambda: None, name='s2')
 
 group = ScenarioGroup(s1, s2)
-```
-
-You can also create an empty group and combine it with others later:
-
-```python
 empty = ScenarioGroup()
 print(len(empty.run()))
 #> 0
 ```
 
-**The `+` operator between scenarios** — adding two or more `Scenario` objects produces a `ScenarioGroup`:
+**The `+` operator between two scenarios** produces a `ScenarioGroup`:
 
 ```python
 from microbenchmark import Scenario
@@ -217,9 +236,11 @@ from microbenchmark import Scenario
 s1 = Scenario(lambda: None, name='s1')
 s2 = Scenario(lambda: None, name='s2')
 group = s1 + s2
+print(type(group).__name__)
+#> ScenarioGroup
 ```
 
-**Adding a scenario to a group** — the result is always a flat group with no nesting:
+**Adding a scenario to an existing group**, or vice versa — the result is always a new flat group with no nesting:
 
 ```python
 from microbenchmark import Scenario, ScenarioGroup
@@ -227,12 +248,14 @@ from microbenchmark import Scenario, ScenarioGroup
 s1 = Scenario(lambda: None, name='s1')
 s2 = Scenario(lambda: None, name='s2')
 s3 = Scenario(lambda: None, name='s3')
-group = s1 + s2 + s3
-print(type(group).__name__)
-#> ScenarioGroup
+group = ScenarioGroup(s1, s2)
+extended = group + s3     # ScenarioGroup + Scenario
+also_ok  = s3 + group     # Scenario + ScenarioGroup
+print(len(extended.run()))
+#> 3
 ```
 
-**Adding two groups together** — the result is a single flat group containing the scenarios from both:
+**Adding two groups together** produces a single flat group:
 
 ```python
 from microbenchmark import Scenario, ScenarioGroup
@@ -249,7 +272,7 @@ print(len(combined.run()))
 
 ### `run(warmup=0)`
 
-Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order in the list matches the order the scenarios were added. The `warmup` argument is forwarded to each scenario.
+Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order of results matches the order the scenarios were added. The `warmup` argument is forwarded to each scenario individually.
 
 ```python
 from microbenchmark import Scenario, ScenarioGroup
@@ -266,7 +289,7 @@ for result in results:
 
 ### `cli()`
 
-Runs all scenarios and prints their results separated by `---` dividers.
+Runs all scenarios and prints their results to stdout. Each scenario block follows the same format as `Scenario.cli()`, and blocks are separated by a `---` line. The separator appears only between blocks, not after the last one.
 
 Supported arguments:
 
@@ -309,11 +332,13 @@ worst: 0.000018s
 ### Fields
 
 - `scenario: Scenario | None` — the `Scenario` that produced this result, or `None` if the result was restored from JSON.
-- `durations: tuple[float, ...]` — per-call timings in seconds, one entry per call.
-- `mean: float` — arithmetic mean of `durations`, computed with `math.fsum` to minimize floating-point error.
-- `best: float` — the shortest individual timing.
-- `worst: float` — the longest individual timing.
-- `is_primary: bool` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`.
+- `durations: tuple[float, ...]` — per-call timings in seconds, one entry per call, in the order they were measured.
+- `mean: float` — arithmetic mean of `durations`, computed with `math.fsum` to minimize floating-point error. Computed automatically from `durations`.
+- `best: float` — the shortest individual timing. Computed automatically.
+- `worst: float` — the longest individual timing. Computed automatically.
+- `is_primary: bool` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`. Preserved during JSON round-trips.
+
+The `mean`, `best`, and `worst` fields are read-only computed values; they are not accepted as constructor arguments.
 
 ```python
 from microbenchmark import Scenario
@@ -340,6 +365,16 @@ print(len(trimmed.durations))
 #> 95
 ```
 
+You can call `percentile()` on a derived result too:
+
+```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+print(len(result.percentile(90).percentile(50).durations))
+#> 45
+```
+
 ### `p95` and `p99`
 
 Convenient cached properties that return `percentile(95)` and `percentile(99)` respectively. The value is computed once and cached for the lifetime of the result object.
@@ -348,16 +383,17 @@ Convenient cached properties that return `percentile(95)` and `percentile(99)` r
 from microbenchmark import Scenario
 
 result = Scenario(lambda: None, name='noop', number=100).run()
-p95 = result.p95
-print(len(p95.durations))
+print(len(result.p95.durations))
 #> 95
-print(p95.is_primary)
+print(result.p95.is_primary)
 #> False
+print(result.p95 is result.p95)   # cached — same object returned each time
+#> True
 ```
 
 ### `to_json()` and `from_json()`
 
-`to_json()` serializes the result to a JSON string. It stores all individual `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
+`to_json()` serializes the result to a JSON string. It stores `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
 
 `from_json()` is a class method that restores a `BenchmarkResult` from a JSON string produced by `to_json()`. Because the original callable cannot be serialized, the restored result has `scenario=None`. The `mean`, `best`, and `worst` fields are recomputed from `durations` on restoration.
 
@@ -375,6 +411,8 @@ print(restored.mean == result.mean)
 #> True
 print(restored.durations == result.durations)
 #> True
+print(restored.is_primary == result.is_primary)
+#> True
 ```
 
 ---
@@ -383,9 +421,9 @@ print(restored.durations == result.durations)
 
 | Feature | `microbenchmark` | `timeit` (stdlib) | `pytest-benchmark` |
 |---|---|---|---|
-| Per-call timings | yes | no | yes |
+| Per-call timings | yes | via `repeat(number=1)` | yes |
 | Percentile views | yes | no | yes |
-| JSON serialization | yes | no | yes (internal format) |
+| JSON serialization | yes | no | yes |
 | Inject custom timer | yes | yes | no |
 | Warmup support | yes | no | yes (calibration) |
 | CI integration (`--max-mean`) | yes | no | via configuration |
@@ -393,4 +431,4 @@ print(restored.durations == result.durations)
 | External dependencies | none | none | several |
 | Embeddable in your own code | yes | yes | pytest plugin required |
 
-`timeit` from the standard library is great for interactive exploration but gives you only a single aggregate number and offers no per-call data. `pytest-benchmark` is powerful and well-integrated into the `pytest` ecosystem, but it is tightly coupled to the test runner and brings its own dependencies. `microbenchmark` sits between the two: richer than `timeit`, lighter and more portable than `pytest-benchmark`, and not tied to any test framework.
+`timeit` from the standard library is great for interactive exploration, but it gives only a single aggregate number per call — you can get a list by using `repeat(number=1)`, though the interface is not designed around it. `pytest-benchmark` is powerful and well-integrated into the `pytest` ecosystem, but it is tightly coupled to the test runner and brings its own dependencies. `microbenchmark` sits between the two: richer than `timeit`, lighter and more portable than `pytest-benchmark`, and not tied to any test framework.

From 074e8a336c2f932d763d6a1e4841f361af63d9ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:53:21 +0300
Subject: [PATCH 06/33] Export core classes from microbenchmark.__init__.py

---
 microbenchmark/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/microbenchmark/__init__.py b/microbenchmark/__init__.py
index e69de29..efc89df 100644
--- a/microbenchmark/__init__.py
+++ b/microbenchmark/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from microbenchmark.benchmark_result import BenchmarkResult
+from microbenchmark.scenario import Scenario
+from microbenchmark.scenario_group import ScenarioGroup
+
+__all__ = ['BenchmarkResult', 'Scenario', 'ScenarioGroup']

From 128d85dab3487bb215a0cb451cf04f401e587b68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:53:51 +0300
Subject: [PATCH 07/33] Implement core benchmarking classes: Scenario,
 ScenarioGroup, and BenchmarkResult

---
 microbenchmark/benchmark_result.py | 72 ++++++++++++++++++++++
 microbenchmark/scenario.py         | 99 ++++++++++++++++++++++++++++++
 microbenchmark/scenario_group.py   | 70 +++++++++++++++++++++
 3 files changed, 241 insertions(+)
 create mode 100644 microbenchmark/benchmark_result.py
 create mode 100644 microbenchmark/scenario.py
 create mode 100644 microbenchmark/scenario_group.py

diff --git a/microbenchmark/benchmark_result.py b/microbenchmark/benchmark_result.py
new file mode 100644
index 0000000..aef954d
--- /dev/null
+++ b/microbenchmark/benchmark_result.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import json
+import math
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from microbenchmark.scenario import Scenario
+
+
+@dataclass
+class BenchmarkResult:
+    scenario: Scenario | None
+    durations: tuple[float, ...]
+    is_primary: bool = True
+
+    mean: float = field(init=False)
+    best: float = field(init=False)
+    worst: float = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.mean = math.fsum(self.durations) / len(self.durations)
+        self.best = min(self.durations)
+        self.worst = max(self.durations)
+
+    def percentile(self, p: float) -> BenchmarkResult:
+        if p <= 0 or p > 100:
+            raise ValueError(f'percentile must be in (0, 100], got {p}')
+        k = math.ceil(len(self.durations) * p / 100)
+        trimmed = tuple(sorted(self.durations)[:k])
+        return BenchmarkResult(
+            scenario=self.scenario,
+            durations=trimmed,
+            is_primary=False,
+        )
+
+    @cached_property
+    def p95(self) -> BenchmarkResult:
+        return self.percentile(95)
+
+    @cached_property
+    def p99(self) -> BenchmarkResult:
+        return self.percentile(99)
+
+    def to_json(self) -> str:
+        scenario_data: dict[str, Any] | None
+        if self.scenario is not None:
+            scenario_data = {
+                'name': self.scenario.name,
+                'doc': self.scenario.doc,
+                'number': self.scenario.number,
+            }
+        else:
+            scenario_data = None
+        return json.dumps({
+            'durations': list(self.durations),
+            'is_primary': self.is_primary,
+            'scenario': scenario_data,
+        })
+
+    @classmethod
+    def from_json(cls, data: str) -> BenchmarkResult:
+        parsed = json.loads(data)
+        if 'durations' not in parsed or 'is_primary' not in parsed:
+            raise ValueError('JSON is missing required fields: durations, is_primary')
+        return cls(
+            scenario=None,
+            durations=tuple(float(d) for d in parsed['durations']),
+            is_primary=bool(parsed['is_primary']),
+        )
diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
new file mode 100644
index 0000000..86d45f7
--- /dev/null
+++ b/microbenchmark/scenario.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import argparse
+import sys
+import time
+from typing import TYPE_CHECKING, Any, Callable
+
+from microbenchmark.benchmark_result import BenchmarkResult
+
+if TYPE_CHECKING:
+    from microbenchmark.scenario_group import ScenarioGroup
+
+
+class Scenario:
+    def __init__(
+        self,
+        function: Callable[..., Any],
+        args: list[Any] | None = None,
+        *,
+        name: str,
+        doc: str = '',
+        number: int = 1000,
+        timer: Callable[[], float] = time.perf_counter,
+    ) -> None:
+        if number < 1:
+            raise ValueError(f'number must be at least 1, got {number}')
+        self.function = function
+        self._args: list[Any] = list(args) if args is not None else []
+        self.name = name
+        self.doc = doc
+        self.number = number
+        self._timer = timer
+
+    def run(self, warmup: int = 0) -> BenchmarkResult:
+        function = self.function
+        args = self._args
+        timer = self._timer
+        for _ in range(warmup):
+            timer()
+            function(*args)
+            timer()
+        durations: list[float] = []
+        for _ in range(self.number):
+            start = timer()
+            function(*args)
+            end = timer()
+            durations.append(end - start)
+        return BenchmarkResult(
+            scenario=self,
+            durations=tuple(durations),
+            is_primary=True,
+        )
+
+    def cli(self) -> None:
+        parser = argparse.ArgumentParser(description=self.doc or f'Benchmark: {self.name}')
+        parser.add_argument('--number', type=int, default=None, help='Number of iterations')
+        parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
+                            help='Fail if mean time (seconds) exceeds this threshold')
+        parsed = parser.parse_args()
+
+        scenario = self
+        if parsed.number is not None:
+            scenario = Scenario(
+                self.function,
+                self._args,
+                name=self.name,
+                doc=self.doc,
+                number=parsed.number,
+                timer=self._timer,
+            )
+
+        result = scenario.run()
+        _print_result(result)
+
+        if parsed.max_mean is not None and result.mean > parsed.max_mean:
+            sys.exit(1)
+
+    def __add__(self, other: object) -> ScenarioGroup:
+        from microbenchmark.scenario_group import ScenarioGroup  # noqa: PLC0415
+        if isinstance(other, Scenario):
+            return ScenarioGroup(self, other)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(self, *other._scenarios)
+        return NotImplemented  # type: ignore[return-value]
+
+    def __radd__(self, other: object) -> ScenarioGroup:
+        from microbenchmark.scenario_group import ScenarioGroup  # noqa: PLC0415
+        if isinstance(other, Scenario):
+            return ScenarioGroup(other, self)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*other._scenarios, self)
+        return NotImplemented  # type: ignore[return-value]
+
+
+def _print_result(result: BenchmarkResult) -> None:
+    sys.stdout.write(f'benchmark: {result.scenario.name}\n')  # type: ignore[union-attr]
+    sys.stdout.write(f'mean:  {result.mean:.6f}s\n')
+    sys.stdout.write(f'best:  {result.best:.6f}s\n')
+    sys.stdout.write(f'worst: {result.worst:.6f}s\n')
diff --git a/microbenchmark/scenario_group.py b/microbenchmark/scenario_group.py
new file mode 100644
index 0000000..d3caa52
--- /dev/null
+++ b/microbenchmark/scenario_group.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import TYPE_CHECKING
+
+from microbenchmark.benchmark_result import BenchmarkResult
+from microbenchmark.scenario import Scenario, _print_result
+
+if TYPE_CHECKING:
+    pass
+
+
+class ScenarioGroup:
+    def __init__(self, *scenarios: Scenario) -> None:
+        self._scenarios: list[Scenario] = list(scenarios)
+
+    def run(self, warmup: int = 0) -> list[BenchmarkResult]:
+        return [s.run(warmup=warmup) for s in self._scenarios]
+
+    def cli(self) -> None:
+        parser = argparse.ArgumentParser(description='Run benchmark group')
+        parser.add_argument('--number', type=int, default=None, help='Number of iterations')
+        parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
+                            help='Fail if any scenario mean time (seconds) exceeds this threshold')
+        parsed = parser.parse_args()
+
+        scenarios = self._scenarios
+        if parsed.number is not None:
+            scenarios = [
+                _make_scenario_with_number(s, parsed.number)
+                for s in self._scenarios
+            ]
+
+        failed = False
+        for i, scenario in enumerate(scenarios):
+            result = scenario.run()
+            _print_result(result)
+            if i < len(scenarios) - 1:
+                sys.stdout.write('---\n')
+            if parsed.max_mean is not None and result.mean > parsed.max_mean:
+                failed = True
+
+        if failed:
+            sys.exit(1)
+
+    def __add__(self, other: object) -> ScenarioGroup:
+        if isinstance(other, Scenario):
+            return ScenarioGroup(*self._scenarios, other)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*self._scenarios, *other._scenarios)
+        return NotImplemented  # type: ignore[return-value]
+
+    def __radd__(self, other: object) -> ScenarioGroup:
+        if isinstance(other, Scenario):
+            return ScenarioGroup(other, *self._scenarios)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*other._scenarios, *self._scenarios)
+        return NotImplemented  # type: ignore[return-value]
+
+
+def _make_scenario_with_number(s: Scenario, number: int) -> Scenario:
+    return Scenario(
+        s.function,
+        s._args,
+        name=s.name,
+        doc=s.doc,
+        number=number,
+        timer=s._timer,
+    )

From 1d1a3b0f65c9171174a077c9b45af0e2a229b8ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:54:26 +0300
Subject: [PATCH 08/33] Add noqa comment to Scenario.__init__ for PLR0913

---
 microbenchmark/scenario.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
index 86d45f7..c53e4d0 100644
--- a/microbenchmark/scenario.py
+++ b/microbenchmark/scenario.py
@@ -12,7 +12,7 @@
 
 
 class Scenario:
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         function: Callable[..., Any],
         args: list[Any] | None = None,

From 62d8d361228a079de5231ef7c16cb8474ee23883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:54:58 +0300
Subject: [PATCH 09/33] Basic unit tests

---
 tests/units/test_benchmark_result.py | 296 +++++++++++++++++++++++++++
 tests/units/test_init.py             |  30 +++
 tests/units/test_scenario.py         | 232 +++++++++++++++++++++
 tests/units/test_scenario_group.py   | 140 +++++++++++++
 4 files changed, 698 insertions(+)
 create mode 100644 tests/units/test_benchmark_result.py
 create mode 100644 tests/units/test_init.py
 create mode 100644 tests/units/test_scenario.py
 create mode 100644 tests/units/test_scenario_group.py

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
new file mode 100644
index 0000000..c30459d
--- /dev/null
+++ b/tests/units/test_benchmark_result.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import json
+import math
+from unittest.mock import MagicMock
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario
+
+
+def make_result(
+    durations: tuple[float, ...],
+    scenario: Scenario | None = None,
+    is_primary: bool = True,
+) -> BenchmarkResult:
+    if scenario is None:
+        scenario = Scenario(lambda: None, name='test', number=len(durations) or 1)
+    return BenchmarkResult(scenario=scenario, durations=durations, is_primary=is_primary)
+
+
+class TestBenchmarkResultFields:
+    def test_all_fields_stored(self) -> None:
+        scenario = Scenario(lambda: None, name='s', number=3)
+        result = BenchmarkResult(
+            scenario=scenario,
+            durations=(0.1, 0.2, 0.3),
+            is_primary=True,
+        )
+        assert result.scenario is scenario
+        assert result.durations == (0.1, 0.2, 0.3)
+        assert result.is_primary is True
+
+    def test_durations_is_tuple(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        assert isinstance(result.durations, tuple)
+
+    def test_mean_computed_correctly(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        expected = math.fsum([1.0, 2.0, 3.0]) / 3
+        assert result.mean == expected
+
+    def test_mean_uses_fsum_precision(self) -> None:
+        # floating point: sum of many small numbers may differ from fsum
+        durations = tuple(0.1 for _ in range(10))
+        result = make_result(durations)
+        expected = math.fsum(durations) / len(durations)
+        assert result.mean == expected
+
+    def test_best_is_min(self) -> None:
+        result = make_result((3.0, 1.0, 2.0))
+        assert result.best == 1.0
+
+    def test_worst_is_max(self) -> None:
+        result = make_result((3.0, 1.0, 2.0))
+        assert result.worst == 3.0
+
+    def test_is_primary_true_by_default(self) -> None:
+        result = make_result((1.0,))
+        assert result.is_primary is True
+
+    def test_is_primary_false(self) -> None:
+        result = make_result((1.0,), is_primary=False)
+        assert result.is_primary is False
+
+    def test_single_duration(self) -> None:
+        result = make_result((0.5,))
+        assert result.best == 0.5
+        assert result.worst == 0.5
+        assert result.mean == 0.5
+
+    def test_all_equal_durations(self) -> None:
+        result = make_result((0.1, 0.1, 0.1))
+        assert result.best == 0.1
+        assert result.worst == 0.1
+        assert result.mean == pytest.approx(0.1)
+
+    def test_scenario_identity(self) -> None:
+        scenario = Scenario(lambda: None, name='check')
+        result = BenchmarkResult(scenario=scenario, durations=(0.1,), is_primary=True)
+        assert result.scenario is scenario
+
+    def test_scenario_none(self) -> None:
+        result = BenchmarkResult(scenario=None, durations=(0.1,), is_primary=True)
+        assert result.scenario is None
+
+
+class TestPercentile:
+    def test_percentile_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 11)))
+        trimmed = result.percentile(90)
+        assert isinstance(trimmed, BenchmarkResult)
+
+    def test_percentile_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 11)))
+        trimmed = result.percentile(90)
+        assert trimmed.is_primary is False
+
+    def test_percentile_count_nearest_rank(self) -> None:
+        import math
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        trimmed = result.percentile(95)
+        expected_count = math.ceil(100 * 95 / 100)
+        assert len(trimmed.durations) == expected_count
+
+    def test_percentile_contains_fastest(self) -> None:
+        result = make_result((5.0, 1.0, 3.0, 2.0, 4.0))
+        trimmed = result.percentile(60)
+        import math
+        k = math.ceil(5 * 60 / 100)
+        assert len(trimmed.durations) == k
+        # should be the smallest k values
+        sorted_original = sorted(result.durations)
+        assert set(trimmed.durations) == set(sorted_original[:k])
+
+    def test_percentile_100_returns_all(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        trimmed = result.percentile(100)
+        assert len(trimmed.durations) == 3
+
+    def test_percentile_small_number(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        trimmed = result.percentile(50)
+        import math
+        expected = math.ceil(3 * 50 / 100)
+        assert len(trimmed.durations) == expected
+
+    def test_percentile_99(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        trimmed = result.percentile(99)
+        import math
+        assert len(trimmed.durations) == math.ceil(100 * 99 / 100)
+
+    def test_percentile_mean_recomputed(self) -> None:
+        result = make_result((1.0, 2.0, 3.0, 4.0, 10.0))
+        trimmed = result.percentile(80)
+        expected_durations = sorted(result.durations)[:4]
+        expected_mean = math.fsum(expected_durations) / 4
+        assert trimmed.mean == pytest.approx(expected_mean)
+
+    def test_percentile_on_derived_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        derived = result.percentile(90).percentile(50)
+        assert isinstance(derived, BenchmarkResult)
+        assert derived.is_primary is False
+
+    def test_percentile_scenario_preserved(self) -> None:
+        scenario = Scenario(lambda: None, name='s')
+        result = BenchmarkResult(scenario=scenario, durations=(1.0, 2.0, 3.0), is_primary=True)
+        trimmed = result.percentile(100)
+        assert trimmed.scenario is scenario
+
+    def test_percentile_0_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(0)
+
+    def test_percentile_negative_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(-5)
+
+    def test_percentile_above_100_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(101)
+
+    def test_percentile_preserves_fsum_mean(self) -> None:
+        durations = tuple(0.1 * i for i in range(1, 11))
+        result = make_result(durations)
+        trimmed = result.percentile(80)
+        sorted_d = sorted(durations)
+        import math
+        k = math.ceil(10 * 80 / 100)
+        expected = math.fsum(sorted_d[:k]) / k
+        assert trimmed.mean == pytest.approx(expected)
+
+
+class TestCachedProperties:
+    def test_p95_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert isinstance(result.p95, BenchmarkResult)
+
+    def test_p99_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert isinstance(result.p99, BenchmarkResult)
+
+    def test_p95_is_cached(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p95 is result.p95
+
+    def test_p99_is_cached(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p99 is result.p99
+
+    def test_p95_count(self) -> None:
+        import math
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert len(result.p95.durations) == math.ceil(100 * 95 / 100)
+
+    def test_p99_count(self) -> None:
+        import math
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert len(result.p99.durations) == math.ceil(100 * 99 / 100)
+
+    def test_p95_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p95.is_primary is False
+
+    def test_p99_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p99.is_primary is False
+
+
+class TestSerialization:
+    def test_to_json_returns_string(self) -> None:
+        result = make_result((0.1, 0.2))
+        assert isinstance(result.to_json(), str)
+
+    def test_to_json_valid_json(self) -> None:
+        result = make_result((0.1, 0.2))
+        data = json.loads(result.to_json())
+        assert isinstance(data, dict)
+
+    def test_to_json_contains_durations(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        data = json.loads(result.to_json())
+        assert 'durations' in data
+
+    def test_to_json_contains_is_primary(self) -> None:
+        result = make_result((0.1,))
+        data = json.loads(result.to_json())
+        assert 'is_primary' in data
+
+    def test_to_json_contains_scenario_metadata(self) -> None:
+        s = Scenario(lambda: None, name='myname', doc='mydoc', number=42)
+        result = BenchmarkResult(scenario=s, durations=(0.1,), is_primary=True)
+        data = json.loads(result.to_json())
+        assert data['scenario']['name'] == 'myname'
+        assert data['scenario']['doc'] == 'mydoc'
+        assert data['scenario']['number'] == 42
+
+    def test_to_json_derived_is_primary_false(self) -> None:
+        result = make_result((1.0, 2.0, 3.0), is_primary=False)
+        data = json.loads(result.to_json())
+        assert data['is_primary'] is False
+
+    def test_from_json_round_trip_durations(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.durations == result.durations
+
+    def test_from_json_round_trip_mean(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.mean == result.mean
+
+    def test_from_json_round_trip_best_worst(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.best == result.best
+        assert restored.worst == result.worst
+
+    def test_from_json_scenario_is_none(self) -> None:
+        result = make_result((0.1, 0.2))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.scenario is None
+
+    def test_from_json_preserves_is_primary(self) -> None:
+        result = make_result((0.1,), is_primary=False)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.is_primary is False
+
+    def test_from_json_primary_preserved(self) -> None:
+        result = make_result((0.1,), is_primary=True)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.is_primary is True
+
+    def test_from_json_fp_precision(self) -> None:
+        durations = tuple(0.1 * i for i in range(1, 6))
+        result = make_result(durations)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.mean == pytest.approx(result.mean)
+
+    def test_from_json_invalid_json_raises(self) -> None:
+        with pytest.raises(json.JSONDecodeError):
+            BenchmarkResult.from_json('{not valid json}')
+
+    def test_from_json_missing_fields_raises(self) -> None:
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json('{}')
+
+    def test_from_json_missing_durations_raises(self) -> None:
+        data = json.dumps({'is_primary': True, 'scenario': {'name': 'x', 'doc': '', 'number': 1}})
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json(data)
diff --git a/tests/units/test_init.py b/tests/units/test_init.py
new file mode 100644
index 0000000..d03e85a
--- /dev/null
+++ b/tests/units/test_init.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import microbenchmark
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestPublicImports:
+    def test_scenario_importable(self) -> None:
+        assert Scenario is not None
+
+    def test_scenario_group_importable(self) -> None:
+        assert ScenarioGroup is not None
+
+    def test_benchmark_result_importable(self) -> None:
+        assert BenchmarkResult is not None
+
+    def test_all_defined(self) -> None:
+        assert hasattr(microbenchmark, '__all__')
+
+    def test_all_contains_scenario(self) -> None:
+        assert 'Scenario' in microbenchmark.__all__
+
+    def test_all_contains_scenario_group(self) -> None:
+        assert 'ScenarioGroup' in microbenchmark.__all__
+
+    def test_all_contains_benchmark_result(self) -> None:
+        assert 'BenchmarkResult' in microbenchmark.__all__
+
+    def test_all_contains_exactly_three_items(self) -> None:
+        assert set(microbenchmark.__all__) == {'Scenario', 'ScenarioGroup', 'BenchmarkResult'}
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
new file mode 100644
index 0000000..77b562d
--- /dev/null
+++ b/tests/units/test_scenario.py
@@ -0,0 +1,232 @@
+from __future__ import annotations
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestScenarioConstruction:
+    def test_minimal_construction(self) -> None:
+        s = Scenario(lambda: None, name='minimal')
+        assert s.name == 'minimal'
+
+    def test_full_construction(self) -> None:
+        timer_calls = [0.0]
+
+        def fake_timer() -> float:
+            timer_calls[0] += 0.001
+            return timer_calls[0]
+
+        s = Scenario(
+            sum,
+            args=[[1, 2, 3]],
+            name='full',
+            doc='A full scenario',
+            number=50,
+            timer=fake_timer,
+        )
+        assert s.name == 'full'
+        assert s.doc == 'A full scenario'
+
+    def test_name_stored(self) -> None:
+        s = Scenario(lambda: None, name='myname')
+        assert s.name == 'myname'
+
+    def test_doc_stored(self) -> None:
+        s = Scenario(lambda: None, name='s', doc='my doc')
+        assert s.doc == 'my doc'
+
+    def test_doc_default_empty(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert s.doc == ''
+
+    def test_number_default(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert s.number == 1000
+
+    def test_number_custom(self) -> None:
+        s = Scenario(lambda: None, name='s', number=42)
+        assert s.number == 42
+
+    def test_args_none_default(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, name='s', number=1)
+        s.run()
+        assert call_log == [()]
+
+    def test_args_empty_list(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, args=[], name='s', number=1)
+        s.run()
+        assert call_log == [()]
+
+    def test_args_with_values(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, args=[1, 2, 3], name='s', number=1)
+        s.run()
+        assert call_log == [(1, 2, 3)]
+
+    def test_args_copied_on_construction(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        original = [10, 20]
+        s = Scenario(fn, args=original, name='s', number=1)
+        original.append(30)  # mutate after construction
+        s.run()
+        assert call_log == [(10, 20)]  # should not include 30
+
+    def test_number_zero_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=0)
+
+    def test_number_negative_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=-1)
+
+
+class TestScenarioRun:
+    def test_run_returns_benchmark_result(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_run_calls_function_number_times(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=7)
+        s.run()
+        assert counter[0] == 7
+
+    def test_run_durations_length_equals_number(self) -> None:
+        s = Scenario(lambda: None, name='s', number=10)
+        result = s.run()
+        assert len(result.durations) == 10
+
+    def test_run_with_warmup_total_calls(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        s.run(warmup=3)
+        assert counter[0] == 8
+
+    def test_run_warmup_not_in_durations(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run(warmup=10)
+        assert len(result.durations) == 5
+
+    def test_run_warmup_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        s.run(warmup=0)
+        assert counter[0] == 5
+
+    def test_run_uses_custom_timer(self) -> None:
+        values = iter(t * 0.001 for t in range(200))
+
+        def fake_timer() -> float:
+            return next(values)
+
+        s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
+        result = s.run()
+        assert len(result.durations) == 3
+        for d in result.durations:
+            assert d > 0
+
+    def test_custom_timer_stateful(self) -> None:
+        # timer is called before and after each run; warmup also consumes timer calls
+        tick = [0]
+
+        def fake_timer() -> float:
+            tick[0] += 1
+            return float(tick[0])
+
+        s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
+        result = s.run(warmup=2)
+        # 2 warmup * 2 timer calls + 3 measured * 2 timer calls = 10 total timer calls
+        assert tick[0] == 10
+        # only the 3 measured durations should be stored
+        assert len(result.durations) == 3
+
+    def test_run_result_scenario_is_self(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert result.scenario is s
+
+    def test_run_twice_independent(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        r1 = s.run()
+        r2 = s.run()
+        assert len(r1.durations) == 5
+        assert len(r2.durations) == 5
+        assert r1 is not r2
+
+    def test_run_propagates_exception(self) -> None:
+        def bad() -> None:
+            raise RuntimeError('oops')
+
+        s = Scenario(bad, name='s', number=1)
+        with pytest.raises(RuntimeError, match='oops'):
+            s.run()
+
+    def test_run_result_is_primary(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert result.is_primary is True
+
+
+class TestScenarioAdd:
+    def test_add_scenario_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+
+    def test_add_scenario_group_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s2)
+        group = s1 + g
+        assert isinstance(group, ScenarioGroup)
+
+    def test_add_unknown_type_returns_not_implemented(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        result = s.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
+    def test_radd_group_scenario(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s1)
+        # g + s2 is g.__add__(s2), but we also want s2.__radd__(g) to work
+        group = s2.__radd__(g)
+        assert isinstance(group, ScenarioGroup)
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
new file mode 100644
index 0000000..31cc46b
--- /dev/null
+++ b/tests/units/test_scenario_group.py
@@ -0,0 +1,140 @@
+from __future__ import annotations
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+def make_scenario(name: str = 's', number: int = 5) -> Scenario:
+    return Scenario(lambda: None, name=name, number=number)
+
+
+class TestScenarioGroupConstruction:
+    def test_empty_group(self) -> None:
+        g = ScenarioGroup()
+        assert isinstance(g, ScenarioGroup)
+
+    def test_single_scenario(self) -> None:
+        s = make_scenario('s1')
+        g = ScenarioGroup(s)
+        results = g.run()
+        assert len(results) == 1
+
+    def test_multiple_scenarios(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g = ScenarioGroup(s1, s2, s3)
+        results = g.run()
+        assert len(results) == 3
+
+
+class TestScenarioGroupOperator:
+    def test_scenario_plus_scenario(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 2
+
+    def test_group_plus_scenario(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = ScenarioGroup(s1, s2) + s3
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 3
+
+    def test_scenario_plus_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = s1 + ScenarioGroup(s2, s3)
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 3
+
+    def test_group_plus_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = ScenarioGroup(s1) + ScenarioGroup(s2, s3)
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 3
+
+    def test_triple_sum_is_flat(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = s1 + s2 + s3
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 3
+
+    def test_add_returns_new_group(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1)
+        new_g = g + s2
+        assert new_g is not g
+
+    def test_add_unknown_type_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
+    def test_radd_scenario_to_group(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1)
+        group = g.__radd__(s2)
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 2
+
+    def test_duplicate_scenarios(self) -> None:
+        s = make_scenario('s')
+        group = s + s
+        results = group.run()
+        assert len(results) == 2
+
+    def test_multiple_groups_flat(self) -> None:
+        scenarios = [make_scenario(f's{i}') for i in range(5)]
+        g1 = ScenarioGroup(scenarios[0], scenarios[1])
+        g2 = ScenarioGroup(scenarios[2], scenarios[3])
+        g3 = ScenarioGroup(scenarios[4])
+        combined = g1 + g2 + g3
+        assert len(combined.run()) == 5
+
+
+class TestScenarioGroupRun:
+    def test_run_returns_list(self) -> None:
+        g = ScenarioGroup()
+        result = g.run()
+        assert isinstance(result, list)
+
+    def test_empty_group_returns_empty_list(self) -> None:
+        g = ScenarioGroup()
+        assert g.run() == []
+
+    def test_run_returns_benchmark_results(self) -> None:
+        s = make_scenario()
+        g = ScenarioGroup(s)
+        results = g.run()
+        for r in results:
+            assert isinstance(r, BenchmarkResult)
+
+    def test_run_order_preserved(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g = ScenarioGroup(s1, s2, s3)
+        results = g.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_run_with_warmup(self) -> None:
+        counters = [0, 0]
+
+        def make_fn(idx: int) -> object:
+            def fn() -> None:
+                counters[idx] += 1
+            return fn
+
+        s1 = Scenario(make_fn(0), name='a', number=5)  # type: ignore[arg-type]
+        s2 = Scenario(make_fn(1), name='b', number=5)  # type: ignore[arg-type]
+        g = ScenarioGroup(s1, s2)
+        results = g.run(warmup=3)
+        # each scenario: 3 warmup + 5 measured = 8 calls
+        assert counters[0] == 8
+        assert counters[1] == 8
+        for r in results:
+            assert len(r.durations) == 5
+
+    def test_run_correct_scenario_reference(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1, s2)
+        results = g.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2

From 1aef2dd34ba033db4a795454209dff04849bec17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:55:08 +0300
Subject: [PATCH 10/33] Typing tests

---
 tests/typing/test_benchmark_result_types.py | 80 +++++++++++++++++++++
 tests/typing/test_scenario_group_types.py   | 51 +++++++++++++
 tests/typing/test_scenario_types.py         | 72 +++++++++++++++++++
 3 files changed, 203 insertions(+)
 create mode 100644 tests/typing/test_benchmark_result_types.py
 create mode 100644 tests/typing/test_scenario_group_types.py
 create mode 100644 tests/typing/test_scenario_types.py

diff --git a/tests/typing/test_benchmark_result_types.py b/tests/typing/test_benchmark_result_types.py
new file mode 100644
index 0000000..3f037ea
--- /dev/null
+++ b/tests/typing/test_benchmark_result_types.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario
+
+
+def make_result() -> BenchmarkResult:
+    return Scenario(lambda: None, name='s', number=10).run()
+
+
+class TestBenchmarkResultPositiveTypes:
+    def test_percentile_returns_benchmark_result(self) -> None:
+        result = make_result()
+        trimmed = result.percentile(50)
+        assert isinstance(trimmed, BenchmarkResult)
+
+    def test_p95_returns_benchmark_result(self) -> None:
+        result = make_result()
+        assert isinstance(result.p95, BenchmarkResult)
+
+    def test_p99_returns_benchmark_result(self) -> None:
+        result = make_result()
+        assert isinstance(result.p99, BenchmarkResult)
+
+    def test_to_json_returns_str(self) -> None:
+        result = make_result()
+        assert isinstance(result.to_json(), str)
+
+    def test_from_json_returns_benchmark_result(self) -> None:
+        result = make_result()
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert isinstance(restored, BenchmarkResult)
+
+    def test_mean_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.mean, float)
+
+    def test_best_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.best, float)
+
+    def test_worst_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.worst, float)
+
+    def test_is_primary_is_bool(self) -> None:
+        result = make_result()
+        assert isinstance(result.is_primary, bool)
+
+    def test_durations_is_tuple(self) -> None:
+        result = make_result()
+        assert isinstance(result.durations, tuple)
+
+
+class TestBenchmarkResultNegativeTypes:
+    def test_percentile_zero_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError):
+            result.percentile(0)
+
+    def test_percentile_negative_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError):
+            result.percentile(-1)
+
+    def test_percentile_above_100_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError):
+            result.percentile(101)
+
+    def test_from_json_invalid_raises(self) -> None:
+        with pytest.raises(Exception):
+            BenchmarkResult.from_json('{not valid}')
+
+    def test_from_json_empty_object_raises(self) -> None:
+        with pytest.raises(Exception):
+            BenchmarkResult.from_json('{}')
diff --git a/tests/typing/test_scenario_group_types.py b/tests/typing/test_scenario_group_types.py
new file mode 100644
index 0000000..9415151
--- /dev/null
+++ b/tests/typing/test_scenario_group_types.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestScenarioGroupPositiveTypes:
+    def test_empty_construction(self) -> None:
+        g = ScenarioGroup()
+        assert isinstance(g, ScenarioGroup)
+
+    def test_single_scenario(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        g = ScenarioGroup(s)
+        assert isinstance(g, ScenarioGroup)
+
+    def test_multiple_scenarios(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s1, s2)
+        assert isinstance(g, ScenarioGroup)
+
+    def test_run_returns_list(self) -> None:
+        g = ScenarioGroup()
+        result = g.run()
+        assert isinstance(result, list)
+
+    def test_run_returns_list_of_benchmark_results(self) -> None:
+        s = Scenario(lambda: None, name='s', number=1)
+        g = ScenarioGroup(s)
+        results = g.run()
+        for r in results:
+            assert isinstance(r, BenchmarkResult)
+
+    def test_add_scenario_returns_group(self) -> None:
+        g = ScenarioGroup()
+        s = Scenario(lambda: None, name='s')
+        result = g + s
+        assert isinstance(result, ScenarioGroup)
+
+    def test_add_group_returns_group(self) -> None:
+        g1 = ScenarioGroup()
+        g2 = ScenarioGroup()
+        result = g1 + g2
+        assert isinstance(result, ScenarioGroup)
+
+
+class TestScenarioGroupNegativeTypes:
+    def test_add_int_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
diff --git a/tests/typing/test_scenario_types.py b/tests/typing/test_scenario_types.py
new file mode 100644
index 0000000..6d2716c
--- /dev/null
+++ b/tests/typing/test_scenario_types.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import time
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+# ---------------------------------------------------------------------------
+# Positive type checks (runtime — confirm valid usage works)
+# ---------------------------------------------------------------------------
+
+class TestScenarioPositiveTypes:
+    def test_callable_function(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert isinstance(s, Scenario)
+
+    def test_args_none(self) -> None:
+        s = Scenario(lambda: None, args=None, name='s')
+        assert isinstance(s, Scenario)
+
+    def test_args_list(self) -> None:
+        s = Scenario(sum, args=[[1, 2, 3]], name='s')
+        assert isinstance(s, Scenario)
+
+    def test_number_int(self) -> None:
+        s = Scenario(lambda: None, name='s', number=100)
+        assert isinstance(s, Scenario)
+
+    def test_timer_callable(self) -> None:
+        s = Scenario(lambda: None, name='s', timer=time.perf_counter)
+        assert isinstance(s, Scenario)
+
+    def test_run_returns_benchmark_result(self) -> None:
+        result = Scenario(lambda: None, name='s', number=1).run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_run_with_warmup_returns_benchmark_result(self) -> None:
+        result = Scenario(lambda: None, name='s', number=1).run(warmup=0)
+        assert isinstance(result, BenchmarkResult)
+
+    def test_add_scenario_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+
+    def test_add_group_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        g = ScenarioGroup()
+        group = s1 + g
+        assert isinstance(group, ScenarioGroup)
+
+
+# ---------------------------------------------------------------------------
+# Negative type checks (runtime ValueError/TypeError for invalid inputs)
+# ---------------------------------------------------------------------------
+
+class TestScenarioNegativeTypes:
+    def test_number_zero_raises(self) -> None:
+        with pytest.raises(ValueError):
+            Scenario(lambda: None, name='s', number=0)
+
+    def test_number_negative_raises(self) -> None:
+        with pytest.raises(ValueError):
+            Scenario(lambda: None, name='s', number=-5)
+
+    def test_add_int_returns_not_implemented(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        result = s.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented

From faaa3951cbccb5ae7e6dc089b96bdcb12c295c2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:55:34 +0300
Subject: [PATCH 11/33] Tests for CLI

---
 tests/cli/test_scenario_cli.py       | 123 +++++++++++++++++++++++++++
 tests/cli/test_scenario_group_cli.py |  99 +++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 tests/cli/test_scenario_cli.py
 create mode 100644 tests/cli/test_scenario_group_cli.py

diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
new file mode 100644
index 0000000..81ecf81
--- /dev/null
+++ b/tests/cli/test_scenario_cli.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+import textwrap
+
+
+def run_script(script: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
+    """Run an inline Python script as a subprocess and return the completed process."""
+    return subprocess.run(
+        [sys.executable, '-c', script, *args],
+        capture_output=True,
+        text=True,
+        encoding='utf-8',
+        timeout=timeout,
+        check=False,
+    )
+
+
+def scenario_script(extra: str = '') -> str:
+    """Return a self-contained script that creates a Scenario and calls cli()."""
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s = Scenario(lambda: None, name='bench', number=10, timer=fake_timer)
+        {extra}
+        s.cli()
+    ''')
+
+
+class TestScenarioCliOutput:
+    def test_cli_outputs_name(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_cli_outputs_mean(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'mean:' in proc.stdout
+
+    def test_cli_outputs_best(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'best:' in proc.stdout
+
+    def test_cli_outputs_worst(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'worst:' in proc.stdout
+
+    def test_cli_output_has_s_suffix(self) -> None:
+        proc = run_script(scenario_script())
+        assert 's\n' in proc.stdout or proc.stdout.rstrip().endswith('s')
+
+    def test_cli_exit_code_0_by_default(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.returncode == 0
+
+    def test_cli_writes_to_stdout(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.stdout.strip() != ''
+        assert proc.stderr == ''
+
+
+class TestScenarioCliNumberArg:
+    def test_number_arg_changes_durations_count(self) -> None:
+        # We can't directly inspect durations from subprocess output,
+        # but we can verify the CLI accepts --number without error
+        proc = run_script(scenario_script(), '--number', '5')
+        assert proc.returncode == 0
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_number_arg_default_uses_scenario_number(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.returncode == 0
+
+
+class TestScenarioCliMaxMean:
+    def test_max_mean_below_threshold_exit_0(self) -> None:
+        # fake_timer increments by 0.001 each call, so mean per run = 0.001
+        proc = run_script(scenario_script(), '--max-mean', '1.0')
+        assert proc.returncode == 0
+
+    def test_max_mean_above_threshold_exit_1(self) -> None:
+        # mean will be ~0.001s; threshold 0.000001 is far below
+        proc = run_script(scenario_script(), '--max-mean', '0.000001')
+        assert proc.returncode == 1
+
+    def test_max_mean_still_prints_output(self) -> None:
+        proc = run_script(scenario_script(), '--max-mean', '0.000001')
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_max_mean_equal_threshold_exit_0(self) -> None:
+        # mean = exactly threshold → should pass (mean <= threshold)
+        # With fake_timer: each call delta = 0.001, number=10
+        # mean = 0.001. Set --max-mean = 10 to ensure it passes.
+        proc = run_script(scenario_script(), '--max-mean', '10')
+        assert proc.returncode == 0
+
+
+class TestScenarioCliHelp:
+    def test_help_exits_0(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        assert proc.returncode == 0
+
+    def test_help_output_not_empty(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert len(combined) > 0
+
+    def test_help_mentions_number(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert 'number' in combined.lower()
+
+    def test_help_mentions_max_mean(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert 'max-mean' in combined.lower() or 'max_mean' in combined.lower()
diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
new file mode 100644
index 0000000..ce9285c
--- /dev/null
+++ b/tests/cli/test_scenario_group_cli.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+import textwrap
+
+
+def run_script(script: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, '-c', script, *args],
+        capture_output=True,
+        text=True,
+        encoding='utf-8',
+        timeout=timeout,
+        check=False,
+    )
+
+
+def group_script(extra: str = '') -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='first', number=5, timer=fake_timer)
+        s2 = Scenario(lambda: None, name='second', number=5, timer=fake_timer)
+        group = s1 + s2
+        {extra}
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliOutput:
+    def test_outputs_both_scenario_names(self) -> None:
+        proc = run_script(group_script())
+        assert 'benchmark: first' in proc.stdout
+        assert 'benchmark: second' in proc.stdout
+
+    def test_results_separated_by_divider(self) -> None:
+        proc = run_script(group_script())
+        assert '---' in proc.stdout
+
+    def test_divider_between_not_after_last(self) -> None:
+        proc = run_script(group_script())
+        lines = proc.stdout.strip().splitlines()
+        # last line should NOT be ---
+        assert lines[-1] != '---'
+
+    def test_exit_code_0_by_default(self) -> None:
+        proc = run_script(group_script())
+        assert proc.returncode == 0
+
+    def test_outputs_mean_best_worst_for_each(self) -> None:
+        proc = run_script(group_script())
+        assert proc.stdout.count('mean:') == 2
+        assert proc.stdout.count('best:') == 2
+        assert proc.stdout.count('worst:') == 2
+
+    def test_writes_to_stdout(self) -> None:
+        proc = run_script(group_script())
+        assert proc.stdout.strip() != ''
+        assert proc.stderr == ''
+
+
+class TestScenarioGroupCliNumberArg:
+    def test_number_arg_accepted(self) -> None:
+        proc = run_script(group_script(), '--number', '3')
+        assert proc.returncode == 0
+        assert 'benchmark: first' in proc.stdout
+
+
+class TestScenarioGroupCliMaxMean:
+    def test_max_mean_passes_when_below(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '10.0')
+        assert proc.returncode == 0
+
+    def test_max_mean_fails_when_any_exceeds(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '0.000001')
+        assert proc.returncode == 1
+
+    def test_max_mean_still_prints_output_on_failure(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '0.000001')
+        assert 'benchmark:' in proc.stdout
+
+
+class TestScenarioGroupCliHelp:
+    def test_help_exits_0(self) -> None:
+        proc = run_script(group_script(), '--help')
+        assert proc.returncode == 0
+
+    def test_help_mentions_number(self) -> None:
+        proc = run_script(group_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert 'number' in combined.lower()

From 251f303364b8e7bb12dd5afb624eaf35f7f9a9d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:55:45 +0300
Subject: [PATCH 12/33] Tests for docs

---
 tests/documentation/test_readme.py | 156 +++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 tests/documentation/test_readme.py

diff --git a/tests/documentation/test_readme.py b/tests/documentation/test_readme.py
new file mode 100644
index 0000000..25463fa
--- /dev/null
+++ b/tests/documentation/test_readme.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+import math
+from functools import partial
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestQuickStart:
+    def test_quick_start_basic(self) -> None:
+        def build_list() -> list[int]:
+            return list(range(1000))
+
+        scenario = Scenario(build_list, name='build_list', number=500)
+        result = scenario.run()
+        assert len(result.durations) == 500
+        assert isinstance(result.mean, float)
+        assert isinstance(result.best, float)
+        assert isinstance(result.worst, float)
+
+
+class TestScenarioConstructor:
+    def test_full_constructor(self) -> None:
+        scenario = Scenario(
+            sorted,
+            args=[[3, 1, 2]],
+            name='sort_three_items',
+            doc='Sort a list of three integers.',
+            number=10000,
+        )
+        assert scenario.name == 'sort_three_items'
+        assert scenario.doc == 'Sort a list of three integers.'
+        assert scenario.number == 10000
+
+    def test_partial_kwargs(self) -> None:
+        scenario = Scenario(
+            partial(sorted, key=lambda x: -x),
+            args=[[3, 1, 2]],
+            name='sort_descending',
+        )
+        result = scenario.run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_multiple_positional_args(self) -> None:
+        scenario = Scenario(pow, args=[2, 10], name='power')
+        result = scenario.run()
+        assert isinstance(result.mean, float)
+
+
+class TestScenarioRun:
+    def test_run_with_warmup(self) -> None:
+        scenario = Scenario(lambda: list(range(100)), name='build', number=1000)
+        result = scenario.run(warmup=100)
+        assert len(result.durations) == 1000
+
+
+class TestScenarioGroupCreation:
+    def test_direct_construction(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = ScenarioGroup(s1, s2)
+        assert isinstance(group, ScenarioGroup)
+
+    def test_empty_group(self) -> None:
+        empty = ScenarioGroup()
+        assert len(empty.run()) == 0
+
+    def test_plus_operator_two_scenarios(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert type(group).__name__ == 'ScenarioGroup'
+
+    def test_scenario_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        group = ScenarioGroup(s1, s2)
+        extended = group + s3
+        assert len(extended.run()) == 3
+
+    def test_reverse_scenario_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        group = ScenarioGroup(s1, s2)
+        also_ok = s3 + group
+        assert len(also_ok.run()) == 3
+
+    def test_group_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        g1 = ScenarioGroup(s1)
+        g2 = ScenarioGroup(s2, s3)
+        combined = g1 + g2
+        assert len(combined.run()) == 3
+
+
+class TestScenarioGroupRun:
+    def test_run_order_preserved(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = ScenarioGroup(s1, s2)
+        results = group.run(warmup=50)
+        assert results[0].scenario.name == 's1'
+        assert results[1].scenario.name == 's2'
+
+
+class TestBenchmarkResultFields:
+    def test_fields_documentation_example(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.durations) == 100
+        assert result.is_primary is True
+
+    def test_durations_is_tuple(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert isinstance(result.durations, tuple)
+
+
+class TestPercentile:
+    def test_percentile_documentation_example(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        trimmed = result.percentile(95)
+        assert trimmed.is_primary is False
+        assert len(trimmed.durations) == math.ceil(100 * 95 / 100)
+
+    def test_percentile_chaining(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        chained = result.percentile(90).percentile(50)
+        assert len(chained.durations) == math.ceil(math.ceil(100 * 90 / 100) * 50 / 100)
+
+
+class TestP95P99:
+    def test_p95_cached(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.p95.durations) == math.ceil(100 * 95 / 100)
+        assert result.p95.is_primary is False
+        assert result.p95 is result.p95
+
+    def test_p99_cached(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.p99.durations) == math.ceil(100 * 99 / 100)
+        assert result.p99.is_primary is False
+        assert result.p99 is result.p99
+
+
+class TestJsonRoundTrip:
+    def test_json_round_trip(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        json_str = result.to_json()
+        restored = BenchmarkResult.from_json(json_str)
+        assert restored.scenario is None
+        assert restored.mean == result.mean
+        assert restored.durations == result.durations
+        assert restored.is_primary == result.is_primary

From 22a709185440a91c74180728a5d9b4839be203a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:56:05 +0300
Subject: [PATCH 13/33] Fixes for typing tests

---
 tests/typing/test_benchmark_result_types.py | 10 +++++-----
 tests/typing/test_scenario_types.py         |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/typing/test_benchmark_result_types.py b/tests/typing/test_benchmark_result_types.py
index 3f037ea..80f4a41 100644
--- a/tests/typing/test_benchmark_result_types.py
+++ b/tests/typing/test_benchmark_result_types.py
@@ -58,23 +58,23 @@ def test_durations_is_tuple(self) -> None:
 class TestBenchmarkResultNegativeTypes:
     def test_percentile_zero_raises(self) -> None:
         result = make_result()
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match='percentile'):
             result.percentile(0)
 
     def test_percentile_negative_raises(self) -> None:
         result = make_result()
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match='percentile'):
             result.percentile(-1)
 
     def test_percentile_above_100_raises(self) -> None:
         result = make_result()
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match='percentile'):
             result.percentile(101)
 
     def test_from_json_invalid_raises(self) -> None:
-        with pytest.raises(Exception):
+        with pytest.raises(json.JSONDecodeError):
             BenchmarkResult.from_json('{not valid}')
 
     def test_from_json_empty_object_raises(self) -> None:
-        with pytest.raises(Exception):
+        with pytest.raises(ValueError, match='required fields'):
             BenchmarkResult.from_json('{}')
diff --git a/tests/typing/test_scenario_types.py b/tests/typing/test_scenario_types.py
index 6d2716c..b834ec4 100644
--- a/tests/typing/test_scenario_types.py
+++ b/tests/typing/test_scenario_types.py
@@ -59,11 +59,11 @@ def test_add_group_returns_group(self) -> None:
 
 class TestScenarioNegativeTypes:
     def test_number_zero_raises(self) -> None:
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match='number'):
             Scenario(lambda: None, name='s', number=0)
 
     def test_number_negative_raises(self) -> None:
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match='number'):
             Scenario(lambda: None, name='s', number=-5)
 
     def test_add_int_returns_not_implemented(self) -> None:

From 2d4d682cc6e97753129fe907565aa2edc8ac663a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 22:56:31 +0300
Subject: [PATCH 14/33] Some lint's issues fixed

---
 tests/typing/test_scenario_types.py  | 1 -
 tests/units/test_benchmark_result.py | 8 --------
 2 files changed, 9 deletions(-)

diff --git a/tests/typing/test_scenario_types.py b/tests/typing/test_scenario_types.py
index b834ec4..08759f7 100644
--- a/tests/typing/test_scenario_types.py
+++ b/tests/typing/test_scenario_types.py
@@ -6,7 +6,6 @@
 
 from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
 
-
 # ---------------------------------------------------------------------------
 # Positive type checks (runtime — confirm valid usage works)
 # ---------------------------------------------------------------------------
diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index c30459d..21efbe5 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -2,7 +2,6 @@
 
 import json
 import math
-from unittest.mock import MagicMock
 
 import pytest
 
@@ -97,7 +96,6 @@ def test_percentile_is_primary_false(self) -> None:
         assert trimmed.is_primary is False
 
     def test_percentile_count_nearest_rank(self) -> None:
-        import math
         result = make_result(tuple(float(i) for i in range(1, 101)))
         trimmed = result.percentile(95)
         expected_count = math.ceil(100 * 95 / 100)
@@ -106,7 +104,6 @@ def test_percentile_count_nearest_rank(self) -> None:
     def test_percentile_contains_fastest(self) -> None:
         result = make_result((5.0, 1.0, 3.0, 2.0, 4.0))
         trimmed = result.percentile(60)
-        import math
         k = math.ceil(5 * 60 / 100)
         assert len(trimmed.durations) == k
         # should be the smallest k values
@@ -121,14 +118,12 @@ def test_percentile_100_returns_all(self) -> None:
     def test_percentile_small_number(self) -> None:
         result = make_result((1.0, 2.0, 3.0))
         trimmed = result.percentile(50)
-        import math
         expected = math.ceil(3 * 50 / 100)
         assert len(trimmed.durations) == expected
 
     def test_percentile_99(self) -> None:
         result = make_result(tuple(float(i) for i in range(1, 101)))
         trimmed = result.percentile(99)
-        import math
         assert len(trimmed.durations) == math.ceil(100 * 99 / 100)
 
     def test_percentile_mean_recomputed(self) -> None:
@@ -170,7 +165,6 @@ def test_percentile_preserves_fsum_mean(self) -> None:
         result = make_result(durations)
         trimmed = result.percentile(80)
         sorted_d = sorted(durations)
-        import math
         k = math.ceil(10 * 80 / 100)
         expected = math.fsum(sorted_d[:k]) / k
         assert trimmed.mean == pytest.approx(expected)
@@ -194,12 +188,10 @@ def test_p99_is_cached(self) -> None:
         assert result.p99 is result.p99
 
     def test_p95_count(self) -> None:
-        import math
         result = make_result(tuple(float(i) for i in range(1, 101)))
         assert len(result.p95.durations) == math.ceil(100 * 95 / 100)
 
     def test_p99_count(self) -> None:
-        import math
         result = make_result(tuple(float(i) for i in range(1, 101)))
         assert len(result.p99.durations) == math.ceil(100 * 99 / 100)
 

From 862e925345237d0fd5898fcf454bada3d8983f34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:01:01 +0300
Subject: [PATCH 15/33] Refactor to_json/from_json to use TypedDict for schema
 validation

---
 microbenchmark/benchmark_result.py | 55 +++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/microbenchmark/benchmark_result.py b/microbenchmark/benchmark_result.py
index aef954d..7cd39f9 100644
--- a/microbenchmark/benchmark_result.py
+++ b/microbenchmark/benchmark_result.py
@@ -4,12 +4,24 @@
 import math
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, TypedDict
 
 if TYPE_CHECKING:
     from microbenchmark.scenario import Scenario
 
 
+class _ScenarioMeta(TypedDict):
+    name: str
+    doc: str
+    number: int
+
+
+class _ResultJson(TypedDict):
+    durations: list[float]
+    is_primary: bool
+    scenario: _ScenarioMeta | None
+
+
 @dataclass
 class BenchmarkResult:
     scenario: Scenario | None
@@ -45,28 +57,37 @@ def p99(self) -> BenchmarkResult:
         return self.percentile(99)
 
     def to_json(self) -> str:
-        scenario_data: dict[str, Any] | None
+        scenario_meta: _ScenarioMeta | None
         if self.scenario is not None:
-            scenario_data = {
-                'name': self.scenario.name,
-                'doc': self.scenario.doc,
-                'number': self.scenario.number,
-            }
+            scenario_meta = _ScenarioMeta(
+                name=self.scenario.name,
+                doc=self.scenario.doc,
+                number=self.scenario.number,
+            )
         else:
-            scenario_data = None
-        return json.dumps({
-            'durations': list(self.durations),
-            'is_primary': self.is_primary,
-            'scenario': scenario_data,
-        })
+            scenario_meta = None
+        data: _ResultJson = _ResultJson(
+            durations=list(self.durations),
+            is_primary=self.is_primary,
+            scenario=scenario_meta,
+        )
+        return json.dumps(data)
 
     @classmethod
     def from_json(cls, data: str) -> BenchmarkResult:
-        parsed = json.loads(data)
-        if 'durations' not in parsed or 'is_primary' not in parsed:
+        raw: object = json.loads(data)
+        if not isinstance(raw, dict):
+            raise ValueError('JSON must be an object')
+        if 'durations' not in raw or 'is_primary' not in raw:
             raise ValueError('JSON is missing required fields: durations, is_primary')
+        raw_durations = raw['durations']
+        raw_is_primary = raw['is_primary']
+        if not isinstance(raw_durations, list):
+            raise ValueError('durations must be a list')
+        if not isinstance(raw_is_primary, bool):
+            raise ValueError('is_primary must be a bool')
         return cls(
             scenario=None,
-            durations=tuple(float(d) for d in parsed['durations']),
-            is_primary=bool(parsed['is_primary']),
+            durations=tuple(float(d) for d in raw_durations),
+            is_primary=raw_is_primary,
         )

From bbb7b9739d7db8260ecd732c6bad5a37b5d9fd0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:06:26 +0300
Subject: [PATCH 16/33] Factor CLI args into shared _CliArgs class for Scenario
 and ScenarioGroup

---
 microbenchmark/scenario.py       | 38 ++++++++++++++++++++------------
 microbenchmark/scenario_group.py | 17 ++++++++------
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
index c53e4d0..bb9176f 100644
--- a/microbenchmark/scenario.py
+++ b/microbenchmark/scenario.py
@@ -3,7 +3,7 @@
 import argparse
 import sys
 import time
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Callable
 
 from microbenchmark.benchmark_result import BenchmarkResult
 
@@ -11,11 +11,17 @@
     from microbenchmark.scenario_group import ScenarioGroup
 
 
+class _CliArgs:
+    def __init__(self) -> None:
+        self.number: int | None = None
+        self.max_mean: float | None = None
+
+
 class Scenario:
     def __init__(  # noqa: PLR0913
         self,
-        function: Callable[..., Any],
-        args: list[Any] | None = None,
+        function: object,
+        args: list[object] | None = None,
         *,
         name: str,
         doc: str = '',
@@ -24,25 +30,26 @@ def __init__(  # noqa: PLR0913
     ) -> None:
         if number < 1:
             raise ValueError(f'number must be at least 1, got {number}')
-        self.function = function
-        self._args: list[Any] = list(args) if args is not None else []
+        self.function: object = function
+        self._args: list[object] = list(args) if args is not None else []
         self.name = name
         self.doc = doc
         self.number = number
         self._timer = timer
 
+    def _call_once(self) -> None:
+        self.function(*self._args)  # type: ignore[operator]
+
     def run(self, warmup: int = 0) -> BenchmarkResult:
-        function = self.function
-        args = self._args
         timer = self._timer
         for _ in range(warmup):
             timer()
-            function(*args)
+            self._call_once()
             timer()
         durations: list[float] = []
         for _ in range(self.number):
             start = timer()
-            function(*args)
+            self._call_once()
             end = timer()
             durations.append(end - start)
         return BenchmarkResult(
@@ -56,23 +63,24 @@ def cli(self) -> None:
         parser.add_argument('--number', type=int, default=None, help='Number of iterations')
         parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
                             help='Fail if mean time (seconds) exceeds this threshold')
-        parsed = parser.parse_args()
+        cli_args = _CliArgs()
+        parser.parse_args(namespace=cli_args)
 
         scenario = self
-        if parsed.number is not None:
+        if cli_args.number is not None:
             scenario = Scenario(
                 self.function,
                 self._args,
                 name=self.name,
                 doc=self.doc,
-                number=parsed.number,
+                number=cli_args.number,
                 timer=self._timer,
             )
 
         result = scenario.run()
         _print_result(result)
 
-        if parsed.max_mean is not None and result.mean > parsed.max_mean:
+        if cli_args.max_mean is not None and result.mean > cli_args.max_mean:
             sys.exit(1)
 
     def __add__(self, other: object) -> ScenarioGroup:
@@ -93,7 +101,9 @@ def __radd__(self, other: object) -> ScenarioGroup:
 
 
 def _print_result(result: BenchmarkResult) -> None:
-    sys.stdout.write(f'benchmark: {result.scenario.name}\n')  # type: ignore[union-attr]
+    scenario = result.scenario
+    assert scenario is not None
+    sys.stdout.write(f'benchmark: {scenario.name}\n')
     sys.stdout.write(f'mean:  {result.mean:.6f}s\n')
     sys.stdout.write(f'best:  {result.best:.6f}s\n')
     sys.stdout.write(f'worst: {result.worst:.6f}s\n')
diff --git a/microbenchmark/scenario_group.py b/microbenchmark/scenario_group.py
index d3caa52..a8530cf 100644
--- a/microbenchmark/scenario_group.py
+++ b/microbenchmark/scenario_group.py
@@ -2,13 +2,15 @@
 
 import argparse
 import sys
-from typing import TYPE_CHECKING
 
 from microbenchmark.benchmark_result import BenchmarkResult
 from microbenchmark.scenario import Scenario, _print_result
 
-if TYPE_CHECKING:
-    pass
+
+class _CliArgs:
+    def __init__(self) -> None:
+        self.number: int | None = None
+        self.max_mean: float | None = None
 
 
 class ScenarioGroup:
@@ -23,12 +25,13 @@ def cli(self) -> None:
         parser.add_argument('--number', type=int, default=None, help='Number of iterations')
         parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
                             help='Fail if any scenario mean time (seconds) exceeds this threshold')
-        parsed = parser.parse_args()
+        cli_args = _CliArgs()
+        parser.parse_args(namespace=cli_args)
 
         scenarios = self._scenarios
-        if parsed.number is not None:
+        if cli_args.number is not None:
             scenarios = [
-                _make_scenario_with_number(s, parsed.number)
+                _make_scenario_with_number(s, cli_args.number)
                 for s in self._scenarios
             ]
 
@@ -38,7 +41,7 @@ def cli(self) -> None:
             _print_result(result)
             if i < len(scenarios) - 1:
                 sys.stdout.write('---\n')
-            if parsed.max_mean is not None and result.mean > parsed.max_mean:
+            if cli_args.max_mean is not None and result.mean > cli_args.max_mean:
                 failed = True
 
         if failed:

From 188a2210e90d79a211abcdb47edce98d31d2400a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:16:27 +0300
Subject: [PATCH 17/33] Update args type hint to Sequence[object] for
 consistency and future extensibility

---
 microbenchmark/scenario.py       | 8 ++++----
 microbenchmark/scenario_group.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
index bb9176f..bda9dd6 100644
--- a/microbenchmark/scenario.py
+++ b/microbenchmark/scenario.py
@@ -3,7 +3,7 @@
 import argparse
 import sys
 import time
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Callable, Sequence
 
 from microbenchmark.benchmark_result import BenchmarkResult
 
@@ -21,7 +21,7 @@ class Scenario:
     def __init__(  # noqa: PLR0913
         self,
         function: object,
-        args: list[object] | None = None,
+        args: Sequence[object] | None = None,
         *,
         name: str,
         doc: str = '',
@@ -89,7 +89,7 @@ def __add__(self, other: object) -> ScenarioGroup:
             return ScenarioGroup(self, other)
         if isinstance(other, ScenarioGroup):
             return ScenarioGroup(self, *other._scenarios)
-        return NotImplemented  # type: ignore[return-value]
+        return NotImplemented
 
     def __radd__(self, other: object) -> ScenarioGroup:
         from microbenchmark.scenario_group import ScenarioGroup  # noqa: PLC0415
@@ -97,7 +97,7 @@ def __radd__(self, other: object) -> ScenarioGroup:
             return ScenarioGroup(other, self)
         if isinstance(other, ScenarioGroup):
             return ScenarioGroup(*other._scenarios, self)
-        return NotImplemented  # type: ignore[return-value]
+        return NotImplemented
 
 
 def _print_result(result: BenchmarkResult) -> None:
diff --git a/microbenchmark/scenario_group.py b/microbenchmark/scenario_group.py
index a8530cf..b97f52d 100644
--- a/microbenchmark/scenario_group.py
+++ b/microbenchmark/scenario_group.py
@@ -52,14 +52,14 @@ def __add__(self, other: object) -> ScenarioGroup:
             return ScenarioGroup(*self._scenarios, other)
         if isinstance(other, ScenarioGroup):
             return ScenarioGroup(*self._scenarios, *other._scenarios)
-        return NotImplemented  # type: ignore[return-value]
+        return NotImplemented
 
     def __radd__(self, other: object) -> ScenarioGroup:
         if isinstance(other, Scenario):
             return ScenarioGroup(other, *self._scenarios)
         if isinstance(other, ScenarioGroup):
             return ScenarioGroup(*other._scenarios, *self._scenarios)
-        return NotImplemented  # type: ignore[return-value]
+        return NotImplemented
 
 
 def _make_scenario_with_number(s: Scenario, number: int) -> Scenario:

From 841cf3b3de6da03147d3f5eb845ec222ed01d8f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:17:09 +0300
Subject: [PATCH 18/33] Add CLI edge case tests and scenario group divider
 logic

---
 tests/cli/test_scenario_cli.py       | 15 +++++++++
 tests/cli/test_scenario_group_cli.py | 46 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
index 81ecf81..66f385f 100644
--- a/tests/cli/test_scenario_cli.py
+++ b/tests/cli/test_scenario_cli.py
@@ -121,3 +121,18 @@ def test_help_mentions_max_mean(self) -> None:
         proc = run_script(scenario_script(), '--help')
         combined = proc.stdout + proc.stderr
         assert 'max-mean' in combined.lower() or 'max_mean' in combined.lower()
+
+
+class TestScenarioCliEdgeCases:
+    def test_number_zero_fails(self) -> None:
+        proc = run_script(scenario_script(), '--number', '0')
+        assert proc.returncode != 0
+
+    def test_number_negative_fails(self) -> None:
+        proc = run_script(scenario_script(), '--number', '-1')
+        assert proc.returncode != 0
+
+    def test_number_and_max_mean_combined(self) -> None:
+        proc = run_script(scenario_script(), '--number', '5', '--max-mean', '10.0')
+        assert proc.returncode == 0
+        assert 'benchmark: bench' in proc.stdout
diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
index ce9285c..9400e63 100644
--- a/tests/cli/test_scenario_group_cli.py
+++ b/tests/cli/test_scenario_group_cli.py
@@ -97,3 +97,49 @@ def test_help_mentions_number(self) -> None:
         proc = run_script(group_script(), '--help')
         combined = proc.stdout + proc.stderr
         assert 'number' in combined.lower()
+
+
+def single_scenario_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='only', number=5, timer=fake_timer)
+        group = ScenarioGroup(s1)
+        group.cli()
+    ''')
+
+
+def three_scenario_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='a', number=5, timer=fake_timer)
+        s2 = Scenario(lambda: None, name='b', number=5, timer=fake_timer)
+        s3 = Scenario(lambda: None, name='c', number=5, timer=fake_timer)
+        group = ScenarioGroup(s1, s2, s3)
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliDividers:
+    def test_single_scenario_no_divider(self) -> None:
+        proc = run_script(single_scenario_script())
+        assert '---' not in proc.stdout
+
+    def test_three_scenarios_two_dividers(self) -> None:
+        proc = run_script(three_scenario_script())
+        assert proc.stdout.count('---') == 2

From 5802a891a3279f1027be6639b41656006e1f28de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:17:37 +0300
Subject: [PATCH 19/33] Add assertions to verify scenario references in results

---
 tests/documentation/test_readme.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/documentation/test_readme.py b/tests/documentation/test_readme.py
index 25463fa..a537711 100644
--- a/tests/documentation/test_readme.py
+++ b/tests/documentation/test_readme.py
@@ -103,6 +103,8 @@ def test_run_order_preserved(self) -> None:
         s2 = Scenario(lambda: None, name='s2')
         group = ScenarioGroup(s1, s2)
         results = group.run(warmup=50)
+        assert results[0].scenario is not None
+        assert results[1].scenario is not None
         assert results[0].scenario.name == 's1'
         assert results[1].scenario.name == 's2'
 

From 46a0d46dc4a4ec17a843d9ac73cd5da7902064b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:19:09 +0300
Subject: [PATCH 20/33] Add tests for BenchmarkResult serialization, Scenario
 construction/run behavior, and ScenarioGroup operator rules

---
 tests/units/test_benchmark_result.py | 34 ++++++++++++++++++++++++++--
 tests/units/test_scenario.py         | 28 ++++++++++++++++++++---
 tests/units/test_scenario_group.py   | 10 ++++++++
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index 21efbe5..a72cdf8 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -106,9 +106,9 @@ def test_percentile_contains_fastest(self) -> None:
         trimmed = result.percentile(60)
         k = math.ceil(5 * 60 / 100)
         assert len(trimmed.durations) == k
-        # should be the smallest k values
+        # should be the smallest k values in sorted order
         sorted_original = sorted(result.durations)
-        assert set(trimmed.durations) == set(sorted_original[:k])
+        assert trimmed.durations == tuple(sorted_original[:k])
 
     def test_percentile_100_returns_all(self) -> None:
         result = make_result((1.0, 2.0, 3.0))
@@ -286,3 +286,33 @@ def test_from_json_missing_durations_raises(self) -> None:
         data = json.dumps({'is_primary': True, 'scenario': {'name': 'x', 'doc': '', 'number': 1}})
         with pytest.raises(ValueError, match='required fields'):
             BenchmarkResult.from_json(data)
+
+    def test_to_json_scenario_none_is_null(self) -> None:
+        result = BenchmarkResult(scenario=None, durations=(0.1,), is_primary=True)
+        data = json.loads(result.to_json())
+        assert data['scenario'] is None
+
+    def test_from_json_with_scenario_field_ignored(self) -> None:
+        payload = json.dumps({
+            'durations': [0.1, 0.2],
+            'is_primary': True,
+            'scenario': {'name': 'x', 'doc': '', 'number': 1},
+        })
+        restored = BenchmarkResult.from_json(payload)
+        assert restored.scenario is None
+        assert restored.durations == (0.1, 0.2)
+
+    def test_from_json_durations_not_list_raises(self) -> None:
+        payload = json.dumps({'durations': 'not a list', 'is_primary': True})
+        with pytest.raises(ValueError, match='durations'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_is_primary_not_bool_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1], 'is_primary': 'true'})
+        with pytest.raises(ValueError, match='is_primary'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_missing_is_primary_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1, 0.2]})
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json(payload)
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index 77b562d..cf52950 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -98,6 +98,10 @@ def test_number_negative_raises(self) -> None:
         with pytest.raises(ValueError, match='number'):
             Scenario(lambda: None, name='s', number=-1)
 
+    def test_name_required_raises(self) -> None:
+        with pytest.raises(TypeError):
+            Scenario(lambda: None)  # type: ignore[call-arg]
+
 
 class TestScenarioRun:
     def test_run_returns_benchmark_result(self) -> None:
@@ -146,6 +150,8 @@ def fn() -> None:
         assert counter[0] == 5
 
     def test_run_uses_custom_timer(self) -> None:
+        # timer produces: 0.000, 0.001, 0.002, 0.003, 0.004, 0.005, ...
+        # each measured interval: end - start = 0.001
         values = iter(t * 0.001 for t in range(200))
 
         def fake_timer() -> float:
@@ -153,9 +159,7 @@ def fake_timer() -> float:
 
         s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
         result = s.run()
-        assert len(result.durations) == 3
-        for d in result.durations:
-            assert d > 0
+        assert result.durations == pytest.approx((0.001, 0.001, 0.001))
 
     def test_custom_timer_stateful(self) -> None:
         # timer is called before and after each run; warmup also consumes timer calls
@@ -203,6 +207,24 @@ def test_run_result_is_primary(self) -> None:
         result = s.run()
         assert result.is_primary is True
 
+    def test_run_args_incompatible_raises_type_error(self) -> None:
+        s = Scenario(lambda: None, args=[1, 2], name='s', number=1)
+        with pytest.raises(TypeError):
+            s.run()
+
+    def test_run_exception_mid_iteration(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+            if counter[0] == 3:
+                raise RuntimeError('fail on 3rd call')
+
+        s = Scenario(fn, name='s', number=5)
+        with pytest.raises(RuntimeError, match='fail on 3rd call'):
+            s.run()
+        assert counter[0] == 3
+
 
 class TestScenarioAdd:
     def test_add_scenario_returns_group(self) -> None:
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
index 31cc46b..3b5f8eb 100644
--- a/tests/units/test_scenario_group.py
+++ b/tests/units/test_scenario_group.py
@@ -61,12 +61,18 @@ def test_add_returns_new_group(self) -> None:
         g = ScenarioGroup(s1)
         new_g = g + s2
         assert new_g is not g
+        assert len(g._scenarios) == 1  # original not mutated
 
     def test_add_unknown_type_returns_not_implemented(self) -> None:
         g = ScenarioGroup()
         result = g.__add__(42)  # type: ignore[arg-type]
         assert result is NotImplemented
 
+    def test_radd_unknown_type_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__radd__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
     def test_radd_scenario_to_group(self) -> None:
         s1, s2 = make_scenario('s1'), make_scenario('s2')
         g = ScenarioGroup(s1)
@@ -99,6 +105,10 @@ def test_empty_group_returns_empty_list(self) -> None:
         g = ScenarioGroup()
         assert g.run() == []
 
+    def test_empty_group_run_with_warmup(self) -> None:
+        g = ScenarioGroup()
+        assert g.run(warmup=10) == []
+
     def test_run_returns_benchmark_results(self) -> None:
         s = make_scenario()
         g = ScenarioGroup(s)

From a61b1e152513f08f6cfb984e35c8de72962bb520 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:20:17 +0300
Subject: [PATCH 21/33] Add test to ensure --help does not run benchmark

---
 tests/cli/test_scenario_cli.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
index 66f385f..4662015 100644
--- a/tests/cli/test_scenario_cli.py
+++ b/tests/cli/test_scenario_cli.py
@@ -122,6 +122,10 @@ def test_help_mentions_max_mean(self) -> None:
         combined = proc.stdout + proc.stderr
         assert 'max-mean' in combined.lower() or 'max_mean' in combined.lower()
 
+    def test_help_does_not_run_benchmark(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        assert 'benchmark:' not in proc.stdout
+
 
 class TestScenarioCliEdgeCases:
     def test_number_zero_fails(self) -> None:

From f5fcec96669ce00dcfaec3f7d415462d41e3b5c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:20:40 +0300
Subject: [PATCH 22/33] Add tests for BenchmarkResult serialization, percentile
 handling, and ScenarioGroup addition behavior

---
 tests/units/test_benchmark_result.py | 26 ++++++++++++++++++++++----
 tests/units/test_scenario.py         |  7 +++++--
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index a72cdf8..93a08a6 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -40,11 +40,12 @@ def test_mean_computed_correctly(self) -> None:
         assert result.mean == expected
 
     def test_mean_uses_fsum_precision(self) -> None:
-        # floating point: sum of many small numbers may differ from fsum
-        durations = tuple(0.1 for _ in range(10))
+        # fsum handles cancellation correctly; plain sum loses precision:
+        # sum([1e20, 1.0, -1e20]) == 0.0, fsum == 1.0
+        durations = (1e20, 1.0, -1e20)
         result = make_result(durations)
-        expected = math.fsum(durations) / len(durations)
-        assert result.mean == expected
+        assert result.mean == pytest.approx(1.0 / 3)
+        assert result.mean != sum(durations) / len(durations)
 
     def test_best_is_min(self) -> None:
         result = make_result((3.0, 1.0, 2.0))
@@ -114,6 +115,8 @@ def test_percentile_100_returns_all(self) -> None:
         result = make_result((1.0, 2.0, 3.0))
         trimmed = result.percentile(100)
         assert len(trimmed.durations) == 3
+        assert trimmed.is_primary is False
+        assert trimmed.durations == tuple(sorted(result.durations))
 
     def test_percentile_small_number(self) -> None:
         result = make_result((1.0, 2.0, 3.0))
@@ -213,6 +216,9 @@ def test_to_json_valid_json(self) -> None:
         result = make_result((0.1, 0.2))
         data = json.loads(result.to_json())
         assert isinstance(data, dict)
+        assert isinstance(data['durations'], list)
+        assert isinstance(data['is_primary'], bool)
+        assert 'scenario' in data
 
     def test_to_json_contains_durations(self) -> None:
         result = make_result((0.1, 0.2, 0.3))
@@ -312,6 +318,18 @@ def test_from_json_is_primary_not_bool_raises(self) -> None:
         with pytest.raises(ValueError, match='is_primary'):
             BenchmarkResult.from_json(payload)
 
+    def test_from_json_is_primary_int_raises(self) -> None:
+        # int 1 is not a bool even though bool is a subclass of int
+        payload = json.dumps({'durations': [0.1], 'is_primary': 1})
+        with pytest.raises(ValueError, match='is_primary'):
+            BenchmarkResult.from_json(payload)
+
+    def test_percentile_single_element(self) -> None:
+        result = make_result((5.0,))
+        trimmed = result.percentile(50)
+        assert trimmed.durations == (5.0,)
+        assert trimmed.is_primary is False
+
     def test_from_json_missing_is_primary_raises(self) -> None:
         payload = json.dumps({'durations': [0.1, 0.2]})
         with pytest.raises(ValueError, match='required fields'):
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index cf52950..1685868 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -209,7 +209,7 @@ def test_run_result_is_primary(self) -> None:
 
     def test_run_args_incompatible_raises_type_error(self) -> None:
         s = Scenario(lambda: None, args=[1, 2], name='s', number=1)
-        with pytest.raises(TypeError):
+        with pytest.raises(TypeError, match='argument'):
             s.run()
 
     def test_run_exception_mid_iteration(self) -> None:
@@ -249,6 +249,9 @@ def test_radd_group_scenario(self) -> None:
         s1 = Scenario(lambda: None, name='s1')
         s2 = Scenario(lambda: None, name='s2')
         g = ScenarioGroup(s1)
-        # g + s2 is g.__add__(s2), but we also want s2.__radd__(g) to work
+        # s2.__radd__(g) = ScenarioGroup(*g._scenarios, s2) = [s1, s2]
         group = s2.__radd__(g)
         assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2

From 825fd3bb0b2c150594a774a15ad5bb37a706260b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:21:52 +0300
Subject: [PATCH 23/33] Add test for negative warmup handling in Scenario.run

---
 tests/units/test_scenario.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index 1685868..0b8bfe3 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -99,9 +99,20 @@ def test_number_negative_raises(self) -> None:
             Scenario(lambda: None, name='s', number=-1)
 
     def test_name_required_raises(self) -> None:
-        with pytest.raises(TypeError):
+        with pytest.raises(TypeError, match='required keyword-only argument'):
             Scenario(lambda: None)  # type: ignore[call-arg]
 
+    def test_run_negative_warmup_acts_as_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=3)
+        result = s.run(warmup=-5)
+        assert len(result.durations) == 3
+        assert counter[0] == 3  # negative warmup = range(-5) = empty, silently ignored
+
 
 class TestScenarioRun:
     def test_run_returns_benchmark_result(self) -> None:
@@ -135,9 +146,15 @@ def fn() -> None:
         assert counter[0] == 8
 
     def test_run_warmup_not_in_durations(self) -> None:
-        s = Scenario(lambda: None, name='s', number=5)
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
         result = s.run(warmup=10)
         assert len(result.durations) == 5
+        assert counter[0] == 15  # 10 warmup + 5 measured
 
     def test_run_warmup_zero(self) -> None:
         counter = [0]
@@ -209,7 +226,7 @@ def test_run_result_is_primary(self) -> None:
 
     def test_run_args_incompatible_raises_type_error(self) -> None:
         s = Scenario(lambda: None, args=[1, 2], name='s', number=1)
-        with pytest.raises(TypeError, match='argument'):
+        with pytest.raises(TypeError, match='positional argument'):
             s.run()
 
     def test_run_exception_mid_iteration(self) -> None:

From 6f265c11af20e88271c7122bfa05d97468e0f95c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:22:40 +0300
Subject: [PATCH 24/33] Add tests for ScenarioGroup CLI divider and percentile
 edge cases

---
 tests/cli/test_scenario_group_cli.py | 3 ++-
 tests/units/test_benchmark_result.py | 7 +++++++
 tests/units/test_scenario_group.py   | 8 ++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
index 9400e63..2e02dc8 100644
--- a/tests/cli/test_scenario_group_cli.py
+++ b/tests/cli/test_scenario_group_cli.py
@@ -47,8 +47,9 @@ def test_results_separated_by_divider(self) -> None:
 
     def test_divider_between_not_after_last(self) -> None:
         proc = run_script(group_script())
+        # group_script() has 2 scenarios → exactly 1 divider between them
+        assert proc.stdout.count('---\n') == 1
         lines = proc.stdout.strip().splitlines()
-        # last line should NOT be ---
         assert lines[-1] != '---'
 
     def test_exit_code_0_by_default(self) -> None:
diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index 93a08a6..f3f89b9 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -124,6 +124,13 @@ def test_percentile_small_number(self) -> None:
         expected = math.ceil(3 * 50 / 100)
         assert len(trimmed.durations) == expected
 
+    def test_percentile_very_small_positive(self) -> None:
+        result = make_result((1.0, 2.0, 3.0, 4.0, 5.0))
+        trimmed = result.percentile(0.001)
+        # ceil(5 * 0.001 / 100) = ceil(0.00005) = 1
+        assert len(trimmed.durations) == 1
+        assert trimmed.durations == (1.0,)
+
     def test_percentile_99(self) -> None:
         result = make_result(tuple(float(i) for i in range(1, 101)))
         trimmed = result.percentile(99)
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
index 3b5f8eb..e2c1190 100644
--- a/tests/units/test_scenario_group.py
+++ b/tests/units/test_scenario_group.py
@@ -148,3 +148,11 @@ def test_run_correct_scenario_reference(self) -> None:
         results = g.run()
         assert results[0].scenario is s1
         assert results[1].scenario is s2
+
+    def test_run_warmup_different_numbers(self) -> None:
+        s1 = Scenario(lambda: None, name='a', number=3)
+        s2 = Scenario(lambda: None, name='b', number=7)
+        g = ScenarioGroup(s1, s2)
+        results = g.run(warmup=2)
+        assert len(results[0].durations) == 3
+        assert len(results[1].durations) == 7

From a95187a092e1939216770d51849d0ea176ccacc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:26:57 +0300
Subject: [PATCH 25/33] Add tests for CLI help output, empty/invalid durations,
 and ScenarioGroup radd

---
 tests/cli/test_scenario_cli.py       |  2 +-
 tests/units/test_benchmark_result.py | 29 ++++++++++++++++++++++++++--
 tests/units/test_scenario.py         |  7 ++++---
 tests/units/test_scenario_group.py   | 25 ++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
index 4662015..4fd2856 100644
--- a/tests/cli/test_scenario_cli.py
+++ b/tests/cli/test_scenario_cli.py
@@ -120,7 +120,7 @@ def test_help_mentions_number(self) -> None:
     def test_help_mentions_max_mean(self) -> None:
         proc = run_script(scenario_script(), '--help')
         combined = proc.stdout + proc.stderr
-        assert 'max-mean' in combined.lower() or 'max_mean' in combined.lower()
+        assert '--max-mean' in combined
 
     def test_help_does_not_run_benchmark(self) -> None:
         proc = run_script(scenario_script(), '--help')
diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index f3f89b9..61e7a8c 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -34,14 +34,33 @@ def test_durations_is_tuple(self) -> None:
         result = make_result((0.1, 0.2, 0.3))
         assert isinstance(result.durations, tuple)
 
+    def test_empty_durations_raises(self) -> None:
+        # BenchmarkResult does not validate durations length;
+        # creating with empty tuple causes ZeroDivisionError in __post_init__
+        s = Scenario(lambda: None, name='s', number=1)
+        with pytest.raises((ZeroDivisionError, ValueError)):
+            BenchmarkResult(scenario=s, durations=(), is_primary=True)
+
+    def test_inf_durations_fields(self) -> None:
+        result = make_result((float('inf'), 1.0, 2.0))
+        assert math.isinf(result.worst)
+        assert math.isinf(result.mean)
+        assert result.best == 1.0
+
+    def test_nan_durations_fields(self) -> None:
+        result = make_result((float('nan'),))
+        assert math.isnan(result.mean)
+        assert math.isnan(result.best)
+        assert math.isnan(result.worst)
+
     def test_mean_computed_correctly(self) -> None:
         result = make_result((1.0, 2.0, 3.0))
         expected = math.fsum([1.0, 2.0, 3.0]) / 3
         assert result.mean == expected
 
     def test_mean_uses_fsum_precision(self) -> None:
-        # fsum handles cancellation correctly; plain sum loses precision:
-        # sum([1e20, 1.0, -1e20]) == 0.0, fsum == 1.0
+        # fsum handles cancellation correctly; plain sum loses precision
+        # for (1e20, 1.0, -1e20): fsum=1.0, sum=0.0 due to IEEE 754
         durations = (1e20, 1.0, -1e20)
         result = make_result(durations)
         assert result.mean == pytest.approx(1.0 / 3)
@@ -337,6 +356,12 @@ def test_percentile_single_element(self) -> None:
         assert trimmed.durations == (5.0,)
         assert trimmed.is_primary is False
 
+    def test_percentile_100_single_element(self) -> None:
+        result = make_result((5.0,))
+        trimmed = result.percentile(100)
+        assert trimmed.durations == (5.0,)
+        assert trimmed.is_primary is False
+
     def test_from_json_missing_is_primary_raises(self) -> None:
         payload = json.dumps({'durations': [0.1, 0.2]})
         with pytest.raises(ValueError, match='required fields'):
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index 0b8bfe3..b8696da 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -167,12 +167,13 @@ def fn() -> None:
         assert counter[0] == 5
 
     def test_run_uses_custom_timer(self) -> None:
-        # timer produces: 0.000, 0.001, 0.002, 0.003, 0.004, 0.005, ...
+        # timer produces: 0.000, 0.001, 0.002, 0.003, ...  (infinite)
         # each measured interval: end - start = 0.001
-        values = iter(t * 0.001 for t in range(200))
+        import itertools  # noqa: PLC0415
+        counter = itertools.count(0)
 
         def fake_timer() -> float:
-            return next(values)
+            return next(counter) * 0.001
 
         s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
         result = s.run()
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
index e2c1190..74310b1 100644
--- a/tests/units/test_scenario_group.py
+++ b/tests/units/test_scenario_group.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import pytest
+
 from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
 
 
@@ -80,6 +82,19 @@ def test_radd_scenario_to_group(self) -> None:
         assert isinstance(group, ScenarioGroup)
         assert len(group.run()) == 2
 
+    def test_radd_group_to_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g1 = ScenarioGroup(s1, s2)
+        g2 = ScenarioGroup(s3)
+        # g2.__radd__(g1) = ScenarioGroup(*g1._scenarios, *g2._scenarios) = [s1, s2, s3]
+        group = g2.__radd__(g1)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
     def test_duplicate_scenarios(self) -> None:
         s = make_scenario('s')
         group = s + s
@@ -156,3 +171,13 @@ def test_run_warmup_different_numbers(self) -> None:
         results = g.run(warmup=2)
         assert len(results[0].durations) == 3
         assert len(results[1].durations) == 7
+
+    def test_run_propagates_exception_from_scenario(self) -> None:
+        def bad() -> None:
+            raise RuntimeError('scenario failed')
+
+        s1 = make_scenario('s1')
+        s2 = Scenario(bad, name='s2', number=1)
+        g = ScenarioGroup(s1, s2)
+        with pytest.raises(RuntimeError, match='scenario failed'):
+            g.run()

From c3791cde0c6db8fc4bfbe1239e8cef668c3d0a96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:38:20 +0300
Subject: [PATCH 26/33] Add tests for BenchmarkResult JSON handling and
 Scenario addition

---
 tests/units/test_benchmark_result.py | 31 ++++++++++++++++++++++++----
 tests/units/test_scenario.py         |  8 +++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index 61e7a8c..7df31bb 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -38,7 +38,7 @@ def test_empty_durations_raises(self) -> None:
         # BenchmarkResult does not validate durations length;
         # creating with empty tuple causes ZeroDivisionError in __post_init__
         s = Scenario(lambda: None, name='s', number=1)
-        with pytest.raises((ZeroDivisionError, ValueError)):
+        with pytest.raises(ZeroDivisionError):
             BenchmarkResult(scenario=s, durations=(), is_primary=True)
 
     def test_inf_durations_fields(self) -> None:
@@ -60,11 +60,10 @@ def test_mean_computed_correctly(self) -> None:
 
     def test_mean_uses_fsum_precision(self) -> None:
         # fsum handles cancellation correctly; plain sum loses precision
-        # for (1e20, 1.0, -1e20): fsum=1.0, sum=0.0 due to IEEE 754
+        # for (1e20, 1.0, -1e20): fsum=1.0 (exact), but sum=0.0 (catastrophic cancellation)
         durations = (1e20, 1.0, -1e20)
         result = make_result(durations)
-        assert result.mean == pytest.approx(1.0 / 3)
-        assert result.mean != sum(durations) / len(durations)
+        assert result.mean == 1.0 / 3  # exact: fsum gives 1.0, divided by 3
 
     def test_best_is_min(self) -> None:
         result = make_result((3.0, 1.0, 2.0))
@@ -167,6 +166,8 @@ def test_percentile_on_derived_result(self) -> None:
         derived = result.percentile(90).percentile(50)
         assert isinstance(derived, BenchmarkResult)
         assert derived.is_primary is False
+        # 100 → p90 → 90 elements → p50 → ceil(90 * 50/100) = 45
+        assert len(derived.durations) == 45
 
     def test_percentile_scenario_preserved(self) -> None:
         scenario = Scenario(lambda: None, name='s')
@@ -350,6 +351,28 @@ def test_from_json_is_primary_int_raises(self) -> None:
         with pytest.raises(ValueError, match='is_primary'):
             BenchmarkResult.from_json(payload)
 
+    def test_from_json_durations_with_invalid_element_raises(self) -> None:
+        payload = '{"durations": [0.1, "not_a_number"], "is_primary": true}'
+        with pytest.raises(ValueError, match='could not convert'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_empty_durations_list_raises(self) -> None:
+        payload = json.dumps({'durations': [], 'is_primary': True})
+        with pytest.raises(ZeroDivisionError):
+            BenchmarkResult.from_json(payload)
+
+    def test_to_json_inf_produces_non_standard_json(self) -> None:
+        # Python's json module allows_nan=True by default: inf/nan → Infinity/NaN
+        result = make_result((float('inf'), 1.0))
+        j = result.to_json()
+        assert 'Infinity' in j
+
+    def test_to_json_nan_round_trips_in_python(self) -> None:
+        # NaN round-trips through Python's json module (non-standard JSON)
+        result = make_result((float('nan'),))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert math.isnan(restored.mean)
+
     def test_percentile_single_element(self) -> None:
         result = make_result((5.0,))
         trimmed = result.percentile(50)
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index b8696da..4cf8ae8 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -257,6 +257,14 @@ def test_add_scenario_group_returns_group(self) -> None:
         g = ScenarioGroup(s2)
         group = s1 + g
         assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
+    def test_add_int_raises_type_error(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        with pytest.raises(TypeError):
+            _ = 42 + s  # type: ignore[operator]
 
     def test_add_unknown_type_returns_not_implemented(self) -> None:
         s = Scenario(lambda: None, name='s')

From 6e25ff492381badce6e20fd50720a87bd58e95e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:42:37 +0300
Subject: [PATCH 27/33] Fix percentile validation to reject non-positive values

---
 microbenchmark/benchmark_result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/microbenchmark/benchmark_result.py b/microbenchmark/benchmark_result.py
index 7cd39f9..69a440d 100644
--- a/microbenchmark/benchmark_result.py
+++ b/microbenchmark/benchmark_result.py
@@ -38,7 +38,7 @@ def __post_init__(self) -> None:
         self.worst = max(self.durations)
 
     def percentile(self, p: float) -> BenchmarkResult:
-        if p <= 0 or p > 100:
+        if not (0 < p <= 100):
             raise ValueError(f'percentile must be in (0, 100], got {p}')
         k = math.ceil(len(self.durations) * p / 100)
         trimmed = tuple(sorted(self.durations)[:k])

From 6ab1b11e7c9ef32d7b5d1ada8933cfff8b73189a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:43:23 +0300
Subject: [PATCH 28/33] Add test for combined --number and --max-mean CLI
 options

---
 tests/cli/test_scenario_group_cli.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
index 2e02dc8..2266540 100644
--- a/tests/cli/test_scenario_group_cli.py
+++ b/tests/cli/test_scenario_group_cli.py
@@ -88,6 +88,12 @@ def test_max_mean_still_prints_output_on_failure(self) -> None:
         proc = run_script(group_script(), '--max-mean', '0.000001')
         assert 'benchmark:' in proc.stdout
 
+    def test_max_mean_and_number_combined(self) -> None:
+        proc = run_script(group_script(), '--number', '3', '--max-mean', '10.0')
+        assert proc.returncode == 0
+        assert 'benchmark: first' in proc.stdout
+        assert 'benchmark: second' in proc.stdout
+
 
 class TestScenarioGroupCliHelp:
     def test_help_exits_0(self) -> None:

From 75211472427734b5c839b8e23bc567242d06cb3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:43:36 +0300
Subject: [PATCH 29/33] Add pragma to suppress type check warning in
 TYPE_CHECKING block

---
 microbenchmark/benchmark_result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/microbenchmark/benchmark_result.py b/microbenchmark/benchmark_result.py
index 69a440d..719a81d 100644
--- a/microbenchmark/benchmark_result.py
+++ b/microbenchmark/benchmark_result.py
@@ -6,7 +6,7 @@
 from functools import cached_property
 from typing import TYPE_CHECKING, TypedDict
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from microbenchmark.scenario import Scenario
 
 

From 690d840fe3537f203919ba51059da499eec81b85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:44:39 +0300
Subject: [PATCH 30/33] Add validation for percentile with NaN/inf and null
 durations in JSON

---
 tests/units/test_benchmark_result.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index 7df31bb..e6bb1f2 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -190,6 +190,16 @@ def test_percentile_above_100_raises(self) -> None:
         with pytest.raises(ValueError, match='percentile'):
             result.percentile(101)
 
+    def test_percentile_nan_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(float('nan'))
+
+    def test_percentile_inf_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(float('inf'))
+
     def test_percentile_preserves_fsum_mean(self) -> None:
         durations = tuple(0.1 * i for i in range(1, 11))
         result = make_result(durations)
@@ -356,6 +366,11 @@ def test_from_json_durations_with_invalid_element_raises(self) -> None:
         with pytest.raises(ValueError, match='could not convert'):
             BenchmarkResult.from_json(payload)
 
+    def test_from_json_durations_with_null_element_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1, None], 'is_primary': True})
+        with pytest.raises(TypeError):
+            BenchmarkResult.from_json(payload)
+
     def test_from_json_empty_durations_list_raises(self) -> None:
         payload = json.dumps({'durations': [], 'is_primary': True})
         with pytest.raises(ZeroDivisionError):

From d42dcc672993d296abdd9f5c8ee65d8e90a88ff7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:45:23 +0300
Subject: [PATCH 31/33] Add pragma to suppress TYPE_CHECKING import warning

---
 microbenchmark/scenario.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
index bda9dd6..32a0c5a 100644
--- a/microbenchmark/scenario.py
+++ b/microbenchmark/scenario.py
@@ -7,7 +7,7 @@
 
 from microbenchmark.benchmark_result import BenchmarkResult
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from microbenchmark.scenario_group import ScenarioGroup
 
 

From 1f3041cc63671a75f2cffa330771b9443bb55c30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Thu, 9 Apr 2026 23:45:37 +0300
Subject: [PATCH 32/33] Add test for Scenario.__radd__ and validate JSON
 deserialization

---
 tests/units/test_benchmark_result.py |  5 +++++
 tests/units/test_scenario.py         | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
index e6bb1f2..e463af4 100644
--- a/tests/units/test_benchmark_result.py
+++ b/tests/units/test_benchmark_result.py
@@ -404,3 +404,8 @@ def test_from_json_missing_is_primary_raises(self) -> None:
         payload = json.dumps({'durations': [0.1, 0.2]})
         with pytest.raises(ValueError, match='required fields'):
             BenchmarkResult.from_json(payload)
+
+    def test_from_json_not_dict_raises(self) -> None:
+        payload = json.dumps([1, 2, 3])
+        with pytest.raises(ValueError, match='JSON must be an object'):
+            BenchmarkResult.from_json(payload)
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index 4cf8ae8..806b302 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -271,6 +271,16 @@ def test_add_unknown_type_returns_not_implemented(self) -> None:
         result = s.__add__(42)  # type: ignore[arg-type]
         assert result is NotImplemented
 
+    def test_radd_scenario_scenario(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        # s2.__radd__(s1) = ScenarioGroup(s1, s2)
+        group = s2.__radd__(s1)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
     def test_radd_group_scenario(self) -> None:
         s1 = Scenario(lambda: None, name='s1')
         s2 = Scenario(lambda: None, name='s2')

From e53b447ea18c451def89b1f35dae3f9daa749d17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D0=B8=D0=B9=20=D0=91=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=BE=D0=B2?= <pomponchik@Mac-mini-Evgenij.local>
Date: Fri, 10 Apr 2026 00:05:03 +0300
Subject: [PATCH 33/33] Add CLI output validation and ScenarioGroup edge cases

---
 tests/cli/test_scenario_cli.py       | 15 +++++++++-
 tests/cli/test_scenario_group_cli.py | 31 ++++++++++++++++++++
 tests/units/test_scenario.py         | 25 ++++++++++++++++
 tests/units/test_scenario_group.py   | 44 ++++++++++++++++++++++++----
 4 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
index 4fd2856..27b9c18 100644
--- a/tests/cli/test_scenario_cli.py
+++ b/tests/cli/test_scenario_cli.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 import subprocess
 import sys
 import textwrap
@@ -54,7 +55,19 @@ def test_cli_outputs_worst(self) -> None:
 
     def test_cli_output_has_s_suffix(self) -> None:
         proc = run_script(scenario_script())
-        assert 's\n' in proc.stdout or proc.stdout.rstrip().endswith('s')
+        # Each value line (mean/best/worst) must end with 's'
+        lines = proc.stdout.strip().splitlines()
+        for line in lines[1:]:  # skip 'benchmark: bench' header
+            assert line.endswith('s'), f'line does not end with s: {line!r}'
+
+    def test_cli_exact_output_format(self) -> None:
+        proc = run_script(scenario_script())
+        lines = proc.stdout.strip().splitlines()
+        assert len(lines) == 4
+        assert lines[0] == 'benchmark: bench'
+        assert re.match(r'^mean:  \d+\.\d{6}s$', lines[1]), f'unexpected format: {lines[1]!r}'
+        assert re.match(r'^best:  \d+\.\d{6}s$', lines[2]), f'unexpected format: {lines[2]!r}'
+        assert re.match(r'^worst: \d+\.\d{6}s$', lines[3]), f'unexpected format: {lines[3]!r}'
 
     def test_cli_exit_code_0_by_default(self) -> None:
         proc = run_script(scenario_script())
diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
index 2266540..de35718 100644
--- a/tests/cli/test_scenario_group_cli.py
+++ b/tests/cli/test_scenario_group_cli.py
@@ -105,6 +105,37 @@ def test_help_mentions_number(self) -> None:
         combined = proc.stdout + proc.stderr
         assert 'number' in combined.lower()
 
+    def test_help_mentions_max_mean(self) -> None:
+        proc = run_script(group_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert '--max-mean' in combined
+
+    def test_help_does_not_run_benchmark(self) -> None:
+        proc = run_script(group_script(), '--help')
+        assert 'benchmark:' not in proc.stdout
+
+
+def empty_group_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import ScenarioGroup
+
+        group = ScenarioGroup()
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliEmptyGroup:
+    def test_empty_group_exits_0(self) -> None:
+        proc = run_script(empty_group_script())
+        assert proc.returncode == 0
+
+    def test_empty_group_no_output(self) -> None:
+        proc = run_script(empty_group_script())
+        assert proc.stdout == ''
+        assert proc.stderr == ''
+
 
 def single_scenario_script() -> str:
     return textwrap.dedent(f'''
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
index 806b302..047bcad 100644
--- a/tests/units/test_scenario.py
+++ b/tests/units/test_scenario.py
@@ -243,6 +243,31 @@ def fn() -> None:
             s.run()
         assert counter[0] == 3
 
+    def test_run_exception_during_warmup_propagates(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+            if counter[0] == 2:
+                raise RuntimeError('fail in warmup')
+
+        s = Scenario(fn, name='s', number=5)
+        with pytest.raises(RuntimeError, match='fail in warmup'):
+            s.run(warmup=3)
+        assert counter[0] == 2  # stopped at 2nd warmup call
+
+    def test_run_number_one(self) -> None:
+        tick = [0]
+
+        def fake_timer() -> float:
+            tick[0] += 1
+            return float(tick[0])
+
+        s = Scenario(lambda: None, name='s', number=1, timer=fake_timer)
+        result = s.run()
+        assert len(result.durations) == 1
+        assert result.durations[0] == pytest.approx(1.0)  # end(2) - start(1) = 1
+
 
 class TestScenarioAdd:
     def test_add_scenario_returns_group(self) -> None:
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
index 74310b1..9d1c0ae 100644
--- a/tests/units/test_scenario_group.py
+++ b/tests/units/test_scenario_group.py
@@ -32,31 +32,50 @@ def test_scenario_plus_scenario(self) -> None:
         s1, s2 = make_scenario('s1'), make_scenario('s2')
         group = s1 + s2
         assert isinstance(group, ScenarioGroup)
-        assert len(group.run()) == 2
+        results = group.run()
+        assert len(results) == 2
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
 
     def test_group_plus_scenario(self) -> None:
         s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
         group = ScenarioGroup(s1, s2) + s3
         assert isinstance(group, ScenarioGroup)
-        assert len(group.run()) == 3
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
 
     def test_scenario_plus_group(self) -> None:
         s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
         group = s1 + ScenarioGroup(s2, s3)
         assert isinstance(group, ScenarioGroup)
-        assert len(group.run()) == 3
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
 
     def test_group_plus_group(self) -> None:
         s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
         group = ScenarioGroup(s1) + ScenarioGroup(s2, s3)
         assert isinstance(group, ScenarioGroup)
-        assert len(group.run()) == 3
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
 
     def test_triple_sum_is_flat(self) -> None:
         s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
         group = s1 + s2 + s3
         assert isinstance(group, ScenarioGroup)
-        assert len(group.run()) == 3
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
 
     def test_add_returns_new_group(self) -> None:
         s1, s2 = make_scenario('s1'), make_scenario('s2')
@@ -64,6 +83,8 @@ def test_add_returns_new_group(self) -> None:
         new_g = g + s2
         assert new_g is not g
         assert len(g._scenarios) == 1  # original not mutated
+        assert new_g._scenarios[0] is s1
+        assert new_g._scenarios[1] is s2
 
     def test_add_unknown_type_returns_not_implemented(self) -> None:
         g = ScenarioGroup()
@@ -124,6 +145,19 @@ def test_empty_group_run_with_warmup(self) -> None:
         g = ScenarioGroup()
         assert g.run(warmup=10) == []
 
+    def test_run_negative_warmup_acts_as_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=3)
+        g = ScenarioGroup(s)
+        results = g.run(warmup=-5)
+        assert len(results) == 1
+        assert len(results[0].durations) == 3
+        assert counter[0] == 3  # range(-5) == empty, so no warmup calls
+
     def test_run_returns_benchmark_results(self) -> None:
         s = make_scenario()
         g = ScenarioGroup(s)