diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..95e7494
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: pomponchik
+
+---
+
+## Short description
+
+Replace this text with a short description of the error and the behavior that you expected to see instead.
+
+
+## Describe the bug in detail
+
+Please add a test that reproduces the bug (i.e., currently fails):
+
+```python
+def test_your_bug():
+    ...
+```
+
+When writing the test, please ensure compatibility with the [`pytest`](https://docs.pytest.org/) framework.
+
+If for some reason you cannot describe the error in the test format, describe the steps to reproduce it here.
+
+
+## Environment
+ - OS: ...
+ - Python version (the output of the `python --version` command): ...
+ - Version of this package: ...
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 0000000..5f5fdc0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,26 @@
+---
+name: Documentation fix
+about: Add something to the documentation, delete it, or change it
+title: ''
+labels: documentation
+assignees: pomponchik
+---
+
+## It's cool that you're here!
+
+Documentation is an important part of the project; we strive to make it high-quality and keep it up to date. Please adjust this template by outlining your proposal.
+
+
+## Type of action
+
+What do you want to do: remove something, add something, or change something?
+
+
+## Where?
+
+Specify which part of the documentation you want to change. For example, the name of an existing documentation section or a line number in `README.md`.
+
+
+## The essence
+
+Please describe the essence of the proposed change.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..117d79f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,17 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: pomponchik
+
+---
+
+## Short description
+
+What do you propose and why do you consider it important?
+
+
+## Some details
+
+If you can, provide code examples that will show how your proposal will work. Also, if you can, indicate which alternative approaches you have considered. And finally, describe how you propose to verify that your idea is implemented correctly, if at all possible.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
new file mode 100644
index 0000000..6f86494
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,12 @@
+---
+name: Question or consultation
+about: Ask anything about this project
+title: ''
+labels: question
+assignees: pomponchik
+
+---
+
+## Your question
+
+Here you can freely describe your question about the project. Please read the documentation provided before doing this, and ask the question only if it is not answered there. In addition, please keep in mind that this is a free non-commercial project and user support is optional for its author. Response times are not guaranteed.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..a9e6850
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,65 @@
+name: Lint
+
+on:
+  push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.14t', '3.15.0-alpha.1']
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+          python-version: ${{ matrix.python-version }}
+
+    - name: Set up uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+
+    - name: Install dependencies
+      shell: bash
+      run: uv pip install --system -r requirements_dev.txt
+
+    - name: Install the library
+      shell: bash
+      run: uv pip install --system .
+
+    - name: Run ruff
+      shell: bash
+      run: ruff check microbenchmark
+
+    - name: Run ruff for tests
+      shell: bash
+      run: ruff check tests
+
+    - name: Run mypy
+      shell: bash
+      run: >-
+        mypy
+        --show-error-codes
+        --strict
+        --disallow-any-decorated
+        --disallow-any-explicit
+        --disallow-any-expr
+        --disallow-any-generics
+        --disallow-any-unimported
+        --disallow-subclassing-any
+        --warn-return-any
+        microbenchmark
+
+    - name: Run mypy for tests
+      shell: bash
+      run: mypy --exclude '^tests/typing/' tests
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..0db2c72
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,40 @@
+name: Release
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  pypi-publish:
+    name: upload release to PyPI
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    # Specifying a GitHub environment is optional, but strongly encouraged
+    environment: release
+    permissions:
+      # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v5
+        with:
+            python-version: '3.13'
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        shell: bash
+        run: uv pip install --system -r requirements_dev.txt
+
+      - name: Build the project
+        shell: bash
+        run: python -m build .
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests_and_coverage.yml b/.github/workflows/tests_and_coverage.yml
new file mode 100644
index 0000000..099f18d
--- /dev/null
+++ b/.github/workflows/tests_and_coverage.yml
@@ -0,0 +1,64 @@
+name: Tests
+
+on:
+  push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        os: [macos-latest, ubuntu-latest, windows-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.14t', '3.15.0-alpha.1']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+          python-version: ${{ matrix.python-version }}
+
+    - name: Set up uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+
+    - name: Install dependencies
+      shell: bash
+      run: uv pip install --system -r requirements_dev.txt
+
+    - name: Install the library
+      shell: bash
+      run: uv pip install --system .
+
+    - name: Print all libs
+      shell: bash
+      run: uv pip list --system
+
+    - name: Run tests and show the branch coverage on the command line
+      shell: bash
+      run: |
+        pth_file="$(python -c 'import sysconfig; print(sysconfig.get_path("purelib"))')/microbenchmark_coverage_process_startup.pth"
+        printf "import os; os.getenv('COVERAGE_PROCESS_START') and __import__('coverage').process_startup()\n" > "$pth_file"
+        coverage erase
+        COVERAGE_PROCESS_START="$PWD/pyproject.toml" coverage run -m pytest -n auto --cache-clear --assert=plain
+        coverage combine
+        coverage report -m --fail-under=100 --omit='*tests*'
+        coverage xml --omit='*tests*'
+
+    - name: Upload coverage to Coveralls
+      if: runner.os == 'Linux' && matrix.python-version == '3.13'
+      env:
+        COVERALLS_REPO_TOKEN: ${{secrets.COVERALLS_REPO_TOKEN}}
+      uses: coverallsapp/github-action@v2
+      with:
+        format: cobertura
+        file: coverage.xml
+        flag-name: ubuntu-python-3.13-branch
+      continue-on-error: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..689d029
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+.DS_Store
+__pycache__
+venv
+.pytest_cache
+build
+dist
+*.egg-info
+test.py
+.coverage
+.coverage.*
+.idea
+.ruff_cache
+.mutmut-cache
+.mypy_cache
+html
+CLAUDE.md
+.claude
+mutants
+planning_features.md
+coverage.xml
+.qwen
+uv.lock
diff --git a/README.md b/README.md
index b79d431..3d5c195 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,434 @@
-# microbenchmark
\ No newline at end of file
+# microbenchmark
+
+A minimal Python library for writing and running benchmarks.
+
+`microbenchmark` gives you simple building blocks — `Scenario`, `ScenarioGroup`, and `BenchmarkResult` — that you can embed directly into your project or call from CI. No separate CLI package to install; `.cli()` is built in. You write a Python file, call `.run()` or `.cli()`, and you are done.
+
+**Key features:**
+
+- A `Scenario` wraps any callable with a fixed argument list and runs it `n` times, collecting per-run timings.
+- A `ScenarioGroup` lets you combine scenarios and run them together with a single call.
+- `BenchmarkResult` holds every individual duration and gives you mean, best, worst, and percentile views.
+- Results can be serialized to and restored from JSON.
+- No external dependencies beyond the Python standard library.
+
+---
+
+## Table of contents
+
+- [Installation](#installation)
+- [Quick start](#quick-start)
+- [Scenario](#scenario)
+- [ScenarioGroup](#scenariogroup)
+- [BenchmarkResult](#benchmarkresult)
+- [Comparison with alternatives](#comparison-with-alternatives)
+
+---
+
+## Installation
+
+```
+pip install microbenchmark
+```
+
+---
+
+## Quick start
+
+```python
+from microbenchmark import Scenario
+
+def build_list():
+    return list(range(1000))
+
+scenario = Scenario(build_list, name='build_list', number=500)
+result = scenario.run()
+
+print(len(result.durations))
+#> 500
+print(result.mean)   # example — actual value depends on your hardware
+#> 0.000012
+print(result.best)
+#> 0.000010
+print(result.worst)
+#> 0.000018
+```
+
+---
+
+## Scenario
+
+A `Scenario` describes a single benchmark: the function to call, what arguments to pass, and how many times to run it.
+
+### Constructor
+
+```python
+Scenario(
+    function,
+    args=None,
+    *,
+    name,
+    doc='',
+    number=1000,
+    timer=time.perf_counter,
+)
+```
+
+- `function` — the callable to benchmark.
+- `args` — a list of positional arguments passed to `function` on every call as `function(*args)`. `None` (the default) and `[]` both mean the function is called with no arguments. The list is shallow-copied on construction, so appending to your original list afterward has no effect. Keyword arguments are not supported; wrap your callable in a `functools.partial` or a lambda if you need them.
+- `name` — a short label for this scenario (required).
+- `doc` — an optional longer description.
+- `number` — how many times to call `function` per run. Must be at least `1`; passing `0` or a negative value raises `ValueError`.
+- `timer` — a zero-argument callable that returns the current time as a `float`. Defaults to `time.perf_counter`. Supply a custom clock to get deterministic measurements in tests.
+
+```python
+import time
+from microbenchmark import Scenario
+
+scenario = Scenario(
+    sorted,
+    args=[[3, 1, 2]],
+    name='sort_three_items',
+    doc='Sort a list of three integers.',
+    number=10000,
+)
+```
+
+For keyword arguments, use `functools.partial`:
+
+```python
+from functools import partial
+from microbenchmark import Scenario
+
+scenario = Scenario(
+    partial(sorted, key=lambda x: -x),
+    args=[[3, 1, 2]],
+    name='sort_descending',
+)
+```
+
+For functions that take multiple positional arguments, list all of them in `args`:
+
+```python
+from microbenchmark import Scenario
+
+scenario = Scenario(pow, args=[2, 10], name='power')
+result = scenario.run()
+print(result.mean)
+#> 0.000000  # example — very fast operation
+```
+
+### `run(warmup=0)`
+
+Runs the benchmark and returns a `BenchmarkResult`.
+
+The optional `warmup` argument specifies how many calls to make before timing begins. Warm-up calls execute the function but are not timed and their results are discarded.
+
+```python
+from microbenchmark import Scenario
+
+scenario = Scenario(lambda: list(range(100)), name='build', number=1000)
+result = scenario.run(warmup=100)
+print(len(result.durations))
+#> 1000
+```
+
+### `cli()`
+
+Turns the scenario into a small command-line program. Call `scenario.cli()` as the entry point of a script and it will parse `sys.argv`, run the benchmark, and print the result to stdout.
+
+Supported arguments:
+
+- `--number N` — override the scenario's `number` for this run.
+- `--max-mean THRESHOLD` — exit with code `1` if the mean time (in seconds) exceeds `THRESHOLD`. Useful in CI.
+- `--help` — print usage information and exit.
+
+Output format:
+
+```
+benchmark: <name>
+mean:  <mean>s
+best:  <best>s
+worst: <worst>s
+```
+
+Values are in seconds. The `mean`, `best`, and `worst` labels are padded to the same width. If `--max-mean` is supplied and the actual mean exceeds the threshold, the same output is printed but the process exits with code `1`.
+
+```python
+# benchmark.py
+import time
+from microbenchmark import Scenario
+
+def build_list():
+    return list(range(1000))
+
+scenario = Scenario(build_list, name='build_list', number=500)
+
+if __name__ == '__main__':
+    scenario.cli()
+```
+
+```
+$ python benchmark.py
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+```
+
+```
+$ python benchmark.py --number 100
+benchmark: build_list
+mean:  0.000013s
+best:  0.000010s
+worst: 0.000020s
+```
+
+```
+$ python benchmark.py --max-mean 0.001
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+$ echo $?
+0
+```
+
+```
+$ python benchmark.py --max-mean 0.000001
+benchmark: build_list
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+$ echo $?
+1
+```
+
+---
+
+## ScenarioGroup
+
+A `ScenarioGroup` holds a flat collection of scenarios and lets you run them together.
+
+### Creating a group
+
+There are four ways to create a group.
+
+**Direct construction** — pass any number of scenarios to the constructor. Passing no scenarios creates an empty group:
+
+```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+
+group = ScenarioGroup(s1, s2)
+empty = ScenarioGroup()
+print(len(empty.run()))
+#> 0
+```
+
+**The `+` operator between two scenarios** produces a `ScenarioGroup`:
+
+```python
+from microbenchmark import Scenario
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+group = s1 + s2
+print(type(group).__name__)
+#> ScenarioGroup
+```
+
+**Adding a scenario to an existing group**, or vice versa — the result is always a new flat group with no nesting:
+
+```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+s3 = Scenario(lambda: None, name='s3')
+group = ScenarioGroup(s1, s2)
+extended = group + s3     # ScenarioGroup + Scenario
+also_ok  = s3 + group     # Scenario + ScenarioGroup
+print(len(extended.run()))
+#> 3
+```
+
+**Adding two groups together** produces a single flat group:
+
+```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+s3 = Scenario(lambda: None, name='s3')
+g1 = ScenarioGroup(s1)
+g2 = ScenarioGroup(s2, s3)
+combined = g1 + g2
+print(len(combined.run()))
+#> 3
+```
+
+### `run(warmup=0)`
+
+Runs every scenario in order and returns a list of `BenchmarkResult` objects. The order of results matches the order the scenarios were added. The `warmup` argument is forwarded to each scenario individually.
+
+```python
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: None, name='s1')
+s2 = Scenario(lambda: None, name='s2')
+group = ScenarioGroup(s1, s2)
+results = group.run(warmup=50)
+for result in results:
+    print(result.scenario.name)
+#> s1
+#> s2
+```
+
+### `cli()`
+
+Runs all scenarios and prints their results to stdout. Each scenario block follows the same format as `Scenario.cli()`, and blocks are separated by a `---` line. The separator appears only between blocks, not after the last one.
+
+Supported arguments:
+
+- `--number N` — passed to every scenario.
+- `--max-mean THRESHOLD` — exits with code `1` if any scenario's mean exceeds the threshold.
+- `--help` — print usage information and exit.
+
+```python
+# benchmarks.py
+from microbenchmark import Scenario, ScenarioGroup
+
+s1 = Scenario(lambda: list(range(100)), name='range_100')
+s2 = Scenario(lambda: list(range(1000)), name='range_1000')
+
+group = s1 + s2
+
+if __name__ == '__main__':
+    group.cli()
+```
+
+```
+$ python benchmarks.py
+benchmark: range_100
+mean:  0.000003s
+best:  0.000002s
+worst: 0.000005s
+---
+benchmark: range_1000
+mean:  0.000012s
+best:  0.000010s
+worst: 0.000018s
+```
+
+---
+
+## BenchmarkResult
+
+`BenchmarkResult` is a dataclass that holds the outcome of a single benchmark run.
+
+### Fields
+
+- `scenario: Scenario | None` — the `Scenario` that produced this result, or `None` if the result was restored from JSON.
+- `durations: tuple[float, ...]` — per-call timings in seconds, one entry per call, in the order they were measured.
+- `mean: float` — arithmetic mean of `durations`, computed with `math.fsum` to minimize floating-point error. Computed automatically from `durations`.
+- `best: float` — the shortest individual timing. Computed automatically.
+- `worst: float` — the longest individual timing. Computed automatically.
+- `is_primary: bool` — `True` for results returned directly by `run()`, `False` for results derived via `percentile()`. Preserved during JSON round-trips.
+
+The `mean`, `best`, and `worst` fields are read-only computed values; they are not accepted as constructor arguments.
+
+```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+print(len(result.durations))
+#> 100
+print(result.is_primary)
+#> True
+```
+
+### `percentile(p)`
+
+Returns a new `BenchmarkResult` containing only the `ceil(len(durations) * p / 100)` fastest timings, sorted by duration ascending. The returned result has `is_primary=False`. `p` must be in the range `(0, 100]`; passing `0` or a value above `100` raises `ValueError`.
+
+```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+trimmed = result.percentile(95)
+print(trimmed.is_primary)
+#> False
+print(len(trimmed.durations))
+#> 95
+```
+
+You can call `percentile()` on a derived result too:
+
+```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+print(len(result.percentile(90).percentile(50).durations))
+#> 45
+```
+
+### `p95` and `p99`
+
+Convenient cached properties that return `percentile(95)` and `percentile(99)` respectively. The value is computed once and cached for the lifetime of the result object.
+
+```python
+from microbenchmark import Scenario
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+print(len(result.p95.durations))
+#> 95
+print(result.p95.is_primary)
+#> False
+print(result.p95 is result.p95)   # cached — same object returned each time
+#> True
+```
+
+### `to_json()` and `from_json()`
+
+`to_json()` serializes the result to a JSON string. It stores `durations`, `is_primary`, and the scenario's `name`, `doc`, and `number`.
+
+`from_json()` is a class method that restores a `BenchmarkResult` from a JSON string produced by `to_json()`. Because the original callable cannot be serialized, the restored result has `scenario=None`. The `mean`, `best`, and `worst` fields are recomputed from `durations` on restoration.
+
+```python
+from microbenchmark import Scenario, BenchmarkResult
+
+result = Scenario(lambda: None, name='noop', number=100).run()
+
+json_str = result.to_json()
+restored = BenchmarkResult.from_json(json_str)
+
+print(restored.scenario)
+#> None
+print(restored.mean == result.mean)
+#> True
+print(restored.durations == result.durations)
+#> True
+print(restored.is_primary == result.is_primary)
+#> True
+```
+
+---
+
+## Comparison with alternatives
+
+| Feature | `microbenchmark` | `timeit` (stdlib) | `pytest-benchmark` |
+|---|---|---|---|
+| Per-call timings | yes | via `repeat(number=1)` | yes |
+| Percentile views | yes | no | yes |
+| JSON serialization | yes | no | yes |
+| Inject custom timer | yes | yes | no |
+| Warmup support | yes | no | yes (calibration) |
+| CI integration (`--max-mean`) | yes | no | via configuration |
+| `+` operator for grouping | yes | no | no |
+| External dependencies | none | none | several |
+| Embeddable in your own code | yes | yes | pytest plugin required |
+
+`timeit` from the standard library is great for interactive exploration, but it gives only a single aggregate number per call — you can get a list by using `repeat(number=1)`, though the interface is not designed around it. `pytest-benchmark` is powerful and well-integrated into the `pytest` ecosystem, but it is tightly coupled to the test runner and brings its own dependencies. `microbenchmark` sits between the two: richer than `timeit`, lighter and more portable than `pytest-benchmark`, and not tied to any test framework.
diff --git a/microbenchmark/__init__.py b/microbenchmark/__init__.py
new file mode 100644
index 0000000..efc89df
--- /dev/null
+++ b/microbenchmark/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from microbenchmark.benchmark_result import BenchmarkResult
+from microbenchmark.scenario import Scenario
+from microbenchmark.scenario_group import ScenarioGroup
+
+__all__ = ['BenchmarkResult', 'Scenario', 'ScenarioGroup']
diff --git a/microbenchmark/benchmark_result.py b/microbenchmark/benchmark_result.py
new file mode 100644
index 0000000..719a81d
--- /dev/null
+++ b/microbenchmark/benchmark_result.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import json
+import math
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import TYPE_CHECKING, TypedDict
+
+if TYPE_CHECKING:  # pragma: no cover
+    from microbenchmark.scenario import Scenario
+
+
+class _ScenarioMeta(TypedDict):
+    name: str
+    doc: str
+    number: int
+
+
+class _ResultJson(TypedDict):
+    durations: list[float]
+    is_primary: bool
+    scenario: _ScenarioMeta | None
+
+
+@dataclass
+class BenchmarkResult:
+    scenario: Scenario | None
+    durations: tuple[float, ...]
+    is_primary: bool = True
+
+    mean: float = field(init=False)
+    best: float = field(init=False)
+    worst: float = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.mean = math.fsum(self.durations) / len(self.durations)
+        self.best = min(self.durations)
+        self.worst = max(self.durations)
+
+    def percentile(self, p: float) -> BenchmarkResult:
+        if not (0 < p <= 100):
+            raise ValueError(f'percentile must be in (0, 100], got {p}')
+        k = math.ceil(len(self.durations) * p / 100)
+        trimmed = tuple(sorted(self.durations)[:k])
+        return BenchmarkResult(
+            scenario=self.scenario,
+            durations=trimmed,
+            is_primary=False,
+        )
+
+    @cached_property
+    def p95(self) -> BenchmarkResult:
+        return self.percentile(95)
+
+    @cached_property
+    def p99(self) -> BenchmarkResult:
+        return self.percentile(99)
+
+    def to_json(self) -> str:
+        scenario_meta: _ScenarioMeta | None
+        if self.scenario is not None:
+            scenario_meta = _ScenarioMeta(
+                name=self.scenario.name,
+                doc=self.scenario.doc,
+                number=self.scenario.number,
+            )
+        else:
+            scenario_meta = None
+        data: _ResultJson = _ResultJson(
+            durations=list(self.durations),
+            is_primary=self.is_primary,
+            scenario=scenario_meta,
+        )
+        return json.dumps(data)
+
+    @classmethod
+    def from_json(cls, data: str) -> BenchmarkResult:
+        raw: object = json.loads(data)
+        if not isinstance(raw, dict):
+            raise ValueError('JSON must be an object')
+        if 'durations' not in raw or 'is_primary' not in raw:
+            raise ValueError('JSON is missing required fields: durations, is_primary')
+        raw_durations = raw['durations']
+        raw_is_primary = raw['is_primary']
+        if not isinstance(raw_durations, list):
+            raise ValueError('durations must be a list')
+        if not isinstance(raw_is_primary, bool):
+            raise ValueError('is_primary must be a bool')
+        return cls(
+            scenario=None,
+            durations=tuple(float(d) for d in raw_durations),
+            is_primary=raw_is_primary,
+        )
diff --git a/microbenchmark/py.typed b/microbenchmark/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/microbenchmark/scenario.py b/microbenchmark/scenario.py
new file mode 100644
index 0000000..32a0c5a
--- /dev/null
+++ b/microbenchmark/scenario.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import argparse
+import sys
+import time
+from typing import TYPE_CHECKING, Callable, Sequence
+
+from microbenchmark.benchmark_result import BenchmarkResult
+
+if TYPE_CHECKING:  # pragma: no cover
+    from microbenchmark.scenario_group import ScenarioGroup
+
+
+class _CliArgs:
+    def __init__(self) -> None:
+        self.number: int | None = None
+        self.max_mean: float | None = None
+
+
+class Scenario:
+    def __init__(  # noqa: PLR0913
+        self,
+        function: object,
+        args: Sequence[object] | None = None,
+        *,
+        name: str,
+        doc: str = '',
+        number: int = 1000,
+        timer: Callable[[], float] = time.perf_counter,
+    ) -> None:
+        if number < 1:
+            raise ValueError(f'number must be at least 1, got {number}')
+        self.function: object = function
+        self._args: list[object] = list(args) if args is not None else []
+        self.name = name
+        self.doc = doc
+        self.number = number
+        self._timer = timer
+
+    def _call_once(self) -> None:
+        self.function(*self._args)  # type: ignore[operator]
+
+    def run(self, warmup: int = 0) -> BenchmarkResult:
+        timer = self._timer
+        for _ in range(warmup):
+            timer()
+            self._call_once()
+            timer()
+        durations: list[float] = []
+        for _ in range(self.number):
+            start = timer()
+            self._call_once()
+            end = timer()
+            durations.append(end - start)
+        return BenchmarkResult(
+            scenario=self,
+            durations=tuple(durations),
+            is_primary=True,
+        )
+
+    def cli(self) -> None:
+        parser = argparse.ArgumentParser(description=self.doc or f'Benchmark: {self.name}')
+        parser.add_argument('--number', type=int, default=None, help='Number of iterations')
+        parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
+                            help='Fail if mean time (seconds) exceeds this threshold')
+        cli_args = _CliArgs()
+        parser.parse_args(namespace=cli_args)
+
+        scenario = self
+        if cli_args.number is not None:
+            scenario = Scenario(
+                self.function,
+                self._args,
+                name=self.name,
+                doc=self.doc,
+                number=cli_args.number,
+                timer=self._timer,
+            )
+
+        result = scenario.run()
+        _print_result(result)
+
+        if cli_args.max_mean is not None and result.mean > cli_args.max_mean:
+            sys.exit(1)
+
+    def __add__(self, other: object) -> ScenarioGroup:
+        from microbenchmark.scenario_group import ScenarioGroup  # noqa: PLC0415
+        if isinstance(other, Scenario):
+            return ScenarioGroup(self, other)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(self, *other._scenarios)
+        return NotImplemented
+
+    def __radd__(self, other: object) -> ScenarioGroup:
+        from microbenchmark.scenario_group import ScenarioGroup  # noqa: PLC0415
+        if isinstance(other, Scenario):
+            return ScenarioGroup(other, self)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*other._scenarios, self)
+        return NotImplemented
+
+
+def _print_result(result: BenchmarkResult) -> None:
+    scenario = result.scenario
+    assert scenario is not None
+    sys.stdout.write(f'benchmark: {scenario.name}\n')
+    sys.stdout.write(f'mean:  {result.mean:.6f}s\n')
+    sys.stdout.write(f'best:  {result.best:.6f}s\n')
+    sys.stdout.write(f'worst: {result.worst:.6f}s\n')
diff --git a/microbenchmark/scenario_group.py b/microbenchmark/scenario_group.py
new file mode 100644
index 0000000..b97f52d
--- /dev/null
+++ b/microbenchmark/scenario_group.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import argparse
+import sys
+
+from microbenchmark.benchmark_result import BenchmarkResult
+from microbenchmark.scenario import Scenario, _print_result
+
+
+class _CliArgs:
+    def __init__(self) -> None:
+        self.number: int | None = None
+        self.max_mean: float | None = None
+
+
+class ScenarioGroup:
+    def __init__(self, *scenarios: Scenario) -> None:
+        self._scenarios: list[Scenario] = list(scenarios)
+
+    def run(self, warmup: int = 0) -> list[BenchmarkResult]:
+        return [s.run(warmup=warmup) for s in self._scenarios]
+
+    def cli(self) -> None:
+        parser = argparse.ArgumentParser(description='Run benchmark group')
+        parser.add_argument('--number', type=int, default=None, help='Number of iterations')
+        parser.add_argument('--max-mean', type=float, default=None, dest='max_mean',
+                            help='Fail if any scenario mean time (seconds) exceeds this threshold')
+        cli_args = _CliArgs()
+        parser.parse_args(namespace=cli_args)
+
+        scenarios = self._scenarios
+        if cli_args.number is not None:
+            scenarios = [
+                _make_scenario_with_number(s, cli_args.number)
+                for s in self._scenarios
+            ]
+
+        failed = False
+        for i, scenario in enumerate(scenarios):
+            result = scenario.run()
+            _print_result(result)
+            if i < len(scenarios) - 1:
+                sys.stdout.write('---\n')
+            if cli_args.max_mean is not None and result.mean > cli_args.max_mean:
+                failed = True
+
+        if failed:
+            sys.exit(1)
+
+    def __add__(self, other: object) -> ScenarioGroup:
+        if isinstance(other, Scenario):
+            return ScenarioGroup(*self._scenarios, other)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*self._scenarios, *other._scenarios)
+        return NotImplemented
+
+    def __radd__(self, other: object) -> ScenarioGroup:
+        if isinstance(other, Scenario):
+            return ScenarioGroup(other, *self._scenarios)
+        if isinstance(other, ScenarioGroup):
+            return ScenarioGroup(*other._scenarios, *self._scenarios)
+        return NotImplemented
+
+
+def _make_scenario_with_number(s: Scenario, number: int) -> Scenario:
+    return Scenario(
+        s.function,
+        s._args,
+        name=s.name,
+        doc=s.doc,
+        number=number,
+        timer=s._timer,
+    )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..6c17998
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+requires = ["setuptools==68.0.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "microbenchmark"
+version = "0.0.1"
+authors = [
+  { name="Evgeniy Blinov", email="zheni-b@yandex.ru" },
+]
+description = ''
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Operating System :: OS Independent",
+    'Operating System :: MacOS :: MacOS X',
+    'Operating System :: Microsoft :: Windows',
+    'Operating System :: POSIX',
+    'Operating System :: POSIX :: Linux',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 3.8',
+    'Programming Language :: Python :: 3.9',
+    'Programming Language :: Python :: 3.10',
+    'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
+    'Programming Language :: Python :: 3.14',
+    'Programming Language :: Python :: 3.15',
+    'Programming Language :: Python :: Free Threading',
+    'Programming Language :: Python :: Free Threading :: 3 - Stable',
+    'License :: OSI Approved :: MIT License',
+    'Intended Audience :: Developers',
+    'Topic :: Software Development :: Libraries',
+    'Typing :: Typed',
+]
+keywords = [
+    'benchmarks'
+]
+
+[tool.setuptools]
+package-data = { microbenchmark = ["py.typed"] }
+packages = { find = { include = ["microbenchmark"] } }
+
+[tool.mutmut]
+paths_to_mutate=["microbenchmark"]
+
+[tool.coverage.run]
+branch = true
+parallel = true
+plugins = ["coverage_pyver_pragma"]
+source = ["microbenchmark"]
+
+[tool.pytest.ini_options]
+norecursedirs = ["build", "mutants"]
+
+[tool.ruff]
+lint.ignore = ['E501', 'E712', 'PTH123', 'PTH118', 'PLR2004', 'PTH107', 'SIM105', 'SIM102', 'RET503', 'PLR0912', 'C901', 'E731', 'F821']
+lint.select = ["ERA001", "YTT", "ASYNC", "BLE", "B", "A", "COM", "INP", "PIE", "T20", "PT", "RSE", "RET", "SIM", "SLOT", "TID252", "ARG", "PTH", "I", "C90", "N", "E", "W", "D201", "D202", "D419", "F", "PL", "PLE", "PLR", "PLW", "RUF", "TRY201", "TRY400", "TRY401"]
+format.quote-style = "single"
+
+[project.urls]
+'Source' = 'https://github.com/mutating/microbenchmark'
+'Tracker' = 'https://github.com/mutating/microbenchmark/issues'
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..47af4f7
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1,13 @@
+pytest==8.3.5
+pytest-xdist==3.6.1; python_version < '3.9'
+pytest-xdist==3.8.0; python_version >= '3.9'
+coverage==7.6.1
+coverage-pyver-pragma==0.4.0
+build==1.2.2.post1
+mypy==1.14.1
+pytest-mypy-testing==0.1.3
+ruff==0.14.6
+mutmut==3.2.3
+cosmic-ray==8.3.15; python_version < '3.9'
+cosmic-ray==8.4.6; python_version >= '3.9'
+full_match==0.0.3
diff --git a/spec.md b/spec.md
new file mode 100644
index 0000000..48a275c
--- /dev/null
+++ b/spec.md
@@ -0,0 +1,91 @@
+# О проекте
+
+Ты сейчас в пустой обвязке для проекта, посвященном бенчмаркингу. Проект называется "microbenchmark", потому что я планирую оставлять код проекта очень простым и минималистичным. Нам предстоит разработать его сейчас с нуля.
+
+Основная цель проекта: дать разработчикам инструментарий для быстрого написания бенчмарков и включения их в состав своих библиотек.
+
+Главные принципы разработки следующие:
+
+- Мы много думаем прежде, чем что-то делать.
+- Дизайн - важен. Мы делаем вещи красивыми и максимально минималистичными.
+- Мы проводим обширное ревью каждого сделанного шага. Источник ревью - программа qwen code, которая установлена на этом компьютере. Мы просим у нее ревью постоянно, как можно чаще, по несколько раз подряд.
+- Мы поддерживаем крайне высокий уровень тестирования. Покрытие тестами в 100% - сильно ниже минимальной планки, в которую мы целимся. Наша задача: покрыть тестами все мыслимые сценарии использования, а потом еще и все немыслимые, которые мы можем придумать.
+- Мы избегаем использования моков в тестировании, стараемся все возможное тестировать "в живую".
+- Мы придерживаемся принципов TDD: сначала продумываем тесты, и только потом приступаем к реализации.
+- Документация (README) - это источник правды. Тесты - основаны на документации. Основной код - основан на тестах.
+- *Каждое* содержательное утверждение в документации должно опираться на соответствующие тесты.
+- Нет внешним зависимостям, разрешены только зависимости на проекты, находящиеся внутри организации https://github.com/mutating
+
+
+## Порядок работы
+
+Работа должна вестись в таком порядке:
+
+- Определяем цели.
+- Определяемся с тем, какой именно код хотим написать.
+- Детально продумываем тест-сьют.
+- Пишем все тесты.
+- Пишем основной код.
+- Добиваемся прохождения новых тестов.
+- Фиксим issues линтеров и проверяем, не упали ли старые тесты.
+- Проверяем покрытие.
+- Пишем или дополняем/исправляем документацию.
+
+После каждого этапа разработки (создание плана, написание основного кода, документации или тестов) нужно запрашивать детальное ревью у qwen code. Программа установлена на данном компьютере, ее можно запустить командой qwen. Qwen нужно промптить, описывая ему текущую подзадачу и наши основные принципы разработки. Нужно просить его быть максимально дотошным. Qwen нужно запустить минимум 5 раз (если доработка сложная - 10+), агрегировать результаты вызовов, "отделить зерна от плевел", и только потом приступать к исправлениям.
+
+
+## Что именно делаем?
+
+Я хочу создать кодовую обвязку для бенчмаркинга проекта. Весь код мы положим в папку benchmarks, и все тесты нового кода - туда же.
+
+Важно: мы не создаем CLI-тулинг для бенчмарков. Это скорее набор сценариев / некий базовый код, который уже при желании можно вызвать из CLI-тулзы или из тестов производительности.
+
+Базовый дизайн состоит из:
+- Класса Сценария
+- Класса Группы Сценариев
+- Класса Результата Бенчмарка
+
+Подробнее о классе Сценария:
+
+- Нужно создать базовый класс Сценария
+- Сценарий принимает в конструктор функцию, которую он должен вызывать, список аргументов для нее (именно список, а не *args и **kwargs, чтобы мы могли расширять API в будущем), параметр doc (текстовое описание бенчмарка), параметр name (имя конкретного сценария), а также параметр количества раз, которое будет вызван бенчмарк
+- У Сценария есть метод run(), который возвращает объект Результата Бенчмарка
+- Метод run() должен иметь также опциональный аргумент, отвечающий за прогрев - сколько раз запустить сценарий до того, как время прогонов начнет заменяться и счетчик прогонов начнет считаться - результаты этих прогонов не должны сохраняться в Результате
+- Конструктор Сценария должен иметь опциональный аргумент - функцию, которая будет использоваться для генерации таймстемпов, со значением по умолчанию
+- У Сценария есть метод cli(), который запускает конкретный сценарий как CLI-программу, принимающую в качестве опционального CLI-аргумента количество итераций бенчмарка и выводящая в консоль все базовые параметры Результата Бенчмарка
+- Метод cli() также должен принимать опциональный CLI-параметр с средним временем бенчмарка, и если среднее время выше - он должен "падать" (чтобы можно было использовать в CI)
+
+О классе Группы Сценариев:
+
+- Группу сценариев можно создать 4 способами: создав инстанс класса Группы напрямую, (опционально) передав ему один или несколько Сценариев в качестве аргументов (через *args); просто суммировав несколько сценариев через оператор "+"; суммировав одну или несколько Групп; суммировав один или несколько Сценариев и одну или несколько групп. Все суммы должны быть "плоскими", то есть если будет два плюсика, то не должна образовываться вложенная иерархия.
+- Группа сценариев имеет внешний API, похожий на API одного сценария: метод run(), возвращающий список Результатов Бенчмарка и метод cli(), запускающий все бенчмарки группы сразу и выводящий общий результат в консоль через разделители
+
+О классе Результата Бенчмарка:
+- Результат Бенчмарка - это датакласс
+- У Результата Бенчмарка которого должны быть следующие поля: среднее время, худшее время, лучшее время, а также объект Сценария, из которого получен этот результат
+- Также Результат Бенчмарка должен хранить в себе продолжительность каждого отдельного запуска бенчмарка
+- Результат должен иметь метод percentile(), который возвращает другой объект Результата Бенчмарка, суженный по данному персентилю
+- Также в Результате должен быть кэшируемый property с 95 персентилем, и такой же с 99 персентилем, возвращающий тоже объекты Результатов Бенчмарка
+- У результата бенчмарка должно быть bool-поле, показывающее, является он первичным или производным (то есть суженным, например по персентилю)
+- Результат должен иметь метод сериализации в json и метод десериализации из json
+- Для аггрегации результатов должны использоваться методы, минимизирующие погрешность из-за складывания чисел с плавающей точкой
+
+Про документацию:
+- Документация должна содержаться только в README.md
+- Она должна быть простой и понятной, но исчерпывающей
+- Должна сопровождаться примерами кода
+- В примерах кода не нужно дублировать импорты, вывод должен демонстрироваться через print'ы и потом строчку с комментарием, начинаяющимся с "#>" и потом реальный ожидаемый вывод
+- Она должна иметь следующую структуру: краткое описание проекта и ключевые фичи, оглавление (с якорными ссылками на разделы), способ установки, быстрый старт, раздел про Сценарий, раздел про Группу Сценариев, раздел про Результат Бенчмарка, раздел сравнения с конкурентами
+- Документация должна быть оформлена в спокойном и дружелюбном стиле, без злоупотребления эмодзи
+- Документация должна быть на английском
+- Нужно активно использовать бэктики в тексте для выделения сущностей из кода, имен библиотек и всякого подобного
+- Хороший пример документации, на который можно равняться, показан в проекте https://github.com/mutating/pristan
+
+Про тесты:
+- Минимальное покрытие - 100%
+- Все, о чем сказано в README, должно быть тщательно протестировано
+- Не скупимся как на негативные, так и на позитивные кейсы в тестах
+- Все mypy-контракты должны быть покрыты тестами с использованием библиотеки pytest-mypy-testing, также должно быть много негативных кейсов и все возможные положительные
+- Все CLI-штуки должны быть протестированы с помощью библиотеки subprocess
+- ClI-тесты должны лежать в tests/cli, обычные юнит-тесты в tests/units, а тесты типизации - в tests/typing
+- На основе каждого примера кода нужно также создать по тесту, эти тесты должны лежать в tests/documentation/test_readme.py
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cli/test_scenario_cli.py b/tests/cli/test_scenario_cli.py
new file mode 100644
index 0000000..27b9c18
--- /dev/null
+++ b/tests/cli/test_scenario_cli.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import re
+import subprocess
+import sys
+import textwrap
+
+
+def run_script(script: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
+    """Run an inline Python script as a subprocess and return the completed process."""
+    return subprocess.run(
+        [sys.executable, '-c', script, *args],
+        capture_output=True,
+        text=True,
+        encoding='utf-8',
+        timeout=timeout,
+        check=False,
+    )
+
+
+def scenario_script(extra: str = '') -> str:
+    """Return a self-contained script that creates a Scenario and calls cli()."""
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s = Scenario(lambda: None, name='bench', number=10, timer=fake_timer)
+        {extra}
+        s.cli()
+    ''')
+
+
+class TestScenarioCliOutput:
+    def test_cli_outputs_name(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_cli_outputs_mean(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'mean:' in proc.stdout
+
+    def test_cli_outputs_best(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'best:' in proc.stdout
+
+    def test_cli_outputs_worst(self) -> None:
+        proc = run_script(scenario_script())
+        assert 'worst:' in proc.stdout
+
+    def test_cli_output_has_s_suffix(self) -> None:
+        proc = run_script(scenario_script())
+        # Each value line (mean/best/worst) must end with 's'
+        lines = proc.stdout.strip().splitlines()
+        for line in lines[1:]:  # skip 'benchmark: bench' header
+            assert line.endswith('s'), f'line does not end with s: {line!r}'
+
+    def test_cli_exact_output_format(self) -> None:
+        proc = run_script(scenario_script())
+        lines = proc.stdout.strip().splitlines()
+        assert len(lines) == 4
+        assert lines[0] == 'benchmark: bench'
+        assert re.match(r'^mean:  \d+\.\d{6}s$', lines[1]), f'unexpected format: {lines[1]!r}'
+        assert re.match(r'^best:  \d+\.\d{6}s$', lines[2]), f'unexpected format: {lines[2]!r}'
+        assert re.match(r'^worst: \d+\.\d{6}s$', lines[3]), f'unexpected format: {lines[3]!r}'
+
+    def test_cli_exit_code_0_by_default(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.returncode == 0
+
+    def test_cli_writes_to_stdout(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.stdout.strip() != ''
+        assert proc.stderr == ''
+
+
+class TestScenarioCliNumberArg:
+    def test_number_arg_changes_durations_count(self) -> None:
+        # We can't directly inspect durations from subprocess output,
+        # but we can verify the CLI accepts --number without error
+        proc = run_script(scenario_script(), '--number', '5')
+        assert proc.returncode == 0
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_number_arg_default_uses_scenario_number(self) -> None:
+        proc = run_script(scenario_script())
+        assert proc.returncode == 0
+
+
+class TestScenarioCliMaxMean:
+    def test_max_mean_below_threshold_exit_0(self) -> None:
+        # fake_timer increments by 0.001 each call, so mean per run = 0.001
+        proc = run_script(scenario_script(), '--max-mean', '1.0')
+        assert proc.returncode == 0
+
+    def test_max_mean_above_threshold_exit_1(self) -> None:
+        # mean will be ~0.001s; threshold 0.000001 is far below
+        proc = run_script(scenario_script(), '--max-mean', '0.000001')
+        assert proc.returncode == 1
+
+    def test_max_mean_still_prints_output(self) -> None:
+        proc = run_script(scenario_script(), '--max-mean', '0.000001')
+        assert 'benchmark: bench' in proc.stdout
+
+    def test_max_mean_equal_threshold_exit_0(self) -> None:
+        # mean = exactly threshold → should pass (mean <= threshold)
+        # With fake_timer: each call delta = 0.001, number=10
+        # mean = 0.001. Set --max-mean = 10 to ensure it passes.
+        proc = run_script(scenario_script(), '--max-mean', '10')
+        assert proc.returncode == 0
+
+
+class TestScenarioCliHelp:
+    def test_help_exits_0(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        assert proc.returncode == 0
+
+    def test_help_output_not_empty(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert len(combined) > 0
+
+    def test_help_mentions_number(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert 'number' in combined.lower()
+
+    def test_help_mentions_max_mean(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert '--max-mean' in combined
+
+    def test_help_does_not_run_benchmark(self) -> None:
+        proc = run_script(scenario_script(), '--help')
+        assert 'benchmark:' not in proc.stdout
+
+
+class TestScenarioCliEdgeCases:
+    def test_number_zero_fails(self) -> None:
+        proc = run_script(scenario_script(), '--number', '0')
+        assert proc.returncode != 0
+
+    def test_number_negative_fails(self) -> None:
+        proc = run_script(scenario_script(), '--number', '-1')
+        assert proc.returncode != 0
+
+    def test_number_and_max_mean_combined(self) -> None:
+        proc = run_script(scenario_script(), '--number', '5', '--max-mean', '10.0')
+        assert proc.returncode == 0
+        assert 'benchmark: bench' in proc.stdout
diff --git a/tests/cli/test_scenario_group_cli.py b/tests/cli/test_scenario_group_cli.py
new file mode 100644
index 0000000..de35718
--- /dev/null
+++ b/tests/cli/test_scenario_group_cli.py
@@ -0,0 +1,183 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+import textwrap
+
+
+def run_script(script: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, '-c', script, *args],
+        capture_output=True,
+        text=True,
+        encoding='utf-8',
+        timeout=timeout,
+        check=False,
+    )
+
+
+def group_script(extra: str = '') -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='first', number=5, timer=fake_timer)
+        s2 = Scenario(lambda: None, name='second', number=5, timer=fake_timer)
+        group = s1 + s2
+        {extra}
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliOutput:
+    def test_outputs_both_scenario_names(self) -> None:
+        proc = run_script(group_script())
+        assert 'benchmark: first' in proc.stdout
+        assert 'benchmark: second' in proc.stdout
+
+    def test_results_separated_by_divider(self) -> None:
+        proc = run_script(group_script())
+        assert '---' in proc.stdout
+
+    def test_divider_between_not_after_last(self) -> None:
+        proc = run_script(group_script())
+        # group_script() has 2 scenarios → exactly 1 divider between them
+        assert proc.stdout.count('---\n') == 1
+        lines = proc.stdout.strip().splitlines()
+        assert lines[-1] != '---'
+
+    def test_exit_code_0_by_default(self) -> None:
+        proc = run_script(group_script())
+        assert proc.returncode == 0
+
+    def test_outputs_mean_best_worst_for_each(self) -> None:
+        proc = run_script(group_script())
+        assert proc.stdout.count('mean:') == 2
+        assert proc.stdout.count('best:') == 2
+        assert proc.stdout.count('worst:') == 2
+
+    def test_writes_to_stdout(self) -> None:
+        proc = run_script(group_script())
+        assert proc.stdout.strip() != ''
+        assert proc.stderr == ''
+
+
+class TestScenarioGroupCliNumberArg:
+    def test_number_arg_accepted(self) -> None:
+        proc = run_script(group_script(), '--number', '3')
+        assert proc.returncode == 0
+        assert 'benchmark: first' in proc.stdout
+
+
+class TestScenarioGroupCliMaxMean:
+    def test_max_mean_passes_when_below(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '10.0')
+        assert proc.returncode == 0
+
+    def test_max_mean_fails_when_any_exceeds(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '0.000001')
+        assert proc.returncode == 1
+
+    def test_max_mean_still_prints_output_on_failure(self) -> None:
+        proc = run_script(group_script(), '--max-mean', '0.000001')
+        assert 'benchmark:' in proc.stdout
+
+    def test_max_mean_and_number_combined(self) -> None:
+        proc = run_script(group_script(), '--number', '3', '--max-mean', '10.0')
+        assert proc.returncode == 0
+        assert 'benchmark: first' in proc.stdout
+        assert 'benchmark: second' in proc.stdout
+
+
+class TestScenarioGroupCliHelp:
+    def test_help_exits_0(self) -> None:
+        proc = run_script(group_script(), '--help')
+        assert proc.returncode == 0
+
+    def test_help_mentions_number(self) -> None:
+        proc = run_script(group_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert 'number' in combined.lower()
+
+    def test_help_mentions_max_mean(self) -> None:
+        proc = run_script(group_script(), '--help')
+        combined = proc.stdout + proc.stderr
+        assert '--max-mean' in combined
+
+    def test_help_does_not_run_benchmark(self) -> None:
+        proc = run_script(group_script(), '--help')
+        assert 'benchmark:' not in proc.stdout
+
+
+def empty_group_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import ScenarioGroup
+
+        group = ScenarioGroup()
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliEmptyGroup:
+    def test_empty_group_exits_0(self) -> None:
+        proc = run_script(empty_group_script())
+        assert proc.returncode == 0
+
+    def test_empty_group_no_output(self) -> None:
+        proc = run_script(empty_group_script())
+        assert proc.stdout == ''
+        assert proc.stderr == ''
+
+
+def single_scenario_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='only', number=5, timer=fake_timer)
+        group = ScenarioGroup(s1)
+        group.cli()
+    ''')
+
+
+def three_scenario_script() -> str:
+    return textwrap.dedent(f'''
+        import sys
+        sys.path.insert(0, {str(__import__('pathlib').Path(__file__).parent.parent.parent)!r})
+        from microbenchmark import Scenario, ScenarioGroup
+
+        tick = [0.0]
+        def fake_timer():
+            tick[0] += 0.001
+            return tick[0]
+
+        s1 = Scenario(lambda: None, name='a', number=5, timer=fake_timer)
+        s2 = Scenario(lambda: None, name='b', number=5, timer=fake_timer)
+        s3 = Scenario(lambda: None, name='c', number=5, timer=fake_timer)
+        group = ScenarioGroup(s1, s2, s3)
+        group.cli()
+    ''')
+
+
+class TestScenarioGroupCliDividers:
+    def test_single_scenario_no_divider(self) -> None:
+        proc = run_script(single_scenario_script())
+        assert '---' not in proc.stdout
+
+    def test_three_scenarios_two_dividers(self) -> None:
+        proc = run_script(three_scenario_script())
+        assert proc.stdout.count('---') == 2
diff --git a/tests/documentation/__init__.py b/tests/documentation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/documentation/test_readme.md b/tests/documentation/test_readme.md
new file mode 100644
index 0000000..e69de29
diff --git a/tests/documentation/test_readme.py b/tests/documentation/test_readme.py
new file mode 100644
index 0000000..a537711
--- /dev/null
+++ b/tests/documentation/test_readme.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+import math
+from functools import partial
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestQuickStart:
+    def test_quick_start_basic(self) -> None:
+        def build_list() -> list[int]:
+            return list(range(1000))
+
+        scenario = Scenario(build_list, name='build_list', number=500)
+        result = scenario.run()
+        assert len(result.durations) == 500
+        assert isinstance(result.mean, float)
+        assert isinstance(result.best, float)
+        assert isinstance(result.worst, float)
+
+
+class TestScenarioConstructor:
+    def test_full_constructor(self) -> None:
+        scenario = Scenario(
+            sorted,
+            args=[[3, 1, 2]],
+            name='sort_three_items',
+            doc='Sort a list of three integers.',
+            number=10000,
+        )
+        assert scenario.name == 'sort_three_items'
+        assert scenario.doc == 'Sort a list of three integers.'
+        assert scenario.number == 10000
+
+    def test_partial_kwargs(self) -> None:
+        scenario = Scenario(
+            partial(sorted, key=lambda x: -x),
+            args=[[3, 1, 2]],
+            name='sort_descending',
+        )
+        result = scenario.run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_multiple_positional_args(self) -> None:
+        scenario = Scenario(pow, args=[2, 10], name='power')
+        result = scenario.run()
+        assert isinstance(result.mean, float)
+
+
+class TestScenarioRun:
+    def test_run_with_warmup(self) -> None:
+        scenario = Scenario(lambda: list(range(100)), name='build', number=1000)
+        result = scenario.run(warmup=100)
+        assert len(result.durations) == 1000
+
+
+class TestScenarioGroupCreation:
+    def test_direct_construction(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = ScenarioGroup(s1, s2)
+        assert isinstance(group, ScenarioGroup)
+
+    def test_empty_group(self) -> None:
+        empty = ScenarioGroup()
+        assert len(empty.run()) == 0
+
+    def test_plus_operator_two_scenarios(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert type(group).__name__ == 'ScenarioGroup'
+
+    def test_scenario_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        group = ScenarioGroup(s1, s2)
+        extended = group + s3
+        assert len(extended.run()) == 3
+
+    def test_reverse_scenario_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        group = ScenarioGroup(s1, s2)
+        also_ok = s3 + group
+        assert len(also_ok.run()) == 3
+
+    def test_group_plus_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        s3 = Scenario(lambda: None, name='s3')
+        g1 = ScenarioGroup(s1)
+        g2 = ScenarioGroup(s2, s3)
+        combined = g1 + g2
+        assert len(combined.run()) == 3
+
+
+class TestScenarioGroupRun:
+    def test_run_order_preserved(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = ScenarioGroup(s1, s2)
+        results = group.run(warmup=50)
+        assert results[0].scenario is not None
+        assert results[1].scenario is not None
+        assert results[0].scenario.name == 's1'
+        assert results[1].scenario.name == 's2'
+
+
+class TestBenchmarkResultFields:
+    def test_fields_documentation_example(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.durations) == 100
+        assert result.is_primary is True
+
+    def test_durations_is_tuple(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert isinstance(result.durations, tuple)
+
+
+class TestPercentile:
+    def test_percentile_documentation_example(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        trimmed = result.percentile(95)
+        assert trimmed.is_primary is False
+        assert len(trimmed.durations) == math.ceil(100 * 95 / 100)
+
+    def test_percentile_chaining(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        chained = result.percentile(90).percentile(50)
+        assert len(chained.durations) == math.ceil(math.ceil(100 * 90 / 100) * 50 / 100)
+
+
+class TestP95P99:
+    def test_p95_cached(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.p95.durations) == math.ceil(100 * 95 / 100)
+        assert result.p95.is_primary is False
+        assert result.p95 is result.p95
+
+    def test_p99_cached(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        assert len(result.p99.durations) == math.ceil(100 * 99 / 100)
+        assert result.p99.is_primary is False
+        assert result.p99 is result.p99
+
+
+class TestJsonRoundTrip:
+    def test_json_round_trip(self) -> None:
+        result = Scenario(lambda: None, name='noop', number=100).run()
+        json_str = result.to_json()
+        restored = BenchmarkResult.from_json(json_str)
+        assert restored.scenario is None
+        assert restored.mean == result.mean
+        assert restored.durations == result.durations
+        assert restored.is_primary == result.is_primary
diff --git a/tests/typing/__init__.py b/tests/typing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/typing/test_benchmark_result_types.py b/tests/typing/test_benchmark_result_types.py
new file mode 100644
index 0000000..80f4a41
--- /dev/null
+++ b/tests/typing/test_benchmark_result_types.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario
+
+
+def make_result() -> BenchmarkResult:
+    return Scenario(lambda: None, name='s', number=10).run()
+
+
+class TestBenchmarkResultPositiveTypes:
+    def test_percentile_returns_benchmark_result(self) -> None:
+        result = make_result()
+        trimmed = result.percentile(50)
+        assert isinstance(trimmed, BenchmarkResult)
+
+    def test_p95_returns_benchmark_result(self) -> None:
+        result = make_result()
+        assert isinstance(result.p95, BenchmarkResult)
+
+    def test_p99_returns_benchmark_result(self) -> None:
+        result = make_result()
+        assert isinstance(result.p99, BenchmarkResult)
+
+    def test_to_json_returns_str(self) -> None:
+        result = make_result()
+        assert isinstance(result.to_json(), str)
+
+    def test_from_json_returns_benchmark_result(self) -> None:
+        result = make_result()
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert isinstance(restored, BenchmarkResult)
+
+    def test_mean_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.mean, float)
+
+    def test_best_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.best, float)
+
+    def test_worst_is_float(self) -> None:
+        result = make_result()
+        assert isinstance(result.worst, float)
+
+    def test_is_primary_is_bool(self) -> None:
+        result = make_result()
+        assert isinstance(result.is_primary, bool)
+
+    def test_durations_is_tuple(self) -> None:
+        result = make_result()
+        assert isinstance(result.durations, tuple)
+
+
+class TestBenchmarkResultNegativeTypes:
+    def test_percentile_zero_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(0)
+
+    def test_percentile_negative_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(-1)
+
+    def test_percentile_above_100_raises(self) -> None:
+        result = make_result()
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(101)
+
+    def test_from_json_invalid_raises(self) -> None:
+        with pytest.raises(json.JSONDecodeError):
+            BenchmarkResult.from_json('{not valid}')
+
+    def test_from_json_empty_object_raises(self) -> None:
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json('{}')
diff --git a/tests/typing/test_scenario_group_types.py b/tests/typing/test_scenario_group_types.py
new file mode 100644
index 0000000..9415151
--- /dev/null
+++ b/tests/typing/test_scenario_group_types.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestScenarioGroupPositiveTypes:
+    def test_empty_construction(self) -> None:
+        g = ScenarioGroup()
+        assert isinstance(g, ScenarioGroup)
+
+    def test_single_scenario(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        g = ScenarioGroup(s)
+        assert isinstance(g, ScenarioGroup)
+
+    def test_multiple_scenarios(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s1, s2)
+        assert isinstance(g, ScenarioGroup)
+
+    def test_run_returns_list(self) -> None:
+        g = ScenarioGroup()
+        result = g.run()
+        assert isinstance(result, list)
+
+    def test_run_returns_list_of_benchmark_results(self) -> None:
+        s = Scenario(lambda: None, name='s', number=1)
+        g = ScenarioGroup(s)
+        results = g.run()
+        for r in results:
+            assert isinstance(r, BenchmarkResult)
+
+    def test_add_scenario_returns_group(self) -> None:
+        g = ScenarioGroup()
+        s = Scenario(lambda: None, name='s')
+        result = g + s
+        assert isinstance(result, ScenarioGroup)
+
+    def test_add_group_returns_group(self) -> None:
+        g1 = ScenarioGroup()
+        g2 = ScenarioGroup()
+        result = g1 + g2
+        assert isinstance(result, ScenarioGroup)
+
+
+class TestScenarioGroupNegativeTypes:
+    def test_add_int_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
diff --git a/tests/typing/test_scenario_types.py b/tests/typing/test_scenario_types.py
new file mode 100644
index 0000000..08759f7
--- /dev/null
+++ b/tests/typing/test_scenario_types.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import time
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+# ---------------------------------------------------------------------------
+# Positive type checks (runtime — confirm valid usage works)
+# ---------------------------------------------------------------------------
+
+class TestScenarioPositiveTypes:
+    def test_callable_function(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert isinstance(s, Scenario)
+
+    def test_args_none(self) -> None:
+        s = Scenario(lambda: None, args=None, name='s')
+        assert isinstance(s, Scenario)
+
+    def test_args_list(self) -> None:
+        s = Scenario(sum, args=[[1, 2, 3]], name='s')
+        assert isinstance(s, Scenario)
+
+    def test_number_int(self) -> None:
+        s = Scenario(lambda: None, name='s', number=100)
+        assert isinstance(s, Scenario)
+
+    def test_timer_callable(self) -> None:
+        s = Scenario(lambda: None, name='s', timer=time.perf_counter)
+        assert isinstance(s, Scenario)
+
+    def test_run_returns_benchmark_result(self) -> None:
+        result = Scenario(lambda: None, name='s', number=1).run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_run_with_warmup_returns_benchmark_result(self) -> None:
+        result = Scenario(lambda: None, name='s', number=1).run(warmup=0)
+        assert isinstance(result, BenchmarkResult)
+
+    def test_add_scenario_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+
+    def test_add_group_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        g = ScenarioGroup()
+        group = s1 + g
+        assert isinstance(group, ScenarioGroup)
+
+
+# ---------------------------------------------------------------------------
+# Negative type checks (runtime ValueError/TypeError for invalid inputs)
+# ---------------------------------------------------------------------------
+
+class TestScenarioNegativeTypes:
+    def test_number_zero_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=0)
+
+    def test_number_negative_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=-5)
+
+    def test_add_int_returns_not_implemented(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        result = s.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
diff --git a/tests/units/__init__.py b/tests/units/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/units/test_benchmark_result.py b/tests/units/test_benchmark_result.py
new file mode 100644
index 0000000..e463af4
--- /dev/null
+++ b/tests/units/test_benchmark_result.py
@@ -0,0 +1,411 @@
+from __future__ import annotations
+
+import json
+import math
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario
+
+
+def make_result(
+    durations: tuple[float, ...],
+    scenario: Scenario | None = None,
+    is_primary: bool = True,
+) -> BenchmarkResult:
+    if scenario is None:
+        scenario = Scenario(lambda: None, name='test', number=len(durations) or 1)
+    return BenchmarkResult(scenario=scenario, durations=durations, is_primary=is_primary)
+
+
+class TestBenchmarkResultFields:
+    def test_all_fields_stored(self) -> None:
+        scenario = Scenario(lambda: None, name='s', number=3)
+        result = BenchmarkResult(
+            scenario=scenario,
+            durations=(0.1, 0.2, 0.3),
+            is_primary=True,
+        )
+        assert result.scenario is scenario
+        assert result.durations == (0.1, 0.2, 0.3)
+        assert result.is_primary is True
+
+    def test_durations_is_tuple(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        assert isinstance(result.durations, tuple)
+
+    def test_empty_durations_raises(self) -> None:
+        # BenchmarkResult does not validate durations length;
+        # creating with empty tuple causes ZeroDivisionError in __post_init__
+        s = Scenario(lambda: None, name='s', number=1)
+        with pytest.raises(ZeroDivisionError):
+            BenchmarkResult(scenario=s, durations=(), is_primary=True)
+
+    def test_inf_durations_fields(self) -> None:
+        result = make_result((float('inf'), 1.0, 2.0))
+        assert math.isinf(result.worst)
+        assert math.isinf(result.mean)
+        assert result.best == 1.0
+
+    def test_nan_durations_fields(self) -> None:
+        result = make_result((float('nan'),))
+        assert math.isnan(result.mean)
+        assert math.isnan(result.best)
+        assert math.isnan(result.worst)
+
+    def test_mean_computed_correctly(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        expected = math.fsum([1.0, 2.0, 3.0]) / 3
+        assert result.mean == expected
+
+    def test_mean_uses_fsum_precision(self) -> None:
+        # fsum handles cancellation correctly; plain sum loses precision
+        # for (1e20, 1.0, -1e20): fsum=1.0 (exact), but sum=0.0 (catastrophic cancellation)
+        durations = (1e20, 1.0, -1e20)
+        result = make_result(durations)
+        assert result.mean == 1.0 / 3  # exact: fsum gives 1.0, divided by 3
+
+    def test_best_is_min(self) -> None:
+        result = make_result((3.0, 1.0, 2.0))
+        assert result.best == 1.0
+
+    def test_worst_is_max(self) -> None:
+        result = make_result((3.0, 1.0, 2.0))
+        assert result.worst == 3.0
+
+    def test_is_primary_true_by_default(self) -> None:
+        result = make_result((1.0,))
+        assert result.is_primary is True
+
+    def test_is_primary_false(self) -> None:
+        result = make_result((1.0,), is_primary=False)
+        assert result.is_primary is False
+
+    def test_single_duration(self) -> None:
+        result = make_result((0.5,))
+        assert result.best == 0.5
+        assert result.worst == 0.5
+        assert result.mean == 0.5
+
+    def test_all_equal_durations(self) -> None:
+        result = make_result((0.1, 0.1, 0.1))
+        assert result.best == 0.1
+        assert result.worst == 0.1
+        assert result.mean == pytest.approx(0.1)
+
+    def test_scenario_identity(self) -> None:
+        scenario = Scenario(lambda: None, name='check')
+        result = BenchmarkResult(scenario=scenario, durations=(0.1,), is_primary=True)
+        assert result.scenario is scenario
+
+    def test_scenario_none(self) -> None:
+        result = BenchmarkResult(scenario=None, durations=(0.1,), is_primary=True)
+        assert result.scenario is None
+
+
+class TestPercentile:
+    def test_percentile_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 11)))
+        trimmed = result.percentile(90)
+        assert isinstance(trimmed, BenchmarkResult)
+
+    def test_percentile_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 11)))
+        trimmed = result.percentile(90)
+        assert trimmed.is_primary is False
+
+    def test_percentile_count_nearest_rank(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        trimmed = result.percentile(95)
+        expected_count = math.ceil(100 * 95 / 100)
+        assert len(trimmed.durations) == expected_count
+
+    def test_percentile_contains_fastest(self) -> None:
+        result = make_result((5.0, 1.0, 3.0, 2.0, 4.0))
+        trimmed = result.percentile(60)
+        k = math.ceil(5 * 60 / 100)
+        assert len(trimmed.durations) == k
+        # should be the smallest k values in sorted order
+        sorted_original = sorted(result.durations)
+        assert trimmed.durations == tuple(sorted_original[:k])
+
+    def test_percentile_100_returns_all(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        trimmed = result.percentile(100)
+        assert len(trimmed.durations) == 3
+        assert trimmed.is_primary is False
+        assert trimmed.durations == tuple(sorted(result.durations))
+
+    def test_percentile_small_number(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        trimmed = result.percentile(50)
+        expected = math.ceil(3 * 50 / 100)
+        assert len(trimmed.durations) == expected
+
+    def test_percentile_very_small_positive(self) -> None:
+        result = make_result((1.0, 2.0, 3.0, 4.0, 5.0))
+        trimmed = result.percentile(0.001)
+        # ceil(5 * 0.001 / 100) = ceil(0.00005) = 1
+        assert len(trimmed.durations) == 1
+        assert trimmed.durations == (1.0,)
+
+    def test_percentile_99(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        trimmed = result.percentile(99)
+        assert len(trimmed.durations) == math.ceil(100 * 99 / 100)
+
+    def test_percentile_mean_recomputed(self) -> None:
+        result = make_result((1.0, 2.0, 3.0, 4.0, 10.0))
+        trimmed = result.percentile(80)
+        expected_durations = sorted(result.durations)[:4]
+        expected_mean = math.fsum(expected_durations) / 4
+        assert trimmed.mean == pytest.approx(expected_mean)
+
+    def test_percentile_on_derived_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        derived = result.percentile(90).percentile(50)
+        assert isinstance(derived, BenchmarkResult)
+        assert derived.is_primary is False
+        # 100 → p90 → 90 elements → p50 → ceil(90 * 50/100) = 45
+        assert len(derived.durations) == 45
+
+    def test_percentile_scenario_preserved(self) -> None:
+        scenario = Scenario(lambda: None, name='s')
+        result = BenchmarkResult(scenario=scenario, durations=(1.0, 2.0, 3.0), is_primary=True)
+        trimmed = result.percentile(100)
+        assert trimmed.scenario is scenario
+
+    def test_percentile_0_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(0)
+
+    def test_percentile_negative_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(-5)
+
+    def test_percentile_above_100_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(101)
+
+    def test_percentile_nan_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(float('nan'))
+
+    def test_percentile_inf_raises(self) -> None:
+        result = make_result((1.0, 2.0, 3.0))
+        with pytest.raises(ValueError, match='percentile'):
+            result.percentile(float('inf'))
+
+    def test_percentile_preserves_fsum_mean(self) -> None:
+        durations = tuple(0.1 * i for i in range(1, 11))
+        result = make_result(durations)
+        trimmed = result.percentile(80)
+        sorted_d = sorted(durations)
+        k = math.ceil(10 * 80 / 100)
+        expected = math.fsum(sorted_d[:k]) / k
+        assert trimmed.mean == pytest.approx(expected)
+
+
+class TestCachedProperties:
+    def test_p95_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert isinstance(result.p95, BenchmarkResult)
+
+    def test_p99_returns_benchmark_result(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert isinstance(result.p99, BenchmarkResult)
+
+    def test_p95_is_cached(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p95 is result.p95
+
+    def test_p99_is_cached(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p99 is result.p99
+
+    def test_p95_count(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert len(result.p95.durations) == math.ceil(100 * 95 / 100)
+
+    def test_p99_count(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert len(result.p99.durations) == math.ceil(100 * 99 / 100)
+
+    def test_p95_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p95.is_primary is False
+
+    def test_p99_is_primary_false(self) -> None:
+        result = make_result(tuple(float(i) for i in range(1, 101)))
+        assert result.p99.is_primary is False
+
+
+class TestSerialization:
+    def test_to_json_returns_string(self) -> None:
+        result = make_result((0.1, 0.2))
+        assert isinstance(result.to_json(), str)
+
+    def test_to_json_valid_json(self) -> None:
+        result = make_result((0.1, 0.2))
+        data = json.loads(result.to_json())
+        assert isinstance(data, dict)
+        assert isinstance(data['durations'], list)
+        assert isinstance(data['is_primary'], bool)
+        assert 'scenario' in data
+
+    def test_to_json_contains_durations(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        data = json.loads(result.to_json())
+        assert 'durations' in data
+
+    def test_to_json_contains_is_primary(self) -> None:
+        result = make_result((0.1,))
+        data = json.loads(result.to_json())
+        assert 'is_primary' in data
+
+    def test_to_json_contains_scenario_metadata(self) -> None:
+        s = Scenario(lambda: None, name='myname', doc='mydoc', number=42)
+        result = BenchmarkResult(scenario=s, durations=(0.1,), is_primary=True)
+        data = json.loads(result.to_json())
+        assert data['scenario']['name'] == 'myname'
+        assert data['scenario']['doc'] == 'mydoc'
+        assert data['scenario']['number'] == 42
+
+    def test_to_json_derived_is_primary_false(self) -> None:
+        result = make_result((1.0, 2.0, 3.0), is_primary=False)
+        data = json.loads(result.to_json())
+        assert data['is_primary'] is False
+
+    def test_from_json_round_trip_durations(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.durations == result.durations
+
+    def test_from_json_round_trip_mean(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.mean == result.mean
+
+    def test_from_json_round_trip_best_worst(self) -> None:
+        result = make_result((0.1, 0.2, 0.3))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.best == result.best
+        assert restored.worst == result.worst
+
+    def test_from_json_scenario_is_none(self) -> None:
+        result = make_result((0.1, 0.2))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.scenario is None
+
+    def test_from_json_preserves_is_primary(self) -> None:
+        result = make_result((0.1,), is_primary=False)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.is_primary is False
+
+    def test_from_json_primary_preserved(self) -> None:
+        result = make_result((0.1,), is_primary=True)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.is_primary is True
+
+    def test_from_json_fp_precision(self) -> None:
+        durations = tuple(0.1 * i for i in range(1, 6))
+        result = make_result(durations)
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert restored.mean == pytest.approx(result.mean)
+
+    def test_from_json_invalid_json_raises(self) -> None:
+        with pytest.raises(json.JSONDecodeError):
+            BenchmarkResult.from_json('{not valid json}')
+
+    def test_from_json_missing_fields_raises(self) -> None:
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json('{}')
+
+    def test_from_json_missing_durations_raises(self) -> None:
+        data = json.dumps({'is_primary': True, 'scenario': {'name': 'x', 'doc': '', 'number': 1}})
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json(data)
+
+    def test_to_json_scenario_none_is_null(self) -> None:
+        result = BenchmarkResult(scenario=None, durations=(0.1,), is_primary=True)
+        data = json.loads(result.to_json())
+        assert data['scenario'] is None
+
+    def test_from_json_with_scenario_field_ignored(self) -> None:
+        payload = json.dumps({
+            'durations': [0.1, 0.2],
+            'is_primary': True,
+            'scenario': {'name': 'x', 'doc': '', 'number': 1},
+        })
+        restored = BenchmarkResult.from_json(payload)
+        assert restored.scenario is None
+        assert restored.durations == (0.1, 0.2)
+
+    def test_from_json_durations_not_list_raises(self) -> None:
+        payload = json.dumps({'durations': 'not a list', 'is_primary': True})
+        with pytest.raises(ValueError, match='durations'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_is_primary_not_bool_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1], 'is_primary': 'true'})
+        with pytest.raises(ValueError, match='is_primary'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_is_primary_int_raises(self) -> None:
+        # int 1 is not a bool even though bool is a subclass of int
+        payload = json.dumps({'durations': [0.1], 'is_primary': 1})
+        with pytest.raises(ValueError, match='is_primary'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_durations_with_invalid_element_raises(self) -> None:
+        payload = '{"durations": [0.1, "not_a_number"], "is_primary": true}'
+        with pytest.raises(ValueError, match='could not convert'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_durations_with_null_element_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1, None], 'is_primary': True})
+        with pytest.raises(TypeError):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_empty_durations_list_raises(self) -> None:
+        payload = json.dumps({'durations': [], 'is_primary': True})
+        with pytest.raises(ZeroDivisionError):
+            BenchmarkResult.from_json(payload)
+
+    def test_to_json_inf_produces_non_standard_json(self) -> None:
+        # Python's json module allows_nan=True by default: inf/nan → Infinity/NaN
+        result = make_result((float('inf'), 1.0))
+        j = result.to_json()
+        assert 'Infinity' in j
+
+    def test_to_json_nan_round_trips_in_python(self) -> None:
+        # NaN round-trips through Python's json module (non-standard JSON)
+        result = make_result((float('nan'),))
+        restored = BenchmarkResult.from_json(result.to_json())
+        assert math.isnan(restored.mean)
+
+    def test_percentile_single_element(self) -> None:
+        result = make_result((5.0,))
+        trimmed = result.percentile(50)
+        assert trimmed.durations == (5.0,)
+        assert trimmed.is_primary is False
+
+    def test_percentile_100_single_element(self) -> None:
+        result = make_result((5.0,))
+        trimmed = result.percentile(100)
+        assert trimmed.durations == (5.0,)
+        assert trimmed.is_primary is False
+
+    def test_from_json_missing_is_primary_raises(self) -> None:
+        payload = json.dumps({'durations': [0.1, 0.2]})
+        with pytest.raises(ValueError, match='required fields'):
+            BenchmarkResult.from_json(payload)
+
+    def test_from_json_not_dict_raises(self) -> None:
+        payload = json.dumps([1, 2, 3])
+        with pytest.raises(ValueError, match='JSON must be an object'):
+            BenchmarkResult.from_json(payload)
diff --git a/tests/units/test_init.py b/tests/units/test_init.py
new file mode 100644
index 0000000..d03e85a
--- /dev/null
+++ b/tests/units/test_init.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import microbenchmark
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestPublicImports:
+    def test_scenario_importable(self) -> None:
+        assert Scenario is not None
+
+    def test_scenario_group_importable(self) -> None:
+        assert ScenarioGroup is not None
+
+    def test_benchmark_result_importable(self) -> None:
+        assert BenchmarkResult is not None
+
+    def test_all_defined(self) -> None:
+        assert hasattr(microbenchmark, '__all__')
+
+    def test_all_contains_scenario(self) -> None:
+        assert 'Scenario' in microbenchmark.__all__
+
+    def test_all_contains_scenario_group(self) -> None:
+        assert 'ScenarioGroup' in microbenchmark.__all__
+
+    def test_all_contains_benchmark_result(self) -> None:
+        assert 'BenchmarkResult' in microbenchmark.__all__
+
+    def test_all_contains_exactly_three_items(self) -> None:
+        assert set(microbenchmark.__all__) == {'Scenario', 'ScenarioGroup', 'BenchmarkResult'}
diff --git a/tests/units/test_scenario.py b/tests/units/test_scenario.py
new file mode 100644
index 0000000..047bcad
--- /dev/null
+++ b/tests/units/test_scenario.py
@@ -0,0 +1,318 @@
+from __future__ import annotations
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+class TestScenarioConstruction:
+    def test_minimal_construction(self) -> None:
+        s = Scenario(lambda: None, name='minimal')
+        assert s.name == 'minimal'
+
+    def test_full_construction(self) -> None:
+        timer_calls = [0.0]
+
+        def fake_timer() -> float:
+            timer_calls[0] += 0.001
+            return timer_calls[0]
+
+        s = Scenario(
+            sum,
+            args=[[1, 2, 3]],
+            name='full',
+            doc='A full scenario',
+            number=50,
+            timer=fake_timer,
+        )
+        assert s.name == 'full'
+        assert s.doc == 'A full scenario'
+
+    def test_name_stored(self) -> None:
+        s = Scenario(lambda: None, name='myname')
+        assert s.name == 'myname'
+
+    def test_doc_stored(self) -> None:
+        s = Scenario(lambda: None, name='s', doc='my doc')
+        assert s.doc == 'my doc'
+
+    def test_doc_default_empty(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert s.doc == ''
+
+    def test_number_default(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        assert s.number == 1000
+
+    def test_number_custom(self) -> None:
+        s = Scenario(lambda: None, name='s', number=42)
+        assert s.number == 42
+
+    def test_args_none_default(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, name='s', number=1)
+        s.run()
+        assert call_log == [()]
+
+    def test_args_empty_list(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, args=[], name='s', number=1)
+        s.run()
+        assert call_log == [()]
+
+    def test_args_with_values(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        s = Scenario(fn, args=[1, 2, 3], name='s', number=1)
+        s.run()
+        assert call_log == [(1, 2, 3)]
+
+    def test_args_copied_on_construction(self) -> None:
+        call_log: list[tuple[object, ...]] = []
+
+        def fn(*a: object) -> None:
+            call_log.append(a)
+
+        original = [10, 20]
+        s = Scenario(fn, args=original, name='s', number=1)
+        original.append(30)  # mutate after construction
+        s.run()
+        assert call_log == [(10, 20)]  # should not include 30
+
+    def test_number_zero_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=0)
+
+    def test_number_negative_raises(self) -> None:
+        with pytest.raises(ValueError, match='number'):
+            Scenario(lambda: None, name='s', number=-1)
+
+    def test_name_required_raises(self) -> None:
+        with pytest.raises(TypeError, match='required keyword-only argument'):
+            Scenario(lambda: None)  # type: ignore[call-arg]
+
+    def test_run_negative_warmup_acts_as_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=3)
+        result = s.run(warmup=-5)
+        assert len(result.durations) == 3
+        assert counter[0] == 3  # negative warmup = range(-5) = empty, silently ignored
+
+
+class TestScenarioRun:
+    def test_run_returns_benchmark_result(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert isinstance(result, BenchmarkResult)
+
+    def test_run_calls_function_number_times(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=7)
+        s.run()
+        assert counter[0] == 7
+
+    def test_run_durations_length_equals_number(self) -> None:
+        s = Scenario(lambda: None, name='s', number=10)
+        result = s.run()
+        assert len(result.durations) == 10
+
+    def test_run_with_warmup_total_calls(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        s.run(warmup=3)
+        assert counter[0] == 8
+
+    def test_run_warmup_not_in_durations(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        result = s.run(warmup=10)
+        assert len(result.durations) == 5
+        assert counter[0] == 15  # 10 warmup + 5 measured
+
+    def test_run_warmup_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        s.run(warmup=0)
+        assert counter[0] == 5
+
+    def test_run_uses_custom_timer(self) -> None:
+        # timer produces: 0.000, 0.001, 0.002, 0.003, ...  (infinite)
+        # each measured interval: end - start = 0.001
+        import itertools  # noqa: PLC0415
+        counter = itertools.count(0)
+
+        def fake_timer() -> float:
+            return next(counter) * 0.001
+
+        s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
+        result = s.run()
+        assert result.durations == pytest.approx((0.001, 0.001, 0.001))
+
+    def test_custom_timer_stateful(self) -> None:
+        # timer is called before and after each run; warmup also consumes timer calls
+        tick = [0]
+
+        def fake_timer() -> float:
+            tick[0] += 1
+            return float(tick[0])
+
+        s = Scenario(lambda: None, name='s', number=3, timer=fake_timer)
+        result = s.run(warmup=2)
+        # 2 warmup * 2 timer calls + 3 measured * 2 timer calls = 10 total timer calls
+        assert tick[0] == 10
+        # only the 3 measured durations should be stored
+        assert len(result.durations) == 3
+
+    def test_run_result_scenario_is_self(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert result.scenario is s
+
+    def test_run_twice_independent(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=5)
+        r1 = s.run()
+        r2 = s.run()
+        assert len(r1.durations) == 5
+        assert len(r2.durations) == 5
+        assert r1 is not r2
+
+    def test_run_propagates_exception(self) -> None:
+        def bad() -> None:
+            raise RuntimeError('oops')
+
+        s = Scenario(bad, name='s', number=1)
+        with pytest.raises(RuntimeError, match='oops'):
+            s.run()
+
+    def test_run_result_is_primary(self) -> None:
+        s = Scenario(lambda: None, name='s', number=5)
+        result = s.run()
+        assert result.is_primary is True
+
+    def test_run_args_incompatible_raises_type_error(self) -> None:
+        s = Scenario(lambda: None, args=[1, 2], name='s', number=1)
+        with pytest.raises(TypeError, match='positional argument'):
+            s.run()
+
+    def test_run_exception_mid_iteration(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+            if counter[0] == 3:
+                raise RuntimeError('fail on 3rd call')
+
+        s = Scenario(fn, name='s', number=5)
+        with pytest.raises(RuntimeError, match='fail on 3rd call'):
+            s.run()
+        assert counter[0] == 3
+
+    def test_run_exception_during_warmup_propagates(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+            if counter[0] == 2:
+                raise RuntimeError('fail in warmup')
+
+        s = Scenario(fn, name='s', number=5)
+        with pytest.raises(RuntimeError, match='fail in warmup'):
+            s.run(warmup=3)
+        assert counter[0] == 2  # stopped at 2nd warmup call
+
+    def test_run_number_one(self) -> None:
+        tick = [0]
+
+        def fake_timer() -> float:
+            tick[0] += 1
+            return float(tick[0])
+
+        s = Scenario(lambda: None, name='s', number=1, timer=fake_timer)
+        result = s.run()
+        assert len(result.durations) == 1
+        assert result.durations[0] == pytest.approx(1.0)  # end(2) - start(1) = 1
+
+
+class TestScenarioAdd:
+    def test_add_scenario_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+
+    def test_add_scenario_group_returns_group(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s2)
+        group = s1 + g
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
+    def test_add_int_raises_type_error(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        with pytest.raises(TypeError):
+            _ = 42 + s  # type: ignore[operator]
+
+    def test_add_unknown_type_returns_not_implemented(self) -> None:
+        s = Scenario(lambda: None, name='s')
+        result = s.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
+    def test_radd_scenario_scenario(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        # s2.__radd__(s1) = ScenarioGroup(s1, s2)
+        group = s2.__radd__(s1)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
+    def test_radd_group_scenario(self) -> None:
+        s1 = Scenario(lambda: None, name='s1')
+        s2 = Scenario(lambda: None, name='s2')
+        g = ScenarioGroup(s1)
+        # s2.__radd__(g) = ScenarioGroup(*g._scenarios, s2) = [s1, s2]
+        group = s2.__radd__(g)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
diff --git a/tests/units/test_scenario_group.py b/tests/units/test_scenario_group.py
new file mode 100644
index 0000000..9d1c0ae
--- /dev/null
+++ b/tests/units/test_scenario_group.py
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+import pytest
+
+from microbenchmark import BenchmarkResult, Scenario, ScenarioGroup
+
+
+def make_scenario(name: str = 's', number: int = 5) -> Scenario:
+    return Scenario(lambda: None, name=name, number=number)
+
+
+class TestScenarioGroupConstruction:
+    def test_empty_group(self) -> None:
+        g = ScenarioGroup()
+        assert isinstance(g, ScenarioGroup)
+
+    def test_single_scenario(self) -> None:
+        s = make_scenario('s1')
+        g = ScenarioGroup(s)
+        results = g.run()
+        assert len(results) == 1
+
+    def test_multiple_scenarios(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g = ScenarioGroup(s1, s2, s3)
+        results = g.run()
+        assert len(results) == 3
+
+
+class TestScenarioGroupOperator:
+    def test_scenario_plus_scenario(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        group = s1 + s2
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 2
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
+    def test_group_plus_scenario(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = ScenarioGroup(s1, s2) + s3
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_scenario_plus_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = s1 + ScenarioGroup(s2, s3)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_group_plus_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = ScenarioGroup(s1) + ScenarioGroup(s2, s3)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_triple_sum_is_flat(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        group = s1 + s2 + s3
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_add_returns_new_group(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1)
+        new_g = g + s2
+        assert new_g is not g
+        assert len(g._scenarios) == 1  # original not mutated
+        assert new_g._scenarios[0] is s1
+        assert new_g._scenarios[1] is s2
+
+    def test_add_unknown_type_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__add__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
+    def test_radd_unknown_type_returns_not_implemented(self) -> None:
+        g = ScenarioGroup()
+        result = g.__radd__(42)  # type: ignore[arg-type]
+        assert result is NotImplemented
+
+    def test_radd_scenario_to_group(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1)
+        group = g.__radd__(s2)
+        assert isinstance(group, ScenarioGroup)
+        assert len(group.run()) == 2
+
+    def test_radd_group_to_group(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g1 = ScenarioGroup(s1, s2)
+        g2 = ScenarioGroup(s3)
+        # g2.__radd__(g1) = ScenarioGroup(*g1._scenarios, *g2._scenarios) = [s1, s2, s3]
+        group = g2.__radd__(g1)
+        assert isinstance(group, ScenarioGroup)
+        results = group.run()
+        assert len(results) == 3
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_duplicate_scenarios(self) -> None:
+        s = make_scenario('s')
+        group = s + s
+        results = group.run()
+        assert len(results) == 2
+
+    def test_multiple_groups_flat(self) -> None:
+        scenarios = [make_scenario(f's{i}') for i in range(5)]
+        g1 = ScenarioGroup(scenarios[0], scenarios[1])
+        g2 = ScenarioGroup(scenarios[2], scenarios[3])
+        g3 = ScenarioGroup(scenarios[4])
+        combined = g1 + g2 + g3
+        assert len(combined.run()) == 5
+
+
+class TestScenarioGroupRun:
+    def test_run_returns_list(self) -> None:
+        g = ScenarioGroup()
+        result = g.run()
+        assert isinstance(result, list)
+
+    def test_empty_group_returns_empty_list(self) -> None:
+        g = ScenarioGroup()
+        assert g.run() == []
+
+    def test_empty_group_run_with_warmup(self) -> None:
+        g = ScenarioGroup()
+        assert g.run(warmup=10) == []
+
+    def test_run_negative_warmup_acts_as_zero(self) -> None:
+        counter = [0]
+
+        def fn() -> None:
+            counter[0] += 1
+
+        s = Scenario(fn, name='s', number=3)
+        g = ScenarioGroup(s)
+        results = g.run(warmup=-5)
+        assert len(results) == 1
+        assert len(results[0].durations) == 3
+        assert counter[0] == 3  # range(-5) == empty, so no warmup calls
+
+    def test_run_returns_benchmark_results(self) -> None:
+        s = make_scenario()
+        g = ScenarioGroup(s)
+        results = g.run()
+        for r in results:
+            assert isinstance(r, BenchmarkResult)
+
+    def test_run_order_preserved(self) -> None:
+        s1, s2, s3 = make_scenario('s1'), make_scenario('s2'), make_scenario('s3')
+        g = ScenarioGroup(s1, s2, s3)
+        results = g.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+        assert results[2].scenario is s3
+
+    def test_run_with_warmup(self) -> None:
+        counters = [0, 0]
+
+        def make_fn(idx: int) -> object:
+            def fn() -> None:
+                counters[idx] += 1
+            return fn
+
+        s1 = Scenario(make_fn(0), name='a', number=5)  # type: ignore[arg-type]
+        s2 = Scenario(make_fn(1), name='b', number=5)  # type: ignore[arg-type]
+        g = ScenarioGroup(s1, s2)
+        results = g.run(warmup=3)
+        # each scenario: 3 warmup + 5 measured = 8 calls
+        assert counters[0] == 8
+        assert counters[1] == 8
+        for r in results:
+            assert len(r.durations) == 5
+
+    def test_run_correct_scenario_reference(self) -> None:
+        s1, s2 = make_scenario('s1'), make_scenario('s2')
+        g = ScenarioGroup(s1, s2)
+        results = g.run()
+        assert results[0].scenario is s1
+        assert results[1].scenario is s2
+
+    def test_run_warmup_different_numbers(self) -> None:
+        s1 = Scenario(lambda: None, name='a', number=3)
+        s2 = Scenario(lambda: None, name='b', number=7)
+        g = ScenarioGroup(s1, s2)
+        results = g.run(warmup=2)
+        assert len(results[0].durations) == 3
+        assert len(results[1].durations) == 7
+
+    def test_run_propagates_exception_from_scenario(self) -> None:
+        def bad() -> None:
+            raise RuntimeError('scenario failed')
+
+        s1 = make_scenario('s1')
+        s2 = Scenario(bad, name='s2', number=1)
+        g = ScenarioGroup(s1, s2)
+        with pytest.raises(RuntimeError, match='scenario failed'):
+            g.run()