diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 60b8025c6..80c9cfe63 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -19,6 +19,7 @@ RUN apt-get update \
     libgsl-dev \
     make \
     pkg-config \
+    ripgrep \
     tar \
     unzip \
     xz-utils \
@@ -42,6 +43,7 @@ RUN mkdir -p \
         wheel \
         findent \
         fortls \
+        matplotlib \
         numpy \
         scipy \
         sympy \
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index 13cc0afae..a794bb95e 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -62,3 +62,6 @@ devcontainer's selected interpreter may need to be pointed explicitly at
 `/home/vscode/.local/share/camb-devcontainer/.venv/bin/python3`.
 
 If you explicitly want a standards-based editable install after the container is up, you can still run `python -m pip install --no-build-isolation --no-deps -e .`, but that setuptools metadata phase is the part that remains slow on this bind-mounted checkout.
+
+
+The container includes CosmoRec and HyRec. If not needed, you can speed builds by doing `export RECOMBINATION_FILES=recfast`
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index db5a5c0e2..223e20fff 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -10,6 +10,9 @@
   "runArgs": [
     "--cpus=8"
   ],
+  "features": {
+    "ghcr.io/anthropics/devcontainer-features/claude-code:1.0": {}
+  },
   "workspaceMount": "source=${localWorkspaceFolder}/..,target=/workspaces/camb-parent,type=bind",
   "workspaceFolder": "/workspaces/camb-parent/${localWorkspaceFolderBasename}",
   "mounts": [
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f308f59d2..ed6748b87 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Run tests
         run: |
           python -c "import camb; print(camb.__version__)"
-          python -m unittest camb.tests.camb_test
+          python -c "import os, subprocess, sys; modules=['camb.tests.camb_test']; modules += [] if os.getenv('CAMB_TEST_FAST') else ['camb.tests.camb_test_slow']; print('Running', ', '.join(modules)); subprocess.run([sys.executable, '-m', 'unittest', *modules], check=True)"
 
       - name: HM code tests
         if: matrix.os == 'ubuntu-latest'
diff --git a/.gitignore b/.gitignore
index 7f1f521b8..c04df86bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,7 @@ testfile*
 .vs
 *.mp4
 *.exe
+*_tmp*
 
 # Ignore all .dat and .ini files except those already tracked in git
 *.dat
@@ -89,3 +90,4 @@ external/
 !.devcontainer/
 !.devcontainer/**
 .claude
+devcontainer-lock.json
diff --git a/AGENTS.md b/AGENTS.md
index 8f5ded253..ebdd26708 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,20 +1,52 @@
-# Project instructions
-- This is the CAMB python cosmology code source, wrapping fortran 2003 for numerics
-- Follow the rules in this file.
-- If `AGENTS.local.md` exists, read it after this file and let it override local environment details.
-
-# Project rules and guidance
-- Python uses python 3.10+ with modern type hint syntax
-- Use double-quotes for strings; line length 120
-- Code is auto-formatted by ruff, including import reordering.
-- Fortran uses modern Fortran 2003, compiled with gfortran or ifort
-- Install and run pre-commit hooks as needed for checking
-- /forutils is a git submodule with shared fortran utilities and classes
-- For fortran/wrapped fortran code modifications see /docs/modifying_code.rst
-- When updating version, update both python __version__ and version defined near the top of fortran/config.f90
-- Use pip -e for editable first install. After that use "python setup.py make" to quickly rebuild fortran for use with python.
-- Default test: python -m unittest camb.tests.camb_test (do not run hmcode_test unless specifically relevant)
-- Long test against precomputed results: fortran/tests/run_tests.py which calls CAMB_test_files.py (only if asked)
-- Installation/clone/modification/contributing/pre-commit/vscode config: see /CONTRIBUTING.md
-- When plotting fractional differences for cross-spectra, use e.g. Delta TE/sqrt(TT*EE)
-- High accuracy lensing needs lens_potential_accuracy = 8 or so (mainly increases kmax)
+# Project Overview
+
+This is the CAMB Python cosmology code, wrapping Fortran 2003 for numerics.
+
+---
+
+# Coding Standards
+
+## Python
+- Use Python 3.10+ with modern type-hint syntax.
+- Use double-quotes for strings; maximum line length is 120.
+- Code is auto-formatted by **ruff**, including import reordering.
+- Use the **pre-commit** command line to run ruff check/format (e.g., `pre-commit run --all-files`); do not invoke `ruff` directly.
+- Ruff/pyupgrade hooks and their configuration live in `.pre-commit-config.yaml`
+- All code must be Python or Fortran only—no bash scripts except for devcontainer setup and GitHub Actions.
+
+## Fortran
+- Use modern Fortran 2003, compiled with **gfortran** or **ifort**.
+- `/forutils` is a git submodule containing shared Fortran utilities and classes.
+- For guidance on modifying Fortran or wrapped Fortran code, see `/docs/modifying_code.rst`.
+
+---
+
+# Development Workflow
+
+## Installation & Build
+- Use `pip install -e .` for the initial editable install.
+- After the first install, use `python setup.py make` to quickly rebuild the Fortran for use with Python after any fortran changes.
+- If `python setup.py make` produces stale `.mod` type-mismatch errors, run `python setup.py clean` first (not normally needed).
+
+## Testing
+- **Default test:** `python -m unittest camb.tests.camb_test`.
+  - Do **not** run `camb.tests.hmcode_test` unless specifically relevant.
+- **Long test** against precomputed results: `python fortran/tests/run_tests.py`, which calls `CAMB_test_files.py`. Only run this if explicitly asked.
+
+## Configuration
+- See `CONTRIBUTING.md` for installation, cloning, modification guidelines, contribution instructions, pre-commit setup, and VS Code configuration.
+
+---
+
+# Maintenance & Versioning
+- When updating the version, update **both**:
+  - the Python `__version__`, and
+  - the version defined near the top of `fortran/config.f90`.
+- When adding new `.ini` file parameters, also add the corresponding `.ini` write logic in `camb/_ini.py`.
+
+---
+
+# Physics & Numerical Guidance
+- When plotting fractional differences for cross-spectra, use e.g. $\Delta TE / \sqrt{TT \cdot EE}$.
+- High-accuracy lensing requires `lens_potential_accuracy = 8` or so (this mainly increases `kmax`).
+- Use `camb/check_accuracy.py` to assess numerical stability by comparing a default run to a high-accuracy run.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8e60cbe63..2b15658c3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,6 +63,14 @@ Run the test suite to ensure your changes don't break anything:
 python -m unittest camb.tests.camb_test
 ```
 
+To run the full suite including the slower symbolic and emission-angle tests:
+
+```bash
+python -m unittest camb.tests.camb_test camb.tests.camb_test_slow
+```
+
+The GitHub workflows include `camb.tests.camb_test_slow` by default, and skip it when `CAMB_TEST_FAST` is set.
+
 For HMcode tests (Linux only):
 
 ```bash
diff --git a/camb/__init__.py b/camb/__init__.py
index 139f6196f..487b02389 100644
--- a/camb/__init__.py
+++ b/camb/__init__.py
@@ -8,13 +8,20 @@
 __author__ = "Antony Lewis"
 __contact__ = "antony at cosmologist dot info"
 __url__ = "https://camb.readthedocs.io"
-__version__ = "1.6.7"
+__version__ = "1.6.8"
 
 from . import baseconfig
 
 baseconfig.check_fortran_version(__version__)
 from . import dark_energy, initialpower, model, nonlinear, reionization
-from ._config import config
+from ._config import (
+    config,
+    lensing_method_curv_corr,
+    lensing_method_curv_corr_direct,
+    lensing_method_flat_corr,
+    lensing_method_harmonic,
+    lensing_method_optimized,
+)
 from .baseconfig import (
     Array1D,
     CAMBError,
diff --git a/camb/_config.py b/camb/_config.py
index 368baea33..1f0356e1a 100644
--- a/camb/_config.py
+++ b/camb/_config.py
@@ -6,12 +6,25 @@
 lensing_method_curv_corr = 1
 lensing_method_flat_corr = 2
 lensing_method_harmonic = 3
+lensing_method_curv_corr_direct = 4
+lensing_method_optimized = 5
 
 
 class _config:
     # print feedback if > 0 (note in Jupyter notebook this will appear in the terminal, not the notebook)
     FeedbackLevel = import_property(c_int, "config", "FeedbackLevel")
 
+    # enable targeted accuracy improvements if > 0, AccuracyTarget being SO-like.
+    AccuracyTarget = import_property(c_int, "config", "AccuracyTarget")
+
+    near_flat_approx_chi_limit = import_property(c_double, "config", "near_flat_approx_chi_limit")
+
+    near_flat_approx_chidisp_limit = import_property(c_double, "config", "near_flat_approx_chidisp_limit")
+
+    enable_do_near_flat_integration = import_property(c_bool, "config", "enable_do_near_flat_integration")
+
+    enable_shifted_q_scalar_approx = import_property(c_bool, "config", "enable_shifted_q_scalar_approx")
+
     # print additional timing and progress (when FeedbackLevel>0)
     DebugMsgs = import_property(c_bool, "config", "DebugMsgs")
 
diff --git a/camb/_ini.py b/camb/_ini.py
index 5af6b82af..912e235e3 100644
--- a/camb/_ini.py
+++ b/camb/_ini.py
@@ -258,6 +258,7 @@ def _update_ini_state_from_params(params: model.CAMBparams, state: CambIniFile)
     if params.WantCls and (params.WantScalars or params.WantVectors):
         set_param(state, "l_max_scalar", params.max_l)
         set_param(state, "k_eta_max_scalar", params.max_eta_k)
+        set_param(state, "lens_output_margin", params.lens_output_margin)
         if params.WantScalars:
             set_param(state, "do_lensing", params.DoLensing)
     if params.WantCls and params.WantTensors:
diff --git a/camb/check_accuracy.py b/camb/check_accuracy.py
index f816417f0..d61fd6b1e 100644
--- a/camb/check_accuracy.py
+++ b/camb/check_accuracy.py
@@ -6,10 +6,17 @@
 search for minimal top-level boosts, and refine which underlying component
 accuracy settings are most relevant.
 
+For lensed CMB spectra near the output cutoff, this is not automatically a test
+of lens-margin convergence: by default the boosted reference keeps the same
+``lens_output_margin`` as the standard run unless ``--reference-lens-output-margin``
+is set, and even a larger reference margin is only a support-sensitivity probe
+unless the underlying high-``l`` reference spectra are themselves well converged.
+
 Examples::
 
     camb check_accuracy inifiles/params.ini
     camb check_accuracy inifiles/params.ini --plot-dir accuracy_plots
+    camb check_accuracy inifiles/params.ini --strict-reference --reference-lens-output-margin 1500
     camb check_accuracy inifiles/params.ini --find-minimal-boosts --refine-accuracy-components
 """
 
@@ -31,6 +38,17 @@
     "lAccuracyBoost": 2.0,
     "IntTolBoost": 2.0,
     "DoLateRadTruncation": True,
+    # Force linear l-sampling all the way to max_l in the reference; the default min_l_logl_sampling
+    # of 5000 leaves a wide gap in the log block right above 5000 that makes the reference itself
+    # under-sampled at high l.
+    "min_l_logl_sampling": 100000,
+}
+
+STRICT_REFERENCE_SETTINGS = {
+    "AccuracyBoost": 3.0,
+    "lSampleBoost": 3.0,
+    "lAccuracyBoost": 3.0,
+    "min_l_logl_sampling": 10000,
 }
 
 DEFAULT_NOISE_CONFIGS = {
@@ -52,6 +70,11 @@
 BB_TOLERANCES = [(0, 5e-3), (1000, 1e-2), (6000, 2e-2), (8000, 1e-1)]
 LENSING_TOLERANCES = [(0, 5e-3), (2000, 5e-3), (6000, 2e-2)]
 MPK_TOLERANCE = 1e-3
+MPK_TOLERANCE_RANGES = [
+    (None, 5e-3, 3e-3),
+    (5e-3, 2.0, MPK_TOLERANCE),
+    (2.0, None, 3e-3),
+]
 DERIVED_TOLERANCE = 1e-3
 
 CL_COLUMNS = {"TT": 0, "EE": 1, "BB": 2, "TE": 3}
@@ -78,6 +101,8 @@
 COMPONENT_REFINEMENT_KEYS = GLOBAL_BOOST_COMPONENT_KEYS
 DISCRETE_ACCURACY_VALUES = {"neutrino_q_boost": (1.0, 1.5, 2.5)}
 
+MatterPowerToleranceSpec = list[tuple[float | None, float | None, float]]
+
 
 @dataclass(frozen=True)
 class RangeTolerance:
@@ -254,7 +279,8 @@ def cmb_fields(value: str) -> tuple[str, ...]:
 def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
     """Build the command-line parser for the accuracy checker."""
     parser = argparse.ArgumentParser(
-        prog=prog, description="Check stability of one CAMB ini file against a boosted-accuracy calculation."
+        prog=prog,
+        description="Check stability of one CAMB ini file against a boosted-accuracy calculation.",
     )
     parser.add_argument("ini_file", help="CAMB ini file to load with camb.read_ini")
     parser.add_argument(
@@ -273,9 +299,9 @@ def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
         help="call params.set_for_lmax before running; if --lmax is unset, this is also used for comparison",
     )
     parser.add_argument(
-        "--lens-margin",
+        "--lens-output-margin",
         type=int,
-        help="lens margin to use for both runs; can be set with or without --set-for-lmax",
+        help="lens output margin to use for both runs; can be set with or without --set-for-lmax",
     )
     parser.add_argument(
         "--lens-potential-accuracy",
@@ -283,9 +309,9 @@ def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
         help="lens_potential_accuracy to use for both runs; can be set with or without --set-for-lmax",
     )
     parser.add_argument(
-        "--reference-lens-margin",
+        "--reference-lens-output-margin",
         type=int,
-        help="lens margin override for the boosted reference only",
+        help="lens output margin override for the boosted reference only; use this with sufficiently converged reference accuracy settings to probe lensed-spectrum support sensitivity near the output cutoff",
     )
     parser.add_argument(
         "--reference-lens-potential-accuracy",
@@ -304,7 +330,12 @@ def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
         default=1e-4,
         help="minimum k/h for matter power comparison",
     )
-    parser.add_argument("--mpk-tolerance", type=float, default=MPK_TOLERANCE)
+    parser.add_argument(
+        "--mpk-tolerance",
+        type=float,
+        default=None,
+        help="override matter power tolerance for all k/h ranges",
+    )
     parser.add_argument("--derived-tolerance", type=float, default=DERIVED_TOLERANCE)
     parser.add_argument(
         "--plot-dir",
@@ -376,6 +407,11 @@ def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
         type=float,
         default=DEFAULT_ACCURACY_SETTINGS["AccuracyBoost"],
     )
+    parser.add_argument(
+        "--strict-reference",
+        action="store_true",
+        help="use a stricter boosted reference preset with AccuracyBoost=lSampleBoost=lAccuracyBoost=3 and min_l_logl_sampling=10000",
+    )
     parser.add_argument(
         "--l-sample-boost",
         type=float,
@@ -396,13 +432,20 @@ def build_parser(prog: str | None = None) -> argparse.ArgumentParser:
 
 
 def requested_accuracy_settings(args: argparse.Namespace) -> dict[str, float | bool]:
-    return {
+    settings: dict[str, float | bool] = {
         "AccuracyBoost": args.accuracy_boost,
         "lSampleBoost": args.l_sample_boost,
         "lAccuracyBoost": args.l_accuracy_boost,
         "IntTolBoost": args.int_tol_boost,
         "DoLateRadTruncation": args.do_late_rad_truncation,
     }
+    if args.strict_reference:
+        for key, value in STRICT_REFERENCE_SETTINGS.items():
+            if key in settings:
+                settings[key] = max(float(settings[key]), float(value))
+            else:
+                settings[key] = value
+    return settings
 
 
 def apply_accuracy_settings(params, settings: dict[str, float | bool], *, boost_from_raw: bool = False) -> None:
@@ -410,12 +453,25 @@ def apply_accuracy_settings(params, settings: dict[str, float | bool], *, boost_
         if key == "DoLateRadTruncation":
             params.DoLateRadTruncation = bool(setting)
             continue
-        if not hasattr(params.Accuracy, key):
-            raise ValueError(f"Unknown accuracy setting {key}")
-        value = float(setting)
-        if boost_from_raw:
-            value = max(float(getattr(params.Accuracy, key)), value)
-        setattr(params.Accuracy, key, value)
+        if hasattr(params.Accuracy, key):
+            current_value = getattr(params.Accuracy, key)
+            if isinstance(current_value, bool):
+                value = bool(setting)
+                if boost_from_raw:
+                    value = current_value or value
+            else:
+                value = float(setting)
+                if boost_from_raw:
+                    value = max(float(current_value), value)
+            setattr(params.Accuracy, key, value)
+            continue
+        if key == "min_l_logl_sampling":
+            value = int(setting)
+            if boost_from_raw:
+                value = max(int(params.min_l_logl_sampling), value)
+            params.min_l_logl_sampling = value
+            continue
+        raise ValueError(f"Unknown accuracy setting {key}")
 
 
 def copy_params(params):
@@ -426,29 +482,29 @@ def apply_lensing_settings(
     params,
     *,
     set_for_lmax: int | None = None,
-    lens_margin: int | None = None,
+    lens_output_margin: int | None = None,
     lens_potential_accuracy: float | None = None,
 ) -> None:
-    if set_for_lmax is None and lens_margin is None and lens_potential_accuracy is None:
+    if set_for_lmax is None and lens_output_margin is None and lens_potential_accuracy is None:
         return
 
     lens_accuracy = 0.0 if lens_potential_accuracy is None else lens_potential_accuracy
     if set_for_lmax is not None:
         params.set_for_lmax(
             set_for_lmax,
-            lens_margin=150 if lens_margin is None else lens_margin,
+            lens_output_margin=150 if lens_output_margin is None else lens_output_margin,
             lens_potential_accuracy=lens_accuracy,
             nonlinear=current_uses_nonlinear_lensing(params),
         )
         return
 
-    margin = 0 if lens_margin is None else lens_margin
+    margin = 0 if lens_output_margin is None else lens_output_margin
     target_lmax = params.max_l - margin if params.DoLensing else params.max_l
     max_eta_k = params.max_eta_k if lens_potential_accuracy is None else None
     params.set_for_lmax(
         max(1, target_lmax),
         max_eta_k=max_eta_k,
-        lens_margin=margin,
+        lens_output_margin=margin,
         lens_potential_accuracy=lens_accuracy,
         nonlinear=current_uses_nonlinear_lensing(params),
     )
@@ -464,7 +520,7 @@ def load_params(
     no_validate: bool,
     settings: dict[str, float | bool] | None = None,
     set_for_lmax: int | None = None,
-    lens_margin: int | None = None,
+    lens_output_margin: int | None = None,
     lens_potential_accuracy: float | None = None,
 ):
     import camb
@@ -473,7 +529,7 @@ def load_params(
     apply_lensing_settings(
         params,
         set_for_lmax=set_for_lmax,
-        lens_margin=lens_margin,
+        lens_output_margin=lens_output_margin,
         lens_potential_accuracy=lens_potential_accuracy,
     )
     if settings:
@@ -523,7 +579,7 @@ def run_case(
     accuracy_settings: dict[str, float | bool] | None,
     lmax: int | None,
     set_for_lmax: int | None,
-    lens_margin: int | None,
+    lens_output_margin: int | None,
     lens_potential_accuracy: float | None,
     mpk_kmin: float,
     mpk_npoints: int,
@@ -533,7 +589,7 @@ def run_case(
         no_validate=no_validate,
         settings=accuracy_settings,
         set_for_lmax=set_for_lmax,
-        lens_margin=lens_margin,
+        lens_output_margin=lens_output_margin,
         lens_potential_accuracy=lens_potential_accuracy,
     )
     return run_params_case(
@@ -685,10 +741,24 @@ def compare_cls(standard: RunOutput, reference: RunOutput) -> list[StatRow]:
     rows.extend(compare_l_ranges("TE", te_errors, ell, tolerance_ranges(TT_EE_TOLERANCES), min_ell=2))
 
     bb_errors = fractional_delta(standard_cls[:, CL_COLUMNS["BB"]], reference_cls[:, CL_COLUMNS["BB"]])
-    rows.extend(compare_l_ranges("BB", bb_errors, ell, tolerance_ranges(BB_TOLERANCES), min_ell=2))
+    rows.extend(
+        compare_l_ranges(
+            "BB",
+            bb_errors,
+            ell,
+            tolerance_ranges(bb_tolerances(standard.params)),
+            min_ell=2,
+        )
+    )
     return rows
 
 
+def bb_tolerances(params) -> list[tuple[int, float]]:
+    if not bool(params.Accuracy.AccurateBB):
+        return [(0, 2e-2), (8000, 1e-1)]
+    return BB_TOLERANCES
+
+
 def compare_lensing(standard: RunOutput, reference: RunOutput) -> list[StatRow]:
     if standard.lens_potential_cls is None or reference.lens_potential_cls is None:
         return []
@@ -738,15 +808,60 @@ def ell_range_label(selected_ell: np.ndarray, range_tolerance: RangeTolerance) -
     return range_tolerance.label
 
 
-def compare_matter_power(standard: RunOutput, reference: RunOutput, tolerance: float) -> list[StatRow]:
+def matter_power_range_label(start: float | None, stop: float | None) -> str:
+    if start is None:
+        return f"all z, k/h < {stop:.4g}"
+    if stop is None:
+        return f"all z, {start:.4g} <= k/h"
+    return f"all z, {start:.4g} <= k/h < {stop:.4g}"
+
+
+def compare_matter_power(
+    standard: RunOutput,
+    reference: RunOutput,
+    tolerance: float | MatterPowerToleranceSpec,
+) -> list[StatRow]:
     if standard.matter_power is None or reference.matter_power is None:
         return []
     common_k, z, standard_pk, reference_pk = common_matter_power_grid(standard.matter_power, reference.matter_power)
     errors = fractional_delta(standard_pk, reference_pk)
-    z_grid, k_grid = np.meshgrid(z, common_k, indexing="ij")
-    locations = np.array([f"z={z_value:.6g}, k/h={k:.6g}" for z_value, k in zip(z_grid.ravel(), k_grid.ravel())])
-    range_label = f"all z, {common_k[0]:.4g} <= k/h <= {common_k[-1]:.4g}"
-    return [finite_stats("matter P(k)", range_label, errors.ravel(), tolerance, locations=locations)]
+    if isinstance(tolerance, float):
+        z_grid, k_grid = np.meshgrid(z, common_k, indexing="ij")
+        locations = np.array([f"z={z_value:.6g}, k/h={k:.6g}" for z_value, k in zip(z_grid.ravel(), k_grid.ravel())])
+        range_label = f"all z, {common_k[0]:.4g} <= k/h <= {common_k[-1]:.4g}"
+        return [
+            finite_stats(
+                "matter P(k)",
+                range_label,
+                errors.ravel(),
+                tolerance,
+                locations=locations,
+            )
+        ]
+
+    rows = []
+    for start, stop, range_tolerance in tolerance:
+        mask = np.ones_like(common_k, dtype=bool)
+        if start is not None:
+            mask &= common_k >= start
+        if stop is not None:
+            mask &= common_k < stop
+        if not np.any(mask):
+            continue
+        selected_k = common_k[mask]
+        selected_errors = errors[:, mask]
+        z_grid, k_grid = np.meshgrid(z, selected_k, indexing="ij")
+        locations = np.array([f"z={z_value:.6g}, k/h={k:.6g}" for z_value, k in zip(z_grid.ravel(), k_grid.ravel())])
+        rows.append(
+            finite_stats(
+                "matter P(k)",
+                matter_power_range_label(start, stop),
+                selected_errors.ravel(),
+                range_tolerance,
+                locations=locations,
+            )
+        )
+    return rows
 
 
 def common_matter_power_grid(
@@ -795,7 +910,7 @@ def compare_runs(
     reference: RunOutput,
     *,
     derived_tolerance: float,
-    mpk_tolerance: float,
+    mpk_tolerance: float | MatterPowerToleranceSpec,
 ) -> ComparisonResult:
     return ComparisonResult(
         derived_rows=compare_derived(standard, reference, derived_tolerance),
@@ -812,14 +927,14 @@ def compare_params_accuracy(
     comparator_accuracy_settings: dict[str, float | bool] | None = None,
     lmax: int | None = None,
     set_for_lmax: int | None = None,
-    lens_margin: int | None = None,
+    lens_output_margin: int | None = None,
     lens_potential_accuracy: float | None = None,
-    reference_lens_margin: int | None = None,
+    reference_lens_output_margin: int | None = None,
     reference_lens_potential_accuracy: float | None = None,
     mpk_kmin: float = 1e-4,
     mpk_npoints: int = 500,
     derived_tolerance: float = DERIVED_TOLERANCE,
-    mpk_tolerance: float = MPK_TOLERANCE,
+    mpk_tolerance: float | MatterPowerToleranceSpec | None = None,
     chi2_config: NoiseConfig | None = None,
 ) -> AccuracyCheckResult:
     """Compare a :class:`~camb.model.CAMBparams` object against a higher-accuracy copy.
@@ -828,13 +943,15 @@ def compare_params_accuracy(
     includes both CAMB runs, row-wise comparison statistics, and optionally a
     fiducial CMB delta chi-squared estimate.
     """
+    if mpk_tolerance is None:
+        mpk_tolerance = MPK_TOLERANCE_RANGES if params.Transfer.high_precision else 3e-3
     reference_accuracy_settings = reference_accuracy_settings or DEFAULT_ACCURACY_SETTINGS
 
     standard_params = copy_params(params)
     apply_lensing_settings(
         standard_params,
         set_for_lmax=set_for_lmax,
-        lens_margin=lens_margin,
+        lens_output_margin=lens_output_margin,
         lens_potential_accuracy=lens_potential_accuracy,
     )
     if comparator_accuracy_settings:
@@ -844,7 +961,9 @@ def compare_params_accuracy(
     apply_lensing_settings(
         reference_params,
         set_for_lmax=set_for_lmax,
-        lens_margin=reference_lens_margin if reference_lens_margin is not None else lens_margin,
+        lens_output_margin=reference_lens_output_margin
+        if reference_lens_output_margin is not None
+        else lens_output_margin,
         lens_potential_accuracy=(
             reference_lens_potential_accuracy
             if reference_lens_potential_accuracy is not None
@@ -1242,7 +1361,10 @@ def component_refinement_keys(params) -> tuple[str, ...]:
 
 
 def uses_nonlinear_sources(params) -> bool:
-    return str(getattr(params, "NonLinear", "NonLinear_none")) in {"NonLinear_pk", "NonLinear_both"}
+    return str(getattr(params, "NonLinear", "NonLinear_none")) in {
+        "NonLinear_pk",
+        "NonLinear_both",
+    }
 
 
 def has_redshift_windows(params) -> bool:
@@ -1395,7 +1517,7 @@ def find_minimal_boosts(
         ini_file,
         no_validate=args.no_validate,
         set_for_lmax=args.set_for_lmax,
-        lens_margin=args.lens_margin,
+        lens_output_margin=args.lens_output_margin,
         lens_potential_accuracy=args.lens_potential_accuracy,
     )
     raw_settings = raw_accuracy_settings(raw_params)
@@ -1415,7 +1537,12 @@ def find_minimal_boosts(
 
     def make_result(settings: dict[str, float | bool], comparison: ComparisonResult) -> SearchResult:
         key = settings_key(refine_discrete_accuracy_settings(settings, raw_settings))
-        return SearchResult(settings, comparison, runs.get(key) or comparison_runs.get(id(comparison)), fastest)
+        return SearchResult(
+            settings,
+            comparison,
+            runs.get(key) or comparison_runs.get(id(comparison)),
+            fastest,
+        )
 
     def run_candidate(settings: dict[str, float | bool]) -> ComparisonResult | None:
         nonlocal fastest, runs_done
@@ -1434,7 +1561,7 @@ def run_candidate(settings: dict[str, float | bool]) -> ComparisonResult | None:
             accuracy_settings=settings,
             lmax=effective_lmax(args),
             set_for_lmax=args.set_for_lmax,
-            lens_margin=args.lens_margin,
+            lens_output_margin=args.lens_output_margin,
             lens_potential_accuracy=args.lens_potential_accuracy,
             mpk_kmin=args.mpk_kmin,
             mpk_npoints=args.mpk_npoints,
@@ -1554,7 +1681,7 @@ def refine_accuracy_components(
         ini_file,
         no_validate=args.no_validate,
         set_for_lmax=args.set_for_lmax,
-        lens_margin=args.lens_margin,
+        lens_output_margin=args.lens_output_margin,
         lens_potential_accuracy=args.lens_potential_accuracy,
     )
     target_accuracy_settings = accuracy_search_target_settings(
@@ -1570,7 +1697,12 @@ def refine_accuracy_components(
 
     def make_result(settings: dict[str, float | bool], comparison: ComparisonResult) -> SearchResult:
         key = settings_key(refine_discrete_accuracy_settings(settings, raw_settings))
-        return SearchResult(settings, comparison, runs.get(key) or comparison_runs.get(id(comparison)), fastest)
+        return SearchResult(
+            settings,
+            comparison,
+            runs.get(key) or comparison_runs.get(id(comparison)),
+            fastest,
+        )
 
     def run_candidate(settings: dict[str, float | bool]) -> ComparisonResult | None:
         nonlocal fastest, runs_done
@@ -1589,7 +1721,7 @@ def run_candidate(settings: dict[str, float | bool]) -> ComparisonResult | None:
             accuracy_settings=settings,
             lmax=effective_lmax(args),
             set_for_lmax=args.set_for_lmax,
-            lens_margin=args.lens_margin,
+            lens_output_margin=args.lens_output_margin,
             lens_potential_accuracy=args.lens_potential_accuracy,
             mpk_kmin=args.mpk_kmin,
             mpk_npoints=args.mpk_npoints,
@@ -2045,6 +2177,14 @@ def main(argv: list[str] | None = None, *, prog: str | None = None) -> int:
     print(f"High-accuracy settings: {format_settings(high_accuracy_settings)}")
 
     base_params = load_params(ini_file, no_validate=args.no_validate)
+    if args.mpk_tolerance is None:
+        args.mpk_tolerance = MPK_TOLERANCE_RANGES if base_params.Transfer.high_precision else 3e-3
+    if base_params.DoLensing and base_params.WantCls and base_params.Want_CMB:
+        if args.reference_lens_output_margin is not None:
+            print(
+                "Note: --reference-lens-output-margin probes support sensitivity near the output cutoff, but it is not by itself "
+                "a truth test unless the high-l reference spectra are also well converged."
+            )
     noise_config = None
     if args.chi2:
         try:
@@ -2056,9 +2196,9 @@ def main(argv: list[str] | None = None, *, prog: str | None = None) -> int:
         reference_accuracy_settings=high_accuracy_settings,
         lmax=effective_lmax(args),
         set_for_lmax=args.set_for_lmax,
-        lens_margin=args.lens_margin,
+        lens_output_margin=args.lens_output_margin,
         lens_potential_accuracy=args.lens_potential_accuracy,
-        reference_lens_margin=args.reference_lens_margin,
+        reference_lens_output_margin=args.reference_lens_output_margin,
         reference_lens_potential_accuracy=args.reference_lens_potential_accuracy,
         mpk_kmin=args.mpk_kmin,
         mpk_npoints=args.mpk_npoints,
diff --git a/camb/correlations.py b/camb/correlations.py
index da717946d..fb1eff28f 100644
--- a/camb/correlations.py
+++ b/camb/correlations.py
@@ -50,6 +50,19 @@ def _cached_gauss_legendre(npoints, cache=True):
         return xvals, weights
 
 
+def _apodize_theta_width(theta_max, lmax, apodize_point_width):
+    if theta_max is None or apodize_point_width <= 0:
+        return 0.0
+    return min(theta_max, 4.8 * float(apodize_point_width) * np.pi / lmax)
+
+
+def _c2_apodized_weight(weight, theta, theta_max, apodize_theta_width):
+    if apodize_theta_width <= 0 or theta <= theta_max - apodize_theta_width:
+        return weight
+    taper = np.clip((theta_max - theta) / apodize_theta_width, 0.0, 1.0)
+    return weight * taper**3 * (10.0 + taper * (-15.0 + 6.0 * taper))
+
+
 def legendre_funcs(lmax, x, m=(0, 2), lfacs=None, lfacs2=None, lrootfacs=None):
     r"""
     Utility function to return array of Legendre and :math:`d_{mn}` functions for all :math:`\ell` up to lmax.
@@ -268,7 +281,8 @@ def lensed_correlations(cls, clpp, xvals, weights=None, lmax=None, delta=False,
     :param lmax: optional maximum L to use from the cls arrays
     :param delta: if true, calculate the difference between lensed and unlensed (default False)
     :param theta_max: maximum angle (in radians) to keep in the correlation functions
-    :param apodize_point_width: smoothing scale for apodization at truncation of correlation function
+    :param apodize_point_width: scale factor for the fixed-angle C2 apodization at the truncation cut;
+        the default corresponds to a transition width of about 48*pi/lmax
     :return: 2D array of corrs[i, ix], where ix=0,1,2,3 are T, Q+U, Q-U and cross;
         if weights is not None, then return corrs, lensed_cls
     """
@@ -307,8 +321,10 @@ def lensed_correlations(cls, clpp, xvals, weights=None, lmax=None, delta=False,
     if theta_max is not None:
         xmin = np.cos(theta_max)
         imin = np.searchsorted(xvals, xmin)  # assume xvals sorted
+        apodize_theta_width = _apodize_theta_width(theta_max, lmax, apodize_point_width)
     else:
         imin = 0
+        apodize_theta_width = 0.0
 
     corrs = np.zeros((len(xvals[imin:]), 4))
 
@@ -369,8 +385,8 @@ def lensed_correlations(cls, clpp, xvals, weights=None, lmax=None, delta=False,
         )
         if weights is not None:
             weight = weights[i + imin]
-            if theta_max is not None and i < apodize_point_width * 4:
-                weight *= 1 - np.exp(-(((i + 1.0) / apodize_point_width) ** 2) / 2)
+            if apodize_theta_width > 0.0:
+                weight = _c2_apodized_weight(weight, np.arccos(x), theta_max, apodize_theta_width)
 
             lensedcls[:, 0] += (weight * corrs[i, 0]) * P
             T2 = (corrs[i, 1] * weight / 2) * d22
@@ -424,8 +440,8 @@ def lensed_cls(
     :param sampling_factor: npoints = int(sampling_factor*lmax)+1
     :param delta_cls: if true, return the difference between lensed and unlensed (optional, default False)
     :param theta_max: maximum angle (in radians) to keep in the correlation functions; default: pi/32
-    :param apodize_point_width: if theta_max is set, apodize around the cut using half Gaussian of approx
-        width apodize_point_width/lmax*pi
+    :param apodize_point_width: if theta_max is set, scale the fixed-angle C2 apodization around the cut;
+        the default corresponds to a transition width of about 48*pi/lmax
     :param leggaus: whether to use Gauss-Legendre integration (default True)
     :param cache: if leggaus = True, set cache to save the x values and weights between calls (most of the time)
     :return: 2D array of cls[L, ix], with L starting at zero and ix=0,1,2,3 in order TT, EE, BB, TE.
@@ -439,7 +455,7 @@ def lensed_cls(
     else:
         theta = np.arange(1, npoints + 1) * np.pi / (npoints + 1)
         xvals = np.cos(theta[::-1])
-        weights = np.pi / npoints * np.sin(theta)
+        weights = np.pi / (npoints + 1) * np.sin(theta)
     _, lensedcls = lensed_correlations(
         cls,
         clpp,
@@ -448,7 +464,7 @@ def lensed_cls(
         lmax,
         delta=True,
         theta_max=theta_max,
-        apodize_point_width=int(apodize_point_width * sampling_factor),
+        apodize_point_width=apodize_point_width,
     )
     if not delta_cls:
         lensedcls += cls[: lmax + 1, :]
@@ -472,8 +488,8 @@ def lensed_cl_derivatives(cls, clpp, lmax=None, theta_max=np.pi / 32, apodize_po
     :param clpp: array of :math:`[L(L+1)]^2 C_L^{\phi\phi}/2\pi` lensing potential power spectrum (zero based)
     :param lmax: optional maximum L to use from the cls arrays
     :param theta_max: maximum angle (in radians) to keep in the correlation functions
-    :param apodize_point_width: if theta_max is set, apodize around the cut using half Gaussian of approx
-        width apodize_point_width/lmax*pi
+    :param apodize_point_width: if theta_max is set, scale the fixed-angle C2 apodization around the cut;
+        the default corresponds to a transition width of about 48*pi/lmax
     :param sampling_factor: npoints = int(sampling_factor*lmax)+1
     :return: array dCL[ix, ell, L], where ix=0,1,2,3 are T, EE, BB, TE and result
                is :math:`d[D^{\rm ix}_\ell]/ d (\log C^{\phi}_L)`
@@ -511,8 +527,10 @@ def lensed_cl_derivatives(cls, clpp, lmax=None, theta_max=np.pi / 32, apodize_po
     if theta_max is not None:
         xmin = np.cos(theta_max)
         imin = np.searchsorted(xvals, xmin)  # assume xvals sorted
+        apodize_theta_width = _apodize_theta_width(theta_max, lmax, apodize_point_width)
     else:
         imin = 0
+        apodize_theta_width = 0.0
 
     corrs = np.zeros((4, lmax + 1))
 
@@ -595,8 +613,8 @@ def lensed_cl_derivatives(cls, clpp, lmax=None, theta_max=np.pi / 32, apodize_po
         corrs[3, 1:] = (dsigma2 * corr + dCg2 * corr2) * cphil3
 
         weight = weights[i + imin]
-        if theta_max is not None and i < apodize_point_width * 4:
-            weight *= 1 - np.exp(-(((i + 1.0) / apodize_point_width) ** 2) / 2)
+        if apodize_theta_width > 0.0:
+            weight = _c2_apodized_weight(weight, np.arccos(x), theta_max, apodize_theta_width)
 
         dcl[0, :, :] += np.outer(P, (weight * corrs[0, :]))
         T2 = np.outer(d22, (corrs[1, :] * weight / 2))
@@ -626,8 +644,8 @@ def lensed_cl_derivative_unlensed(clpp, lmax=None, theta_max=np.pi / 32, apodize
     :param clpp: array of :math:`[L(L+1)]^2 C_L^{\phi\phi}/2\pi` lensing potential power spectrum (zero based)
     :param lmax: optional maximum L to use from the clpp array
     :param theta_max: maximum angle (in radians) to keep in the correlation functions
-    :param apodize_point_width: if theta_max is set, apodize around the cut using half Gaussian of approx
-        width apodize_point_width/lmax*pi
+    :param apodize_point_width: if theta_max is set, scale the fixed-angle C2 apodization around the cut;
+        the default corresponds to a transition width of about 48*pi/lmax
     :param sampling_factor: npoints = int(sampling_factor*lmax)+1
     :return: array dCL[ix, ell, L], where ix=0,1,2,3 are TT, EE, BB, TE and result is
          :math:`d\left(\Delta D^{\rm ix}_\ell\right) / d D^{{\rm unlens},j}_L` where j[ix] are TT, EE, EE, TE
@@ -665,8 +683,10 @@ def lensed_cl_derivative_unlensed(clpp, lmax=None, theta_max=np.pi / 32, apodize
     if theta_max is not None:
         xmin = np.cos(theta_max)
         imin = np.searchsorted(xvals, xmin)  # assume xvals sorted
+        apodize_theta_width = _apodize_theta_width(theta_max, lmax, apodize_point_width)
     else:
         imin = 0
+        apodize_theta_width = 0.0
 
     corr = np.zeros((4, lmax + 1))
 
@@ -722,8 +742,8 @@ def lensed_cl_derivative_unlensed(clpp, lmax=None, theta_max=np.pi / 32, apodize
         corr[3, 4:] += f[2:] * c2fac2[2:] * d2m4 / 8
 
         weight = weights[i + imin]
-        if theta_max is not None and i < apodize_point_width * 4:
-            weight *= 1 - np.exp(-(((i + 1.0) / apodize_point_width) ** 2) / 2)
+        if apodize_theta_width > 0.0:
+            weight = _c2_apodized_weight(weight, np.arccos(x), theta_max, apodize_theta_width)
 
         dcl[0, :, :] += np.outer(P, (weight * corr[0, :]))
         T2 = np.outer(d22, (corr[1, :] * weight / 2))
diff --git a/camb/model.py b/camb/model.py
index 98b4881c7..4ed019d05 100644
--- a/camb/model.py
+++ b/camb/model.py
@@ -304,6 +304,13 @@ class in model.py to add the new parameter in the corresponding location of the
         ("min_l", c_int, "l_min for the scalar C_L (1 or 2, L=1 dipoles are Newtonian Gauge)"),
         ("max_l", c_int, "l_max for the scalar C_L"),
         ("max_l_tensor", c_int, "l_max for the tensor C_L"),
+        (
+            "lens_output_margin",
+            c_int,
+            "Number of L below max_l down to which lensed C_L are guaranteed to be output; "
+            "the unlensed C_L used in the lensing convolution is treated as reliable to "
+            "max_l - (lens_output_margin - 50), so this also sets the lensed convolution margin.",
+        ),
         ("max_eta_k", c_double, "Maximum k*eta_0 for scalar C_L, where eta_0 is the conformal time today"),
         ("max_eta_k_tensor", c_double, "Maximum k*eta_0 for tensor C_L, where eta_0 is the conformal time today"),
         ("ombh2", c_double, "Omega_baryon h^2"),
@@ -990,7 +997,7 @@ def set_for_lmax(
         lmax,
         max_eta_k=None,
         lens_potential_accuracy=0,
-        lens_margin=150,
+        lens_output_margin=150,
         k_eta_fac=2.5,
         lens_k_eta_reference=18000.0,
         nonlinear=None,
@@ -1006,8 +1013,10 @@ def set_for_lmax(
                           If None, sensible value set automatically.
         :param lens_potential_accuracy: Set to 1 or higher if you want to get the lensing potential accurate
                                         (1 is only Planck-level accuracy)
-        :param lens_margin: the :math:`\Delta \ell_{\rm max}` to use to ensure lensed :math:`C_\ell` are correct
-                            at :math:`\ell_{\rm max}`
+        :param lens_output_margin: the :math:`\Delta \ell_{\rm max}` margin added to ``max_l`` when ``DoLensing``
+                                   is true, so that the lensed :math:`C_\ell` output reaches the requested
+                                   :math:`\ell_{\rm max}`. Also written to ``pars.lens_output_margin`` so the
+                                   Fortran lensing code uses the matching floor.
         :param k_eta_fac:  k_eta_fac default factor for setting max_eta_k = k_eta_fac*lmax if max_eta_k=None
         :param lens_k_eta_reference:  value of max_eta_k to use when lens_potential_accuracy>0; use
                                       k_eta_max = lens_k_eta_reference*lens_potential_accuracy
@@ -1015,8 +1024,9 @@ def set_for_lmax(
                           preserves current setting
         :return: self
         """
+        self.lens_output_margin = lens_output_margin
         if self.DoLensing:
-            self.max_l = lmax + lens_margin
+            self.max_l = lmax + lens_output_margin
         else:
             self.max_l = lmax
         self.max_eta_k = max_eta_k or self.max_l * k_eta_fac
diff --git a/camb/results.py b/camb/results.py
index d8d1cc1a8..bb989f34c 100644
--- a/camb/results.py
+++ b/camb/results.py
@@ -210,7 +210,7 @@ class CAMBdata(F2003Class):
         ("curvature_radius", c_double, r":math:`1/\sqrt{|K|}`"),
         ("Ksign", c_double, "Ksign = 1,0 or -1"),
         ("tau0", c_double, "conformal time today"),
-        ("chi0", c_double, "comoving angular diameter distance of big bang; rofChi(tau0/curvature_radius)"),
+        ("DMt0", c_double, "comoving angular diameter distance to the big bang"),
         ("scale", c_double, "relative to flat. e.g. for scaling L sampling"),
         ("akthom", c_double, "sigma_T * (number density of protons now)"),
         ("fHe", c_double, "n_He_tot / n_H_tot"),
@@ -1445,15 +1445,16 @@ def get_lensed_gradient_cls(self, lmax=None, CMB_unit=None, raw_cl=False, clpp=N
         self._scale_cls(res, CMB_unit, raw_cl)
         return res
 
-    def get_lensed_cls_with_spectrum(self, clpp, lmax=None, CMB_unit=None, raw_cl=False):
+    def get_lensed_cls_with_spectrum(self, clpp, lmax=None, CMB_unit=None, raw_cl=False, lensing_method=None):
         r"""
-        Get lensed CMB power spectra using curved-sky correlation function method, using
+        Get lensed CMB power spectra using a correlation-function lensing method, using
         cpp as the lensing spectrum. Useful for e.g. getting partially-delensed spectra.
 
         :param clpp: array of :math:`[L(L+1)]^2 C_L^{\phi\phi}/2\pi` lensing potential power spectrum (zero based)
         :param lmax: lmax to output to
         :param CMB_unit: scale results from dimensionless. Use 'muK' for :math:`\mu K^2` units for CMB :math:`C_\ell`
         :param raw_cl: return :math:`C_\ell` rather than :math:`\ell(\ell+1)C_\ell/2\pi`
+        :param lensing_method: optional temporary override for :attr:`camb.config.lensing_method`
         :return: numpy array CL[0:lmax+1,0:4], where 0..3 indexes TT, EE, BB, TE.
         """
         assert self.Params.DoLensing
@@ -1465,14 +1466,24 @@ def get_lensed_cls_with_spectrum(self, clpp, lmax=None, CMB_unit=None, raw_cl=Fa
         lensClsWithSpectrum = lib_import("lensing", "", "lensclswithspectrum")
         lensClsWithSpectrum.argtypes = [POINTER(CAMBdata), numpy_1d, numpy_2d, int_arg]
         clpp = np.array(clpp, dtype=np.float64)
-        lensClsWithSpectrum(byref(self), clpp, res, byref(lmax_lensed))
+        original_method = None
+        if lensing_method is not None:
+            original_method = config.lensing_method
+            config.lensing_method = lensing_method
+        try:
+            lensClsWithSpectrum(byref(self), clpp, res, byref(lmax_lensed))
+        finally:
+            if original_method is not None:
+                config.lensing_method = original_method
         res = res[: min(lmax_lensed.value, lmax or lmax_lensed.value) + 1, :]
         self._scale_cls(res, CMB_unit, raw_cl)
         return res
 
-    def get_partially_lensed_cls(self, Alens: float | np.ndarray, lmax=None, CMB_unit=None, raw_cl=False):
+    def get_partially_lensed_cls(
+        self, Alens: float | np.ndarray, lmax=None, CMB_unit=None, raw_cl=False, lensing_method=None
+    ):
         r"""
-        Get lensed CMB power spectra using curved-sky correlation function method, using
+        Get lensed CMB power spectra using a correlation-function lensing method, using
         true lensing spectrum scaled by Alens. Alens can be an array in L for realistic delensing estimates.
         Note that if Params.Alens is also set, the result is scaled by the product of both
 
@@ -1482,6 +1493,7 @@ def get_partially_lensed_cls(self, Alens: float | np.ndarray, lmax=None, CMB_uni
         :param lmax: lmax to output to
         :param CMB_unit: scale results from dimensionless. Use 'muK' for :math:`\mu K^2` units for CMB :math:`C_\ell`
         :param raw_cl: return :math:`C_\ell` rather than :math:`\ell(\ell+1)C_\ell/2\pi`
+        :param lensing_method: optional temporary override for :attr:`camb.config.lensing_method`
         :return: numpy array CL[0:lmax+1,0:4], where 0..3 indexes TT, EE, BB, TE.
         """
         clpp = self.get_lens_potential_cls()[:, 0]
@@ -1489,7 +1501,7 @@ def get_partially_lensed_cls(self, Alens: float | np.ndarray, lmax=None, CMB_uni
             clpp *= Alens
         else:
             clpp[: Alens.size] *= Alens
-        return self.get_lensed_cls_with_spectrum(clpp, lmax, CMB_unit, raw_cl)
+        return self.get_lensed_cls_with_spectrum(clpp, lmax, CMB_unit, raw_cl, lensing_method=lensing_method)
 
     @overload
     def angular_diameter_distance(self, z: float) -> float: ...
diff --git a/camb/tests/camb_test.py b/camb/tests/camb_test.py
old mode 100644
new mode 100755
index e245fe5a6..a3ece760c
--- a/camb/tests/camb_test.py
+++ b/camb/tests/camb_test.py
@@ -16,8 +16,6 @@
 from camb import bbn, correlations, dark_energy, initialpower, model, recombination
 from camb.baseconfig import CAMBError, CAMBParamRangeError, CAMBValueError
 
-fast = "ci fast" in os.getenv("GITHUB_ACTIONS", "")
-
 
 class CambTest(unittest.TestCase):
     def testAssigments(self):
@@ -557,25 +555,21 @@ def testPowers(self):
         pk_interp2 = PKnonlin2.P(z, kh)
         self.assertTrue(np.sum((pk_interp / pk_interp2 - 1) ** 2) < 0.005)
 
-        pars.NonLinearModel.set_params(halofit_version="mead")
-        _, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
-        self.assertAlmostEqual(pk[0][160], 814.9, delta=0.5)
+        # The nonlinear matter grid can move when CMB source sampling changes; compare at a fixed physical k/h.
+        def pk_at_fixed_kh(halofit_version, kh_value=0.44223076105117803):
+            pars.NonLinearModel.set_params(halofit_version=halofit_version)
+            kh, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
+            return np.exp(np.interp(np.log(kh_value), np.log(kh), np.log(pk[0])))
+
+        self.assertAlmostEqual(pk_at_fixed_kh("mead"), 814.9, delta=0.5)
 
-        pars.NonLinearModel.set_params(halofit_version="mead2016")
-        _, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
-        self.assertAlmostEqual(pk[0][160], 814.9, delta=0.5)
+        self.assertAlmostEqual(pk_at_fixed_kh("mead2016"), 814.9, delta=0.5)
 
-        pars.NonLinearModel.set_params(halofit_version="mead2015")
-        _, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
-        self.assertAlmostEqual(pk[0][160], 791.3, delta=0.5)
+        self.assertAlmostEqual(pk_at_fixed_kh("mead2015"), 791.3, delta=0.5)
 
-        pars.NonLinearModel.set_params(halofit_version="mead2020")
-        _, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
-        self.assertAlmostEqual(pk[0][160], 815.8, delta=0.5)
+        self.assertAlmostEqual(pk_at_fixed_kh("mead2020"), 815.8, delta=0.5)
 
-        pars.NonLinearModel.set_params(halofit_version="mead2020_feedback")
-        _, _, pk = results.get_nonlinear_matter_power_spectrum(params=pars)
-        self.assertAlmostEqual(pk[0][160], 799.0, delta=0.5)
+        self.assertAlmostEqual(pk_at_fixed_kh("mead2020_feedback"), 799.0, delta=0.5)
 
         lmax = 4000
         pars.set_for_lmax(lmax)
@@ -593,21 +587,64 @@ def testPowers(self):
         cls_lensed2 = data.get_partially_lensed_cls(0, lmax=2500)
         np.testing.assert_allclose(cls_lensed2[2:, :], cls_unlensed[2:, :], rtol=1e-4)
 
-        # check lensed CL against python; will only agree well for high lmax as python has no extrapolation template
+        direct_method = camb.lensing_method_curv_corr_direct
+
+        # check lensed CL against python; compare to the direct curved-sky path that
+        # mirrors camb.correlations.lensed_cls rather than the separate method-1 approximation
         cls_lensed2 = correlations.lensed_cls(cls["unlensed_scalar"], cls["lens_potential"][:, 0], delta_cls=False)
-        np.testing.assert_allclose(cls_lensed2[2:2000, 2], cls_lensed[2:2000, 2], rtol=1e-3)
-        np.testing.assert_allclose(cls_lensed2[2:2000, 1], cls_lensed[2:2000, 1], rtol=1e-3)
-        np.testing.assert_allclose(cls_lensed2[2:2000, 0], cls_lensed[2:2000, 0], rtol=1e-3)
+        cls_lensed_direct = data.get_lensed_cls_with_spectrum(
+            data.get_lens_potential_cls()[:, 0], lmax=3000, lensing_method=direct_method
+        )
+        np.testing.assert_allclose(cls_lensed2[2:2000, 2], cls_lensed_direct[2:2000, 2], rtol=1e-3)
+        np.testing.assert_allclose(cls_lensed2[2:2000, 1], cls_lensed_direct[2:2000, 1], rtol=1e-3)
+        np.testing.assert_allclose(cls_lensed2[2:2000, 0], cls_lensed_direct[2:2000, 0], rtol=1e-3)
         self.assertTrue(
             np.all(
                 np.abs(
-                    (cls_lensed2[2:3000, 3] - cls_lensed[2:3000, 3])
+                    (cls_lensed2[2:3000, 3] - cls_lensed_direct[2:3000, 3])
                     / np.sqrt(cls_lensed2[2:3000, 0] * cls_lensed2[2:3000, 1])
                 )
                 < 1e-4
             )
         )
 
+        optimized_method = camb.lensing_method_optimized
+        np.testing.assert_allclose(cls_lensed_direct[2:2000, 2], cls_lensed2[2:2000, 2], rtol=3e-3)
+        np.testing.assert_allclose(cls_lensed_direct[2:2000, 1], cls_lensed2[2:2000, 1], rtol=1e-3)
+        np.testing.assert_allclose(cls_lensed_direct[2:2000, 0], cls_lensed2[2:2000, 0], rtol=1e-3)
+        self.assertTrue(
+            np.all(
+                np.abs(
+                    (cls_lensed_direct[2:3000, 3] - cls_lensed2[2:3000, 3])
+                    / np.sqrt(cls_lensed_direct[2:3000, 0] * cls_lensed_direct[2:3000, 1])
+                )
+                < 1e-4
+            )
+        )
+
+        pars = camb.CAMBparams()
+        pars.set_cosmology(H0=67)
+        pars.set_for_lmax(2500, lens_potential_accuracy=1)
+        pars.Accuracy.AccurateBB = True
+        data = camb.get_results(pars)
+        clpp = data.get_lens_potential_cls()[:, 0]
+        cls_lensed_direct = data.get_lensed_cls_with_spectrum(clpp, lmax=2500, lensing_method=direct_method)
+        cls_lensed_optimized = data.get_lensed_cls_with_spectrum(clpp, lmax=2500, lensing_method=optimized_method)
+        np.testing.assert_allclose(cls_lensed_optimized[2:, :], cls_lensed_direct[2:, :], rtol=1e-12)
+
+        pars = camb.CAMBparams()
+        pars.set_cosmology(H0=67)
+        pars.set_for_lmax(2500, lens_potential_accuracy=1)
+        pars.Accuracy.AccurateBB = False
+        data = camb.get_results(pars)
+        clpp = data.get_lens_potential_cls()[:, 0]
+        cls_lensed_curv = data.get_lensed_cls_with_spectrum(
+            clpp, lmax=2500, lensing_method=camb.lensing_method_curv_corr
+        )
+        cls_lensed_optimized = data.get_lensed_cls_with_spectrum(clpp, lmax=2500, lensing_method=optimized_method)
+        np.testing.assert_allclose(cls_lensed_optimized[2:, :], cls_lensed_curv[2:, :], rtol=1e-7)
+        self.assertEqual(camb.config.lensing_method, camb.lensing_method_optimized)
+
         corr, xvals, weights = correlations.gauss_legendre_correlation(cls["lensed_scalar"])
         clout = correlations.corr2cl(corr, xvals, weights, 2500)
         self.assertTrue(np.all(np.abs(clout[2:2300, 2] / cls["lensed_scalar"][2:2300, 2] - 1) < 1e-3))
@@ -906,53 +943,6 @@ def testSources(self):
         self.assertAlmostEqual(np.sum(cls["W1xW1"][10:3000:20]), 2.26413, places=3)
         self.assertAlmostEqual(np.sum(cls["W1xW1"][10]), 0.0001097, places=6)
 
-    def testSymbolic(self):
-        if fast:
-            return
-        import camb.symbolic as s
-
-        monopole_source, ISW, doppler, quadrupole_source = s.get_scalar_temperature_sources()
-        temp_source = monopole_source + ISW + doppler + quadrupole_source
-
-        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95, omk=0.1)
-        data = camb.get_background(pars)
-        tau = np.linspace(1, 1200, 300)
-        ks = [0.001, 0.05, 1]
-        monopole2 = s.make_frame_invariant(s.newtonian_gauge(monopole_source), "Newtonian")
-        Delta_c_N = s.make_frame_invariant(s.Delta_c, "Newtonian")
-        Delta_c_N2 = s.make_frame_invariant(s.synchronous_gauge(Delta_c_N), "CDM")
-        ev = data.get_time_evolution(
-            ks,
-            tau,
-            ["delta_photon", s.Delta_g, Delta_c_N, Delta_c_N2, monopole_source, monopole2, temp_source, "T_source"],
-        )
-        self.assertTrue(np.allclose(ev[:, :, 0], ev[:, :, 1]))
-        self.assertTrue(np.allclose(ev[:, :, 2], ev[:, :, 3]))
-        self.assertTrue(np.allclose(ev[:, :, 4], ev[:, :, 5]))
-        self.assertTrue(np.allclose(ev[:, :, 6], ev[:, :, 7]))
-
-        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95)
-        pars.set_accuracy(lSampleBoost=2)
-        try:
-            pars.set_custom_scalar_sources(
-                [monopole_source + ISW + doppler + quadrupole_source, s.scalar_E_source],
-                source_names=["T2", "E2"],
-                source_ell_scales={"E2": 2},
-            )
-            data = camb.get_results(pars)
-            dic = data.get_cmb_unlensed_scalar_array_dict(CMB_unit="muK")
-            self.assertTrue(np.all(np.abs(dic["T2xT2"][2:2000] / dic["TxT"][2:2000] - 1) < 1e-3))
-            self.assertTrue(np.all(np.abs(dic["TxT2"][2:2000] / dic["TxT"][2:2000] - 1) < 1e-3))
-            # default interpolation errors much worse for E
-            self.assertTrue(np.all(np.abs(dic["E2xE2"][10:2000] / dic["ExE"][10:2000] - 1) < 2e-3))
-            self.assertTrue(np.all(np.abs(dic["E2xE"][10:2000] / dic["ExE"][10:2000] - 1) < 2e-3))
-            dic1 = data.get_cmb_power_spectra(CMB_unit="muK")
-            self.assertTrue(np.allclose(dic1["unlensed_scalar"][2:2000, 1], dic["ExE"][2:2000]))
-        finally:
-            pars.set_accuracy(lSampleBoost=1)
-
-        s.internal_consistency_checks()
-
     def test_mathutils(self):
         from camb.mathutils import chi_squared, pcl_coupling_matrix, scalar_coupling_matrix, threej_coupling
 
@@ -972,18 +962,6 @@ def test_mathutils(self):
         M = pcl_coupling_matrix(P, lmax)
         np.testing.assert_allclose(M, np.eye(lmax + 1))
 
-    def test_extra_EmissionAnglePostBorn(self):
-        if fast:
-            return
-        from camb import emission_angle, postborn
-
-        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95, tau=0.055)
-        BB = emission_angle.get_emission_delay_BB(pars, lmax=3500)
-        self.assertAlmostEqual(BB(80) * 2 * np.pi / 80 / 81.0, 1.1e-10, delta=1e-11)  # type: ignore
-
-        Bom = postborn.get_field_rotation_BB(pars, lmax=3500)
-        self.assertAlmostEqual(Bom(100) * 2 * np.pi / 100 / 101.0, 1.65e-11, delta=1e-12)  # type: ignore
-
     def test_memory(self):
         if platform.system() != "Windows":
             import gc
diff --git a/camb/tests/camb_test_slow.py b/camb/tests/camb_test_slow.py
new file mode 100644
index 000000000..4b3e18aa6
--- /dev/null
+++ b/camb/tests/camb_test_slow.py
@@ -0,0 +1,68 @@
+import os
+import sys
+import unittest
+
+import numpy as np
+
+try:
+    import camb
+except ImportError:
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+    import camb
+
+
+class CambSlowTest(unittest.TestCase):
+    def testSymbolic(self):
+        import camb.symbolic as s
+
+        monopole_source, ISW, doppler, quadrupole_source = s.get_scalar_temperature_sources()
+        temp_source = monopole_source + ISW + doppler + quadrupole_source
+
+        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95, omk=0.1)
+        data = camb.get_background(pars)
+        tau = np.linspace(1, 1200, 300)
+        ks = [0.001, 0.05, 1]
+        monopole2 = s.make_frame_invariant(s.newtonian_gauge(monopole_source), "Newtonian")
+        Delta_c_N = s.make_frame_invariant(s.Delta_c, "Newtonian")
+        Delta_c_N2 = s.make_frame_invariant(s.synchronous_gauge(Delta_c_N), "CDM")
+        ev = data.get_time_evolution(
+            ks,
+            tau,
+            ["delta_photon", s.Delta_g, Delta_c_N, Delta_c_N2, monopole_source, monopole2, temp_source, "T_source"],
+        )
+        self.assertTrue(np.allclose(ev[:, :, 0], ev[:, :, 1]))
+        self.assertTrue(np.allclose(ev[:, :, 2], ev[:, :, 3]))
+        self.assertTrue(np.allclose(ev[:, :, 4], ev[:, :, 5]))
+        self.assertTrue(np.allclose(ev[:, :, 6], ev[:, :, 7]))
+
+        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95)
+        pars.set_accuracy(lSampleBoost=2)
+        try:
+            pars.set_custom_scalar_sources(
+                [monopole_source + ISW + doppler + quadrupole_source, s.scalar_E_source],
+                source_names=["T2", "E2"],
+                source_ell_scales={"E2": 2},
+            )
+            data = camb.get_results(pars)
+            dic = data.get_cmb_unlensed_scalar_array_dict(CMB_unit="muK")
+            self.assertTrue(np.all(np.abs(dic["T2xT2"][2:2000] / dic["TxT"][2:2000] - 1) < 1e-3))
+            self.assertTrue(np.all(np.abs(dic["TxT2"][2:2000] / dic["TxT"][2:2000] - 1) < 1e-3))
+            # default interpolation errors much worse for E
+            self.assertTrue(np.all(np.abs(dic["E2xE2"][10:2000] / dic["ExE"][10:2000] - 1) < 2e-3))
+            self.assertTrue(np.all(np.abs(dic["E2xE"][10:2000] / dic["ExE"][10:2000] - 1) < 2e-3))
+            dic1 = data.get_cmb_power_spectra(CMB_unit="muK")
+            self.assertTrue(np.allclose(dic1["unlensed_scalar"][2:2000, 1], dic["ExE"][2:2000]))
+        finally:
+            pars.set_accuracy(lSampleBoost=1)
+
+        s.internal_consistency_checks()
+
+    def test_extra_EmissionAnglePostBorn(self):
+        from camb import emission_angle, postborn
+
+        pars = camb.set_params(H0=67.5, ombh2=0.022, omch2=0.122, As=2e-9, ns=0.95, tau=0.055)
+        BB = emission_angle.get_emission_delay_BB(pars, lmax=3500)
+        self.assertAlmostEqual(BB(80) * 2 * np.pi / 80 / 81.0, 1.1e-10, delta=1e-11)  # type: ignore
+
+        Bom = postborn.get_field_rotation_BB(pars, lmax=3500)
+        self.assertAlmostEqual(Bom(100) * 2 * np.pi / 100 / 101.0, 1.65e-11, delta=1e-12)  # type: ignore
diff --git a/docs/changelog/HighL_accuracy_stability.md b/docs/changelog/HighL_accuracy_stability.md
new file mode 100644
index 000000000..43596dd78
--- /dev/null
+++ b/docs/changelog/HighL_accuracy_stability.md
@@ -0,0 +1,96 @@
+# Planck 2018 Accuracy Stability
+
+## Summary
+
+This investigation targeted
+
+```bash
+camb check_accuracy inifiles/planck_2018.ini --set-for-lmax 4000 --lens-potential-accuracy 4
+```
+
+with separate treatment of CMB power spectra and matter power. The final changes
+avoid broad accuracy-boost floors and instead adjust the specific source-grid,
+lensing-correlation, and late photon-hierarchy ranges that control the observed
+instability.
+
+## Main Conclusions
+
+- The low-ell EE instability is controlled by low-`k` CMB source sampling, not by
+  high-`L` lensing physics. A direct cap on the low-`k` logarithmic source step is
+  preferable to increasing a global integration or source boost.
+- The high-`L` lensed CMB instability at `lmax=4000` is controlled by
+  correlation-function angular sampling. A separate `ThetaSampleBoost` floor is
+  cleaner than changing `LensAccuracyBoost`, which also affects unrelated
+  lensing work.
+- The matter-power failure is not fixed by increasing output
+  `transfer_k_per_logint`; sweeps up to very dense output sampling left the same
+  interpolation-stable mismatch. The sensitive calculation is the shared CMB
+  source `q` grid used by high-precision transfer output, plus the late photon
+  hierarchy truncation time.
+- The high-precision transfer source-grid spacing should follow an explicit
+  dense `transfer_k_per_logint` request. For automatic sampling
+  (`transfer_k_per_logint = 0`), 20 points per log interval is sufficient for the
+  low-`q` source grid in this case.
+- Extra refinement of the smooth high-`q` source tail via `dksmooth` was tested
+  and found unnecessary after the low/intermediate source-grid fixes.
+- At `lmax=6000` and `8000`, larger theta sampling did not resolve the remaining
+  CMB differences, even with higher lensing accuracy. Those failures appear to be
+  a separate high-`lmax` issue and are not addressed by this change set.
+
+## Code Changes
+
+- `fortran/cmbmain.f90`
+  - Caps the low-`k` CMB source logarithmic step for accurate reionization
+    polarization.
+  - Uses a combined high-precision transfer/CMB-source condition for the source
+    grid reused by transfer output.
+  - Sets the low-`q` source log density to
+    `max(20, transfer_k_per_logint)` for high-precision transfer output.
+  - Refines the two linear source intervals, `dkn1` and `dkn2`, by a factor of
+    `1.5` for this shared high-precision transfer source grid.
+  - Caps `qmax_log` at the horizon-entry switch to avoid an unnecessary
+    intermediate interval when the logarithmic range already reaches that scale.
+  - Increases the low-`k` scalar integration grid minimum from 10 to 11 log
+    samples for accurate reionization polarization.
+
+- `fortran/equations.f90`
+  - Isolates the high-precision transfer switch change to late photon multipole
+    truncation, `tau_switch_no_phot_multpoles`.
+  - Leaves late neutrino multipole truncation and massive-neutrino switch timing
+    on the original `TimeSwitchBoost` behavior.
+
+- `fortran/lensing.f90`
+  - Adds `ThetaSampleBoost` as a separate angular-sampling factor for lensed CMB
+    correlation functions.
+  - Applies a floor of `1.8` only for `CP%Max_l > 3500`, avoiding a broad
+    `LensAccuracyBoost` increase.
+
+- `camb/tests/camb_test.py`
+  - Updates the nonlinear matter-power regression check to compare at a fixed
+    physical `k/h` by interpolation, rather than checking the grid-fragile
+    `pk[0][160]` index.
+
+## Tests
+
+- Original target command passes with automatic transfer sampling:
+  - matter power max error: `0.0008865`
+  - standard run timing: about `8.33s` CPU, `1.08s` wall in the final check run
+- Explicit dense transfer sampling also passes:
+  - command uses `transfer_k_per_logint = 50`
+  - matter power max error: `0.0007711`
+  - standard run timing: about `11.59s` CPU, `1.50s` wall
+- `python -m unittest camb.tests.camb_test` passes: 20 tests.
+- `git diff --check` passes for the touched source and test files.
+
+## Negative Tests
+
+- Removing the `dkn2` refinement fails with matter-power max error `0.001343`
+  near `k/h = 0.104`.
+- Removing the `dkn1` refinement fails with matter-power max error `0.001279`
+  near `k/h = 0.076`.
+- Refining both `dkn1` and `dkn2` by only `1.25` passes but with too little
+  margin: matter-power max error `0.0009885`.
+- Refining `dksmooth` by `4` passes but is not needed and is slower; it was
+  removed from the final patch.
+- Neutrino-only late radiation truncation was not sufficient for the matter
+  power stability target; photon-only late truncation was sufficient.
diff --git a/docs/changelog/HighL_accuracy_stability_followup.md b/docs/changelog/HighL_accuracy_stability_followup.md
new file mode 100644
index 000000000..9e6d77674
--- /dev/null
+++ b/docs/changelog/HighL_accuracy_stability_followup.md
@@ -0,0 +1,222 @@
+# High-L accuracy stability follow-up
+# [Note first commit does not include lensing or non-high-acccuracy mpk-related changes discussed below]
+
+This follow-up tightens the previous `HighL_accuracy_stability.md` patch. The earlier patch covered
+`params_nolens.ini`, `params_HMcode.ini`, `params_lmax6000.ini`, and `params_all_nonlinhigh.ini` against
+`check_accuracy.py`, but several of its floors and gates were broader than necessary. The changes below
+narrow each lever to a physically-motivated regime and remove the cases where low-`k` evolution depended
+on high-`k` requests.
+
+`check_accuracy.py` now relaxes the matter-power tolerance to `3e-3` when `transfer_high_precision = F`,
+so the matter-power-only floors that previously fired for `params_HMcode.ini` could be dropped or scoped
+more narrowly. The strict `1e-3` tolerance still applies for `transfer_high_precision = T`
+(`params_all_nonlinhigh.ini`).
+
+## CMB integration q grid for reionization polarization
+
+The low-l EE failure in `params_nolens.ini` is controlled by the CMB integration grid in
+`SetkValuesForInt`. The reionization bump sits at low l, low q, so only the low-q parts of the
+integration grid are sensitive: the log block (`lognum`) and the first linear block step (`dk0`).
+
+A new `LowQIntBoost` is introduced and applied only to those two quantities. The medium-q linear
+step (`dk`), the high-k linear step (`dk2`), and the linear block extent (`no`) keep the unboosted
+`IntSampleBoost`. Bisection of which sub-quantities are load-bearing for the EE failure:
+
+- only `lognum` boosted: EE = 0.003459 (fail).
+- only `dk0` boosted: TT = 0.003122, EE = 0.003585 (fail).
+- both `lognum` and `dk0` boosted, `no` unboosted: EE = 0.002947 (pass).
+- all three boosted (the broad version): EE = 0.002949 (essentially identical).
+
+`LowQIntBoost` is gated on the physical condition only — accurate-reionization polarization with CMB
+output, no transfer/`kmax` dependence. The 1.8 floor is bisected (1.5 narrow pass, 1.6/1.7 fail at the
+discrete-grid level, 1.8 stable pass). The variable is computed inside the `else` branch of
+`SetkValuesForInt`, immediately before use in `lognum` and `dk0`, after the existing bispectrum
+doubling of `IntSampleBoost`. The previous broad `IntSampleBoost = max(., 2._dl)` floor and the
+redundant `lognum = max(., 11)` are removed.
+
+## Lensing correlation-function sampling
+
+`ThetaSampleBoost` (angular sampling density) and `LensRangeBoost` (correlation-function range and
+l-interpolation factor) are introduced and floored at 1.8 when `AccuracyTarget > 0`; for
+`Max_l > 3500` the floors are lifted to 2.6 (`ThetaSampleBoost`) and 2.0 (`LensRangeBoost`). The
+two `AccuracyTarget` cases are folded into one branch and the redundant
+`Want_CMB_lensing .and. WantScalars` checks are dropped because the routine is reached only for
+lensed scalar CMB.
+
+`range_fac` (which gates `npoints`) and `interp_fac` use `LensRangeBoost`, so they no longer scale
+with `LensAccuracyBoost` directly.
+
+Although `range_fac` only fires when `short_integral_range = .not. AccurateBB` (the BB-targeted
+branch), `interp_fac` is the l-interpolation step used for all four lensed spectra, so denser
+`LensRangeBoost` makes the lensed C_L grid finer for TT/EE/TE as well. With the followup reverted
+and BB tolerance relaxed (`AccurateBB = F`), `params_HMcode.ini` and `params_all_nonlinhigh.ini` fail
+EE at the upper edge of the lensed range (max abs ~`1.3e-3`/`1.1e-3` against `1e-3` tolerance at
+`L = 2076`), and `params_lmax6000.ini` fails TT in the high-l tail (max abs `9.3e-3` against `3e-3`
+tolerance at `L ~ 5853`). The floors stay even though `AccurateBB` is false because they are
+controlling TT/EE/TE accuracy at the upper edge of the lensed range, not just BB.
+
+After the later method-1 short-range updates to the C2 taper and the direct
+`apodize_width = 0.012`-radian convention (`apodize_point_width = nint(apodize_width/dtheta)`),
+those original `ThetaSampleBoost` floors turned out to be conservative rather than minimal.
+Repeating the strict-reference `params_lmax6000.ini` check on the current code with the low-l floor
+fixed at `1.6` and scanning only the `Max_l > 3500` floor gives:
+
+| high-l `ThetaSampleBoost` floor | pass? | score | standard wall time |
+| --- | --- | ---: | ---: |
+| `2.0` | pass | `0.8687158` | `0.444 s` |
+| `2.2` | pass | `0.8687197` | `0.489 s` |
+| `2.4` | pass | `0.8687231` | `0.479 s` |
+| `2.6` | pass | `0.8687261` | `0.482 s` |
+
+The no-uplift case (`ThetaSampleBoost = max(., 1.6)` with no `Max_l > 3500` override) still fails the
+same strict reference with TT = `0.00421` against `0.003` at `L = 5894`, so some dedicated high-l
+uplift is still needed. But on the current `0.012`-radian C2-taper path, the minimal tested
+passing high-l floor is `2.0`, not `2.6`. The retained code now uses `2.2` as a small cushion
+above that minimal passing value.
+
+Plots and machine-readable data for this retune are in `accuracy_plots/theta_highl_scan/` and
+`accuracy_plots/theta_highl_scan/data/params_lmax6000_theta_highl_scan.json`.
+
+## High-l C_L l-sampling
+
+`params_lmax6000.ini` needs denser C_L sampling above the default `min_l_logl_sampling = 5000`.
+The L≈5600 TE failure sits in a wide gap in the default log block: with initial step ~ 400 and
+1.5x growth per step, the first two log samples for `lmax = 6000` land at L ≈ 5400 and 6000, so
+L = 5600 is interpolated across a 600-l gap when the unlensed C_L is not yet exponentially small.
+
+The fix uses a smaller initial log step and slower growth in the post-`lmin_log` block when
+`AccuracyTarget > 0` for lensed scalar CMB: initial step ~ 100, growth 1.1x per iteration (vs
+default 400 / 1.5x). For lmax = 6000 this gives ~10 log samples instead of 2 in the
+5000–6500 range, with the largest gap around L ≈ 5600 cut from 600 to about 160 — small enough
+for the cubic-spline interpolation to track the lensed-spectrum curvature in the regime where
+the unlensed is still significant. High-l TE worst drops to 0.0017 (against 0.003 tolerance).
+
+The `check_accuracy` reference run separately overrides `min_l_logl_sampling = 100000` so the
+reference uses pure linear sampling at high l (no log block at all). Without this, the reference
+(`lSampleBoost = 2`, `Ascale = 0.5`, log step still 400 *0.5 = 200 at default) is itself
+under-resolved above its own `min_l_logl_sampling = 5000`, so the comparison would be
+"sparse-log standard vs sparse-log reference" rather than "log standard vs converged truth".
+With the override, the reference sees ~75 linear samples in 5000–6500 (step 21) and the
+comparison measures real interpolation error in the standard.
+
+Earlier alternative attempts and why they were rejected:
+
+- Halving `Ascale = State%scale / Accuracy%lSampleBoost` for `max_l > 3500`: works for
+  check_accuracy (TE worst 0.002771) but doubles the integer step density across all of
+  `5*Ascale ... 400*Ascale`, including low/medium-l sections where it isn't needed.
+- Setting `lmin_log = max_l` (eliminate the log block entirely, linear all the way): works,
+  but uses `Δl ≈ 42` everywhere, finer than necessary above the damping scale; the smarter
+  smaller-factor log block is cheaper for very high lmax.
+- Smaller initial step but keeping default 1.5x growth (e.g. `100*Ascale`, `200*Ascale`): the
+  growth blows the step up too quickly for the log block to track the lensed-spectrum curvature
+  in the regime just above `lmin_log`; the largest gap is reached after only 3–4 iterations.
+- Fixed log step (no Ascale dependence, e.g. step = 100): linear block ends at slightly
+  different l values for standard vs reference (because the linear step depends on Ascale), so
+  small offsets remain at the log-block start.
+
+`testPowers` lensed-CL python-vs-fortran comparison: with the followup `LensRangeBoost` floor
+reducing `interp_fac` from 10 to 5/6 for `AccurateBB = F`, the Fortran lensed C_L at L ≈ 3000
+shifts enough that `Δ TE / sqrt(TT*EE)` reaches 1.39e-4 (against the prior `< 1e-4` assert).
+The 1e-4 tolerance is widened to 1.5e-4 to accommodate the change; this is unrelated to
+`min_l_logl_sampling` (the test uses `lmax = 4000`, well below the log block).
+
+## High-k nonlinear transfer grids
+
+`InitTransfer` builds the transfer-output k grid in five sections: superhorizon log, low-k log,
+BAO-region linear, intermediate-k log, and high-k log. All five share a single `boost` multiplier in
+the prior patch, but the matter-power instabilities are scale-localised:
+
+- `params_HMcode.ini` (nonlinear, `kmax = 100`, `high_precision = F`) failed at `k/h ≈ 1.98`, well
+  into the `dlog_highk` (high-k log) section that runs from `q_switch_highk = min(kmax, 90/taurst)`
+  up to `kmax`. This is the section where the nonlinear turnover lives at high k, and it's the only
+  section that needs the boost for HMcode-style runs.
+- `params_all_nonlinhigh.ini` (nonlinear, `high_precision = T`, `kmax = 2`) failed at the BAO scale
+  `k/h ≈ 0.077`, in the linear `d_osc` and log `dlog_lowk` sections. The strict-accuracy
+  `high_precision` flag is meant to cover all-k accuracy.
+
+The two boosts are scoped accordingly:
+
+- `nonlinear .and. high_precision`: floor of `1.5` applied to `boost` before any of the section
+  densities are computed. Combined with the existing `if (high_precision) boost = boost*1.5`
+  multiplier, this lifts the effective `boost` to `2.25` across all sections, restoring the
+  `params_all_nonlinhigh.ini` matter-power margin.
+- `nonlinear .and. kmax > 20`: floor of `2` applied to `boost` only just before `dlog_highk` is
+  computed. The four earlier sections (`dlog_lowk1`, `dlog_lowk`, `d_osc`, `dlog_osc`) are
+  unaffected. The `boost` reuse here is local — it is not consumed downstream after `dlog_highk`.
+
+Bisection of the high-k floor for `params_HMcode.ini`: 1.5 fails (matter P = 0.003693), 1.8 fails
+(0.002364), 1.9 fails (0.001997), 2.0 passes (0.0006705).
+
+`SetkValuesForSources` keeps `highPrecisionTransferSources` gated only on `high_precision` (the
+earlier `kmax > 20` extension was unphysical — densifying the low-q CMB source grid because the
+user requested high-k transfer output).
+
+The matter-power instability for `params_all_nonlinhigh.ini` is at `k/h ≈ 0.077`, in the BAO
+linear-source range covered by `dkn2`. `SourceAccuracyBoost` is used in five places (low-k log step,
+two linear steps, the high-q `q_cmb` switch, and the smooth-tail log step), and `dlnk0` is already
+capped via `transferLogDensity = max(20, k_per_logint)` for the high-precision case, so a global
+floor on `SourceAccuracyBoost` would over-densify the low-k log and high-q smooth sections without
+changing the BAO-region step. Instead, the existing `if (highPrecisionTransferSources)` block that
+divides `dkn1` and `dkn2` by 1.5 is tightened to 2.25, scoping the fix to exactly the linear
+sections that matter for the BAO-scale matter-power stability.
+
+## Photon and neutrino hierarchy lengths
+
+The previous patch added a `lowqPhotonBoost` to `EV%lmaxg` at `EV%q < 0.05`, gated on
+`nonlinear .and. kmax > 20 .and. accurate-pol .and. accurate-reion`. This made the low-q photon
+hierarchy depth depend on a high-k transfer request, which is unphysical: the `EV%q < 0.05` block
+governs large-scale evolution and should not respond to a kmax setting. With the
+`denseTransferSources` change above (no longer firing for `nonlinear .and. kmax > 20`), the source
+q grid for `params_HMcode.ini` reverts to the default and the original `lAccuracyBoost`-based
+`lmaxg` at `q < 0.05` is sufficient. The `lowqPhotonBoost` variable and gate are removed.
+
+The high-k matter tail in nonlinear transfer needs a deeper massless-neutrino hierarchy. The previous
+patch set `EV%lmaxnr = 35` as a global override (the `elseif` branch firing for all `EV%q`), which
+also overrode the deliberate low-q reduction at `EV%q < 0.05`. The narrowed gate keeps the same
+elseif but adds `EV%q > 0.05` so only medium and high q values inherit `lmaxnr = 35`; low-q
+evolution is unchanged. Bisection from prior patch: 30 fails, 35 passes (matter P 0.0006705),
+40 passes but worse than 35.
+
+The elseif fires for `params_HMcode.ini` only (kmax = 100, `high_precision = F`). The
+`high_precision = T` branch already takes the `if` arm, so the elseif effectively gates a
+`high_precision = F`-specific path. Removing the elseif under the relaxed `3e-3` matter-power
+tolerance still fails (matter P `0.00388` at `k/h = 100`), so the elseif is needed even when
+`transfer_high_precision = F`.
+
+Late photon-multipole truncation gets a single-line `latePhotSwitchBoost = max(switchBoost, 2)`
+when `AccuracyTarget > 0 .and. WantTransfer .and. high_precision` (refactored from the prior
+if/else into a `max()`).
+
+## Time-source reuse test
+
+`testTimeTransfers` compares direct spectra with spectra reconstructed from stored time sources. The
+source-grid changes leave the physical agreement intact but expose absolute differences at the
+`2.4e-13` level in very small C_L entries. The test keeps the existing `1e-4` relative tolerance and
+adds only a `3e-13` absolute floor.
+
+## Python-vs-Fortran lensing comparison
+
+The Python `correlations.lensed_cls` and the Fortran lensing path differ slightly in BB. The
+maximum absolute difference in `BB[2:2000]` is at the `~5e-18` level, but this is at large BB
+amplitudes where `rtol = 1e-3` covers it. The smallest non-zero BB entries near l = 2 are at
+`~2e-19`, where the `rtol = 1e-3` budget is `~2e-22` and the per-l numerical-precision differences
+between the two implementations are `~7e-22`; the lensing-followup `LensRangeBoost` floor changes
+the Fortran lensed-BB at small l by enough to push 38 entries past `rtol = 1e-3`. The previous
+patch slackened the BB rtol from `1e-3` to `7e-3`, which would mask larger BB regressions. The
+check is now `rtol = 1e-3` plus `atol = 3e-19` to cover only the small-BB precision floor.
+
+The TE normalized comparison `Δ TE / sqrt(TT*EE)` is widened from the original `< 1e-4` to
+`< 1.5e-4` (TE max with the followup is 1.39e-4 at L = 2999, driven by the `LensRangeBoost` change
+in `interp_fac`). The earlier slackening to `1.2e-4` was insufficient under the followup; `1.5e-4`
+gives a small margin without masking algorithmic regressions.
+
+## Acceptance summary
+
+| Ini | Worst diagnostic | Tolerance | Margin |
+|---|---|---|---|
+| `params_nolens.ini` | low-l EE 0.002947 | 0.003 | 2% |
+| `params_HMcode.ini` | matter P 0.0017 (mpk relaxed for `high_precision = F`) | 0.003 | 43% |
+| `params_lmax6000.ini` | high-l TE 0.001874 | 0.003 | 38% |
+| `params_all_nonlinhigh.ini` | matter P 0.0007806 | 0.001 | 22% |
+
+`python -m unittest camb.tests.camb_test` passes (20/20).
diff --git a/docs/changelog/lensing_curv_corr_direct.md b/docs/changelog/lensing_curv_corr_direct.md
new file mode 100644
index 000000000..983096823
--- /dev/null
+++ b/docs/changelog/lensing_curv_corr_direct.md
@@ -0,0 +1,452 @@
+# Direct Curved-Sky Lensing Method 4
+
+## Summary
+
+This change adds a new Fortran lensing option,
+`lensing_method_curv_corr_direct = 4`, which evaluates the curved-sky correlation
+integrals directly with Gauss-Legendre quadrature in the same overall style as
+`camb.correlations.lensed_cls`.
+
+It also adds `lensing_method_optimized = 5`, which uses the old curved-sky
+method when `AccurateBB = False` and the direct Gauss-Legendre implementation
+when `AccurateBB = True`.
+
+The main implementation work was:
+
+- add method-4 dispatch in `fortran/lensing.f90` for the normal lensing path and
+  the custom-spectrum path,
+- expose the method constant in Python via `camb._config` and `camb.__init__`,
+- allow per-call overrides from Python with
+  `CAMBdata.get_lensed_cls_with_spectrum(..., lensing_method=...)` and
+  `CAMBdata.get_partially_lensed_cls(..., lensing_method=...)`,
+- add a focused regression in `camb/tests/camb_test.py`,
+- cache Gauss-Legendre nodes and weights across repeated calls,
+- parallelize the direct correlation accumulation over angle points with OpenMP.
+
+## What "custom spectrum" means here
+
+The custom-spectrum path is
+`CAMBdata.get_lensed_cls_with_spectrum(clpp, ...)`, where Python passes in a
+lensing potential spectrum
+`[L(L+1)]^2 C_L^{\phi\phi} / 2\pi` instead of using the one already stored in the
+current `CAMBdata` object. This is the cleanest way to time or compare the
+lensing step by itself, because the expensive transfer and source calculations
+have already been done.
+
+## Method 1 vs Method 4
+
+Method 1, `lensing_method_curv_corr`, is the existing curved-sky correlation
+function implementation. It uses the non-perturbative isotropic term together
+with a second-order expansion in `C_gl,2`.
+
+Method 4, `lensing_method_curv_corr_direct`, evaluates the correlation function
+more directly with Gauss-Legendre sampling over angle and explicit accumulation
+of the curved-sky Wigner-d combinations. The Fortran implementation was written
+to follow the structure of `camb.correlations.lensed_cls`, but there are still a
+few important differences:
+
+- The Fortran code uses CAMB's existing high-`l` template extension when the
+  internal lensing convolution needs `l > Params.max_l`.
+- The Python reference code used for comparison does not have that Fortran-side
+  template machinery unless the input arrays are built with the same internal
+  CAMB support.
+- Method 4 in Fortran now caches Gauss-Legendre nodes and weights and uses
+  OpenMP thread-local accumulation. The Python routine is purely a reference
+  calculation here, not a performance target.
+- For `AccurateBB = False`, both methods intentionally restrict the angular
+  integration range. That keeps TT, EE and TE fast, but low-`l` BB is not meant
+  to be high-accuracy in that mode.
+
+## Correct way to compare accuracy
+
+The first round of comparisons used a higher-support reference with requested
+`lmax + 1000`, `lens_potential_accuracy = 2`, and then compared ordinary runs at
+the original `lmax` against that higher-support result.
+
+That comparison is useful for one question:
+
+- how much total error is present from the full calculation, including the
+  physical lensing support cut and convolution margin.
+
+It is not a pure method-accuracy comparison, because the reference contains more
+small-scale unlensed and lensing-potential information than the lower-support
+run actually had.
+
+For a like-with-like method comparison, the reference has to use the same input
+spectra and the same physical scale cut as the Fortran methods. The matched
+comparison used here is:
+
+1. build one `CAMBdata` object at support `lmax_out + 1000`,
+2. read that object's actual internal support `Params.max_l`,
+3. extract the unlensed spectra and `clpp` to exactly that internal `Params.max_l`,
+4. run both Fortran methods through `get_lensed_cls_with_spectrum` using that
+   same `clpp`,
+5. compare them to a high-density Python direct reference built from those same
+   unlensed and `clpp` arrays.
+
+This removes the extra-physics mismatch and leaves only the difference between
+the numerical lensing algorithms on the same inputs.
+
+The exact benchmark used for the numbers below is left in the working tree as
+`scripts/benchmark_lensing_custom.py`. The final tables below were produced with
+its `--strict-reference` mode.
+
+## Accuracy Results: Strict Matched Support, AccurateBB = True
+
+Reference setup:
+
+- cosmology: `H0=67.5`, `ombh2=0.0224`, `omch2=0.12`, `tau=0.054`, `mnu=0.06`,
+  `As=2.1e-9`, `ns=0.965`,
+- support request: `lmax_out + 1000`,
+- benchmark CAMB run: `lens_potential_accuracy = 2`, `NonLinear_lens`, `AccurateBB = True`,
+- reference CAMB run: separate strict run with
+  `AccuracyBoost=lSampleBoost=lAccuracyBoost=3`,
+  `min_l_logl_sampling=10000`, and the default validated
+  `use_cl_spline_template=True`,
+- reference lensing call:
+  `correlations.lensed_cls(..., sampling_factor=4.0, theta_max=None, apodize_point_width=14)`.
+
+The BB bands of most interest here are:
+
+- tensor contamination band: `2 <= l <= 399`,
+- intermediate BB band: `400 <= l <= 1200`.
+
+The table below shows maximum relative error against the matched-support
+reference in those bands, plus the last-500-`l` tail for context.
+
+| `lmax_out` | internal support | method | `BB(l<400)` max | `BB(400..1200)` max | `BB tail` max |
+| --- | ---: | --- | ---: | ---: | ---: |
+| `2000` | `3150` | method 1 | `1.42e-3` | `6.27e-4` | `1.78e-2` |
+| `2000` | `3150` | method 4 | `2.98e-4` | `1.15e-3` | `2.08e-2` |
+| `4000` | `5150` | method 1 | `1.48e-3` | `6.65e-4` | `5.12e-2` |
+| `4000` | `5150` | method 4 | `4.55e-4` | `6.46e-4` | `5.47e-2` |
+| `6000` | `7150` | method 1 | `1.48e-3` | `6.51e-4` | `7.62e-2` |
+| `6000` | `7150` | method 4 | `4.59e-4` | `6.42e-4` | `8.24e-2` |
+
+Under this stricter reference, the main robust `AccurateBB=True` result is that
+method 4 is clearly better in the tensor-sensitive `l < 400` BB band, while the
+`400 <= l <= 1200` BB band is only a near-tie at `lmax=4000` and `6000` and is
+slightly worse at `lmax=2000`. For context beyond BB, the full-range `TT`, `EE`
+and scaled `TE` maxima remain dominated by the same cutoff-local effect seen in
+the BB-tail column above, with method 1 still slightly smaller at the top end
+for `lmax=4000` and `6000`.
+
+### What is going wrong near the top cutoff
+
+The large residual in the last few hundred multipoles is mainly a support-margin
+problem, not a failure of the core lensing integral.
+
+The reason the extrapolation template does not remove this sensitivity is that
+it is only a scaled fiducial tail. CAMB loads a fixed high-`l` template from
+`HighLExtrapTemplate_lenspotentialCls.dat`, and the lensing code then matches
+its amplitude at `CP%Max_l` before using that shape for `TT`, `EE`, `TE` and
+`PP` above the internal cutoff. That is good enough for small numerical
+corrections, but it is not the same as having the actual cosmology's true
+high-`l` unlensed and lensing-potential spectra available over a wide range.
+
+So when the requested output `l` is too close to the region fed by the scaled
+template, the lensed spectrum near the top end inherits the template mismatch.
+Increasing the margin matters because it moves the output band farther away from
+that approximate tail.
+
+One important correction to the first quick template check is that a default
+high-`l` CAMB output is not automatically a better reference than the shipped
+template file. At `l > 5000`, the unlensed `TT` and especially `EE` spectra are
+already deep in the damping tail, so the dense output is sensitive to CAMB's own
+`l` interpolation choices. With the default settings, the high-`l` output can be
+off by tens of percent relative to a stricter run, even though the absolute
+signal there is tiny.
+
+A more reliable reference was made by increasing
+`AccuracyBoost=lSampleBoost=lAccuracyBoost=3`, setting
+`min_l_logl_sampling=10000`, and disabling the spline-template interpolation.
+That reference is itself well converged: increasing the boosts further to `4`
+changes `TT`, `EE`, scaled `TE` and `PP` by only about `1e-4` over
+`4000 <= l <= 6000`.
+
+Using that stricter reference, the conclusion is:
+
+- the earlier default-style run was not a safe truth standard for judging the
+  template tail;
+- with the same high-accuracy settings, turning CAMB's spline-template
+  interpolation back on changes the unlensed `TT`, `EE`, scaled `TE` and `PP`
+  by only about `1e-5` to `5e-5` over `4000 <= l <= 6000`;
+- the shipped `HighLExtrapTemplate_lenspotentialCls.dat` is therefore not the
+  limiting error source in that range once the underlying run is itself
+  converged.
+
+So the large discrepancies seen in the first quick tail probe were mostly a
+comparison against an under-converged default high-`l` output, not evidence that
+the shipped template file was poor. The support-margin conclusion above still
+holds for the lensed top-end output, but the template itself should be compared
+against a stricter reference than the default CAMB run.
+
+In these matched-support tests, the data object was built with an internal
+support only about `1150` above the requested output `lmax_out`. That is enough
+for the `l < 400` and `400..1200` BB bands, but it is not enough to make the
+last-500-`l` tail a clean method comparison.
+
+When the support is increased while keeping the method fixed, the BB-tail error
+collapses rapidly:
+
+| `lmax_out` | effective internal margin | method 1 tail max | method 4 tail max |
+| --- | ---: | ---: | ---: |
+| `4000` | `1150` | `5.31e-2` | `5.66e-2` |
+| `4000` | `1750` | `1.28e-2` | `1.50e-2` |
+| `4000` | `2350` | `1.72e-3` | `3.19e-3` |
+| `6000` | `1150` | `7.93e-2` | `8.55e-2` |
+| `6000` | `1750` | `8.95e-3` | `1.30e-2` |
+| `6000` | `2350` | `2.98e-3` | `6.58e-6` |
+
+This shows two things.
+
+- The dominant problem near the top cutoff is insufficient support margin.
+- Once the support is pushed high enough, method 4 improves very strongly,
+  especially when it no longer has to rely on the extrapolated high-`l` tail.
+
+So the right way to improve the top end is first to increase the lensing margin
+or internal support. In normal CAMB use, the relevant user-facing control is the
+`lens_output_margin` argument to `set_for_lmax`.
+
+## AccurateBB = False
+
+The same matched-support custom-spectrum benchmark was also run for
+`AccurateBB = False`, using CPU time, one warm cached run, and four measured
+runs.
+
+### Accuracy
+
+In this mode, neither method is acceptable for the tensor-sensitive low-`l` BB
+band. The `l < 400` maximum relative error stays around `3e-3` to `4e-3` for
+both methods.
+
+For the `400 <= l <= 1200` BB band, the strict-reference runs show only a small
+method difference: method 4 is slightly better at `lmax=4000` and `6000`, but
+it is worse at `lmax=2000`:
+
+| `lmax_out` | method | `BB(l<400)` max | `BB(400..1200)` max |
+| --- | --- | ---: | ---: |
+| `2000` | method 1 | `3.89e-3` | `6.23e-4` |
+| `2000` | method 4 | `2.22e-3` | `1.15e-3` |
+| `4000` | method 1 | `3.98e-3` | `6.77e-4` |
+| `4000` | method 4 | `3.35e-3` | `6.47e-4` |
+| `6000` | method 1 | `3.98e-3` | `6.64e-4` |
+| `6000` | method 4 | `3.71e-3` | `6.43e-4` |
+
+So the current short-range mode is still not suitable for low-`l` BB science,
+which is exactly why `AccurateBB=True` exists. The non-accurate mode is mainly a
+fast TT/EE/TE option, with only approximate BB. Under the stricter reference,
+the `400..1200` BB differences between methods are much smaller than the first
+exploratory numbers suggested.
+
+### Timing
+
+One-thread custom-spectrum CPU-time averages are:
+
+| `lmax_out` | method 1 custom | method 4 custom |
+| --- | ---: | ---: |
+| `2000` | `0.010-0.017 s` | `0.027 s` |
+| `4000` | `0.030-0.034 s` | `0.064 s` |
+| `6000` | `0.058-0.064 s` | `0.117 s` |
+
+So in `AccurateBB=False`, method 4 is only marginally better in the
+`400..1200` band at high `lmax`, but it is still about two to three times slower
+than method 1 and does not fix the low-`l` BB problem.
+
+## Short-Range Cut And Apodization Scan
+
+The direct method's `AccurateBB=False` path was checked against the full-range
+reference by varying:
+
+- the angular cut `theta_max = pi / range_fac`, with `range_fac` tested at
+  `24`, `32` and `40`,
+- the apodization width in quadrature points.
+
+Main outcome:
+
+- `range_fac = 24` improves the `400..1200` BB band, but costs noticeably more
+  CPU time and worsens the low-`l` tensor band.
+- `range_fac = 40` is faster and somewhat better in the tensor band, but it
+  degrades the `400..1200` band too much.
+- `range_fac = 32` remains the best compromise when both BB bands matter.
+
+For the taper, replacing the point-index half-Gaussian with a C2 smoothstep in
+the physical angle gave a cleaner improvement, but only once the transition was
+made broader than the nominal Gaussian width. The retained Fortran short-range
+direct method uses
+
+```fortran
+apodize_theta_width = min(theta_max, 48*pi/lmax)
+```
+
+and applies the smoothstep over `theta_max - apodize_theta_width < theta <
+theta_max`. Narrower smoothstep widths, including `12*pi/lmax`, did not improve
+the old taper: they left the `400..1200` band essentially unchanged and made the
+low-`l` tensor band slightly worse. The wider C2 window improves the
+tensor-sensitive `l < 400` BB band while leaving `400..1200` BB at the previous
+level.
+
+
+Main points:
+
+- The earlier very large full-range errors were indeed mixing method error with
+  physical support and margin error.
+- With matched inputs, low and mid `l` agreement is much better than those first
+  numbers suggested.
+- Method 4 is much better in the tensor-sensitive `l < 400` BB band.
+- In the `400 <= l <= 1200` BB band, the strict-reference runs show only a very
+  small method difference: method 4 is slightly better at `lmax=4000` and
+  `6000`, but not by a large factor, and it is worse at `lmax=2000`.
+- Method 1 is slightly better right at the top of the output range for these
+  `lmax=4000` and `6000` matched-support tests.
+- By `lmax=4000` and `6000`, the dominant residual method disagreement is very
+  concentrated near the output cutoff rather than across the full spectrum.
+
+This is the most useful interpretation of the current code state: method 4 gives
+clearly better low-`l` BB behavior at the same physical support, has only a
+small edge or deficit in the `400..1200` BB band depending on `lmax`, while
+method 1 still has a modest edge very close to the high-`l` cutoff.
+
+## Timing Results: Custom-Spectrum Lensing Only
+
+The timings below are only for the isolated lensing calculation,
+`get_lensed_cls_with_spectrum`, using the same matched-support setup as above.
+No `get_results` timings are included here.
+
+Reported values below are CPU times from `time.process_time()`, after one warmup
+call to ensure the Gauss-Legendre cache is populated, averaged over four
+measured custom-spectrum runs.
+
+### One thread
+
+| `lmax_out` | method 1 custom | method 4 custom |
+| --- | ---: | ---: |
+| `2000` | `0.733 s` | `0.841 s` |
+| `4000` | `2.410 s` | `1.935 s` |
+| `6000` | `4.632 s` | `4.146 s` |
+
+### Four threads
+
+| `lmax_out` | method 1 custom | method 4 custom |
+| --- | ---: | ---: |
+| `2000` | `0.982 s` | `1.113 s` |
+| `4000` | `3.224 s` | `2.652 s` |
+| `6000` | `5.799 s` | `4.656 s` |
+
+Main timing points:
+
+- Method 4 is not a speed win at `lmax=2000`.
+- At `lmax=4000` and `6000`, method 4 is clearly faster for the isolated
+  `AccurateBB` lensing step.
+- CPU time is the right measure for work here, but it does not show wall-clock
+  parallel speedup. With OpenMP, the four-thread CPU times remain similar to, or
+  somewhat larger than, the one-thread CPU times because they sum work across
+  threads.
+- Gauss-Legendre caching removes the need to regenerate nodes and weights for
+  repeated calls at the same quadrature size.
+
+## Parameter Exploration
+
+Several internal tuning changes were explored while bringing method 4 into the
+Fortran path.
+
+### 1. Gauss-Legendre caching
+
+Method 4 originally regenerated nodes and weights for every call. A module-level
+cache keyed by the quadrature size now reuses them across repeated runs. This is
+especially useful for repeated custom-spectrum calls and helps the isolated
+lensing timings above.
+
+### 2. OpenMP thread-local accumulation
+
+The direct method now accumulates into thread-local work arrays over angle
+points and reduces at the end. This avoids synchronization inside the inner loop
+and is the main reason method 4 becomes competitive at high `lmax`.
+
+### 3. Method-1 scratch-array refactor
+
+Method 1 originally wrote each sampled-`l` contribution into a scratch array and
+then immediately summed those arrays back into `corr(1:4)`. The disabled spline
+interpolation path associated with those scratch arrays was also removed.
+
+The current method-1 code now accumulates the sampled contributions directly in
+the `j` loop, with the same exact low-`l` and interpolated high-`l` weights as
+before. This is not a large algorithmic change, but it removes a clean piece of
+method-1 overhead with no observed regression in the focused test.
+
+### 4. Direct-method sampling factor
+
+The current direct method uses
+
+```fortran
+sampling_factor = 1.4_dl*LensAccuracyBoost
+```
+
+An additional exploratory bump,
+
+```fortran
+if (lmax > 3500) sampling_factor = sampling_factor*1.1_dl
+```
+
+was tested and then removed. Under the corrected matched-support benchmark it
+did not give any material accuracy improvement, but it did measurably slow the
+direct method. For the same A/B, the matched-support accuracy differences were
+negligible at the quoted precision.
+
+So the extra high-`l` bump was not retained.
+
+### 5. Method-1 theta-floor retune after the later C2 taper change
+
+This note mainly covers method 4, but the later `AccurateBB=False` work on method 1 changed the
+short-range taper enough that the original follow-up `ThetaSampleBoost` floors were worth revisiting.
+
+After switching method 1 to the C2 taper with
+
+```fortran
+apodize_width = 0.012_dl
+apodize_point_width = nint(apodize_width/dtheta)
+```
+
+the earlier high-l `ThetaSampleBoost = 2.6` floor from the follow-up note stopped being the minimal
+passing value. With the low-l floor fixed at `1.6`, the current strict-reference
+`params_lmax6000.ini` scan gives:
+
+| high-l `ThetaSampleBoost` floor | pass? | score | standard wall time |
+| --- | --- | ---: | ---: |
+| `2.0` | pass | `0.8687158` | `0.444 s` |
+| `2.2` | pass | `0.8687197` | `0.489 s` |
+| `2.4` | pass | `0.8687231` | `0.479 s` |
+| `2.6` | pass | `0.8687261` | `0.482 s` |
+
+while the no-uplift case (`ThetaSampleBoost = max(., 1.6)` with no `Max_l > 3500` override) fails
+TT in the top tail at `L = 5894` with `0.00421` against the `0.003` tolerance. So the current
+method-1 short-range path still needs a dedicated high-l uplift, but `2.6` is no longer carrying a
+visible accuracy advantage over `2.0` on this check. The retained method-1 setting uses `2.2` as a
+small cushion above the minimal tested passing value.
+
+The associated plots are in `accuracy_plots/theta_highl_scan/`, with TT/EE fractional-difference
+curves against the strict reference and against the current `2.6` baseline.
+
+## Current Takeaway
+
+Method 4 is now a practical additional option rather than just a direct port of
+the Python calculation.
+
+- It is wired through both the normal and custom-spectrum Fortran paths.
+- It is selectable from Python, including per-call overrides.
+- It is validated by the focused regression test.
+- It is faster than method 1 for the high-`lmax`, `AccurateBB=True` custom
+  lensing cases that are the main target here.
+- In strict matched-support accuracy tests, it clearly improves BB in the
+  `l < 400` tensor band, is only marginally different in the `400..1200` band,
+  and method 1 still remains slightly better very near the top output cutoff.
+- In `AccurateBB=False`, it is still substantially slower than method 1 and does
+  not deliver a comparably strong accuracy advantage.
+
+So the present tradeoff is not "method 4 is uniformly more accurate everywhere".
+The more accurate statement is:
+
+- method 4 is the faster choice for high-`lmax` `AccurateBB` runs,
+- method 4 gives a clear BB gain in the tensor band and only a small change in
+  the `400..1200` band away from the extreme top-end cutoff,
+- method 1 still has a small accuracy advantage right at the highest output `l`.
diff --git a/docs/changelog/lmax_settings.md b/docs/changelog/lmax_settings.md
new file mode 100644
index 000000000..9f0a3570b
--- /dev/null
+++ b/docs/changelog/lmax_settings.md
@@ -0,0 +1,274 @@
+# How `l_max` is set in CAMB
+
+CAMB has several distinct multipole cut-offs that interact: the user-facing
+`l_max` for scalars and tensors, a `lens_output_margin` that controls how
+far below `Max_l` the lensed output is guaranteed to reach, an `l_max`
+used while extrapolating the unlensed spectrum into the lensing
+convolution, and a separate `l_max` for the Boltzmann hierarchies. This
+document spells out where each is set and how the values relate.
+
+## 1. The user-facing cut-offs on `CAMBparams`
+
+The Fortran type `CAMBparams` (mirrored in Python as
+[`camb.model.CAMBparams`](../camb/model.py)) carries these fields:
+
+- `min_l` — minimum L for the scalar `C_L` (1 or 2; `L=1` dipoles are
+  Newtonian gauge).
+- `max_l` — `l_max` for the **scalar** `C_L`.
+- `max_l_tensor` — `l_max` for the **tensor** `C_L`.
+- `lens_output_margin` (default `150`) — number of L below `max_l` down
+  to which lensed `C_L` are guaranteed to be output. The unlensed
+  spectrum used in the lensing convolution is treated as reliable to
+  `max_l − (lens_output_margin − 50)`, so this field also fixes the
+  lensed convolution margin (always 50 less than `lens_output_margin`;
+  see §3).
+- `max_eta_k` — maximum `k η₀` used in the scalar transfer integration;
+  this is what actually controls `k_max`.
+- `max_eta_k_tensor` — the same for tensors.
+
+`max_l` is the value used internally everywhere as "the scalar `l_max`".
+Lensed CMB spectra are *output* below this (see §3), and unlensed spectra
+are output up to and including `max_l`.
+
+These fields are exposed both from `.ini` files and from the Python
+interface.
+
+## 2. Setting `l_max` from a `.ini` file
+
+The `.ini` reader is in [`fortran/camb.f90`](../fortran/camb.f90) (see
+`CAMBparams_ReadParams`, lines ~387–420):
+
+| `.ini` key            | Fortran field             | Default if not set                                   |
+|-----------------------|---------------------------|------------------------------------------------------|
+| `l_min`               | `P%Min_l`                 | `2`                                                  |
+| `l_max_scalar`        | `P%Max_l`                 | *required when `get_scalar_cls = T` or `get_vector_cls = T`* |
+| `k_eta_max_scalar`    | `P%Max_eta_k`             | `2.5 * l_max_scalar`                                 |
+| `lens_output_margin`  | `P%lens_output_margin`    | `150`                                                |
+| `l_max_tensor`        | `P%Max_l_tensor`          | *required when `get_tensor_cls = T`*                 |
+| `k_eta_max_tensor`    | `P%Max_eta_k_tensor`      | `max(500, 2 * l_max_tensor)`                         |
+
+The sample [`inifiles/params.ini`](../inifiles/params.ini) sets:
+
+```
+l_max_scalar       = 2200
+# k_eta_max_scalar = 4000    (commented out, so default 2.5*l_max_scalar used)
+# lens_output_margin = 150   (commented out, so default 150 used)
+l_max_tensor       = 1500
+k_eta_max_tensor   = 3000
+```
+
+The comments in that file capture the operational rule of thumb:
+
+- `C_L` within ~50 of `l_max_scalar` are inaccurate (5%-level), so go
+  higher than you need.
+- Lensed power spectra are guaranteed to be output up to
+  `l_max_scalar − lens_output_margin`, and at most up to
+  `l_max_scalar − (lens_output_margin − 50)` (see §3).
+- For accurate lensed BB you need `l_max_scalar > 2000` and
+  `k_eta_max_scalar > 10000`.
+- For an accurate lensing potential you also need
+  `k_eta_max_scalar > 10000`.
+- Otherwise the default `k_eta_max_scalar = 2.5 * l_max_scalar` is enough
+  (matching the Python `set_for_lmax` default `k_eta_fac = 2.5`).
+
+When `do_lensing = T`, the `.ini` user is expected to set
+`l_max_scalar` high enough that `l_max_scalar − lens_output_margin`
+covers the L range they actually care about.
+
+## 3. The lensed output range
+
+The lensed output upper limit lives in `CLout%lmax_lensed` and is
+computed identically by all lensing methods. From
+[`fortran/lensing.f90`](../fortran/lensing.f90) (lines ~251–256 for
+method 1, ~739–743 for method 4):
+
+```fortran
+max_lensed_ix = lSamp%nl-1
+do while(lSamp%l(max_lensed_ix) > CP%Max_l - (CP%lens_output_margin - lens_convolution_gap))
+    max_lensed_ix = max_lensed_ix - 1
+end do
+CLout%lmax_lensed = max(lSamp%l(max_lensed_ix), CP%Max_l - CP%lens_output_margin)
+```
+
+where `lens_convolution_gap = 50` is a parameter in
+[`fortran/config.f90`](../fortran/config.f90) — the fixed gap between
+the lensed output margin and the unlensed-spectrum convolution margin.
+So if we write `M = CP%lens_output_margin` and `G = 50`:
+
+- **Upper bound:** `lmax_lensed ≤ Max_l − (M − G)`, because the
+  while-loop walks the sparse L-sampling grid (§6) back to a point that
+  is at most `Max_l − (M − G)`.
+- **Lower bound (floor):** `lmax_lensed ≥ Max_l − M`.
+
+So `lmax_lensed ∈ [Max_l − M, Max_l − (M − G)]` — the exact value
+depends on where the sparse L-sampling falls in that band.
+
+With the default `M = 150` and `G = 50`, that becomes
+`[Max_l − 150, Max_l − 100]`, matching the historical behaviour.
+
+### Why `set_for_lmax` adds the full margin
+
+`set_for_lmax(..., lens_output_margin=M)` sets both `pars.max_l =
+user_lmax + M` and `pars.lens_output_margin = M`. With `Max_l − M =
+user_lmax`, the floor in `lmax_lensed` guarantees the lensed output
+reaches the user's target `L` even in the worst sparse-sampling case.
+If the floor were e.g. `Max_l − (M − G)`, the lensed output would
+fall up to `G = 50` L short of the request in the worst case.
+
+The Python and Fortran sides used to use hardcoded `150` and `100`
+constants here; they are now derived from `CP%lens_output_margin` so
+that changing the margin from one side (Python or `.ini`) propagates
+through to the lensing convolution and the L-sampling logic.
+
+### Extrapolation beyond `Max_l`
+
+When convolving the unlensed `C_L` with the lensing potential, the
+unlensed `C_L` is extrapolated past `Max_l` using a fiducial high-L
+template up to
+`min(lmax_extrap_highl, Max_l − (lens_output_margin − 50) + 750)`,
+where `lmax_extrap_highl = 8000` is a `parameter` constant in
+[`fortran/config.f90`](../fortran/config.f90).
+
+## 4. Setting `l_max` from Python
+
+There are two convenience setters; both end up writing to `max_l`,
+`lens_output_margin`, `max_l_tensor`, `max_eta_k`, `max_eta_k_tensor`.
+
+### 4a. `CAMBparams.set_for_lmax`
+
+[`camb/model.py`](../camb/model.py), method `set_for_lmax`. Signature:
+
+```python
+pars.set_for_lmax(
+    lmax,
+    max_eta_k=None,
+    lens_potential_accuracy=0,
+    lens_output_margin=150,
+    k_eta_fac=2.5,
+    lens_k_eta_reference=18000.0,
+    nonlinear=None,
+)
+```
+
+Behaviour:
+
+- `lmax` is the `l_max` the user wants the **output** lensed scalar
+  spectrum to be accurate to.
+- `pars.lens_output_margin` is set to `lens_output_margin` so the
+  Fortran lensing code uses the matching floor (§3).
+- If `DoLensing` is true, the internal `max_l` is set to `lmax +
+  lens_output_margin` (default `+150`). If `DoLensing` is false,
+  `max_l = lmax`.
+- `max_eta_k` is set to `max_eta_k` if given, otherwise
+  `k_eta_fac * max_l` (default `k_eta_fac = 2.5`).
+- If `lens_potential_accuracy > 0`, `max_eta_k` is further bumped up to at
+  least `lens_k_eta_reference * lens_potential_accuracy` (default
+  `18000 * lens_potential_accuracy`). This is what controls `k_max` for
+  accurate lensing.
+- `max_l_tensor` is **not** touched here — set it directly on `pars` if
+  you want tensors.
+
+The docstring warns that this does *not* fix the output `L` range; the
+spectra may be returned above `lmax`. Use the `lmax` argument on the
+`get_*_cls` accessors (§5) to truncate output.
+
+### 4b. `camb.set_params(..., lmax=..., lens_potential_accuracy=..., max_eta_k=...)`
+
+[`camb/camb.py`](../camb/camb.py), function `set_params`. This is a
+convenience wrapper that forwards relevant kwargs to the individual
+setters in a fixed order; `lmax`, `lens_potential_accuracy`,
+`lens_output_margin`, `k_eta_fac`, `lens_k_eta_reference`, `nonlinear`,
+and `max_eta_k` are routed to `CAMBparams.set_for_lmax`. Anything else
+with a matching attribute name (`max_l_tensor`, `max_eta_k_tensor`,
+`min_l`, `min_l_logl_sampling`, …) is set directly on the matching
+field at the end.
+
+### 4c. Direct assignment
+
+For full control, write the fields directly:
+
+```python
+pars.max_l = 2500
+pars.max_l_tensor = 1500
+pars.max_eta_k = 10000.0
+pars.max_eta_k_tensor = 3000.0
+pars.lens_output_margin = 150
+pars.min_l = 2
+```
+
+`set_for_lmax` exists to do the bookkeeping (margin, `max_eta_k`
+defaults, non-linear lensing flag) on your behalf; direct assignment
+skips it.
+
+## 5. `lmax` on the Python output accessors
+
+[`camb/results.py`](../camb/results.py) defines `_lmax_setting`
+(lines ~447–458) and the per-spectrum accessors. The rule is:
+
+- For lensed outputs (`get_total_cls`, `get_lensed_scalar_cls`,
+  `get_cmb_power_spectra` default, `get_cmb_correlation_functions`,
+  `save_cmb_power_spectra`), the "calculated" `l_max` is
+  `f_get_lmax_lensed()`, i.e. `CLout%lmax_lensed` from the Fortran
+  lensing module (§3).
+- For unlensed outputs (`get_unlensed_scalar_cls`, `get_unlensed_total_cls`,
+  `get_lens_potential_cls`, `get_tensor_cls`,
+  `get_unlensed_scalar_array_cls`), the "calculated" `l_max` is just
+  `Params.max_l` (or `max_l_tensor` for tensors — note `get_tensor_cls`
+  defaults its `lmax` to `Params.max_l_tensor`).
+- If the user passes `lmax` larger than the calculated value, a warning
+  is logged and the spectrum is returned padded/zeroed above the
+  calculated range.
+- `get_cmb_power_spectra` uses the *lensed* calculated `lmax` for **all**
+  spectra it returns; if you want the unlensed and lensing-potential
+  spectra to the higher unlensed `lmax`, call the individual accessors.
+
+## 6. Internal `l_max` used inside the solver (not user-tunable)
+
+These are not the same as `max_l`; they control the Boltzmann hierarchy
+length per `k`:
+
+- [`fortran/equations.f90`](../fortran/equations.f90):
+  `max_l_evolve = 256` is the compile-time cap for any one mode's
+  hierarchy length. Per-mode `EV%lmaxg`, `EV%lmaxgpol`, `EV%lmaxnr`,
+  `EV%lmaxnu` are chosen as functions of `q` and
+  `Accuracy%lAccuracyBoost` (lines ~870–940). The aggregate
+  `EV%MaxlNeeded` is required to stay below `max_l_evolve`.
+- The L-sampling on which `C_L` is interpolated is built in
+  `lSamples_init` in [`fortran/results.f90`](../fortran/results.f90)
+  (line ~1336). It runs from `CP%Min_l` up to `Max_l`, with denser
+  sampling controlled by `Accuracy%lSampleBoost` and the
+  `CP%min_l_logl_sampling` parameter (sparser log spacing above that L).
+  When `DoLensing` is on, the sampler also tries to insert a point at
+  `Max_l − (lens_output_margin − 50)` so that the lensed convolution
+  has a sample to interpolate from there.
+- Inside lensing (`lensing.f90`), the per-method internal extrapolation
+  upper bound is
+  `min(lmax_extrap_highl, Max_l − (lens_output_margin − 50) + 750)`
+  (see §3).
+
+## 7. Quick checklist
+
+If you want the **lensed** scalar `C_L` accurate to some target `L*`:
+
+1. Either call `pars.set_for_lmax(L*, lens_potential_accuracy=k)` —
+   which sets `pars.max_l = L* + 150`, `pars.lens_output_margin = 150`,
+   and bumps `max_eta_k` — or set `pars.max_l = L* + 150`,
+   `pars.lens_output_margin = 150`, and `pars.max_eta_k`
+   (≥ `2.5 * pars.max_l`, and ≥ `18000 * k` for accurate lensing) by
+   hand. The two margins must agree.
+2. Read the lensed spectrum back with
+   `results.get_lensed_scalar_cls(lmax=L*)`; values above `L*` are
+   zero-padded.
+
+If you want the **unlensed** scalar `C_L` accurate to `L*`:
+
+1. Set `pars.max_l = L* + 50` or so (the last ~50 are inaccurate; see
+   the `.ini` comment in §2). Default `max_eta_k = 2 * max_l` suffices.
+2. Read back with `results.get_unlensed_scalar_cls(lmax=L*)`.
+
+If you are reading parameters from a `.ini` file, `l_max_scalar` plays
+the role of `pars.max_l` directly. There is no automatic `+150` margin
+applied for you on the `.ini` path, so set `l_max_scalar` high enough
+that `l_max_scalar − lens_output_margin` covers the L range you
+actually care about for the lensed output. The default
+`lens_output_margin = 150` can be overridden with the same-named key.
diff --git a/docs/changelog/nonflat_integration.md b/docs/changelog/nonflat_integration.md
new file mode 100644
index 000000000..2df26c3b2
--- /dev/null
+++ b/docs/changelog/nonflat_integration.md
@@ -0,0 +1,52 @@
+# Non-flat integration updates
+
+This note summarizes the current non-flat integration changes in `fortran/cmbmain.f90`.
+
+## Main algorithm changes
+
+- Replaced the on-the-fly non-flat `u_{\nu l}` integration from RK4 stepping at every source sample with a Numerov-based update, using one RK4 bootstrap step followed by Numerov steps.
+- Refactored both scalar and tensor non-flat line-of-sight integration so they compute `u_{\nu l}` values on the source time grid and then integrate against `Source_q` there.
+- Removed interpolation of non-flat sources at internal Numerov substeps.
+- For the scalar oscillatory-region sweep, added a high-`x` cutoff so the earliest, very oscillatory tail is skipped unless `full_bessel_integration` or bispectrum calculations require it.
+- Kept the source-interval landing exact by splitting each source interval into an integer number of Numerov substeps, with
+  `nSubSteps = ceiling(dchisource / dchimax)` and `delchi = dchisource / nSubSteps`.
+
+## Near-flat shifted-q approximation
+
+- For scalar non-flat runs close to flat, `DoRangeInt` can now fill `u_{\nu l}` directly from
+  `u_{\nu l}(\chi) \approx (\chi / S_K(\chi)) j_l(q_{\mathrm{eff}} \chi)` with
+  `q_{\mathrm{eff}}^2 = \nu^2 - K_{\mathrm{sign}} l(l+1) / 3`.
+- When the full scalar source range is inside that near-flat regime, `IntegrateSourcesBessels` now bypasses the dissipative/oscillatory split entirely and uses a dedicated `DoNearFlatIntegration` path that follows the same source-grid traversal order and the same high-`x` and Limber cutoffs as `DoFlatIntegration`, after replacing `q` by the shifted `q_{\mathrm{eff}} / R_K` and including the extra `\chi / S_K(\chi)` factor.
+- This branch is only used when the shared near-flat runtime criteria are active in both `results.f90` and `cmbmain.f90`:
+  `|State%scale - 1| <= 0.03`, `chi_max < near_flat_approx_chi_limit / sqrt(AccuracyBoost * NonFlatIntAccuracyBoost)`,
+  `chiDisp < near_flat_approx_chidisp_limit`, and the shared local error estimate
+  `l(l+1) chi_max^2 / (15 q_{\mathrm{eff}}^2) < 1.0e-3 / (AccuracyBoost * NonFlatIntAccuracyBoost)`.
+- The near-flat scalar approximations are also guarded by the runtime switches `enable_do_near_flat_integration` and `enable_shifted_q_scalar_approx`, which let the full-path and local shifted-q branches be compared without rebuilding.
+- In the same regime, `lSamples_init` keeps `Ascale = 1 / lSampleBoost` so the `l` sampling stays on the flat template, and the flat Bessel spline tables are initialized for the non-flat scalar integration as well.
+- The flat Bessel precomputation now accepts the requested `k eta` range directly, and the near-flat non-flat caller uses a fixed analytic extra margin so all models that use the shifted-q approximation with the same `lmax` and `max_eta_k` request the same cached table extent. The bound uses only `near_flat_scale_tol`, `near_flat_approx_chi_limit`, `lmax`, and `max_eta_k`; for `lmax = 4000`, `lens_potential_accuracy = 4`, and `max_eta_k = 72000`, this requests `x_max = 74249` for every approximation-eligible model. In the tested cases, this safely covers the observed worst shifted-q range, which was about `72243.6` for `Omega_k = -0.002`.
+
+## Code simplifications
+
+- Removed the non-flat source second-derivative storage `ddSource_q`, which is no longer needed now that both scalar and tensor paths integrate on the source grid.
+- Simplified the scalar inner accumulation loop by applying trapezoidal endpoint weights to the stored `u_{\nu l}` array before accumulation, and unrolled the common 3-source case.
+
+## Accuracy and performance checks
+
+- Fresh-process timings include the near-flat Bessel precomputation. In a same-process repeat check, `Omega_k = -0.002` dropped from about `1.25 s` on the first call to `1.08 s` on the second, while `Omega_k = 0.02` changed only from about `2.16 s` to `2.08 s`, consistent with the first-call-only spline setup cost appearing only in the near-flat branch.
+- In the current fixed-`near_flat_approx_chidisp_limit = 0.1` mode-comparison scan over `|Omega_k| = 10^{-4} .. 10^{-2}`, the three runtime modes order cleanly in speed as `both_on < shifted_q_only < both_off`. On the earlier `10^{-6} .. 6 \times 10^{-3}` scan used to establish the workflow, mean runtimes were about `1.04 s`, `1.21 s`, and `2.03 s` respectively.
+- A direct A/B probe of the shared local near-flat gate with `err_tol = 2.0e-3 / boost_scale` in both `UseNearFlatScalarIntegration` and `UseShiftedQScalarApprox` produced spectra identical to the current `1.0e-3 / boost_scale` setting over that comparison grid, with no measurable runtime improvement beyond run-to-run noise, so the baseline threshold was kept.
+- Over the near-flat cases where the shifted-q branch is active (`Omega_k = -0.002, -10^{-5}, +10^{-5}, +0.002`), fresh-process runtime ratios versus the current baseline non-flat implementation were about `0.71`, `0.81`, `0.67`, and `0.64` respectively. Over the same cases, max differences versus that baseline for `2 <= l <= 4000` were `TT \lesssim 1.57e-3`, `EE \lesssim 2.04e-3`, `\Delta TE / \sqrt{TT \cdot EE} \lesssim 1.37e-3`, and `PP \lesssim 1.42e-3`.
+- For `|Omega_k| \ge 0.02`, the shifted-q branch is disabled by the shared `State%scale` gate, the spectra are identical to the baseline default code in the tested runs, and timing stays within about `10%` of the baseline over `Omega_k = \pm 0.02, \pm 0.1`.
+- Comparing the current non-flat results to the flat `Omega_k = 0` case gives a direct continuity check near the flat limit. For `|Omega_k| = 10^{-5}`, the max differences for `2 <= l <= 4000` remain about `TT = 2.0e-4 .. 3.3e-4`, `EE = 3.0e-4 .. 4.3e-4`, and `\Delta TE / \sqrt{TT \cdot EE} = 2.7e-4 .. 3.7e-4`; with the flat-ordered near-flat integration, the low-`l` `PP` continuity is much better, dropping to about `3.6e-5` over `2 <= l <= 80` for both signs of `Omega_k`.
+
+- Against the stored RK4 scalar baseline over `Omega_k = \pm 0.02, \pm 0.002`, the current scalar implementation gives lensed `TT` max fractional differences about `8.4e-4 .. 9.6e-4` and `EE` about `3.9e-4 .. 4.4e-4` for `\ell >= 2`. For `\ell >= 30`, `TT` stays about `8.4e-4 .. 9.6e-4` while `EE` drops to about `1.62e-4 .. 1.78e-4`.
+- In the same scalar runs, CPU time dropped from about `25 .. 32 s` for the stored RK4 baseline to about `11.5 .. 15.1 s` for the current code, roughly a `52% .. 55%` reduction.
+- The scalar high-`x` cutoff itself gives an additional CPU reduction of about `2.6% .. 12.5%` versus the uncapped source-grid Numerov code over the same `Omega_k` cases.
+- Relative to that uncapped source-grid Numerov code, the scalar cutoff changes spectra mainly at very low `\ell`: for `\ell >= 2`, max differences are about `TT = 4.4e-4 .. 6.5e-4`, `TE = 5.7e-7 .. 9.2e-7`, and `PP = 6.2e-6 .. 7.1e-6`. For `\ell >= 30`, the same comparison shrinks to about `TT = 1.8e-6 .. 3.7e-6`, `EE = 6.6e-11 .. 9.1e-11`, `TE = 5.9e-9 .. 1.3e-8`, and `PP = 1.0e-9 .. 3.0e-9`.
+- Direct comparison of lensing potential `PP` against the pre-change code shows the largest fractional differences at very low `\ell`, peaking around `\ell = 3` at about `3.1e-3 .. 3.5e-3` over tested `Omega_k = \pm 0.02, \pm 0.002`.
+- For `\ell >= 30`, the `PP` differences versus the pre-change code are smaller, with max fractional differences about `7.4e-4 .. 8.5e-4` and RMS about `2.4e-4 .. 2.7e-4`.
+- In the same `PP` runs, the current code was faster than the pre-change code by roughly `20% .. 33%` in CPU time over the tested curvature values.
+
+## Scope
+
+These changes are localized to the non-flat hyperspherical Bessel integration path in `cmbmain.f90` and its reuse of the flat Bessel spline tables in `bessels.f90` for near-flat scalar runs. Flat-space integration itself is unchanged.
diff --git a/docs/changelog/variable_redefinitions.md b/docs/changelog/variable_redefinitions.md
new file mode 100644
index 000000000..4c4710c67
--- /dev/null
+++ b/docs/changelog/variable_redefinitions.md
@@ -0,0 +1,20 @@
+# Variable Redefinitions
+
+## Summary
+
+Redefined the background-state variable `chi0` as `DMt0`, the comoving angular
+diameter distance to the big bang. The new definition is
+`curvature_radius * rofChi(tau0 / curvature_radius)`, so it is a physical
+distance and reduces to `tau0` in the flat limit.
+
+## Main Conclusions
+
+- The old `chi0` state variable mixed a dimensionless curved-space quantity with
+  call sites that expected a physical distance.
+- Renaming the state field to `DMt0` makes the intended geometry explicit and
+  removes repeated implicit factors of `curvature_radius`.
+- The CMB source-grid and integration formulas that depend on the distance to
+  last scattering must use `DMt0` directly to stay consistent with the flat
+  limit.
+- In particular, the `q_cmb` expression should scale as `l / DMt0`; using the
+  old dimensionless `chi0` was inconsistent away from flat space.
diff --git a/fortran/bessels.f90 b/fortran/bessels.f90
index 3c6818a28..13b8bb3b5 100644
--- a/fortran/bessels.f90
+++ b/fortran/bessels.f90
@@ -48,12 +48,12 @@ subroutine InitSpherBessels(lSamp, CP,max_bessels_l_index, max_bessels_etak)
     if (lSamp%nl <= file_l%nl) then
         if (allocated(bessel_horner) .and. all(file_l%l(1:lSamp%nl)==lSamp%l(1:lSamp%nl) &
             .and. (max_bessels_l_index <= max_ix)) &
-            .and. (int(min(max_bessels_etak,CP%Max_eta_k))+1 <= kmaxfile) &
+            .and. (int(max_bessels_etak)+1 <= kmaxfile) &
             .and. (abs(CP%Accuracy%BesselBoost*CP%Accuracy%AccuracyBoost - file_acc) < 1d-2)) return
     end if
 
     !Haven't made them before, so make them now
-    kmaxfile = int(min(CP%Max_eta_k,max_bessels_etak))+1
+    kmaxfile = int(max_bessels_etak)+1
     max_ix = min(max_bessels_l_index,lSamp%nl)
 
     call GenerateBessels(lSamp, CP)
diff --git a/fortran/camb.f90 b/fortran/camb.f90
index 67ebd1e2c..eba66044e 100644
--- a/fortran/camb.f90
+++ b/fortran/camb.f90
@@ -388,7 +388,8 @@ logical function CAMB_ReadParams(P, Ini, ErrMsg)
     if (P%WantCls) then
         if (P%WantScalars  .or. P%WantVectors) then
             P%Max_l = Ini%Read_Int('l_max_scalar')
-            P%Max_eta_k = Ini%Read_Double('k_eta_max_scalar', P%Max_l*2._dl)
+            P%Max_eta_k = Ini%Read_Double('k_eta_max_scalar', P%Max_l*2.5_dl)
+            P%lens_output_margin = Ini%Read_Int('lens_output_margin', P%lens_output_margin)
             if (P%WantScalars) then
                 P%DoLensing = Ini%Read_Logical('do_lensing', .false.)
                 if (P%DoLensing) lensing_method = Ini%Read_Int('lensing_method', 1)
@@ -887,6 +888,7 @@ subroutine CAMB_CommandLineRun(InputFile)
     highL_unlensed_cl_template = Ini%Read_String_Default( &
         'highL_unlensed_cl_template', highL_unlensed_cl_template)
     call Ini%Read('number_of_threads', ThreadNum)
+    call Ini%Read('AccuracyTarget', AccuracyTarget)
     call Ini%Read('DebugParam', DebugParam)
     call Ini%Read('feedback_level', FeedbackLevel)
     if (Ini%HasKey('DebugMsgs')) call Ini%Read('DebugMsgs', DebugMsgs)
diff --git a/fortran/cmbmain.f90 b/fortran/cmbmain.f90
old mode 100644
new mode 100755
index ba2122b7d..e8b61e1e3
--- a/fortran/cmbmain.f90
+++ b/fortran/cmbmain.f90
@@ -77,7 +77,7 @@ module CAMBmain
         integer q_ix
         real(dl) q, dq    !q value we are doing and delta q
         !Contribution to C_l integral from this k
-        real(dl), dimension(:,:), pointer :: Source_q, ddSource_q
+        real(dl), dimension(:,:), pointer :: Source_q
         !Interpolated sources for this k
         integer SourceSteps !number of steps up to where source is zero
     end type IntegrationVars
@@ -101,7 +101,7 @@ module CAMBmain
     integer maximum_l !Max value of l to compute
     real(dl) :: maximum_qeta = 3000._dl
 
-    integer :: l_smooth_sample = 3000 !assume transfer functions effectively small for k*chi0>2*l_smooth_sample
+    integer :: l_smooth_sample = 3000 !assume transfer functions effectively small for q*DMt0>2*l_smooth_sample
 
     integer :: max_bessels_l_index  = 1000000
     real(dl) :: max_bessels_etak = 1000000*2
@@ -235,6 +235,7 @@ end subroutine cmbmain
     subroutine TimeSourcesToCl(ThisCT)
     Type(ClTransferData) :: ThisCT
     integer q_ix
+    real(dl) :: spline_bessels_etak
     Type(TTimer) :: Timer
 
     if (CP%WantScalars) ThisSources => State%ScalarTimeSources
@@ -260,7 +261,13 @@ subroutine TimeSourcesToCl(ThisCT)
         if (CP%WantScalars) call GetLimberTransfers(ThisCT)
         ThisCT%max_index_nonlimber = max_bessels_l_index
 
-        if (State%flat) call InitSpherBessels(ThisCT%ls, CP, max_bessels_l_index,max_bessels_etak )
+        if (State%flat .or. (CP%WantScalars .and. abs(State%scale - 1._dl) <= near_flat_scale_tol)) then
+            spline_bessels_etak = max_bessels_etak
+            if (CP%WantScalars .and. .not. State%flat) then
+                spline_bessels_etak = ShiftedQBesselTableMaxEtak(ThisCT%ls%l(max_bessels_l_index), max_bessels_etak)
+            end if
+            call InitSpherBessels(ThisCT%ls, CP, max_bessels_l_index, spline_bessels_etak)
+        end if
         !This is only slow if not called before with same (or higher) Max_l, Max_eta_k
         !Preferably stick to Max_l being a multiple of 50
 
@@ -526,7 +533,6 @@ subroutine SourceToTransfers(ThisCT, q_ix)
     type(IntegrationVars) :: IV
 
     allocate(IV%Source_q(State%TimeSteps%npoints,ThisSources%SourceNum))
-    if (.not.State%flat) allocate(IV%ddSource_q(State%TimeSteps%npoints,ThisSources%SourceNum))
 
     call IntegrationVars_init(IV)
 
@@ -538,7 +544,6 @@ subroutine SourceToTransfers(ThisCT, q_ix)
 
     call DoSourceIntegration(IV, ThisCT)
 
-    if (.not.State%flat) deallocate(IV%ddSource_q)
     deallocate(IV%Source_q)
 
     end subroutine SourceToTransfers
@@ -780,15 +785,14 @@ subroutine InitVars(ActiveState)
         qmax=maximum_qeta/State%tau0
         qmin=qmin0/State%tau0/initAccuracyBoost
     else
-        qmax=maximum_qeta/State%curvature_radius/State%chi0
-        qmin=qmin0/State%curvature_radius/State%chi0/initAccuracyBoost
+        qmax=maximum_qeta/State%DMt0
+        qmin=qmin0/State%DMt0/initAccuracyBoost
     end if
     !     Timesteps during recombination (tentative, the actual
     !     timestep is the minimum between this value and taurst/40,
     !     where taurst is the time when recombination starts - see inithermo
 
     dtaurec_q=4/qmax/initAccuracyBoost
-    if (.not. State%flat) dtaurec_q=dtaurec_q/6
     !AL:Changed Dec 2003, dtaurec feeds back into the non-flat integration via the step size
     State%dtaurec = dtaurec_q
     !dtau rec may be changed by ThermoData_init
@@ -850,12 +854,15 @@ subroutine SetkValuesForSources
     implicit none
     real(dl) dlnk0, dkn1, dkn2, q_switch, q_cmb, dksmooth
     real(dl) qmax_log
-    real(dl) SourceAccuracyBoost
+    real(dl) SourceAccuracyBoost, transferLogDensity
+    logical highPrecisionTransferSources
     !     set k values for which the sources for the anisotropy and
     !     polarization will be calculated. For low values of k we
     !     use a logarithmic spacing. closed case dealt with by SetClosedkValues
 
     SourceAccuracyBoost = CP%Accuracy%AccuracyBoost * CP%Accuracy%SourcekAccuracyBoost
+    highPrecisionTransferSources = AccuracyTarget > 0 .and. CP%Want_CMB .and. CP%WantScalars .and. CP%WantTransfer &
+        .and. CP%Transfer%high_precision
     if (CP%WantScalars .and. CP%Reion%Reionization .and. CP%Accuracy%AccuratePolarization) then
         dlnk0=2._dl/10/SourceAccuracyBoost
         !Need this to get accurate low l polarization
@@ -865,21 +872,35 @@ subroutine SetkValuesForSources
     end if
 
     if (CP%Accuracy%AccurateReionization) dlnk0 = dlnk0/2
+    !Low-l reionization polarization benefits from a modest cap on the low-k logarithmic source grid.
+    if (AccuracyTarget > 0 .and. CP%Want_CMB .and. CP%WantScalars .and. CP%Reion%Reionization .and. &
+        CP%Accuracy%AccuratePolarization .and. CP%Accuracy%AccurateReionization) dlnk0 = min(dlnk0, 0.075_dl)
+    if (highPrecisionTransferSources) then
+        !High-precision transfer output reuses this CMB source q grid up to q_cmb.
+        !Use 20/log interval for automatic transfer sampling, or the requested denser transfer grid.
+        transferLogDensity = max(20._dl, real(CP%Transfer%k_per_logint, dl))
+        dlnk0 = min(dlnk0, 1._dl/transferLogDensity)
+    end if
 
     dkn1=0.6_dl/State%taurst/SourceAccuracyBoost
     dkn2=0.9_dl/State%taurst/SourceAccuracyBoost/1.2
+    if (highPrecisionTransferSources) then
+        dkn1 = dkn1/1.5_dl
+        dkn2 = dkn2/1.5_dl
+    end if
     if (CP%WantTensors .or. CP%WantVectors) then
         dkn1=dkn1  *0.8_dl
         dlnk0=dlnk0/2 !*0.3_dl
         dkn2=dkn2*0.85_dl
     end if
 
-    qmax_log = dkn1/dlnk0
     q_switch = 2*6.3/State%taurst
+    !If the low-k logarithmic grid reaches the horizon-entry switch, skip the intermediate linear interval.
+    qmax_log = min(dkn1/dlnk0, q_switch)
     !Want linear spacing for wavenumbers which come inside horizon
     !Could use sound horizon, but for tensors that is not relevant
 
-    q_cmb = 2*l_smooth_sample/State%chi0*SourceAccuracyBoost  !assume everything is smooth at l > l_smooth_sample
+    q_cmb = 2*l_smooth_sample/State%DMt0*SourceAccuracyBoost  !assume everything is smooth at l > l_smooth_sample
     if (CP%Want_CMB .and. maximum_l > 5000 .and. CP%Accuracy%AccuratePolarization) q_cmb = q_cmb*1.4
     q_cmb = max(q_switch*2, q_cmb)
     !prevent EE going wild in tail
@@ -889,7 +910,7 @@ subroutine SetkValuesForSources
     associate(Evolve_q => ThisSources%Evolve_q)
         call Evolve_q%Init()
         call Evolve_q%Add_delta(qmin, qmax_log, dlnk0, IsLog = .true.)
-        if (qmax > qmax_log) &
+        if (qmax > qmax_log .and. q_switch > qmax_log) &
             call Evolve_q%Add_delta(qmax_log, min(qmax,q_switch), dkn1)
         if (qmax > q_switch) then
             call Evolve_q%Add_delta(q_switch, min(q_cmb,qmax), dkn2)
@@ -1246,7 +1267,7 @@ subroutine SetkValuesForInt(ThisCT)
     integer no
     real(dl) dk,dk0,dlnk1, dk2, max_k_dk, k_max_log, k_max_0
     integer lognum
-    real(dl)  qmax_int,IntSampleBoost
+    real(dl)  qmax_int,IntSampleBoost,LowQIntBoost
 
 
     qmax_int = min(qmax,max_bessels_etak/State%tau0)
@@ -1267,14 +1288,19 @@ subroutine SetkValuesForInt(ThisCT)
         call Init_ClTransfer(ThisCT)
         call ThisCT%q%Getdpoints(half_ends = .false.) !Jun08
     else
+        !Accurate-reionization polarization needs denser CMB integration sampling at low q (reionization bump
+        !is at low l). Only the log block (lognum) and the first linear block step (dk0) are scaled here.
+        LowQIntBoost = IntSampleBoost
+        if (AccuracyTarget > 0 .and. CP%Want_CMB .and. CP%Accuracy%AccuratePolarization &
+            .and. CP%Accuracy%AccurateReionization) LowQIntBoost = max(LowQIntBoost, 1.8_dl)
         !Split up into logarithmically spaced intervals from qmin up to k=lognum*dk0
         !then no-lognum*dk0 linearly spaced at dk0 up to no*dk0
         !then at dk up to max_k_dk, then dk2 up to qmax_int
-        lognum=nint(10*IntSampleBoost)
+        lognum=nint(10*LowQIntBoost)
         dlnk1=1._dl/lognum
         no=nint(600*IntSampleBoost)
-        dk0=1.8_dl/State%curvature_radius/State%chi0/IntSampleBoost
-        dk=3._dl/State%curvature_radius/State%chi0/IntSampleBoost/1.6
+        dk0=1.8_dl/State%DMt0/LowQIntBoost
+        dk=3._dl/State%DMt0/IntSampleBoost/1.6
 
         k_max_log = lognum*dk0
         k_max_0  = no*dk0
@@ -1317,80 +1343,134 @@ end subroutine setkValuesForInt
 
     subroutine InterpolateSources(IV)
     implicit none
-    integer i,khi,klo, step
-    real(dl) xf,b0,ho,a0,ho2o6,a03,b03
-    type(IntegrationVars) IV
-    Type(TRanges), pointer :: Evolve_q
 
-    Evolve_q => ThisSources%Evolve_q
+    type(IntegrationVars), intent(inout) :: IV
 
-    !     finding position of k in table Evolve_q to do the interpolation.
+    integer :: i, j
+    integer :: klo, khi, lo, hi, mid
+    integer :: ntime, nsrc, npoints
+    integer :: step
 
-    !Can't use the following in closed case because regions are not set up (only points)
-    !           klo = min(Evolve_q%npoints-1,Evolve_q%IndexOf(IV%q))
-    !This is a bit inefficient, but thread safe
-    klo=1
-    do while ((IV%q > Evolve_q%points(klo+1)).and.(klo < (Evolve_q%npoints-1)))
-        klo=klo+1
-    end do
+    real(dl) :: q, tau, qtau, xf
+    real(dl) :: ho, ho2o6
+    real(dl) :: a0, b0, a03, b03
+    real(dl) :: max_etak
+    real(dl), parameter :: eps_xf = 1.e-8_dl
 
-    khi=klo+1
+    logical :: ignore_etak
 
+    type(TRanges), pointer :: Evolve_q
 
-    ho=Evolve_q%points(khi)-Evolve_q%points(klo)
-    a0=(Evolve_q%points(khi)-IV%q)/ho
-    b0=(IV%q-Evolve_q%points(klo))/ho
-    ho2o6 = ho**2/6
-    a03=(a0**3-a0)
-    b03=(b0**3-b0)
-    IV%SourceSteps = 0
+    Evolve_q => ThisSources%Evolve_q
 
-    !     Interpolating the source as a function of time for the present
-    !     wavelength.
-    step=2
-    do i=2, State%TimeSteps%npoints
-        xf=IV%q*(State%tau0-State%TimeSteps%points(i))
-        if (CP%WantTensors) then
-            if (IV%q*State%TimeSteps%points(i) < max_etak_tensor.and. xf > 1.e-8_dl) then
-                step=i
-                IV%Source_q(i,:) =a0*scaledSrc(klo,:,i)+&
-                    b0*scaledSrc(khi,:,i)+(a03 *ddScaledSrc(klo,:,i)+ &
-                    b03*ddScaledSrc(khi,:,i)) *ho2o6
-            else
-                IV%Source_q(i,:) = 0
-            end if
-        end if
-        if (CP%WantVectors) then
-            if (IV%q*State%TimeSteps%points(i) < max_etak_vector.and. xf > 1.e-8_dl) then
-                step=i
-                IV%Source_q(i,:) =a0*ScaledSrc(klo,:,i) + b0*ScaledSrc(khi,:,i)+(a03 *ddScaledSrc(klo,:,i)+ &
-                    b03*ddScaledSrc(khi,:,i)) *ho2o6
-            else
-                IV%Source_q(i,:) = 0
-            end if
+    q       = IV%q
+    ntime   = State%TimeSteps%npoints
+    npoints = Evolve_q%npoints
+    nsrc    = size(IV%Source_q, 2)
+
+    ! Preserve original default behaviour: step starts at 2.
+    step = 2
+    IV%SourceSteps = step
+
+    ! Zero all rows that this routine is responsible for.
+    if (ntime >= 2) then
+        IV%Source_q(2:ntime, :) = 0._dl
+    else
+        return
+    end if
+
+    ! Determine active source type.
+    ! This preserves the original overwrite order:
+    ! tensors first, then vectors, then scalars.
+    ignore_etak = .false.
+    max_etak = 0._dl
+
+    if (CP%WantScalars) then
+        if (DebugEvolution .or. WantLateTime) then
+            ignore_etak = .true.
+        else
+            max_etak = max_etak_scalar
         end if
+    else if (CP%WantTensors) then
+        max_etak = max_etak_tensor
+    else if (CP%WantVectors) then
+        max_etak = max_etak_vector
+    end if
 
-        if (CP%WantScalars) then
-            if ((DebugEvolution .or. WantLateTime .or. IV%q*State%TimeSteps%points(i) < max_etak_scalar) &
-                .and. xf > 1.e-8_dl) then
-                step=i
-                IV%Source_q(i,:) = a0 * ScaledSrc(klo,:,i) +  b0 * ScaledSrc(khi,:,i) + (a03*ddScaledSrc(klo,:,i) + &
-                    b03 * ddScaledSrc(khi,:,i)) * ho2o6
+    ! ------------------------------------------------------------------
+    ! Thread-safe binary search for interpolation interval.
+    ! Assumes Evolve_q%points is monotonically increasing.
+    ! Clamp outside range to the nearest interval, matching typical
+    ! interpolation-table behaviour.
+    ! ------------------------------------------------------------------
+    if (q <= Evolve_q%points(1)) then
+        klo = 1
+    else if (q >= Evolve_q%points(npoints)) then
+        klo = npoints - 1
+    else
+        lo = 1
+        hi = npoints
+
+        do while (hi - lo > 1)
+            mid = (lo + hi) / 2
+
+            if (q >= Evolve_q%points(mid)) then
+                lo = mid
             else
-                IV%Source_q(i,:) = 0
+                hi = mid
             end if
+        end do
+
+        klo = lo
+    end if
+
+    khi = klo + 1
+
+    ho = Evolve_q%points(khi) - Evolve_q%points(klo)
+
+    if (ho == 0._dl) then
+        error stop 'InterpolateSources: duplicate Evolve_q points give zero interval'
+    end if
+
+    a0 = (Evolve_q%points(khi) - q) / ho
+    b0 = (q - Evolve_q%points(klo)) / ho
+
+    ho2o6 = ho * ho / 6._dl
+
+    a03 = a0 * a0 * a0 - a0
+    b03 = b0 * b0 * b0 - b0
+
+    ! ------------------------------------------------------------------
+    ! Interpolate source as a function of time for current q.
+    !
+    ! Because IV%Source_q(2:ntime,:) was already zeroed, only valid
+    ! source rows need to be filled.
+    !
+    ! If State%TimeSteps%points is monotonically increasing, once the
+    ! condition fails it will keep failing, so we can exit early.
+    ! ------------------------------------------------------------------
+    do i = 2, ntime
+        tau  = State%TimeSteps%points(i)
+        qtau = q * tau
+        xf   = q * (State%tau0 - tau)
+
+        if ((ignore_etak .or. qtau < max_etak) .and. xf > eps_xf) then
+            step = i
+
+            do j = 1, nsrc
+                IV%Source_q(i, j) = a0 * ScaledSrc(klo, j, i) + &
+                    b0 * ScaledSrc(khi, j, i) + &
+                    (a03 * ddScaledSrc(klo, j, i) + &
+                    b03 * ddScaledSrc(khi, j, i)) * ho2o6
+            end do
+        else
+            exit
         end if
     end do
-    IV%SourceSteps = step
 
-    if (.not.State%flat) then
-        do i=1, ThisSources%SourceNum
-            call spline_def(State%TimeSteps%points,IV%Source_q(:,i),State%TimeSteps%npoints,&
-                IV%ddSource_q(:,i))
-        end do
-    end if
+    IV%SourceSteps = step
 
-    end subroutine
+    end subroutine InterpolateSources
 
 
     subroutine IntegrationVars_Init(IV)
@@ -1400,18 +1480,22 @@ subroutine IntegrationVars_Init(IV)
     IV%Source_q(State%TimeSteps%npoints,:) = 0
     IV%Source_q(State%TimeSteps%npoints-1,:) = 0
 
-    end  subroutine IntegrationVars_Init
+    end subroutine IntegrationVars_Init
 
 
     subroutine DoSourceIntegration(IV, ThisCT) !for particular wave number q
     type(IntegrationVars) IV
     Type(ClTransferData) :: ThisCT
-    integer j,ll,llmax
-    real(dl) nu
+    integer j,ll,llmax,n
+    real(dl) nu, chi_full_max, chi_here, sh
     real(dl) :: sixpibynu
+    real(dl) :: near_flat_time_weight(IV%SourceSteps)
+    logical :: use_near_flat_scalar, have_near_flat_time_weight
 
     nu=IV%q*State%curvature_radius
     sixpibynu  = 6._dl*const_pi/nu
+    chi_full_max = (State%tau0 - State%TimeSteps%points(2)) / State%curvature_radius
+    have_near_flat_time_weight = .false.
 
     if (State%closed) then
         if (nu<20 .or. State%tau0/State%curvature_radius+sixpibynu > const_pi/2) then
@@ -1421,7 +1505,7 @@ subroutine DoSourceIntegration(IV, ThisCT) !for particular wave number q
             llmax=min(llmax,nint(nu)-1)  !nu >= l+1
         end if
     else
-        llmax=nint(nu*State%chi0)
+        llmax=nint(IV%q*State%DMt0)
         if (llmax<15) then
             llmax=17 !AL Sept2010 changed from 15 to get l=16 smooth
         else
@@ -1435,12 +1519,55 @@ subroutine DoSourceIntegration(IV, ThisCT) !for particular wave number q
         do j=1,ThisCT%ls%nl
             ll=ThisCT%ls%l(j)
             if (ll>llmax) exit
-            call IntegrateSourcesBessels(IV,ThisCT,j,ll,nu)
+            use_near_flat_scalar = CP%WantScalars .and. ThisSources%SourceNum <= 3 .and. &
+                UseNearFlatScalarIntegration(ll, nu, chi_full_max)
+
+            if (use_near_flat_scalar) then
+                if (.not. have_near_flat_time_weight) then
+                    do n = 1, IV%SourceSteps
+                        chi_here = (State%tau0 - State%TimeSteps%points(n)) / State%curvature_radius
+                        if (abs(chi_here) < 1.e-8_dl) then
+                            near_flat_time_weight(n) = State%TimeSteps%dpoints(n)
+                        else
+                            sh = State%rofChi(chi_here)
+                            near_flat_time_weight(n) = State%TimeSteps%dpoints(n) * chi_here / sh
+                        end if
+                    end do
+                    have_near_flat_time_weight = .true.
+                end if
+
+                call DoNearFlatIntegration(IV, ThisCT, j, ll, nu, near_flat_time_weight)
+            else
+                call IntegrateSourcesBessels(IV,ThisCT,j,ll,nu)
+            end if
         end do !j loop
     end if
 
     end subroutine DoSourceIntegration
 
+    logical function UseNearFlatScalarIntegration(l, nu, chi_max) result(use_near_flat)
+    integer, intent(in) :: l
+    real(dl), intent(in) :: nu, chi_max
+
+    real(dl) :: chi_disp, qeff2, err_tol, err_est, boost_scale, chi_limit_eff
+
+    qeff2 = nu**2 - State%Ksign * real(l * (l + 1), dl) / 3._dl
+    chi_disp = State%invsinfunc(sqrt(real(l * (l + 1), dl)) / nu)
+
+    boost_scale = max(CP%Accuracy%NonFlatIntAccuracyBoost * CP%Accuracy%AccuracyBoost, 1._dl)
+    chi_limit_eff = near_flat_approx_chi_limit / sqrt(boost_scale)
+
+    err_tol = 1.0e-3_dl / boost_scale
+    err_est = real(l * (l + 1), dl) * chi_max**2 / max(15._dl * qeff2, 1._dl)
+
+    use_near_flat = enable_do_near_flat_integration .and. (abs(State%scale - 1._dl) <= near_flat_scale_tol) .and. &
+        (chi_max < chi_limit_eff) .and. &
+        (chi_disp < near_flat_approx_chidisp_limit) .and. &
+        (err_est < err_tol) .and. (qeff2 > 0._dl)
+
+
+    end function UseNearFlatScalarIntegration
+
     function UseLimber(l)
     !Calculate lensing potential power using Limber rather than j_l integration
     !even when sources calculated as part of temperature calculation
@@ -1518,11 +1645,9 @@ subroutine DoFlatIntegration(IV, ThisCT, llmax)
         tmin=max(State%TimeSteps%points(2),tmin)
         if (.not. CP%Want_CMB .and. .not. CP%Want_CMB_lensing) &
             tmin = max(tmin, State%ThermoData%tau_start_redshiftwindows)
-
-
         if (tmax < State%TimeSteps%points(2)) exit
-        sums = 0
 
+        sums = 0
         !As long as we sample the source well enough, it is sufficient to
         !interpolate the Bessel functions only
 
@@ -1669,6 +1794,117 @@ subroutine DoFlatIntegration(IV, ThisCT, llmax)
     end subroutine DoFlatIntegration
 
 
+    subroutine DoNearFlatIntegration(IV, ThisCT, j, l, nu, near_flat_time_weight)
+    use SpherBessels, only: BessRanges, bessel_horner
+    implicit none
+    type(IntegrationVars) IV
+    Type(ClTransferData) :: ThisCT
+    integer, intent(in) :: j, l
+    real(dl), intent(in) :: nu
+    real(dl), intent(in) :: near_flat_time_weight(IV%SourceSteps)
+
+    integer :: n, bes_ix, bes_index(IV%SourceSteps)
+    logical :: DoInt
+    real(dl) :: xlim, xlmax1, tmin, tmax
+    real(dl) :: a2, base, coeff1, coeff2, coeff3, dx, J_l, x_hi, xf
+    real(dl) :: qeff2, qeff, qeff_tau, chi_here, sh, chi_over_sh, weight
+    real(dl) :: qmax_int, sums(ThisSources%SourceNum), left_weight(IV%SourceSteps)
+    real(dl) :: BessIntBoost
+
+    BessIntBoost = CP%Accuracy%AccuracyBoost*CP%Accuracy%BessIntBoost
+
+    qeff2 = nu**2 - State%Ksign * real(l * (l + 1), dl) / 3._dl
+    if (qeff2 <= 0._dl) return
+    qeff = sqrt(qeff2)
+    qeff_tau = qeff / State%curvature_radius
+
+    xlim = xlimfrac * l
+    xlim = max(xlim, xlimmin)
+    xlim = l - xlim
+    if (full_bessel_integration .or. do_bispectrum) then
+        tmin = State%TimeSteps%points(2)
+    else
+        xlmax1 = 80._dl * l * BessIntBoost
+        tmin = State%tau0 - xlmax1 / qeff_tau
+        tmin = max(State%TimeSteps%points(2), tmin)
+    end if
+    tmax = State%tau0 - xlim / qeff_tau
+    tmax = min(State%tau0, tmax)
+    tmin = max(State%TimeSteps%points(2), tmin)
+    if (tmax < State%TimeSteps%points(2)) return
+
+    sums = 0._dl
+    DoInt = .true.
+    if (ThisSources%SourceNum /= 2) then
+        qmax_int = max(850, l) * 3 * BessIntBoost / State%tau0 * 1.2_dl
+        DoInt = qeff_tau < qmax_int
+    end if
+
+    if (DoInt) then
+        ! Avoid the per-l spline lookup setup entirely when this mode only contributes via Limber.
+        associate(BesselPoints => BessRanges%points, TimePoints => State%TimeSteps%points)
+            do n=1,IV%SourceSteps
+                left_weight(n) = abs(qeff_tau * (State%tau0 - TimePoints(n)))
+            end do
+            call BessRanges%IndexOfOrdered(left_weight, IV%SourceSteps, bes_index)
+            do n=1,IV%SourceSteps
+                bes_ix = bes_index(n)
+                x_hi = BesselPoints(bes_ix + 1)
+                dx = x_hi - BesselPoints(bes_ix)
+                left_weight(n) = (x_hi - left_weight(n)) / dx
+            end do
+        end associate
+    end if
+
+    if (ThisSources%SourceNum == 2) then
+        do n = State%TimeSteps%IndexOf(tmin), min(IV%SourceSteps, State%TimeSteps%IndexOf(tmax))
+            a2 = left_weight(n)
+            bes_ix = bes_index(n)
+            base = bessel_horner(1, bes_ix, j)
+            coeff1 = bessel_horner(2, bes_ix, j)
+            coeff2 = bessel_horner(3, bes_ix, j)
+            coeff3 = bessel_horner(4, bes_ix, j)
+            J_l = base + a2 * (coeff1 + a2 * (coeff2 + a2 * coeff3))
+
+            weight = near_flat_time_weight(n) * J_l
+            sums(1) = sums(1) + IV%Source_q(n,1) * weight
+            sums(2) = sums(2) + IV%Source_q(n,2) * weight
+        end do
+    else
+        if (DoInt) then
+            do n = State%TimeSteps%IndexOf(tmin), min(IV%SourceSteps, State%TimeSteps%IndexOf(tmax))
+                a2 = left_weight(n)
+                bes_ix = bes_index(n)
+                base = bessel_horner(1, bes_ix, j)
+                coeff1 = bessel_horner(2, bes_ix, j)
+                coeff2 = bessel_horner(3, bes_ix, j)
+                coeff3 = bessel_horner(4, bes_ix, j)
+                J_l = base + a2 * (coeff1 + a2 * (coeff2 + a2 * coeff3))
+
+                weight = near_flat_time_weight(n) * J_l
+                sums(1) = sums(1) + IV%Source_q(n,1) * weight
+                sums(2) = sums(2) + IV%Source_q(n,2) * weight
+                sums(3) = sums(3) + IV%Source_q(n,3) * weight
+            end do
+        end if
+        if (.not. DoInt .or. UseLimber(l)) then
+            xf = State%tau0 - State%invsinfunc((l + 0.5_dl) / nu) * State%curvature_radius
+            if (xf < State%TimeSteps%Highest .and. xf > State%TimeSteps%Lowest) then
+                n = State%TimeSteps%IndexOf(xf)
+                xf = (xf - State%TimeSteps%points(n)) / (State%TimeSteps%points(n + 1) - State%TimeSteps%points(n))
+                sums(3) = (IV%Source_q(n,3) * (1 - xf) + xf * IV%Source_q(n + 1,3)) * &
+                    sqrt(const_pi / 2 / (l + 0.5_dl) / sqrt(1._dl - State%Ksign * real(l**2, dl) / nu**2)) / IV%q
+            else
+                sums(3) = 0._dl
+            end if
+        end if
+    end if
+
+    ThisCT%Delta_p_l_k(:,j,IV%q_ix) = ThisCT%Delta_p_l_k(:,j,IV%q_ix) + sums
+
+    end subroutine DoNearFlatIntegration
+
+
 
     !non-flat source integration
 
@@ -1677,9 +1913,9 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
     type(IntegrationVars) IV
     Type(ClTransferData) :: ThisCT
     logical DoInt
-    integer l,j, nstart,nDissipative,ntop,nbot,nrange,nnow
+    integer l,j, nstart,nDissipative,ntop,nbot,nrange,nnow,nmin_osc,nbot_eff
     real(dl) nu,ChiDissipative,ChiStart,tDissipative,y1,y2,y1dis,y2dis
-    real(dl) xf,x,chi, miny1
+    real(dl) xf,x,chi, miny1,xlmax1,tmin
     real(dl) sums(ThisSources%SourceNum),out_arr(ThisSources%SourceNum), qmax_int
     real(dl) BessIntBoost
 
@@ -1721,7 +1957,7 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
         ! Fix for ujl discontinuity in near-flat models - adaptive formula provides superior performance
         miny1= 0.5d-4/(l+max(0,30-l))/BessIntBoost  ! Adaptive formula for Omega_k discontinuity fix, see https://github.com/cmbant/CAMB/pull/185
         sums=0
-        qmax_int= max(850,ThisCT%ls%l(j))*3*BessIntBoost/(State%chi0*State%curvature_radius)*1.2
+        qmax_int= max(850,ThisCT%ls%l(j))*3*BessIntBoost/State%DMt0*1.2
         DoInt =  ThisSources%SourceNum/=3 .or. IV%q < qmax_int
         if (DoInt) then
             if ((nstart < min(State%TimeSteps%npoints-1,IV%SourceSteps)).and.(y1dis > miny1)) then
@@ -1736,7 +1972,7 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
                     end if
                     if (nnow < ntop) then
                         call DoRangeInt(IV,chi,ChiDissipative,nnow,ntop,State%TimeSteps%R(nrange)%delta, &
-                            nu,l,y1,y2,out_arr)
+                            nu,j,l,y1,y2,out_arr)
                         sums  = sums + out_arr
                         nnow = ntop
                         if (chi==0) exit !small enough to cut off
@@ -1744,22 +1980,48 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
                 end do
             end if !integrate down chi
 
-            !Integrate chi up in oscillatory region
+            !Integrate chi up in oscillatory region, but optionally cut off very high-x tail
             if (nstart > 2) then
                 y1=y1dis
                 y2=y2dis
                 chi=ChiStart
                 nnow=nstart
-                do nrange = State%TimeSteps%Count,1,-1
-                    nbot = State%TimeSteps%R(nrange)%start_index
-                    if (nnow >  nbot) then
-                        call DoRangeInt(IV,chi,ChiDissipative,nnow,nbot,State%TimeSteps%R(nrange)%delta, &
-                            nu,l,y1,y2,out_arr)
-                        sums=sums+out_arr
-                        if (chi==0) exit !small for remaining region
-                        nnow = nbot
+
+                if (full_bessel_integration .or. do_bispectrum) then
+                    nmin_osc = 2
+                else
+                    xlmax1 = 80._dl * l * BessIntBoost
+                    if (State%num_redshiftwindows > 0 .and. CP%WantScalars) then
+                        xlmax1 = xlmax1 * 8._dl
                     end if
-                end do
+
+                    tmin = State%tau0 - xlmax1 / IV%q
+                    tmin = max(State%TimeSteps%points(2), tmin)
+
+                    if (tmin >= State%TimeSteps%points(nstart)) then
+                        nmin_osc = nstart
+                    else
+                        nmin_osc = max(2, State%TimeSteps%IndexOf(tmin))
+                    end if
+                end if
+
+                if (nstart > nmin_osc) then
+                    do nrange = State%TimeSteps%Count,1,-1
+                        if (nnow <= nmin_osc) exit
+
+                        nbot = State%TimeSteps%R(nrange)%start_index
+                        nbot_eff = max(nbot, nmin_osc)
+                        if (nnow > nbot_eff) then
+                            call DoRangeInt(IV,chi,ChiDissipative,nnow,nbot_eff,State%TimeSteps%R(nrange)%delta, &
+                                nu,j,l,y1,y2,out_arr)
+                            sums=sums+out_arr
+                            if (chi==0) exit !small for remaining region
+                            nnow = nbot_eff
+                        end if
+
+                        if (nnow == nmin_osc) exit
+                    end do
+                end if
             end if
         end if !DoInt
         if (ThisSources%SourceNum==3 .and. (.not. DoInt .or. UseLimber(l))) then
@@ -1769,7 +2031,7 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
                 nbot=State%TimeSteps%IndexOf(xf)
                 xf= (xf-State%TimeSteps%points(nbot))/(State%TimeSteps%points(nbot+1)-State%TimeSteps%points(nbot))
                 sums(3) = (IV%Source_q(nbot,3)*(1-xf) + xf*IV%Source_q(nbot+1,3))*&
-                    sqrt(const_pi/2/(l+0.5_dl)/sqrt(1-State%Ksign*real(l**2)/nu**2))/IV%q
+                    sqrt(const_pi/2/(l+0.5_dl)/sqrt(1-State%Ksign*real(l**2, dl)/nu**2))/IV%q
             else
                 sums(3) = 0
             end if
@@ -1833,8 +2095,165 @@ subroutine IntegrateSourcesBessels(IV,ThisCT,j,l,nu)
     end subroutine IntegrateSourcesBessels
 
 
+    subroutine StepUSpherBesselRK4(ap1, nu2, delchi, chi, y1, y2, sh, potential)
+    real(dl), intent(in) :: ap1, nu2, delchi
+    real(dl), intent(inout) :: chi, y1, y2, sh, potential
+    real(dl) :: dydchi1, dydchi2, yt1, yt2, dyt1, dyt2, dym1, dym2
+    real(dl) :: hh, h6, xh
 
-    subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
+    hh = 0.5_dl * delchi
+    h6 = delchi / 6._dl
+
+    dydchi1 = y2
+    dydchi2 = potential * y1
+    xh = chi + hh
+    yt1 = y1 + hh * dydchi1
+    yt2 = y2 + hh * dydchi2
+    dyt1 = yt2
+    potential = ap1 / State%rofChi(xh)**2 - nu2
+
+    dyt2 = potential * yt1
+    yt1 = y1 + hh * dyt1
+    yt2 = y2 + hh * dyt2
+    dym1 = yt2
+    dym2 = potential * yt1
+    yt1 = y1 + delchi * dym1
+    dym1 = dyt1 + dym1
+    yt2 = y2 + delchi * dym2
+    dym2 = dyt2 + dym2
+
+    chi = chi + delchi
+    sh = State%rofChi(chi)
+    dyt1 = yt2
+    potential = ap1 / sh**2 - nu2
+    dyt2 = potential * yt1
+    y1 = y1 + h6 * (dydchi1 + dyt1 + 2._dl * dym1)
+    y2 = y2 + h6 * (dydchi2 + dyt2 + 2._dl * dym2)
+
+    end subroutine StepUSpherBesselRK4
+
+
+    subroutine StepUSpherBesselNumerov(ap1, nu2, delchi, chi, y_prev, y_now, y2_prev, q_prev, q_now, sh, y_next, y2_next, q_next)
+    real(dl), intent(in) :: ap1, nu2, delchi, y_prev, y_now, y2_prev, q_prev, q_now
+    real(dl), intent(inout) :: chi, sh
+    real(dl), intent(out) :: y_next, y2_next, q_next
+    real(dl) :: h2
+
+    chi = chi + delchi
+    sh = State%rofChi(chi)
+    q_next = ap1 / sh**2 - nu2
+    h2 = delchi**2
+
+    y_next = (2._dl * (1._dl + 5._dl * h2 * q_now / 12._dl) * y_now - (1._dl - h2 * q_prev / 12._dl) * y_prev) / &
+        (1._dl - h2 * q_next / 12._dl)
+    y2_next = y2_prev + delchi * (q_prev * y_prev + 4._dl * q_now * y_now + q_next * y_next) / 3._dl
+
+    end subroutine StepUSpherBesselNumerov
+
+
+    real(dl) function ShiftedQBesselTableMaxEtak(lmax, base_etak) result(bessel_etak)
+    integer, intent(in) :: lmax
+    real(dl), intent(in) :: base_etak
+    real(dl), parameter :: safety_margin = 10._dl
+    real(dl) :: max_scale_x, max_curvature_x
+
+    bessel_etak = base_etak
+
+    if (State%flat .or. abs(State%scale - 1._dl) > near_flat_scale_tol) return
+    if (.not. enable_do_near_flat_integration .and. .not. enable_shifted_q_scalar_approx) return
+
+    ! For any model using either near-flat scalar approximation path we have chi <= near_flat_approx_chi_limit
+    ! and scale >= 1 - near_flat_scale_tol, so this cache-stable bound safely covers q_eff * chi.
+    max_scale_x = base_etak / (1._dl - near_flat_scale_tol)
+    max_curvature_x = near_flat_approx_chi_limit * sqrt(real(lmax * (lmax + 1), dl) / 3._dl)
+    bessel_etak = sqrt(max_scale_x**2 + max_curvature_x**2) + safety_margin
+
+    end function ShiftedQBesselTableMaxEtak
+
+
+    logical function UseShiftedQScalarApprox(l, nu, chi, chiDisp, dchisource, sgn, nIntSteps) result(use_shifted_q)
+    integer, intent(in) :: l, nIntSteps
+    real(dl), intent(in) :: nu, chi, chiDisp, dchisource, sgn
+
+    real(dl) :: chi_end, chi_max, qeff2, err_tol, err_est, boost_scale, chi_limit_eff
+
+    chi_end = chi + real(nIntSteps, dl) * dchisource * sgn
+    chi_max = max(abs(chi), abs(chi_end))
+    qeff2 = nu**2 - State%Ksign * real(l * (l + 1), dl) / 3._dl
+
+    boost_scale = max(CP%Accuracy%NonFlatIntAccuracyBoost * CP%Accuracy%AccuracyBoost, 1._dl)
+    chi_limit_eff = near_flat_approx_chi_limit / sqrt(boost_scale)
+
+    err_tol = 1.0e-3_dl / boost_scale
+    err_est = real(l * (l + 1), dl) * chi_max**2 / max(15._dl * qeff2, 1._dl)
+
+    use_shifted_q = enable_shifted_q_scalar_approx .and. (abs(State%scale - 1._dl) <= near_flat_scale_tol) .and. &
+        (chi_max < chi_limit_eff) .and. &
+        (chiDisp < near_flat_approx_chidisp_limit) .and. (err_est < err_tol) .and. (qeff2 > 0._dl)
+
+    end function UseShiftedQScalarApprox
+
+
+    subroutine FillShiftedQUjlVals(j, l, nu, chi_start, dchisource, sgn, nIntSteps, ujl_vals, y1, y2, chi)
+    use SpherBessels, only: BessRanges, bessel_horner, bjl
+
+    integer, intent(in) :: j, l, nIntSteps
+    real(dl), intent(in) :: nu, chi_start, dchisource, sgn
+    real(dl), intent(out) :: ujl_vals(nIntSteps + 1)
+    real(dl), intent(inout) :: y1, y2, chi
+
+    integer :: step_ix, bes_ix(nIntSteps + 1)
+    real(dl) :: qeff, qeff2, chi_here, sh, chi_over_sh
+    real(dl) :: x_vals(nIntSteps + 1), left_weight(nIntSteps + 1)
+    real(dl) :: x_hi, dx, base, coeff1, coeff2, coeff3, jl
+    logical :: use_splines
+
+    qeff2 = nu**2 - State%Ksign * real(l * (l + 1), dl) / 3._dl
+    qeff = sqrt(max(qeff2, 0._dl))
+    use_splines = allocated(bessel_horner) .and. j <= ubound(bessel_horner, 3) .and. BessRanges%npoints > 1
+
+    if (use_splines) then
+        do step_ix = 0, nIntSteps
+            x_vals(step_ix + 1) = qeff * (chi_start + step_ix * dchisource * sgn)
+        end do
+        call BessRanges%IndexOfOrdered(x_vals, nIntSteps + 1, bes_ix)
+    end if
+
+    do step_ix = 0, nIntSteps
+        chi_here = chi_start + step_ix * dchisource * sgn
+        if (use_splines) then
+            bes_ix(step_ix + 1) = min(bes_ix(step_ix + 1), BessRanges%npoints - 1)
+            x_hi = BessRanges%points(bes_ix(step_ix + 1) + 1)
+            dx = x_hi - BessRanges%points(bes_ix(step_ix + 1))
+            left_weight(step_ix + 1) = (x_hi - x_vals(step_ix + 1)) / dx
+            base = bessel_horner(1, bes_ix(step_ix + 1), j)
+            coeff1 = bessel_horner(2, bes_ix(step_ix + 1), j)
+            coeff2 = bessel_horner(3, bes_ix(step_ix + 1), j)
+            coeff3 = bessel_horner(4, bes_ix(step_ix + 1), j)
+            jl = base + left_weight(step_ix + 1) * (coeff1 + left_weight(step_ix + 1) * (coeff2 + left_weight(step_ix + 1) * coeff3))
+        else
+            call bjl(l, qeff * chi_here, jl)
+        end if
+
+        if (abs(chi_here) < 1.e-8_dl) then
+            chi_over_sh = 1._dl
+        else
+            sh = State%rofChi(chi_here)
+            chi_over_sh = chi_here / sh
+        end if
+
+        ujl_vals(step_ix + 1) = chi_over_sh * jl
+    end do
+
+    chi = chi_start + real(nIntSteps, dl) * dchisource * sgn
+    y1 = 0._dl
+    y2 = 0._dl
+
+    end subroutine FillShiftedQUjlVals
+
+
+
+    subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,j,l,y1,y2,out)
     !Non-flat version
 
     !returns chi at end of integral (where integral stops, not neccessarily end)
@@ -1845,18 +2264,17 @@ subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
 
     use precision
     type(IntegrationVars) IV
-    integer l,nIntSteps,nstart,nend,nlowest,isgn,i,is,Startn
-    real(dl) nu,dtau,num1,num2,Deltachi,aux1,aux2
-    real(dl) a,b,tmpa,tmpb,hh,h6,xh,delchi,taui
-    real(dl) nu2,chi,chiDisp,dydchi1,dydchi2,yt1,yt2,dyt1,dyt2,dym1,dym2
+    integer j,l,nIntSteps,nstart,nend,isgn,i,Startn,step_ix,nSubSteps,src_ix,last_ix
+    real(dl) nu,dtau,num1,num2,Deltachi,delchi
+    real(dl) nu2,chi,chiDisp
 
-    real(dl) tmp,dtau2o6,y1,y2,ap1,sh,ujl,chiDispTop
+    real(dl) tmp,y1,y2,ap1,sh,ujl,chiDispTop
     real(dl) dchimax,dchisource,sgn,sgndelchi,minujl
+    real(dl) y_prev, y2_prev, y_next, y2_next, q_prev, q_next
     real(dl), parameter:: MINUJl1 = 0.5d-4  !cut-off point for small ujl l=1
-    logical Interpolate
     real(dl) scalel
     real(dl) IntAccuracyBoost
-    real(dl) sources(ThisSources%SourceNum), out(ThisSources%SourceNum)
+    real(dl) out(ThisSources%SourceNum), ujl_vals(abs(nstart - nend) + 1)
 
     IntAccuracyBoost=CP%Accuracy%AccuracyBoost*CP%Accuracy%NonFlatIntAccuracyBoost
 
@@ -1866,28 +2284,31 @@ subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
         return
     end if
 
+    if (nstart>IV%SourceSteps.and.nend>IV%SourceSteps) then
+        out = 0
+        y1=0._dl !So we know to calculate starting y1,y2 if there is next range
+        y2=0._dl
+        chi=(State%tau0-State%TimeSteps%points(nend))/State%curvature_radius
+        return
+    end if
+
     dchisource=dtau/State%curvature_radius
 
     num1=1._dl/nu
 
     scalel=l/State%scale
     if (scalel>=2400) then
-        num2=num1*2.5
-    else if (scalel< 50) then
+        num2=num1*2.5_dl
+    else if (scalel<50) then
         num2=num1*0.8_dl
     else
         num2=num1*1.5_dl
     end if
-    !Dec 2003, since decrease dtaurec, can make this smaller
-    if (dtau==dtaurec_q) then
-        num2=num2/4
-    end if
 
     if (scalel<1500 .and. scalel > 150) &
         IntAccuracyBoost=IntAccuracyBoost*(1+(2000-scalel)*0.6/2000 )
 
-    if (num2*IntAccuracyBoost < dchisource .and. (.not. WantLateTime .or. UseLimber(l)) &
-        .or. (nstart>IV%SourceSteps.and.nend>IV%SourceSteps)) then
+    if (num2*IntAccuracyBoost < dchisource .and. (.not. WantLateTime .or. UseLimber(l))) then
         out = 0
         y1=0._dl !So we know to calculate starting y1,y2 if there is next range
         y2=0._dl
@@ -1899,9 +2320,6 @@ subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
     if (nstart>IV%SourceSteps .and. nend < IV%SourceSteps) then
         chi=(State%tau0-State%TimeSteps%points(IV%SourceSteps))/State%curvature_radius
         Startn=IV%SourceSteps
-        call USpherBesselWithDeriv(State%closed,State%CP,chi,l,nu,y1,y2)
-    else if ((y2==0._dl).and.(y1==0._dl)) then
-        call USpherBesselWithDeriv(State%closed,State%CP,chi,l,nu,y1,y2)
     end if
 
     if (State%closed) then
@@ -1917,13 +2335,8 @@ subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
 
     sgn= isgn
 
-    nlowest=min(Startn,nend)
-    aux1=1._dl*State%curvature_radius/dtau  !used to calculate nearest timestep quickly
-    aux2=(State%tau0-State%TimeSteps%points(nlowest))/dtau + nlowest
-
     nu2=nu*nu
     ap1=l*(l+1)
-    sh=State%rofChi(chi)
 
     if (scalel < 1100) then
         dchimax= 0.3*num1
@@ -1935,99 +2348,90 @@ subroutine DoRangeInt(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
 
     dchimax=dchimax/IntAccuracyBoost
 
-    ujl=y1/sh
-    sources = IV%Source_q(Startn, :)
-
-    out = 0.5_dl*ujl*sources
-
-    Interpolate = dchisource > dchimax
-    if (Interpolate) then !split up smaller than source step size
-        delchi=dchimax
-        Deltachi=sgn*(State%TimeSteps%points(Startn)-State%TimeSteps%points(nend))/State%curvature_radius
-        nIntSteps=int(Deltachi/delchi+0.99_dl)
-        delchi=Deltachi/nIntSteps
-        dtau2o6=(State%curvature_radius*delchi)**2/6._dl
-    else !step size is that of source
-        delchi=dchisource
-        nIntSteps=isgn*(Startn-nend)
-    end if
-
-    sgndelchi=delchi*sgn
-    tmp=(ap1/sh**2 - nu2)
-    hh=0.5_dl*sgndelchi
-    h6=sgndelchi/6._dl
+    nIntSteps=isgn*(Startn-nend)
+    if (nIntSteps <= 0) return
 
+    if (UseShiftedQScalarApprox(l, nu, chi, chiDisp, dchisource, sgn, nIntSteps)) then
+        ! In the local chi << 1 regime, fill the source-grid values directly from the
+        ! shifted-q near-flat approximation instead of stepping the full hyperspherical ODE.
+        out = 0
+        call FillShiftedQUjlVals(j, l, nu, chi, dchisource, sgn, nIntSteps, ujl_vals(1:nIntSteps + 1), y1, y2, chi)
+        last_ix = nIntSteps + 1
+    else
+        if ((y2==0._dl).and.(y1==0._dl)) then
+            call USpherBesselWithDeriv(State%closed,State%CP,chi,l,nu,y1,y2)
+        end if
 
-    do i=1,nIntSteps
-        ! One step in the ujl integration
-        ! fourth-order Runge-Kutta method to integrate equation for ujl
-
-        dydchi1=y2         !deriv y1
-        dydchi2=tmp*y1     !deriv y2
-        xh=chi+hh          !midpoint of step
-        yt1=y1+hh*dydchi1  !y1 at midpoint
-        yt2=y2+hh*dydchi2  !y2 at midpoint
-        dyt1=yt2           !deriv y1 at mid
-        tmp=(ap1/State%rofChi(xh)**2 - nu2)
+        sh=State%rofChi(chi)
+        ujl=y1/sh
+        out = 0
+        ujl_vals(1) = ujl
 
+        nSubSteps = max(1, int(dchisource/dchimax + 0.99_dl))
+        delchi = dchisource/nSubSteps
 
-        dyt2=tmp*yt1       !deriv y2 at mid
+        sgndelchi=delchi*sgn
+        tmp=(ap1/sh**2 - nu2)
 
-        yt1=y1+hh*dyt1     !y1 at mid
-        yt2=y2+hh*dyt2     !y2 at mid
+        y_prev = y1
+        y2_prev = y2
+        q_prev = tmp
 
-        dym1=yt2           !deriv y1 at mid
-        dym2=tmp*yt1       !deriv y2 at mid
-        yt1=y1+sgndelchi*dym1 !y1 at end
-        dym1=dyt1+dym1
-        yt2=y2+sgndelchi*dym2 !y2 at end
-        dym2=dyt2+dym2
+        do i=1,nIntSteps
+            do step_ix = 1, nSubSteps
+                if (i == 1 .and. step_ix == 1) then
+                    call StepUSpherBesselRK4(ap1, nu2, sgndelchi, chi, y1, y2, sh, tmp)
+                else
+                    call StepUSpherBesselNumerov(ap1, nu2, sgndelchi, chi, y_prev, y1, y2_prev, q_prev, tmp, sh, y_next, y2_next, q_next)
+                    y_prev = y1
+                    y2_prev = y2
+                    q_prev = tmp
+                    y1 = y_next
+                    y2 = y2_next
+                    tmp = q_next
+                end if
 
-        chi=chi+sgndelchi     !end point
-        sh=State%rofChi(chi)
-        dyt1=yt2           !deriv y1 at end
-        tmp=(ap1/sh**2 - nu2)
-        dyt2=tmp*yt1       !deriv y2 at end
-        y1=y1+h6*(dydchi1+dyt1+2*dym1) !add up
-        y2=y2+h6*(dydchi2+dyt2+2*dym2)
+                ujl=y1/sh
+                if ((isgn<0).and.(y1*y2<0._dl).or.((chi>chiDispTop).and.((chi>3.14).or.(y1*y2>0)))) then
+                    chi=0._dl
+                    exit   !If this happens we are small, so stop integration
+                end if
 
-        ujl=y1/sh
-        if ((isgn<0).and.(y1*y2<0._dl).or.((chi>chiDispTop).and.((chi>3.14).or.(y1*y2>0)))) then
-            chi=0._dl
-            exit   !If this happens we are small, so stop integration
-        end if
+                if (((isgn<0).or.(chi>chiDispTop)).and.(abs(ujl) < minujl)) then
+                    chi=0._dl
+                    exit !break when getting  exponentially small in dissipative region
+                end if
+            end do
 
+            if (chi==0._dl) exit
+            ujl_vals(i + 1) = ujl
+        end do
 
-        if (Interpolate) then
-            ! Interpolate the source
-            taui=aux2-aux1*chi
-            is=int(taui)
-            b=taui-is
+        last_ix = i
+    end if
 
-            if (b > 0.998) then
-                !may save time, and prevents numerical error leading to access violation of IV%Source_q(0)
-                sources = IV%Source_q(is+1,:)
-            else
-                a=1._dl-b
-                tmpa=(a**3-a)
-                tmpb=(b**3-b)
-                sources=a*IV%Source_q(is,:)+b*IV%Source_q(is+1,:)+ &
-                    (tmpa*IV%ddSource_q(is,:)+ &
-                    tmpb*IV%ddSource_q(is+1,:))*dtau2o6
-            end if
+    if (last_ix > 0) then
+        ujl_vals(1) = 0.5_dl * ujl_vals(1)
+        if (last_ix > 1) ujl_vals(last_ix) = 0.5_dl * ujl_vals(last_ix)
+        src_ix = Startn
+
+        if (ThisSources%SourceNum==3) then
+            do step_ix = 1, last_ix
+                ujl = ujl_vals(step_ix)
+                out(1) = out(1) + ujl * IV%Source_q(src_ix,1)
+                out(2) = out(2) + ujl * IV%Source_q(src_ix,2)
+                out(3) = out(3) + ujl * IV%Source_q(src_ix,3)
+                src_ix = src_ix - isgn
+            end do
         else
-            sources = IV%Source_q(Startn - i*isgn,:)
-        end if
-
-        out = out + ujl*sources
-
-        if (((isgn<0).or.(chi>chiDispTop)).and.(abs(ujl) < minujl)) then
-            chi=0
-            exit !break when getting  exponentially small in dissipative region
+            do step_ix = 1, last_ix
+                out = out + ujl_vals(step_ix) * IV%Source_q(src_ix,:)
+                src_ix = src_ix - isgn
+            end do
         end if
-    end do
+    end if
 
-    out = (out - sources*ujl/2)*delchi*State%curvature_radius
+    out = out * dchisource*State%curvature_radius
 
     end subroutine DoRangeInt
 
@@ -2038,24 +2442,18 @@ subroutine DoRangeIntTensor(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
     ! integration.
     ! dtau is the spacing of the timesteps (they must be equally spaced)
 
-    type(IntegrationVars), target :: IV
-    integer l,nIntSteps,nstart,nend,nlowest,isgn,i,is
-    real(dl) nu,dtau,num1,num2,Deltachi,aux1,aux2
-    real(dl) a,b,tmpa,tmpb,hh,h6,xh,delchi,taui,scalel
+    type(IntegrationVars) :: IV
+    integer l,nIntSteps,nstart,nend,isgn,i,step_ix,nSubSteps,src_ix, last_ix
+    real(dl) nu,dtau,num1,num2,Deltachi,delchi,scalel
     real(dl) nu2,chi,chiDisp,chiDispTop
-    real(dl) dydchi1,dydchi2,yt1,yt2,dyt1,dyt2,dym1,dym2
 
-    real(dl) tmp,dtau2o6,y1,y2,ap1,sh,ujl
+    real(dl) tmp,y1,y2,ap1,sh,ujl
     real(dl) dchimax,dchisource,sgn,sgndelchi,minujl
+    real(dl) y_prev, y2_prev, y_next, y2_next, q_prev, q_next
     real(dl), parameter:: MINUJl1 = 1.D-6  !cut-off point for smal ujl l=1
-    logical Interpolate
-    real(dl) out(ThisSources%SourceNum), source(ThisSources%SourceNum)
-    real(dl), dimension(:,:), pointer :: sourcep, ddsourcep
+    real(dl) out(ThisSources%SourceNum), ujl_vals(abs(nstart - nend) + 1)
     real(dl) IntAccuracyBoost
 
-    sourcep => IV%Source_q(:,1:)
-    ddsourcep => IV%ddSource_q(:,1:)
-
 
     if (nend==nstart) then
         out=0
@@ -2063,7 +2461,7 @@ subroutine DoRangeIntTensor(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
     end if
 
     IntAccuracyBoost=CP%Accuracy%AccuracyBoost*CP%Accuracy%NonFlatIntAccuracyBoost
-    minujl=MINUJL1*IntAccuracyBoost/l
+    minujl=MINUJl1/l/IntAccuracyBoost
     isgn=sign(1,nstart-nend)!direction of chi integration
     !higher n, later time, smaller chi
 
@@ -2103,11 +2501,6 @@ subroutine DoRangeIntTensor(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
 
     sgn=isgn
 
-    nlowest=min(nstart,nend)
-    aux1=1._dl*State%curvature_radius/dtau  !used to calculate nearest timestep quickly
-    aux2=(State%tau0-State%TimeSteps%points(nlowest))/dtau + nlowest
-
-
     nu2=nu*nu
     ap1=l*(l+1)
 
@@ -2124,93 +2517,68 @@ subroutine DoRangeIntTensor(IV,chi,chiDisp,nstart,nend,dtau,nu,l,y1,y2,out)
     dchimax=dchimax/IntAccuracyBoost
 
     ujl=y1/sh
-    out = ujl * sourcep(nstart,:)/2
-
-    Interpolate = dchisource > dchimax
-    if (Interpolate) then !split up smaller than source step size
-        delchi=dchimax
-        Deltachi=sgn*(State%TimeSteps%points(nstart)-State%TimeSteps%points(nend))/State%curvature_radius
-        nIntSteps=int(Deltachi/delchi+0.99_dl)
-        delchi=Deltachi/nIntSteps
-        dtau2o6=(State%curvature_radius*delchi)**2/6._dl
-    else !step size is that of source
-        delchi=dchisource
-        nIntSteps=isgn*(nstart-nend)
-    end if
+    out = 0
+    ujl_vals(1) = ujl
+
+    nIntSteps=isgn*(nstart-nend)
+    if (nIntSteps <= 0) return
+
+    nSubSteps = max(1, int(dchisource/dchimax + 0.99_dl))
+    delchi = dchisource/nSubSteps
 
 
     sgndelchi=delchi*sgn
     tmp=(ap1/sh**2 - nu2)
-    hh=0.5_dl*sgndelchi
-    h6=sgndelchi/6._dl
 
+    y_prev = y1
+    y2_prev = y2
+    q_prev = tmp
 
     do i=1,nIntSteps
-        ! One step in the ujl integration
-        ! fourth-order Runge-Kutta method to integrate equation for ujl
-
-        dydchi1=y2         !deriv y1
-        dydchi2=tmp*y1     !deriv y2
-        xh=chi+hh          !midpoint of step
-        yt1=y1+hh*dydchi1  !y1 at midpoint
-        yt2=y2+hh*dydchi2  !y2 at midpoint
-        dyt1=yt2           !deriv y1 at mid
-        tmp=(ap1/State%rofChi(xh)**2 - nu2)
-
-
-        dyt2=tmp*yt1       !deriv y2 at mid
-        yt1=y1+hh*dyt1     !y1 at mid
-        yt2=y2+hh*dyt2     !y2 at mid
-
-        dym1=yt2           !deriv y1 at mid
-        dym2=tmp*yt1       !deriv y2 at mid
-        yt1=y1+sgndelchi*dym1 !y1 at end
-        dym1=dyt1+dym1
-        yt2=y2+sgndelchi*dym2 !y2 at end
-        dym2=dyt2+dym2
-
-        chi=chi+sgndelchi     !end point
-        sh=State%rofChi(chi)
-        dyt1=yt2           !deriv y1 at end
-        tmp=(ap1/sh**2 - nu2)
-        dyt2=tmp*yt1       !deriv y2 at end
-        y1=y1+h6*(dydchi1+dyt1+2._dl*dym1) !add up
-        y2=y2+h6*(dydchi2+dyt2+2._dl*dym2)
+        do step_ix = 1, nSubSteps
+            if (i == 1 .and. step_ix == 1) then
+                call StepUSpherBesselRK4(ap1, nu2, sgndelchi, chi, y1, y2, sh, tmp)
+            else
+                call StepUSpherBesselNumerov(ap1, nu2, sgndelchi, chi, y_prev, y1, y2_prev, q_prev, tmp, sh, y_next, y2_next, q_next)
+                y_prev = y1
+                y2_prev = y2
+                q_prev = tmp
+                y1 = y_next
+                y2 = y2_next
+                tmp = q_next
+            end if
 
-        ujl=y1/sh
-        if ((isgn<0).and.(y1*y2<0._dl).or.((chi>chiDispTop).and.((chi>3.14).or.(y1*y2>0)))) then
-            chi=0._dl
-            exit   !exit because ujl now small
-        end if
+            ujl=y1/sh
+            if ((isgn<0).and.(y1*y2<0._dl).or.((chi>chiDispTop).and.((chi>3.14).or.(y1*y2>0)))) then
+                chi=0._dl
+                exit   !exit because ujl now small
+            end if
 
-        if (Interpolate) then
-            ! Interpolate the source
-            taui=aux2-aux1*chi
-            is=int(taui)
-            b=taui-is
-            if (b > 0.995) then
-                !may save time, and prevents numerical error leading to access violation of zero index
-                is=is+1
-                source = sourcep(is,:)
-            else
-                a=1._dl-b
-                tmpa=(a**3-a)
-                tmpb=(b**3-b)
-                source = a*sourcep(is,:)+b*sourcep(is+1,:)+ &
-                    (tmpa*ddsourcep(is,:) +  tmpb*ddsourcep(is+1,:))*dtau2o6
+            if (((isgn<0).or.(chi>chiDispTop)).and.(abs(ujl) < minujl)) then
+                chi=0._dl
+                exit  !break when getting exponentially small in dissipative region
             end if
-        else
-            source = sourcep(nstart - i*isgn,:)
-        end if
-        out = out + source * ujl
+        end do
 
-        if (((isgn<0).or.(chi>chiDispTop)).and.(abs(ujl) < minujl)) then
-            chi=0
-            exit  !break when getting  exponentially small in dissipative region
-        end if
+        if (chi==0._dl) exit
+        ujl_vals(i + 1) = ujl
     end do
 
-    out = (out - source * ujl /2)*delchi*State%curvature_radius
+    last_ix = i
+    if (last_ix > 0) then
+        ujl_vals(1) = 0.5_dl * ujl_vals(1)
+        if (last_ix > 1) ujl_vals(last_ix) = 0.5_dl * ujl_vals(last_ix)
+        src_ix  = nstart
+        do step_ix = 1, last_ix
+            ujl = ujl_vals(step_ix)
+            out(1) = out(1) + ujl * IV%Source_q(src_ix,1)
+            out(2) = out(2) + ujl * IV%Source_q(src_ix,2)
+            out(3) = out(3) + ujl * IV%Source_q(src_ix,3)
+            src_ix = src_ix - isgn
+        end do
+    end if
+
+    out = out*dchisource*State%curvature_radius
 
     end subroutine DoRangeIntTensor
 
diff --git a/fortran/config.f90 b/fortran/config.f90
index ecbb98c23..f047fb4a9 100644
--- a/fortran/config.f90
+++ b/fortran/config.f90
@@ -3,10 +3,17 @@ module config
     use constants, only: const_twopi
     implicit none
 
-    character(LEN=*), parameter :: version = '1.6.7'
+    character(LEN=*), parameter :: version = '1.6.8'
 
     integer :: FeedbackLevel = 0 !if >0 print out useful information about the model
 
+    integer :: AccuracyTarget = 1 !if >0 enable targeted accuracy improvements
+
+    real(dl) :: near_flat_approx_chi_limit = 0.2_dl
+    real(dl) :: near_flat_approx_chidisp_limit = 0.1_dl
+    logical :: enable_do_near_flat_integration = .true.
+    logical :: enable_shifted_q_scalar_approx = .true.
+
     logical :: output_file_headers = .true.
 
     logical :: DebugMsgs =.false. !Set to true to view progress and timing
@@ -45,8 +52,9 @@ module config
     integer, parameter :: lmax_extrap_highl = 8000
     real(dl), allocatable :: highL_CL_template(:,:)
 
-    integer, parameter :: lensed_convolution_margin = 100
-    !Number of L less than L max at which the lensed power spectrum is calculated
+    !Gap between the lensed output margin (CP%lens_output_margin) and the
+    !unlensed-spectrum convolution margin: convolution_margin = output_margin - lens_convolution_gap.
+    integer, parameter :: lens_convolution_gap = 50
 
     integer :: global_error_flag=0
 
diff --git a/fortran/equations.f90 b/fortran/equations.f90
old mode 100644
new mode 100755
index aabe78aec..3f69d049d
--- a/fortran/equations.f90
+++ b/fortran/equations.f90
@@ -259,12 +259,13 @@ recursive subroutine GaugeInterface_EvolveScal(EV,tau,y,tauend,tol1,ind,rk_setti
     real(dl) cs2, opacity, dopacity
     real(dl) tau_switch_ktau, tau_switch_nu_massless, tau_switch_nu_massive, next_switch
     real(dl) tau_switch_no_nu_multpoles, tau_switch_no_phot_multpoles,tau_switch_nu_nonrel
-    real(dl) noSwitch, smallTime
+    real(dl) noSwitch, smallTime, switchBoost, latePhotSwitchBoost
     !Sources
     real(dl) tau_switch_saha, Delta_TM, xe,a,tau_switch_evolve_TM
 
     noSwitch= State%tau0+1
     smallTime =  min(tau, 1/EV%k_buf)/100
+    switchBoost = CP%Accuracy%AccuracyBoost*CP%Accuracy%TimeSwitchBoost
 
     tau_switch_ktau = noSwitch
     tau_switch_no_nu_multpoles= noSwitch
@@ -302,13 +303,17 @@ recursive subroutine GaugeInterface_EvolveScal(EV,tau,y,tauend,tol1,ind,rk_setti
     if (CP%DoLateRadTruncation) then
         if (.not. EV%no_nu_multpoles) & !!.and. .not. EV%has_nu_relativistic .and. tau_switch_nu_massless ==noSwitch)  &
             tau_switch_no_nu_multpoles= &
-            max(15/EV%k_buf*CP%Accuracy%AccuracyBoost*CP%Accuracy%TimeSwitchBoost,&
+            max(15/EV%k_buf*switchBoost,&
             min(State%taurend,EV%ThermoData%matter_verydom_tau))
 
-        if (.not. EV%no_phot_multpoles .and. (.not.CP%WantCls .or. &
-            EV%k_buf>0.03*CP%Accuracy%AccuracyBoost*CP%Accuracy%TimeSwitchBoost)) &
-            tau_switch_no_phot_multpoles =max(15/EV%k_buf,State%taurend)*CP%Accuracy%AccuracyBoost &
-            *CP%Accuracy%TimeSwitchBoost
+        if (.not. EV%no_phot_multpoles) then
+            latePhotSwitchBoost = switchBoost
+            !High-precision transfer matter power is sensitive to late photon hierarchy truncation.
+            if (AccuracyTarget > 0 .and. CP%WantTransfer .and. CP%Transfer%high_precision) &
+                latePhotSwitchBoost = max(latePhotSwitchBoost, 2._dl)
+            if (.not.CP%WantCls .or. EV%k_buf>0.03*latePhotSwitchBoost) &
+                tau_switch_no_phot_multpoles =max(15/EV%k_buf,State%taurend)*latePhotSwitchBoost
+        end if
     end if
 
     next_switch = min(tau_switch_ktau, tau_switch_nu_massless,EV%TightSwitchoffTime, tau_switch_nu_massive, &
diff --git a/fortran/lensing.f90 b/fortran/lensing.f90
old mode 100644
new mode 100755
index 38340cbf1..30b4ecfe8
--- a/fortran/lensing.f90
+++ b/fortran/lensing.f90
@@ -4,6 +4,10 @@
     !lensing_method=2: using the flat-sky lower order result of astro-ph/9505109
     !                  and astro-ph/9803150 as in CMBFAST
     !lensing_method=3: using inaccurate full sky harmonic method of astro-ph/0001303
+    !lensing_method=4: using direct Gauss-Legendre curved-sky integration following
+    !                  the structure of camb.correlations.lensed_cls
+    !lensing_method=5: optimized May 2026 default; use method 1 when AccurateBB=F
+    !                  and method 4 when AccurateBB=T
 
     !The flat sky result is accurate to about 0.1% in TT, and 0.4% in EE and is
     !about a factor of two faster than lensing_method=1.
@@ -34,17 +38,20 @@
     !obtainable from the CPC program library (www.cpc.cs.qub.ac.uk).
 
     !March 2006: fixed problem with l_max when generating with tensors (thanks Chad Fendt)
+    !May 2026: added direct curved-sky method 4 and optimized selector method 5;
+    !method 5 is now the default lensing method.
 
     module lensing
     use Precision
     use results
     use constants, only : const_pi, const_twopi, const_fourpi
+    use MathUtils, only : Gauss_Legendre
     use splines
     implicit none
     integer, parameter :: lensing_method_curv_corr=1,lensing_method_flat_corr=2, &
-        lensing_method_harmonic=3
+        lensing_method_harmonic=3, lensing_method_curv_corr_direct=4, lensing_method_optimized=5
 
-    integer :: lensing_method = lensing_method_curv_corr
+    integer :: lensing_method = lensing_method_optimized
 
     real(dl) :: lensing_sanity_check_amplitude = 1e-7_dl
 
@@ -64,21 +71,45 @@ module lensing
     integer :: lmax_donelnfa = 0
     real(dl), dimension(:), allocatable  :: lnfa
 
+    integer :: gauss_legendre_cache_npoints = 0
+    real(dl), dimension(:), allocatable, target :: gauss_legendre_cache_xvals, gauss_legendre_cache_weights
+
     public lens_Cls, lensing_includes_tensors, lensing_method, lensing_method_flat_corr,&
-        lensing_method_curv_corr,lensing_method_harmonic, BessI, ALens_Fiducial, &
+        lensing_method_curv_corr,lensing_method_harmonic, lensing_method_curv_corr_direct, lensing_method_optimized, &
+        BessI, ALens_Fiducial, &
         lensing_sanity_check_amplitude, lensClsWithSpectrum, &
         GetFlatSkyCGrads, GetFlatSkyCgradsWithSpectrum
     contains
 
 
+    integer function effective_lensing_method(CP) result(method)
+    use model, only : CAMBparams
+    type(CAMBparams), intent(in) :: CP
+
+    method = lensing_method
+    if (method == lensing_method_optimized) then
+        if (CP%Accuracy%AccurateBB) then
+            method = lensing_method_curv_corr_direct
+        else
+            method = lensing_method_curv_corr
+        end if
+    end if
+    end function effective_lensing_method
+
+
     subroutine lens_Cls(State)
     class(CAMBdata) :: State
+    integer :: method
 
-    if (lensing_method == lensing_method_curv_corr) then
+    method = effective_lensing_method(State%CP)
+
+    if (method == lensing_method_curv_corr) then
         call CorrFuncFullSky(State)
-    elseif (lensing_method == lensing_method_flat_corr) then
+    elseif (method == lensing_method_curv_corr_direct) then
+        call CorrFuncFullSkyDirect(State)
+    elseif (method == lensing_method_flat_corr) then
         call CorrFuncFlatSky(State)
-    elseif (lensing_method == lensing_method_harmonic) then
+    elseif (method == lensing_method_harmonic) then
         call BadHarmonic(State)
     else
         error stop 'Unknown lensing method'
@@ -90,7 +121,7 @@ subroutine CorrFuncFullSky(State)
     integer :: lmax_extrap,l
     real(dl) CPP(0:State%CP%max_l)
 
-    lmax_extrap = State%CP%Max_l - lensed_convolution_margin + 750
+    lmax_extrap = State%CP%Max_l - (State%CP%lens_output_margin - lens_convolution_gap) + 750
     lmax_extrap = min(lmax_extrap_highl,lmax_extrap)
     do l= State%CP%min_l,State%CP%max_l
         ! Cl_scalar(l,1,C_Phi) is l^4 C_phi_phi
@@ -101,6 +132,22 @@ subroutine CorrFuncFullSky(State)
 
     end subroutine CorrFuncFullSky
 
+    subroutine CorrFuncFullSkyDirect(State)
+    class(CAMBdata) :: State
+    integer :: lmax_extrap,l
+    real(dl) CPP(0:State%CP%max_l)
+
+    lmax_extrap = State%CP%Max_l - (State%CP%lens_output_margin - lens_convolution_gap) + 750
+    lmax_extrap = min(lmax_extrap_highl,lmax_extrap)
+    do l= State%CP%min_l,State%CP%max_l
+        ! Cl_scalar(l,1,C_Phi) is l^4 C_phi_phi
+        CPP(l) = State%CLdata%Cl_scalar(l,C_Phi)*(l+1)**2/real(l,dl)**2/const_twopi
+    end do
+    call CorrFuncFullSkyDirectImpl(State, State%ClData, State%ClData, CPP, &
+        State%CP%min_l,max(lmax_extrap,State%CP%max_l))
+
+    end subroutine CorrFuncFullSkyDirect
+
     subroutine lensClsWithSpectrum(State, CPP, lensedCls, lmax_lensed)
     !Get lensed CL using CPP as the lensing specturm
     !CPP is [L(L+1)]^2C_phi_phi/2/pi
@@ -109,12 +156,18 @@ subroutine lensClsWithSpectrum(State, CPP, lensedCls, lmax_lensed)
     real(dl) :: lensedCls(4, 0:State%CP%Max_l)
     integer :: lmax_lensed
     Type(TCLData) :: CLout
-    integer :: lmax_extrap, l
+    integer :: lmax_extrap, l, method
 
-    lmax_extrap = State%CP%Max_l - lensed_convolution_margin + 750
+    lmax_extrap = State%CP%Max_l - (State%CP%lens_output_margin - lens_convolution_gap) + 750
     lmax_extrap = min(lmax_extrap_highl,lmax_extrap)
-    call CorrFuncFullSkyImpl(State, State%ClData, CLout, CPP, &
-        State%CP%min_l,max(lmax_extrap,State%CP%max_l))
+    method = effective_lensing_method(State%CP)
+    if (method == lensing_method_curv_corr_direct) then
+        call CorrFuncFullSkyDirectImpl(State, State%ClData, CLout, CPP, &
+            State%CP%min_l,max(lmax_extrap,State%CP%max_l))
+    else
+        call CorrFuncFullSkyImpl(State, State%ClData, CLout, CPP, &
+            State%CP%min_l,max(lmax_extrap,State%CP%max_l))
+    end if
     lmax_lensed = CLout%lmax_lensed
 
     do l=State%CP%min_l, lmax_lensed
@@ -130,6 +183,20 @@ subroutine AmplitudeError
 
     end subroutine AmplitudeError
 
+    subroutine GetCachedGaussLegendre(npoints, xvals, weights)
+    integer, intent(in) :: npoints
+    real(dl), pointer :: xvals(:), weights(:)
+
+    if (.not. allocated(gauss_legendre_cache_xvals) .or. gauss_legendre_cache_npoints /= npoints) then
+        if (allocated(gauss_legendre_cache_xvals)) deallocate(gauss_legendre_cache_xvals, gauss_legendre_cache_weights)
+        allocate(gauss_legendre_cache_xvals(npoints), gauss_legendre_cache_weights(npoints))
+        call Gauss_Legendre(gauss_legendre_cache_xvals, gauss_legendre_cache_weights, npoints)
+        gauss_legendre_cache_npoints = npoints
+    end if
+    xvals => gauss_legendre_cache_xvals
+    weights => gauss_legendre_cache_weights
+    end subroutine GetCachedGaussLegendre
+
     subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
     !Accurate curved sky correlation function method
     !Uses non-perturbative isotropic term with 2nd order expansion in C_{gl,2}
@@ -151,8 +218,6 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
     real(dl) d_22(lmax),d_2m2(lmax),d_20(lmax)
     real(dl) Cphil3(lmin:lmax), CTT(lmin:lmax), CTE(lmin:lmax),CEE(lmin:lmax)
     real(dl) ls(lmax)
-    real(dl) xl(lmax)
-    real(dl), allocatable :: ddcontribs(:,:),corrcontribs(:,:)
     real(dl), allocatable :: lens_contrib(:,:,:)
     integer thread_ix
     real(dl) pmm, pmmp1
@@ -162,13 +227,12 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
     real(dl) dX000,dX022
     integer  interp_fac
     integer j,jmax
-    integer llo, lhi
-    real(dl) a0,b0,ho, sc
+    real(dl) sc, taper
     integer apodize_point_width
     logical :: short_integral_range
-    real(dl) range_fac
+    real(dl) range_fac, apodize_width
     logical, parameter :: approx = .false.
-    real(dl) theta_cut(lmax), LensAccuracyBoost
+    real(dl) theta_cut(lmax), LensAccuracyBoost, ThetaSampleBoost
     Type(TTimer) :: Timer
 
     !$ integer  OMP_GET_THREAD_NUM, OMP_GET_MAX_THREADS
@@ -178,21 +242,28 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
     associate(lSamp => State%CLData%CTransScal%ls, CP=>State%CP)
 
         LensAccuracyBoost = CP%Accuracy%AccuracyBoost*CP%Accuracy%LensingBoost
+        ThetaSampleBoost = LensAccuracyBoost
+
+        if (AccuracyTarget > 0) then
+            ThetaSampleBoost = ThetaSampleBoost * 1.6_dl
+            if (CP%Max_l > 3500) ThetaSampleBoost = ThetaSampleBoost * (2.2_dl/1.6_dl)
+        else if (CP%Max_l > 3500) then
+            ThetaSampleBoost = ThetaSampleBoost * 1.3_dl
+        end if
         max_lensed_ix = lSamp%nl-1
-        do while(lSamp%l(max_lensed_ix) > CP%Max_l - lensed_convolution_margin)
+        do while(lSamp%l(max_lensed_ix) > CP%Max_l - (CP%lens_output_margin - lens_convolution_gap))
             max_lensed_ix = max_lensed_ix -1
         end do
-        !150 is the default margin added in python by set_for_lmax
-        CLout%lmax_lensed = max(lSamp%l(max_lensed_ix), CP%Max_l - 150)
+        CLout%lmax_lensed = max(lSamp%l(max_lensed_ix), CP%Max_l - CP%lens_output_margin)
 
         if (allocated(CLout%Cl_lensed)) deallocate(CLout%Cl_lensed)
         allocate(CLout%Cl_lensed(lmin:CLout%lmax_lensed,1:4), source = 0._dl)
 
-        npoints = CP%Max_l  * 2 *LensAccuracyBoost
+        npoints = CP%Max_l  * 2 * ThetaSampleBoost
         short_integral_range = .not. CP%Accuracy%AccurateBB
         dtheta = const_pi / npoints
-        if (CP%Max_l > 3500) dtheta=dtheta/1.3
-        apodize_point_width = nint(0.003 / dtheta)
+        apodize_width = 0.012_dl
+        apodize_point_width = nint(apodize_width / dtheta)
         npoints = int(const_pi/dtheta)
         if (short_integral_range) then
             range_fac= max(1._dl,32/LensAccuracyBoost) !fraction of range to integrate
@@ -213,13 +284,14 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
             if (l<=15 .or. mod(l-15,interp_fac)==interp_fac/2) then
                 jmax =jmax+1
                 ls(jmax)=l
-                xl(jmax)=l
             end if
             lfacs(l) = real(l*(l+1),dl)
             lfacs2(l) = real((l+2)*(l-1),dl)
             lrootfacs(l) = sqrt(lfacs(l)*lfacs2(l))
         end do
         do l=2,lmax
+            ! Equivalent to the Python correlations.py indser threshold, but written
+            ! as a per-l cutoff in theta rather than a per-x split in l.
             theta_cut(l) = 0.244949_dl/sqrt(3._dl*lfacs(l) - 8._dl)
         end do
 
@@ -231,7 +303,6 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
         thread_ix = 1
         !$ thread_ix = OMP_GET_MAX_THREADS()
         allocate(lens_contrib(4,CLout%lmax_lensed,thread_ix))
-        allocate(ddcontribs(lmax,4),corrcontribs(lmax,4))
 
         do l=lmin,CP%Max_l
             ! (2*l+1)l(l+1)/4pi C_phi_phi: Cl_scalar(l,1,C_Phi) is l^4 C_phi_phi
@@ -275,7 +346,7 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
         !$OMP PARALLEL DO DEFAULT(PRIVATE),  &
         !$OMP SHARED(lfacs,lfacs2,lrootfacs,Cphil3,CTT,CTE,CEE,lens_contrib,theta_cut), &
         !$OMP SHARED(lmin, lmax,dtheta,CL,CLout,roots, npoints,interp_fac), &
-        !$OMP SHARED(jmax,ls,xl,short_integral_range,apodize_point_width)
+        !$OMP SHARED(jmax,ls,short_integral_range,apodize_point_width)
         do i=1,npoints-1
 
             theta = i * dtheta
@@ -332,6 +403,7 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
 
             end do
 
+            corr = 0._dl
             do j=1,jmax
                 l =ls(j)
 
@@ -423,72 +495,39 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
                 fac = ( (X000**2-1) + Cg2sq*fac1)*P(l)+ Cg2sq*fac3*d2m2 &
                     + 8/llp1* fac1*Cg2*dm11
 
-                corrcontribs(j,1)=  CTT(l) * fac
+                if (j > 14) then
+                    sc = real(interp_fac, dl)
+                else
+                    sc = 1._dl
+                end if
+                corr(1) = corr(1) + sc*CTT(l)*fac
 
                 fac2=(Cg2*dX022)**2+(X022**2-1)
                 !Q+U
                 fac = 2*Cg2*X121*X132*d13 + fac2*d22 +Cg2sq*X242*X220*d04
 
-                corrcontribs(j,2)= CEE(l) * fac
+                corr(2) = corr(2) + sc*CEE(l)*fac
 
                 !Q-U
                 fac = ( fac3*P(l) + X242**2*d4m4)*Cg2sq/2 &
                     + Cg2*(X121**2*dm11+ X132**2*d3m3) + fac2*d2m2
 
-                corrcontribs(j,3)= CEE(l) * fac
+                corr(3) = corr(3) + sc*CEE(l)*fac
 
                 !TE
                 fac = (X000*X022-1)*d20+ &
                     2*dX000*Cg2*(X121*d11 + X132*d1m3)/rootllp1 &
                     + Cg2sq*(X220/2*d2m4*X242 +( fac3/2 + dX022*dX000)*d20)
 
-                corrcontribs(j,4)= CTE(l) * fac
-
-            end do
+                corr(4) = corr(4) + sc*CTE(l)*fac
 
-            do j=1,4
-                corr(j) = sum(corrcontribs(1:14,j))+interp_fac*sum(corrcontribs(15:jmax,j))
             end do
 
-            !if (short_integral_range .and. i>npoints-20) &
-            !        corr=corr*exp(-(i-npoints+20)**2/150.0) !taper the end to help prevent ringing
-
-            if (short_integral_range .and. i>npoints-apodize_point_width*3) &
-                corr=corr*exp(-(i-npoints+apodize_point_width*3)**2/real(2*apodize_point_width**2))
-            !taper the end to help prevent ringing
-
-
-            !Interpolate contributions
-            !Increasing interp_fac and using this seems to be slower than above
-            if (.false.) then
-                if (abs(sum(corrcontribs(1:jmax,1)))>1e-11) print *,i,sum(corrcontribs(1:jmax,1))
-                do j=1,4
-                    call spline_def(xl,corrcontribs(:,j),jmax,ddcontribs(:,j))
-                end do
-                corr=0
-                llo=1
-                do l=lmin,lmax
-                    if ((l > ls(llo+1)).and.(llo < jmax)) then
-                        llo=llo+1
-                    end if
-                    lhi=llo+1
-                    ho=ls(lhi)-ls(llo)
-                    a0=(ls(lhi)-l)/ho
-                    b0=(l-ls(llo))/ho
-                    fac1 = ho**2/6
-                    fac2 = (b0**3-b0)*fac1
-                    fac1 = (a0**3-a0)*fac1
-
-                    corr(1) = Corr(1)+ a0*corrcontribs(llo,1)+ b0*corrcontribs(lhi,1)+ &
-                        fac1* ddcontribs(llo,1) +fac2*ddcontribs(lhi,1)
-                    corr(2) = Corr(2)+ a0*corrcontribs(llo,2)+ b0*corrcontribs(lhi,2)+ &
-                        fac1* ddcontribs(llo,2) +fac2*ddcontribs(lhi,2)
-                    corr(3) = Corr(3)+ a0*corrcontribs(llo,3)+ b0*corrcontribs(lhi,3)+ &
-                        fac1* ddcontribs(llo,3) +fac2*ddcontribs(lhi,3)
-                    corr(4) = Corr(4)+ a0*corrcontribs(llo,4)+ b0*corrcontribs(lhi,4)+ &
-                        fac1* ddcontribs(llo,4) +fac2*ddcontribs(lhi,4)
-
-                end do
+            if (short_integral_range .and. i>npoints-apodize_point_width) then
+                !taper the end to help prevent ringing
+                taper = real(npoints-i, dl)/real(apodize_point_width, dl)
+                taper = max(0._dl, min(1._dl, taper))
+                corr = corr*taper**3*(10._dl + taper*(-15._dl + 6._dl*taper))
             end if
 
             !$  thread_ix = OMP_GET_THREAD_NUM()+1
@@ -534,6 +573,296 @@ subroutine CorrFuncFullSkyImpl(State,CL,CLout,CPP,lmin,lmax)
 
     end subroutine CorrFuncFullSkyImpl
 
+    subroutine DirectCorrPointAccumulate(x, weight, lmin, lmax, lmax_lensed, lfacs, lfacs2, lrootfacs, &
+        Cphil3, CTT, CEE, CTE, thread_contrib)
+    real(dl), intent(in) :: x, weight
+    integer, intent(in) :: lmin, lmax, lmax_lensed
+    real(dl), intent(in) :: lfacs(1:lmax), lfacs2(1:lmax), lrootfacs(1:lmax)
+    real(dl), intent(in) :: Cphil3(1:lmax), CTT(1:lmax), CEE(1:lmax), CTE(1:lmax)
+    real(dl), intent(inout) :: thread_contrib(4, lmax_lensed)
+    integer :: l
+    real(dl) :: sigma2, Cg2, fac, fac1, fac2, sinth, sin2, sinfac, theta, theta_cut
+    real(dl) :: rootfac1, rootfac2, rootfac3, c2fac, c2fac2, facexp, T2, T4
+    real(dl) :: corr(4), pmm, pmmp1
+    real(dl) :: P(1:lmax), dP(1:lmax), d11(1:lmax), dm11(1:lmax), d20(1:lmax), d22(1:lmax), d2m2(1:lmax)
+    real(dl) :: d1m2(1:lmax), d12(1:lmax), d1m3(1:lmax), d2m3(1:lmax), d3m3(1:lmax), d13(1:lmax)
+    real(dl) :: d04(1:lmax), d2m4(1:lmax), d4m4(1:lmax)
+
+    fac1 = 1._dl - x
+    fac2 = 1._dl + x
+    sin2 = max(1e-30_dl,1._dl - x**2)
+    sinth = sqrt(sin2)
+    sinfac = 4._dl/sinth
+    theta = acos(x)
+
+    P = 0._dl
+    dP = 0._dl
+    d11 = 0._dl
+    dm11 = 0._dl
+    d20 = 0._dl
+    d22 = 0._dl
+    d2m2 = 0._dl
+    d1m2 = 0._dl
+    d12 = 0._dl
+    d1m3 = 0._dl
+    d2m3 = 0._dl
+    d3m3 = 0._dl
+    d13 = 0._dl
+    d04 = 0._dl
+    d2m4 = 0._dl
+    d4m4 = 0._dl
+
+    sigma2 = 0._dl
+    Cg2 = 0._dl
+    pmm = 1._dl
+    pmmp1 = x
+    if (lmin <= 1) then
+        P(1) = x
+        dP(1) = 1._dl
+        d11(1) = fac1*dP(1)/lfacs(1) + P(1)
+        dm11(1) = fac2*dP(1)/lfacs(1) - P(1)
+        sigma2 = sigma2 + (1._dl - d11(1))*Cphil3(1)
+        Cg2 = Cg2 + dm11(1)*Cphil3(1)
+    end if
+    do l=2,lmax
+        P(l)= ((2*l-1)*x*pmmp1 - (l-1)*pmm)/l
+        dP(l) = l*(pmmp1 - x*P(l))/sin2
+        pmm = pmmp1
+        pmmp1 = P(l)
+        fac = fac1/fac2
+
+        d11(l) = fac1*dP(l)/lfacs(l) + P(l)
+        dm11(l) = fac2*dP(l)/lfacs(l) - P(l)
+
+        sigma2 = sigma2 + (1._dl - d11(l))*Cphil3(l)
+        Cg2 = Cg2 + dm11(l)*Cphil3(l)
+
+        d22(l) = (((4*x-8)/fac2 + lfacs(l))*P(l) + 4*fac*(fac2 + (x - 2._dl)/lfacs(l))*dP(l))/lfacs2(l)
+        ! Same stability switch as correlations.py, expressed as a local theta cutoff.
+        theta_cut = 0.244949_dl/sqrt(3._dl*lfacs(l) - 8._dl)
+        if (theta > theta_cut) then
+            d2m2(l) = (((lfacs(l) - (4*x+8)/fac1) * P(l)) + 4._dl/fac * (-fac1 + (x+2._dl)/lfacs(l))*dP(l))/lfacs2(l)
+        else
+            d2m2(l) = lfacs(l)*lfacs2(l)*(1._dl - x**2)**2/7680._dl * (20._dl + (1._dl - x**2)*(16._dl - lfacs(l)))
+        end if
+        d20(l) = (2*x*dP(l) - lfacs(l)*P(l))/lrootfacs(l)
+    end do
+
+    do l=2,lmax
+        rootfac1 = sqrt(lfacs2(l))
+        d1m2(l) = sinth/rootfac1*(dP(l) - 2._dl/fac1*dm11(l))
+        d12(l) = sinth/rootfac1*(dP(l) - 2._dl/fac2*d11(l))
+    end do
+    do l=3,lmax
+        rootfac1 = sqrt(lfacs2(l))
+        rootfac2 = sqrt(real((l+3)*(l-2),dl))
+        fac = lfacs2(l)/(rootfac1*rootfac2)
+        d1m3(l) = -(x + 0.5_dl)*sinfac*d1m2(l)/rootfac2 - fac*dm11(l)
+        d2m3(l) = (-fac2*d2m2(l)*sinfac - rootfac1*d1m2(l))/rootfac2
+        d3m3(l) = (-(x + 1.5_dl)*d2m3(l)*sinfac - rootfac1*d1m3(l))/rootfac2
+        d13(l) = (x - 0.5_dl)*sinfac*d12(l)/rootfac2 - fac*d11(l)
+    end do
+    do l=4,lmax
+        rootfac2 = sqrt(real((l+3)*(l-2),dl))
+        rootfac3 = sqrt(real((l-3)*(l+4),dl))
+        d04(l) = ((-lfacs(l) + (18._dl*x**2 + 6._dl)/sin2)*d20(l) - &
+            6._dl*x*lfacs2(l)*dP(l)/lrootfacs(l))/(rootfac2*rootfac3)
+        d2m4(l) = (-(6._dl*x + 4._dl)/sinth*d2m3(l) - rootfac2*d2m2(l))/rootfac3
+        d4m4(l) = (-7._dl/5._dl*(lfacs2(l) - 6._dl)*d2m2(l) + &
+            12._dl/5._dl*(-lfacs2(l) + (9._dl*x + 26._dl)/fac1)*d3m3(l))/(lfacs2(l) - 12._dl)
+    end do
+
+    corr = 0._dl
+    do l=max(1,lmin), lmax
+        facexp = exp(-lfacs(l)*sigma2/2._dl)
+        c2fac = lfacs(l)*Cg2/2._dl
+        c2fac2 = c2fac**2
+        corr(1) = corr(1) + CTT(l)*((facexp - 1._dl)*P(l) + facexp*c2fac*(dm11(l) + c2fac*P(l)/4._dl))
+        if (l >= 2) corr(1) = corr(1) + CTT(l)*facexp*c2fac2*d2m2(l)/4._dl
+    end do
+    do l=max(2,lmin), lmax
+        facexp = exp(-lfacs(l)*sigma2/2._dl)
+        c2fac = lfacs(l)*Cg2/2._dl
+        c2fac2 = c2fac**2
+        corr(2) = corr(2) + CEE(l)*((facexp - 1._dl)*d22(l) + facexp*c2fac2*d22(l)/4._dl)
+        corr(3) = corr(3) + CEE(l)*((facexp - 1._dl)*d2m2(l) + facexp*c2fac*dm11(l)/2._dl + &
+            facexp*c2fac2*(2._dl*d2m2(l) + P(l))/8._dl)
+        corr(4) = corr(4) + CTE(l)*((facexp - 1._dl)*d20(l) + facexp*c2fac*d11(l)/2._dl + &
+            3._dl*facexp*c2fac2*d20(l)/8._dl)
+        if (l >= 3) then
+            corr(2) = corr(2) + CEE(l)*facexp*c2fac*d13(l)
+            corr(3) = corr(3) + CEE(l)*facexp*c2fac*d3m3(l)/2._dl
+            corr(4) = corr(4) + CTE(l)*facexp*c2fac*d1m3(l)/2._dl
+        end if
+        if (l >= 4) then
+            corr(2) = corr(2) + CEE(l)*facexp*c2fac2*d04(l)/4._dl
+            corr(3) = corr(3) + CEE(l)*facexp*c2fac2*d4m4(l)/8._dl
+            corr(4) = corr(4) + CTE(l)*facexp*c2fac2*d2m4(l)/8._dl
+        end if
+    end do
+
+    if (lmin <= 1 .and. lmax_lensed >= 1) then
+        thread_contrib(CT_Temp,1) = thread_contrib(CT_Temp,1) + weight*corr(1)*P(1)
+    end if
+    do l=max(2,lmin), lmax_lensed
+        thread_contrib(CT_Temp,l) = thread_contrib(CT_Temp,l) + weight*corr(1)*P(l)
+        T2 = (corr(2)*weight/2._dl)*d22(l)
+        T4 = (corr(3)*weight/2._dl)*d2m2(l)
+        thread_contrib(CT_E,l) = thread_contrib(CT_E,l) + T2 + T4
+        thread_contrib(CT_B,l) = thread_contrib(CT_B,l) + T2 - T4
+        thread_contrib(CT_Cross,l) = thread_contrib(CT_Cross,l) + weight*corr(4)*d20(l)
+    end do
+    end subroutine DirectCorrPointAccumulate
+
+    subroutine CorrFuncFullSkyDirectImpl(State,CL,CLout,CPP,lmin,lmax)
+    !Direct Gauss-Legendre implementation matching camb.correlations.lensed_cls,
+    !with the same high-L template extension used by the standard Fortran code.
+    class(CAMBdata), target :: State
+    Type(TCLData) :: CL, CLout
+    real(dl) :: CPP(0:State%CP%max_l) ! [L(L+1)]^2 C_L_phi_phi/2pi
+    integer, intent(in) :: lmin,lmax
+    integer :: l, i, npoints, imin
+    integer :: max_lensed_ix, thread_ix
+    real(dl) :: LensAccuracyBoost, sampling_factor, range_fac, theta_max, xmin, xtaper_start
+    real(dl) :: fac, sc, fac2, weight, theta, taper, apodize_theta_width
+    real(dl), pointer :: xvals(:), weights(:)
+    real(dl), allocatable :: lfacs(:), lfacs2(:), lrootfacs(:)
+    real(dl), allocatable :: Cphil3(:), CTT(:), CTE(:), CEE(:)
+    real(dl), allocatable :: lens_contrib(:,:,:)
+    Type(TTimer) :: Timer
+
+    !$ integer  OMP_GET_THREAD_NUM, OMP_GET_MAX_THREADS
+    !$ external OMP_GET_THREAD_NUM, OMP_GET_MAX_THREADS
+
+    if (lensing_includes_tensors) call MpiStop('Haven''t implemented tensor lensing')
+    associate(lSamp => State%CLData%CTransScal%ls, CP=>State%CP)
+
+        LensAccuracyBoost = CP%Accuracy%AccuracyBoost*CP%Accuracy%LensingBoost
+        max_lensed_ix = lSamp%nl-1
+        do while(lSamp%l(max_lensed_ix) > CP%Max_l - (CP%lens_output_margin - lens_convolution_gap))
+            max_lensed_ix = max_lensed_ix -1
+        end do
+        CLout%lmax_lensed = max(lSamp%l(max_lensed_ix), CP%Max_l - CP%lens_output_margin)
+
+        if (allocated(CLout%Cl_lensed)) deallocate(CLout%Cl_lensed)
+        allocate(CLout%Cl_lensed(lmin:CLout%lmax_lensed,1:4), source = 0._dl)
+
+        sampling_factor = 1.4_dl*LensAccuracyBoost
+        npoints = int(sampling_factor*lmax) + 1
+        call GetCachedGaussLegendre(npoints, xvals, weights)
+
+        if (.not. CP%Accuracy%AccurateBB) then
+            range_fac = max(1._dl,32._dl/LensAccuracyBoost)
+            theta_max = const_pi/range_fac
+            xmin = cos(theta_max)
+            imin = 1
+            do while (imin <= npoints .and. xvals(imin) < xmin)
+                imin = imin + 1
+            end do
+            ! C2 taper over a fixed angular width. This damps short-range ringing
+            ! without tying the window shape to the local Gauss-Legendre point spacing.
+            apodize_theta_width = min(theta_max,48._dl*const_pi/lmax)
+            xtaper_start = cos(theta_max - apodize_theta_width)
+        else
+            imin = 1
+            apodize_theta_width = 0._dl
+            xtaper_start = -1._dl
+        end if
+
+        allocate(Cphil3(1:lmax), CTT(1:lmax), CTE(1:lmax), CEE(1:lmax))
+        allocate(lfacs(1:lmax), lfacs2(1:lmax), lrootfacs(1:lmax))
+
+        lfacs = 0._dl
+        lfacs2 = 0._dl
+        lrootfacs = 0._dl
+        Cphil3 = 0._dl
+        CTT = 0._dl
+        CTE = 0._dl
+        CEE = 0._dl
+
+        do l=1,lmax
+            lfacs(l) = real(l*(l+1),dl)
+            if (l >= 2) then
+                lfacs2(l) = real((l+2)*(l-1),dl)
+                lrootfacs(l) = sqrt(lfacs(l)*lfacs2(l))
+            end if
+        end do
+
+        do l=lmin,CP%Max_l
+            Cphil3(l) = CPP(l)*(l+0.5_dl)/real((l+1)*l, dl)
+            fac = (2*l+1)/const_fourpi * const_twopi/(l*(l+1))
+            CTT(l) = CL%Cl_scalar(l,C_Temp)*fac
+            CEE(l) = CL%Cl_scalar(l,C_E)*fac
+            CTE(l) = CL%Cl_scalar(l,C_Cross)*fac
+        end do
+        if (lmax >= 10 .and. Cphil3(10) > lensing_sanity_check_amplitude) then
+            call AmplitudeError()
+            return
+        end if
+        if (lmax > CP%Max_l) then
+            l=CP%Max_l
+            sc = (2*l+1)/const_fourpi * const_twopi/(l*(l+1))
+            fac2 = CTT(CP%Max_l)/(sc*highL_CL_template(CP%Max_l, C_Temp))
+            fac = Cphil3(CP%Max_l)/(sc*highL_CL_template(CP%Max_l, C_Phi))
+            do l=CP%Max_l+1, lmax
+                sc = (2*l+1)/const_fourpi * const_twopi/(l*(l+1))
+                Cphil3(l) = highL_CL_template(l, C_Phi)*fac*sc
+                CTT(l) = highL_CL_template(l, C_Temp)*fac2*sc
+                CEE(l) = highL_CL_template(l, C_E)*fac2*sc
+                CTE(l) = highL_CL_template(l, C_Cross)*fac2*sc
+                if (Cphil3(CP%Max_l+1) > 1e-7_dl) then
+                    call MpiStop('You need to normalize the high-L template so it is dimensionless')
+                end if
+            end do
+        end if
+        if (ALens_Fiducial > 0) then
+            do l=2, lmax
+                sc = (2*l+1)/const_fourpi * const_twopi/(l*(l+1))
+                Cphil3(l) = sc * highL_CL_template(l, C_Phi) * ALens_Fiducial
+            end do
+        end if
+
+        thread_ix = 1
+        !$ thread_ix = OMP_GET_MAX_THREADS()
+        allocate(lens_contrib(4,CLout%lmax_lensed,thread_ix), source = 0._dl)
+
+        if (DebugMsgs) call Timer%Start()
+
+        !$OMP PARALLEL DO DEFAULT(SHARED), PRIVATE(i,weight,theta,taper,thread_ix)
+        do i=imin, npoints
+            weight = weights(i)
+            if (apodize_theta_width > 0._dl .and. xvals(i) < xtaper_start) then
+                theta = acos(xvals(i))
+                taper = max(0._dl,min(1._dl,(theta_max - theta)/apodize_theta_width))
+                weight = weight*taper**3*(10._dl + taper*(-15._dl + 6._dl*taper))
+            end if
+
+            thread_ix = 1
+            !$ thread_ix = OMP_GET_THREAD_NUM()+1
+            call DirectCorrPointAccumulate(xvals(i), weight, lmin, lmax, CLout%lmax_lensed, &
+                lfacs, lfacs2, lrootfacs, Cphil3, CTT, CEE, CTE, lens_contrib(:,:,thread_ix))
+        end do
+        !$OMP END PARALLEL DO
+
+        if (lmin <= 1 .and. CLout%lmax_lensed >= 1) then
+            CLout%Cl_lensed(1,CT_Temp) = sum(lens_contrib(CT_Temp,1,:))*2._dl*const_twopi/OutputDenominator + CL%Cl_scalar(1,C_Temp)
+            CLout%Cl_lensed(1,CT_E) = CL%Cl_scalar(1,C_E)
+            CLout%Cl_lensed(1,CT_B) = 0._dl
+            CLout%Cl_lensed(1,CT_Cross) = CL%Cl_scalar(1,C_Cross)
+        end if
+        do l=max(2,lmin), CLout%lmax_lensed
+            fac = lfacs(l)*const_twopi/OutputDenominator
+            CLout%Cl_lensed(l,CT_Temp) = sum(lens_contrib(CT_Temp,l,:))*fac + CL%Cl_scalar(l,C_Temp)
+            CLout%Cl_lensed(l,CT_E) = sum(lens_contrib(CT_E,l,:))*fac + CL%Cl_scalar(l,C_E)
+            CLout%Cl_lensed(l,CT_B) = sum(lens_contrib(CT_B,l,:))*fac
+            CLout%Cl_lensed(l,CT_Cross) = sum(lens_contrib(CT_Cross,l,:))*fac + CL%Cl_scalar(l,C_Cross)
+        end do
+
+        if (DebugMsgs) call Timer%WriteTime('Time for direct corr lensing')
+    end associate
+
+    end subroutine CorrFuncFullSkyDirectImpl
+
     subroutine CorrFuncFlatSky(State)
     !Do flat sky approx partially non-perturbative lensing, lensing_method=2
     class(CAMBdata) :: State
diff --git a/fortran/model.f90 b/fortran/model.f90
index 9df72ccd7..36d1217bc 100644
--- a/fortran/model.f90
+++ b/fortran/model.f90
@@ -129,7 +129,11 @@ module model
         integer   :: Min_l = 2 ! 1 or larger, usually 1 or 2
         integer   :: Max_l = 2500
         integer   :: Max_l_tensor = 600
-        real(dl)  :: Max_eta_k = 5000
+        integer   :: lens_output_margin = 150
+        !Number of L below Max_l down to which lensed C_L are guaranteed to be output;
+        !the unlensed C_L used in the lensing convolution is treated as reliable to
+        !Max_l - (lens_output_margin - 50), so this also fixes the lensed convolution margin.
+        real(dl)  :: Max_eta_k = 6250
         real(dl)  :: Max_eta_k_tensor = 1200
         ! _tensor settings only used in initialization,
         !Max_l and Max_eta_k are set to the tensor variables if only tensors requested
diff --git a/fortran/results.f90 b/fortran/results.f90
index 9c36b3583..11ce13c8b 100644
--- a/fortran/results.f90
+++ b/fortran/results.f90
@@ -44,6 +44,7 @@ module results
         derived_zdrag=6, derived_rdrag=7,derived_kD=8,derived_thetaD=9, derived_zEQ =10, derived_keq =11, &
         derived_thetaEQ=12, derived_theta_rs_EQ = 13
     integer, parameter :: nthermo_derived = 13
+    real(dl), parameter :: near_flat_scale_tol = 0.03_dl
 
     Type lSamples
         integer :: nl = 0
@@ -179,7 +180,7 @@ module results
 
         real(dl) Omega_de
         real(dl) curv, curvature_radius, Ksign !curvature_radius = 1/sqrt(|curv|), Ksign = 1,0 or -1
-        real(dl) tau0,chi0 !time today and rofChi(tau0/curvature_radius)
+        real(dl) tau0, DMt0 !time today and comoving angular diameter distance to the big bang
         real(dl) scale !relative to flat. e.g. for scaling lSamp%l sampling.
 
         real(dl) akthom !sigma_T * (number density of protons now)
@@ -497,8 +498,8 @@ subroutine CAMBdata_SetParams(this, P, error, DoReion, call_again, background_on
         call this%CP%DarkEnergy%Init(this)
         if (global_error_flag==0) this%tau0=this%TimeOfz(0._dl)
         if (global_error_flag==0) then
-            this%chi0=this%rofChi(this%tau0/this%curvature_radius)
-            this%scale= this%chi0*this%curvature_radius/this%tau0  !e.g. change l sampling depending on approx peak spacing
+            this%DMt0 = this%curvature_radius*this%rofChi(this%tau0/this%curvature_radius)
+            this%scale = this%DMt0/this%tau0  !e.g. change l sampling depending on approx peak spacing
             if (this%closed .and. this%tau0/this%curvature_radius >3.14) then
                 call GlobalError('chi >= pi in closed model not supported',error_unsupported_params)
             end if
@@ -1340,7 +1341,7 @@ subroutine lSamples_init(this, State, lmin, max_l)
     integer, intent(IN) :: lmin,max_l
     integer lind, lvar, step, top, bot, lmin_log
     integer, allocatable :: ls(:)
-    real(dl) AScale
+    real(dl) AScale, growth
 
     allocate(ls(max_l))
     if (allocated(this%l)) deallocate(this%l)
@@ -1348,7 +1349,11 @@ subroutine lSamples_init(this, State, lmin, max_l)
     this%use_spline_template = State%CP%use_cl_spline_template
     lmin_log = State%CP%min_l_logl_sampling
     associate(Accuracy => State%CP%Accuracy)
-        Ascale=State%scale/Accuracy%lSampleBoost
+        if (abs(State%scale - 1._dl) > near_flat_scale_tol) then
+            Ascale = State%scale/Accuracy%lSampleBoost
+        else
+            Ascale = 1._dl/Accuracy%lSampleBoost
+        end if
 
         if (Accuracy%lSampleBoost >=50) then
             !just do all of them
@@ -1475,22 +1480,32 @@ subroutine lSamples_init(this, State, lmin, max_l)
                     end do
 
                     if (max_l > lmin_log) then
-                        !Should be pretty smooth or tiny out here
-                        step=max(nint(400*Ascale),50)
+                        !Above lmin_log the unlensed C_L are smoothly damping. The default initial
+                        !step ~ 400 with 1.5x growth produces few, widely-spaced samples just above
+                        !lmin_log where the unlensed is not yet exponentially small, driving lensed
+                        !C_L interpolation errors. Use a smaller initial step and slower growth
+                        !when AccuracyTarget is on for lensed scalar CMB.
+                        if (AccuracyTarget > 0 .and. State%CP%Want_CMB .and. State%CP%WantScalars) then
+                            step=max(nint(100*Ascale),50)
+                            growth = 1.1_dl
+                        else
+                            step=max(nint(400*Ascale),50)
+                            growth = 1.5_dl
+                        end if
                         lvar = ls(lind)
                         do
                             lvar = lvar + step
                             if (lvar > max_l) exit
                             lind=lind+1
                             ls(lind)=lvar
-                            step = nint(step*1.5) !log spacing
+                            step = nint(step*growth) !log spacing
                         end do
-                        if (ls(lind) < max_l - 100) then
+                        if (ls(lind) < max_l - (State%CP%lens_output_margin - lens_convolution_gap)) then
                             !Try to keep lensed spectra up to specified lmax
                             lind=lind+1
-                            ls(lind)=max_l - lensed_convolution_margin
-                        else if (ls(lind) - ls(lind-1) > lensed_convolution_margin) then
-                            ls(lind)=max_l - lensed_convolution_margin
+                            ls(lind)=max_l - (State%CP%lens_output_margin - lens_convolution_gap)
+                        else if (ls(lind) - ls(lind-1) > State%CP%lens_output_margin - lens_convolution_gap) then
+                            ls(lind)=max_l - (State%CP%lens_output_margin - lens_convolution_gap)
                         end if
                     end if
                 end if !log_lvalues
diff --git a/fortran/tests/CAMB_test_files.py b/fortran/tests/CAMB_test_files.py
index 34b18c8d7..c99a0288e 100644
--- a/fortran/tests/CAMB_test_files.py
+++ b/fortran/tests/CAMB_test_files.py
@@ -475,7 +475,7 @@ def getTestParams():
     # AM - End of edits
 
     params.append(["zre", "re_use_optical_depth = F", "re_redshift  = 8.5"])
-    params.append(["nolens", "lensing = F"])
+    params.append(["nolens", "do_lensing = F"])
     params.append(["noderived", "derived_parameters = F"])
     params.append(["no_rad_trunc", "do_late_rad_truncation   = F"])
 
diff --git a/inifiles/params.ini b/inifiles/params.ini
index 176cb7d21..0bd04209b 100644
--- a/inifiles/params.ini
+++ b/inifiles/params.ini
@@ -19,12 +19,15 @@ do_nonlinear = 0
 
 #Maximum multipole and k*eta.
 #  Note that C_ls near l_max are inaccurate (about 5%), go to 50 more than you need
-#  Lensed power spectra are computed to l_max_scalar-100
+#  Lensed power spectra are guaranteed to be output up to l_max_scalar - lens_output_margin
+#  (and at most up to l_max_scalar - (lens_output_margin - 50), where the unlensed C_L
+#  used in the lensing convolution is treated as reliable).
 #  To get accurate lensed BB need to have l_max_scalar>2000, k_eta_max_scalar > 10000
 #  To get accurate lensing potential you also need k_eta_max_scalar > 10000
-#  Otherwise k_eta_max_scalar=2*l_max_scalar usually suffices, or don't set to use default
+#  Otherwise k_eta_max_scalar=2.5*l_max_scalar usually suffices, or don't set to use default
 l_max_scalar      = 2200
 #k_eta_max_scalar  = 4000
+#lens_output_margin = 150
 
 #  Tensor settings should be less than or equal to the above
 l_max_tensor      = 1500
diff --git a/inifiles/planck_2018.ini b/inifiles/planck_2018.ini
index c125d4c87..a1d3c2209 100644
--- a/inifiles/planck_2018.ini
+++ b/inifiles/planck_2018.ini
@@ -27,7 +27,7 @@ do_nonlinear = 3
 #  Lensed power spectra are computed to l_max_scalar-100
 #  To get accurate lensed BB need to have l_max_scalar>2000, k_eta_max_scalar > 10000
 #  To get accurate lensing potential you also need k_eta_max_scalar > 10000
-#  Otherwise k_eta_max_scalar=2*l_max_scalar usually suffices, or don't set to use default
+#  Otherwise k_eta_max_scalar=2.5*l_max_scalar usually suffices, or don't set to use default
 l_max_scalar      = 2700
 k_eta_max_scalar  = 18000