From d91fb675b0b8c13ab1b05e522163441df86232b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Val=C3=A9rian=20Rey?= Date: Sat, 9 May 2026 02:35:29 +0200 Subject: [PATCH 1/2] chore: Fix typos in docs and source files Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 4 ++-- README.md | 2 +- docs/source/examples/basic_usage.rst | 2 +- docs/source/examples/grouping.rst | 10 +++++----- src/torchjd/_linalg/_matrix.py | 4 ++-- src/torchjd/aggregation/_graddrop.py | 2 +- src/torchjd/aggregation/_imtl_g.py | 2 +- src/torchjd/aggregation/_pcgrad.py | 2 +- src/torchjd/aggregation/_random.py | 2 +- src/torchjd/autogram/_jacobian_computer.py | 2 +- src/torchjd/autogram/_module_hook_manager.py | 2 +- src/torchjd/autojac/_mtl_backward.py | 2 +- src/torchjd/autojac/_transform/_grad.py | 3 +-- tests/unit/autojac/test_mtl_backward.py | 2 +- 14 files changed, 20 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6cec74a4..43ccaeb2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,7 +91,7 @@ changelog does not include internal changes that do not affect the user. Suggested change: `mtl_backward(losses=losses, features=features)` => `mtl_backward(losses, features=features)`. The `features` parameter remains usable as positional or keyword. All other parameters are now keyword-only. - - `Aggregator.__call__`: The `matrix` parameter is now positonal-only. Suggested change: + - `Aggregator.__call__`: The `matrix` parameter is now positional-only. Suggested change: `aggregator(matrix=matrix)` => `aggregator(matrix)`. - `Weighting.__call__`: The `stat` parameter is now positional-only. Suggested change: `weighting(stat=gramian)` => `weighting(gramian)`. @@ -177,7 +177,7 @@ changelog does not include internal changes that do not affect the user. - Made some aggregators (`CAGrad`, `ConFIG`, `DualProj`, `GradDrop`, `IMTLG`, `NashMTL`, `PCGrad` and `UPGrad`) raise a `NonDifferentiableError` whenever one tries to differentiate through them. - Before this change, trying to differentiate through them leaded to wrong gradients or unclear + Before this change, trying to differentiate through them led to wrong gradients or unclear errors. ### Added diff --git a/README.md b/README.md index 5a68971ba..1c3a6b0ff 100644 --- a/README.md +++ b/README.md @@ -294,7 +294,7 @@ TorchJD provides many existing aggregators from the literature, listed in the fo ## Release Methodology -We try to make a release whenever have something worth sharing to users (bug fix, minor or large +We try to make a release whenever we have something worth sharing to users (bug fix, minor or large feature, etc.). TorchJD follows [semantic versioning](https://semver.org/). Since the library is still in beta (`0.x.y`), we sometimes make interface changes in minor versions. We prioritize the long-term quality of the library, which occasionally means introducing breaking changes. Whenever a diff --git a/docs/source/examples/basic_usage.rst b/docs/source/examples/basic_usage.rst index 58b435e4b..1d82d1d63 100644 --- a/docs/source/examples/basic_usage.rst +++ b/docs/source/examples/basic_usage.rst @@ -69,7 +69,7 @@ Perform the Jacobian descent backward pass: The first function will populate the ``.jac`` field of each model parameter with the corresponding Jacobian, and the second one will aggregate these Jacobians and store the result in the ``.grad`` -field of the parameters. It also deletes the ``.jac`` fields save some memory. +field of the parameters. It also deletes the ``.jac`` fields to save some memory. Update each parameter based on its ``.grad`` field, using the ``optimizer``: diff --git a/docs/source/examples/grouping.rst b/docs/source/examples/grouping.rst index a04e50a04..dff8d1279 100644 --- a/docs/source/examples/grouping.rst +++ b/docs/source/examples/grouping.rst @@ -6,21 +6,21 @@ The aggregation can be made independently on groups of parameters, at different the parameters: 1. **Together** (baseline): one group covering all parameters. Corresponds to the `whole_model` - stategy in the paper. + strategy in the paper. 2. **Per network**: one group per top-level sub-network (e.g. encoder and decoder separately). - Corresponds to the `enc_dec` stategy in the paper. + Corresponds to the `enc_dec` strategy in the paper. -3. **Per layer**: one group per leaf module of the network. Corresponds to the `all_layer` stategy +3. **Per layer**: one group per leaf module of the network. Corresponds to the `all_layer` strategy in the paper. 4. **Per tensor**: one group per individual parameter tensor. Corresponds to the `all_matrix` - stategy in the paper. + strategy in the paper. In TorchJD, grouping is achieved by calling :func:`~torchjd.autojac.jac_to_grad` once per group after :func:`~torchjd.autojac.backward` or :func:`~torchjd.autojac.mtl_backward`, with a dedicated aggregator instance per group. For :class:`~torchjd.aggregation.Stateful` aggregators, each instance -should independently maintains its own state (e.g. the EMA :math:`\hat{\phi}` state in +should independently maintain its own state (e.g. the EMA :math:`\hat{\phi}` state in :class:`~torchjd.aggregation.GradVac`, matching the per-block targets from the original paper). .. note:: diff --git a/src/torchjd/_linalg/_matrix.py b/src/torchjd/_linalg/_matrix.py index 7a4960ca7..a7b5ce614 100644 --- a/src/torchjd/_linalg/_matrix.py +++ b/src/torchjd/_linalg/_matrix.py @@ -2,8 +2,8 @@ from torch import Tensor -# Note: we're using classes and inherittance instead of NewType because it's possible to have -# multiple inherittance but there is no type intersection. However, these classes should never be +# Note: we're using classes and inheritance instead of NewType because it's possible to have +# multiple inheritance but there is no type intersection. However, these classes should never be # instantiated: they're only used for static type checking. diff --git a/src/torchjd/aggregation/_graddrop.py b/src/torchjd/aggregation/_graddrop.py index 81ebf8176..c693b8041 100644 --- a/src/torchjd/aggregation/_graddrop.py +++ b/src/torchjd/aggregation/_graddrop.py @@ -20,7 +20,7 @@ class GradDrop(Aggregator): Optimizing Deep Multitask Models with Gradient Sign Dropout `_. - :param f: The function to apply to the Gradient Positive Sign Purity. It should be monotically + :param f: The function to apply to the Gradient Positive Sign Purity. It should be monotonically increasing. Defaults to identity. :param leak: The tensor of leak values, determining how much each row is allowed to leak through. Defaults to None, which means no leak. diff --git a/src/torchjd/aggregation/_imtl_g.py b/src/torchjd/aggregation/_imtl_g.py index a53085be7..b1dd0cbff 100644 --- a/src/torchjd/aggregation/_imtl_g.py +++ b/src/torchjd/aggregation/_imtl_g.py @@ -29,7 +29,7 @@ class IMTLG(GramianWeightedAggregator): :class:`~torchjd.aggregation.GramianWeightedAggregator` generalizing the method described in `Towards Impartial Multi-task Learning `_. This generalization, defined formally in `Jacobian Descent For Multi-Objective Optimization - `_, supports matrices with some linearly dependant rows. + `_, supports matrices with some linearly dependent rows. """ gramian_weighting: IMTLGWeighting diff --git a/src/torchjd/aggregation/_pcgrad.py b/src/torchjd/aggregation/_pcgrad.py index 25e244522..fce7af24d 100644 --- a/src/torchjd/aggregation/_pcgrad.py +++ b/src/torchjd/aggregation/_pcgrad.py @@ -48,7 +48,7 @@ def forward(self, gramian: PSDMatrix, /) -> Tensor: class PCGrad(GramianWeightedAggregator): """ - :class:`~torchjd.aggregation.GramianWeightedAggregator` as defined in algorithm 1 of + :class:`~torchjd.aggregation.GramianWeightedAggregator` as defined in Algorithm 1 of `Gradient Surgery for Multi-Task Learning `_. """ diff --git a/src/torchjd/aggregation/_random.py b/src/torchjd/aggregation/_random.py index d20d54db1..f1e4010c1 100644 --- a/src/torchjd/aggregation/_random.py +++ b/src/torchjd/aggregation/_random.py @@ -21,7 +21,7 @@ def forward(self, matrix: Tensor, /) -> Tensor: class Random(WeightedAggregator): """ :class:`~torchjd.aggregation.WeightedAggregator` that computes a random combination of - the rows of the provided matrices, as defined in algorithm 2 of `Reasonable Effectiveness of + the rows of the provided matrices, as defined in Algorithm 2 of `Reasonable Effectiveness of Random Weighting: A Litmus Test for Multi-Task Learning `_. """ diff --git a/src/torchjd/autogram/_jacobian_computer.py b/src/torchjd/autogram/_jacobian_computer.py index adc90b06a..c5d7ad4c6 100644 --- a/src/torchjd/autogram/_jacobian_computer.py +++ b/src/torchjd/autogram/_jacobian_computer.py @@ -120,7 +120,7 @@ def functional_model_call(rg_params: dict[str, Parameter]) -> tuple[Tensor, ...] vjp_func = torch.func.vjp(functional_model_call, self.rg_params)[1] - # vjp_func is a function that computes the vjp w.r.t. to the primals (tuple). Here the + # vjp_func is a function that computes the vjp w.r.t. the primals (tuple). Here the # functional has a single primal which is dict(module.named_parameters()). We therefore take # the 0'th element to obtain the dict of gradients w.r.t. the module's named_parameters. gradients = vjp_func(grad_outputs_j_)[0] diff --git a/src/torchjd/autogram/_module_hook_manager.py b/src/torchjd/autogram/_module_hook_manager.py index f85693dac..d04372a92 100644 --- a/src/torchjd/autogram/_module_hook_manager.py +++ b/src/torchjd/autogram/_module_hook_manager.py @@ -23,7 +23,7 @@ class ModuleHookManager: """ - Class responsible for handling hooks and Nodes that computes the Gramian reverse accumulation. + Class responsible for handling hooks and Nodes that compute the Gramian reverse accumulation. :param target_edges: Registry for tracking gradient edges that serve as targets for the first differentiation. diff --git a/src/torchjd/autojac/_mtl_backward.py b/src/torchjd/autojac/_mtl_backward.py index 281ddfed2..cb34bc526 100644 --- a/src/torchjd/autojac/_mtl_backward.py +++ b/src/torchjd/autojac/_mtl_backward.py @@ -204,7 +204,7 @@ def _create_task_transform( backpropagate = Select(features) # Transform that accumulates the gradient of the tensor w.r.t. the task-specific parameters into - # their .grad fields and backpropagates the gradient of the tensor w.r.t. to the features. + # their .grad fields and backpropagates the gradient of the tensor w.r.t. the features. backward_task = (backpropagate | accumulate) << grad << Select(tensor) return backward_task diff --git a/src/torchjd/autojac/_transform/_grad.py b/src/torchjd/autojac/_transform/_grad.py index a4bd4ff3a..38f24a00d 100644 --- a/src/torchjd/autojac/_transform/_grad.py +++ b/src/torchjd/autojac/_transform/_grad.py @@ -10,8 +10,7 @@ class Grad(Differentiate): """ Transform from Gradients to Gradients, computing the gradient of each output element with - respect to each input tensor, and applying the linear transformations represented by provided - the grad_outputs to the results. + respect to each input tensor, and applying the linear transformations represented by the grad_outputs to the results. :param outputs: Tensors to differentiate. :param inputs: Tensors with respect to which we differentiate. diff --git a/tests/unit/autojac/test_mtl_backward.py b/tests/unit/autojac/test_mtl_backward.py index 2e8ec2f50..06dca9ed1 100644 --- a/tests/unit/autojac/test_mtl_backward.py +++ b/tests/unit/autojac/test_mtl_backward.py @@ -691,7 +691,7 @@ def test_repeated_task_params() -> None: def test_grad_tensors_value_is_correct() -> None: """ - Tests that mtl_ackward correctly computes the element-wise product of grad_tensors and the + Tests that mtl_backward correctly computes the element-wise product of grad_tensors and the tensors. """ From 13d86c99ccf49b610ecb2c9cab19a9a7c47f8417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Val=C3=A9rian=20Rey?= Date: Sat, 9 May 2026 02:45:13 +0200 Subject: [PATCH 2/2] Add newline --- src/torchjd/autojac/_transform/_grad.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/torchjd/autojac/_transform/_grad.py b/src/torchjd/autojac/_transform/_grad.py index 38f24a00d..33ab747ff 100644 --- a/src/torchjd/autojac/_transform/_grad.py +++ b/src/torchjd/autojac/_transform/_grad.py @@ -10,7 +10,8 @@ class Grad(Differentiate): """ Transform from Gradients to Gradients, computing the gradient of each output element with - respect to each input tensor, and applying the linear transformations represented by the grad_outputs to the results. + respect to each input tensor, and applying the linear transformations represented by the + grad_outputs to the results. :param outputs: Tensors to differentiate. :param inputs: Tensors with respect to which we differentiate.