SimplexLab · ValerianRey · Feb 20, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -100,7 +100,7 @@ def linkcode_resolve(domain: str, info: dict[str, str]) -> str | None:
     return link
 
 
-def _get_obj(_info: dict[str, str]):
+def _get_obj(_info: dict[str, str]) -> object:
     module_name = _info["module"]
     full_name = _info["fullname"]
     sub_module = sys.modules.get(module_name)
@@ -112,7 +112,7 @@ def _get_obj(_info: dict[str, str]):
     return obj
 
 
-def _get_file_name(obj) -> str | None:
+def _get_file_name(obj: object) -> str | None:
     try:
         file_name = inspect.getsourcefile(obj)
         file_name = os.path.relpath(file_name, start=_PATH_ROOT)
@@ -121,7 +121,7 @@ def _get_file_name(obj) -> str | None:
     return file_name
 
 
-def _get_line_str(obj) -> str:
+def _get_line_str(obj: object) -> str:
     source, start = inspect.getsourcelines(obj)
     end = start + len(source) - 1
     line_str = f"#L{start}-L{end}"

diff --git a/pyproject.toml b/pyproject.toml
@@ -134,6 +134,7 @@ select = [
     "I",    # isort
     "UP",   # pyupgrade
     "ARG",  # flake8-unused-arguments
+    "ANN",  # flake-8-annotations
     "B",    # flake8-bugbear
     "C4",   # flake8-comprehensions
     "FIX",  # flake8-fixme
@@ -156,12 +157,16 @@ ignore = [
     "RUF012",  # Mutable default value for class attribute (a bit tedious to fix)
     "RET504",  # Unnecessary assignment return statement
     "COM812",  # Trailing comma missing (conflicts with formatter, see https://github.com/astral-sh/ruff/issues/9216)
+    "ANN401",  # Prevent annotating as Any (we rarely do that, and when we do it's hard to find an alternative)
 ]
 
 [tool.ruff.lint.per-file-ignores]
 "**/conftest.py" = ["ARG"]  # Can't change argument names in the functions pytest expects
 "tests/doc/test_rst.py" = ["ARG"]  # For the lightning example
 
+[tool.ruff.lint.flake8-annotations]
+suppress-dummy-args = true
+
 [tool.ruff.lint.isort]
 combine-as-imports = true
 

diff --git a/src/torchjd/aggregation/_aggregator_bases.py b/src/torchjd/aggregation/_aggregator_bases.py
@@ -13,7 +13,7 @@ class Aggregator(nn.Module, ABC):
     :math:`m \times n` into row vectors of dimension :math:`n`.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     @staticmethod
@@ -48,7 +48,7 @@ class WeightedAggregator(Aggregator):
     :param weighting: The object responsible for extracting the vector of weights from the matrix.
     """
 
-    def __init__(self, weighting: Weighting[Matrix]):
+    def __init__(self, weighting: Weighting[Matrix]) -> None:
         super().__init__()
         self.weighting = weighting
 
@@ -77,6 +77,6 @@ class GramianWeightedAggregator(WeightedAggregator):
         gramian.
     """
 
-    def __init__(self, gramian_weighting: Weighting[PSDMatrix]):
+    def __init__(self, gramian_weighting: Weighting[PSDMatrix]) -> None:
         super().__init__(gramian_weighting << compute_gramian)
         self.gramian_weighting = gramian_weighting
diff --git a/src/torchjd/aggregation/_aligned_mtl.py b/src/torchjd/aggregation/_aligned_mtl.py
@@ -61,7 +61,7 @@ def __init__(
         self,
         pref_vector: Tensor | None = None,
         scale_mode: SUPPORTED_SCALE_MODE = "min",
-    ):
+    ) -> None:
         self._pref_vector = pref_vector
         self._scale_mode: SUPPORTED_SCALE_MODE = scale_mode
         super().__init__(AlignedMTLWeighting(pref_vector, scale_mode=scale_mode))
@@ -92,7 +92,7 @@ def __init__(
         self,
         pref_vector: Tensor | None = None,
         scale_mode: SUPPORTED_SCALE_MODE = "min",
-    ):
+    ) -> None:
         super().__init__()
         self._pref_vector = pref_vector
         self._scale_mode: SUPPORTED_SCALE_MODE = scale_mode

diff --git a/src/torchjd/aggregation/_cagrad.py b/src/torchjd/aggregation/_cagrad.py
@@ -34,7 +34,7 @@ class CAGrad(GramianWeightedAggregator):
         To install it, use ``pip install "torchjd[cagrad]"``.
     """
 
-    def __init__(self, c: float, norm_eps: float = 0.0001):
+    def __init__(self, c: float, norm_eps: float = 0.0001) -> None:
         super().__init__(CAGradWeighting(c=c, norm_eps=norm_eps))
         self._c = c
         self._norm_eps = norm_eps
@@ -67,7 +67,7 @@ class CAGradWeighting(Weighting[PSDMatrix]):
         function.
     """
 
-    def __init__(self, c: float, norm_eps: float = 0.0001):
+    def __init__(self, c: float, norm_eps: float = 0.0001) -> None:
         super().__init__()
 
         if c < 0.0:

diff --git a/src/torchjd/aggregation/_config.py b/src/torchjd/aggregation/_config.py
@@ -50,7 +50,7 @@ class ConFIG(Aggregator):
         <https://github.com/tum-pbs/ConFIG/tree/main/conflictfree>`_.
     """
 
-    def __init__(self, pref_vector: Tensor | None = None):
+    def __init__(self, pref_vector: Tensor | None = None) -> None:
         super().__init__()
         self.weighting = pref_vector_to_weighting(pref_vector, default=SumWeighting())
         self._pref_vector = pref_vector

diff --git a/src/torchjd/aggregation/_constant.py b/src/torchjd/aggregation/_constant.py
@@ -15,7 +15,7 @@ class Constant(WeightedAggregator):
     :param weights: The weights associated to the rows of the input matrices.
     """
 
-    def __init__(self, weights: Tensor):
+    def __init__(self, weights: Tensor) -> None:
         super().__init__(weighting=ConstantWeighting(weights=weights))
         self._weights = weights
 
@@ -35,7 +35,7 @@ class ConstantWeighting(Weighting[Matrix]):
     :param weights: The weights to return at each call.
     """
 
-    def __init__(self, weights: Tensor):
+    def __init__(self, weights: Tensor) -> None:
         if weights.dim() != 1:
             raise ValueError(
                 "Parameter `weights` should be a 1-dimensional tensor. Found `weights.shape = "

diff --git a/src/torchjd/aggregation/_dualproj.py b/src/torchjd/aggregation/_dualproj.py
@@ -33,7 +33,7 @@ def __init__(
         norm_eps: float = 0.0001,
         reg_eps: float = 0.0001,
         solver: SUPPORTED_SOLVER = "quadprog",
-    ):
+    ) -> None:
         self._pref_vector = pref_vector
         self._norm_eps = norm_eps
         self._reg_eps = reg_eps
@@ -77,7 +77,7 @@ def __init__(
         norm_eps: float = 0.0001,
         reg_eps: float = 0.0001,
         solver: SUPPORTED_SOLVER = "quadprog",
-    ):
+    ) -> None:
         super().__init__()
         self._pref_vector = pref_vector
         self.weighting = pref_vector_to_weighting(pref_vector, default=MeanWeighting())

diff --git a/src/torchjd/aggregation/_flattening.py b/src/torchjd/aggregation/_flattening.py
@@ -20,7 +20,7 @@ class Flattening(GeneralizedWeighting):
     :param weighting: The weighting to apply to the Gramian matrix.
     """
 
-    def __init__(self, weighting: Weighting):
+    def __init__(self, weighting: Weighting) -> None:
         super().__init__()
         self.weighting = weighting
 

diff --git a/src/torchjd/aggregation/_graddrop.py b/src/torchjd/aggregation/_graddrop.py
@@ -26,7 +26,7 @@ class GradDrop(Aggregator):
         through. Defaults to None, which means no leak.
     """
 
-    def __init__(self, f: Callable = _identity, leak: Tensor | None = None):
+    def __init__(self, f: Callable = _identity, leak: Tensor | None = None) -> None:
         if leak is not None and leak.dim() != 1:
             raise ValueError(
                 "Parameter `leak` should be a 1-dimensional tensor. Found `leak.shape = "

diff --git a/src/torchjd/aggregation/_imtl_g.py b/src/torchjd/aggregation/_imtl_g.py
@@ -16,7 +16,7 @@ class IMTLG(GramianWeightedAggregator):
     <https://arxiv.org/pdf/2406.16232>`_, supports matrices with some linearly dependant rows.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(IMTLGWeighting())
 
         # This prevents computing gradients that can be very wrong.

diff --git a/src/torchjd/aggregation/_krum.py b/src/torchjd/aggregation/_krum.py
@@ -19,7 +19,7 @@ class Krum(GramianWeightedAggregator):
     :param n_selected: The number of selected rows in the context of Multi-Krum. Defaults to 1.
     """
 
-    def __init__(self, n_byzantine: int, n_selected: int = 1):
+    def __init__(self, n_byzantine: int, n_selected: int = 1) -> None:
         self._n_byzantine = n_byzantine
         self._n_selected = n_selected
         super().__init__(KrumWeighting(n_byzantine=n_byzantine, n_selected=n_selected))
@@ -44,7 +44,7 @@ class KrumWeighting(Weighting[PSDMatrix]):
     :param n_selected: The number of selected rows in the context of Multi-Krum. Defaults to 1.
     """
 
-    def __init__(self, n_byzantine: int, n_selected: int = 1):
+    def __init__(self, n_byzantine: int, n_selected: int = 1) -> None:
         super().__init__()
         if n_byzantine < 0:
             raise ValueError(

diff --git a/src/torchjd/aggregation/_mean.py b/src/torchjd/aggregation/_mean.py
@@ -13,7 +13,7 @@ class Mean(WeightedAggregator):
     matrices.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(weighting=MeanWeighting())
 
 

diff --git a/src/torchjd/aggregation/_mgda.py b/src/torchjd/aggregation/_mgda.py
@@ -20,7 +20,7 @@ class MGDA(GramianWeightedAggregator):
     :param max_iters: The maximum number of iterations of the optimization loop.
     """
 
-    def __init__(self, epsilon: float = 0.001, max_iters: int = 100):
+    def __init__(self, epsilon: float = 0.001, max_iters: int = 100) -> None:
         super().__init__(MGDAWeighting(epsilon=epsilon, max_iters=max_iters))
         self._epsilon = epsilon
         self._max_iters = max_iters
@@ -38,7 +38,7 @@ class MGDAWeighting(Weighting[PSDMatrix]):
     :param max_iters: The maximum number of iterations of the optimization loop.
     """
 
-    def __init__(self, epsilon: float = 0.001, max_iters: int = 100):
+    def __init__(self, epsilon: float = 0.001, max_iters: int = 100) -> None:
         super().__init__()
         self.epsilon = epsilon
         self.max_iters = max_iters

diff --git a/src/torchjd/aggregation/_nash_mtl.py b/src/torchjd/aggregation/_nash_mtl.py
@@ -77,7 +77,7 @@ def __init__(
         max_norm: float = 1.0,
         update_weights_every: int = 1,
         optim_niter: int = 20,
-    ):
+    ) -> None:
         super().__init__(
             weighting=_NashMTLWeighting(
                 n_tasks=n_tasks,
@@ -126,7 +126,7 @@ def __init__(
         max_norm: float,
         update_weights_every: int,
         optim_niter: int,
-    ):
+    ) -> None:
         super().__init__()
 
         self.n_tasks = n_tasks

diff --git a/src/torchjd/aggregation/_pcgrad.py b/src/torchjd/aggregation/_pcgrad.py
@@ -16,7 +16,7 @@ class PCGrad(GramianWeightedAggregator):
     `Gradient Surgery for Multi-Task Learning <https://arxiv.org/pdf/2001.06782.pdf>`_.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(PCGradWeighting())
 
         # This prevents running into a RuntimeError due to modifying stored tensors in place.

diff --git a/src/torchjd/aggregation/_random.py b/src/torchjd/aggregation/_random.py
@@ -16,7 +16,7 @@ class Random(WeightedAggregator):
     <https://arxiv.org/pdf/2111.10603.pdf>`_.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(RandomWeighting())
 
 

diff --git a/src/torchjd/aggregation/_sum.py b/src/torchjd/aggregation/_sum.py
@@ -13,7 +13,7 @@ class Sum(WeightedAggregator):
     matrices.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(weighting=SumWeighting())
 
 

diff --git a/src/torchjd/aggregation/_trimmed_mean.py b/src/torchjd/aggregation/_trimmed_mean.py
@@ -15,7 +15,7 @@ class TrimmedMean(Aggregator):
         input matrix (note that ``2 * trim_number`` values are removed from each column).
     """
 
-    def __init__(self, trim_number: int):
+    def __init__(self, trim_number: int) -> None:
         super().__init__()
         if trim_number < 0:
             raise ValueError(

diff --git a/src/torchjd/aggregation/_upgrad.py b/src/torchjd/aggregation/_upgrad.py
@@ -34,7 +34,7 @@ def __init__(
         norm_eps: float = 0.0001,
         reg_eps: float = 0.0001,
         solver: SUPPORTED_SOLVER = "quadprog",
-    ):
+    ) -> None:
         self._pref_vector = pref_vector
         self._norm_eps = norm_eps
         self._reg_eps = reg_eps
@@ -78,7 +78,7 @@ def __init__(
         norm_eps: float = 0.0001,
         reg_eps: float = 0.0001,
         solver: SUPPORTED_SOLVER = "quadprog",
-    ):
+    ) -> None:
         super().__init__()
         self._pref_vector = pref_vector
         self.weighting = pref_vector_to_weighting(pref_vector, default=MeanWeighting())

diff --git a/src/torchjd/aggregation/_utils/non_differentiable.py b/src/torchjd/aggregation/_utils/non_differentiable.py
@@ -2,7 +2,7 @@
 
 
 class NonDifferentiableError(RuntimeError):
-    def __init__(self, module: nn.Module):
+    def __init__(self, module: nn.Module) -> None:
         super().__init__(f"Trying to differentiate through {module}, which is not differentiable.")
 
 

diff --git a/src/torchjd/aggregation/_weighting_bases.py b/src/torchjd/aggregation/_weighting_bases.py
@@ -20,7 +20,7 @@ class Weighting(nn.Module, ABC, Generic[_T]):
     generally its Gramian, of dimension :math:`m \times m`.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     @abstractmethod
@@ -46,7 +46,7 @@ class _Composition(Weighting[_T]):
     output of the function.
     """
 
-    def __init__(self, weighting: Weighting[_FnOutputT], fn: Callable[[_T], _FnOutputT]):
+    def __init__(self, weighting: Weighting[_FnOutputT], fn: Callable[[_T], _FnOutputT]) -> None:
         super().__init__()
         self.fn = fn
         self.weighting = weighting
@@ -63,7 +63,7 @@ class GeneralizedWeighting(nn.Module, ABC):
     :math:`m_1 \times \dots \times m_k \times m_k \times \dots \times m_1`.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     @abstractmethod

diff --git a/src/torchjd/autogram/_engine.py b/src/torchjd/autogram/_engine.py
@@ -183,7 +183,7 @@ def __init__(
         self,
         *modules: nn.Module,
         batch_dim: int | None,
-    ):
+    ) -> None:
         self._gramian_accumulator = GramianAccumulator()
         self._target_edges = EdgeRegistry()
         self._batch_dim = batch_dim

diff --git a/src/torchjd/autogram/_gramian_computer.py b/src/torchjd/autogram/_gramian_computer.py
@@ -29,7 +29,7 @@ def reset(self) -> None:
 
 
 class JacobianBasedGramianComputer(GramianComputer, ABC):
-    def __init__(self, jacobian_computer: JacobianComputer):
+    def __init__(self, jacobian_computer: JacobianComputer) -> None:
         self.jacobian_computer = jacobian_computer
 
 
@@ -39,7 +39,7 @@ class JacobianBasedGramianComputerWithCrossTerms(JacobianBasedGramianComputer):
     the gramian.
     """
 
-    def __init__(self, jacobian_computer: JacobianComputer):
+    def __init__(self, jacobian_computer: JacobianComputer) -> None:
         super().__init__(jacobian_computer)
         self.remaining_counter = 0
         self.summed_jacobian: Matrix | None = None

diff --git a/src/torchjd/autogram/_jacobian_computer.py b/src/torchjd/autogram/_jacobian_computer.py
@@ -26,7 +26,7 @@ class JacobianComputer(ABC):
     :params module: The module to differentiate.
     """
 
-    def __init__(self, module: nn.Module):
+    def __init__(self, module: nn.Module) -> None:
         self.module = module
 
         self.rg_params = dict[str, Parameter]()