homalg-project · kamalsaleh · Jan 21, 2026 · Jan 21, 2026
diff --git a/PackageInfo.g b/PackageInfo.g
@@ -10,7 +10,7 @@ SetPackageInfo( rec(
 
 PackageName := "GradientBasedLearningForCAP",
 Subtitle := "Gradient Based Learning via Category Theory",
-Version := "2026.01-01",
+Version := "2026.01-02",
 Date := (function ( ) if IsBound( GAPInfo.SystemEnvironment.GAP_PKG_RELEASE_DATE ) then return GAPInfo.SystemEnvironment.GAP_PKG_RELEASE_DATE; else return Concatenation( ~.Version{[ 1 .. 4 ]}, "-", ~.Version{[ 6, 7 ]}, "-01" ); fi; end)( ),
 License := "GPL-2.0-or-later",
 

diff --git a/README.md b/README.md
@@ -112,18 +112,18 @@ where the activation map applied on the output layer is the identity function _I
 ```julia
 gap> input_dim := 1;; hidden_dims := [ ];; output_dim := 1;;
 
-gap> f := PredictionMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "IdFunc" );;
+gap> f := NeuralNetworkPredictionMorphism( Para, input_dim, hidden_dims, output_dim, "IdFunc" );;
 ```
 As a parametrized map this neural network is defined as:
 
 <img src="pictures/eq-1.png" alt="Image Description" width="1000" height="120">
 
 Note that $(\theta_1,\theta_2)$ represents the parameters-vector while $(x)$ represents the input-vector. Hence, the above output is an affine transformation of $(x)\in \mathbb{R}^1$.
 ```julia
-gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "x" ] );
+gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "x" ] );
 [ theta_1, theta_2, x ]
 
-gap> Display( f : dummy_input := input );
+gap> Display( f : dummy_input := dummy_input );
 ℝ^1 -> ℝ^1 defined by:
 
 Underlying Object:
@@ -156,12 +156,12 @@ Note that $(\theta_1,\theta_2)$ represents the parameters-vector while $(x,y)$ r
 In the following we construct the aforementioned loss-map:
 
 ```julia
-gap> ell := LossMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "IdFunc" );;
+gap> ell := NeuralNetworkLossMorphism( Para, input_dim, hidden_dims, output_dim, "IdFunc" );;
 
-gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "x", "y" ] );
+gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "x", "y" ] );
 [ theta_1, theta_2, x, y ]
 
-gap> Display( ell : dummy_input := input );
+gap> Display( ell : dummy_input := dummy_input );
 ℝ^2 -> ℝ^1 defined by:
 
 Underlying Object:
@@ -209,7 +209,7 @@ gap> theta := [ 0.1, -0.1 ];;
 
 To perform _nr_epochs_ = 15 updates on $\theta\in\mathbb{R}^2$ we can use the _Fit_ operation:
 ```julia
-gap> nr_epochs := 10;;
+gap> nr_epochs := 15;;
 
 gap> theta := Fit( one_epoch_update, nr_epochs, theta );
 Epoch  0/15 - loss = 26.777499999999993
@@ -321,7 +321,7 @@ Its input dimension is 2 and output dimension is 3 and has no hidden layers.
 ```julia
 gap> input_dim := 2;; hidden_dims := [ ];; output_dim := 3;;
 
-gap> f := PredictionMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "Softmax" );;
+gap> f := NeuralNetworkPredictionMorphism( Para, input_dim, hidden_dims, output_dim, "Softmax" );;
 ```
 
 As a parametrized map this neural network is defined as:
@@ -330,10 +330,10 @@ As a parametrized map this neural network is defined as:
 
 Note that $(\theta_1,\dots,\theta_9)$ represents the parameters-vector while $(x_{1},x_{2})$ represents the input-vector. Hence, the above output is the _Softmax_ of an affine transformation of $(x_{1},x_{2})$.
 ```julia
-gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2" ] );
+gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2" ] );
 [ theta_1, theta_2, theta_3, theta_4, theta_5, theta_6, theta_7, theta_8, theta_9, x1, x2 ]
 
-gap> Display( f : dummy_input := input );
+gap> Display( f : dummy_input := dummy_input );
 ℝ^2 -> ℝ^3 defined by:
 
 Underlying Object:
@@ -380,11 +380,11 @@ $$\text{Cross-Entropy}((z_1,z_2,z_3),(y_{1},y_{2},y_{3})) := -\frac{1}{3}\left(y
 In the following we construct the aforementioned loss-map:
 
 ```julia
-gap> ell := LossMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "Softmax" );;
+gap> ell := NeuralNetworkLossMorphism( Para, input_dim, hidden_dims, output_dim, "Softmax" );;
 
-gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2", "y1", "y2", "y3" ] );
+gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2", "y1", "y2", "y3" ] );
 
-gap> Display( ell : dummy_input := input );
+gap> Display( ell : dummy_input := dummy_input );
 ℝ^5 -> ℝ^1 defined by:
 
 Underlying Object:
@@ -416,7 +416,7 @@ CategoryOfLenses( SkeletalSmoothMaps )
 
 gap> optimizer := Lenses.AdamOptimizer( : learning_rate := 0.01, beta_1 := 0.9, beta_2 := 0.999 );;
 
-gap> optimizer( 9 )
+gap> optimizer( 9 );
 (ℝ^28, ℝ^28) -> (ℝ^9, ℝ^9) defined by:
 
 Get Morphism:
@@ -433,7 +433,7 @@ Now we compute the One-Epoch-Update-Lens using the _batch size_ = 1:
 ```julia
 gap> batch_size := 1;;
 
-gap> one_epoch_update := OneEpochUpdateLens( ell, optimizer, D, batch_size );;
+gap> one_epoch_update := OneEpochUpdateLens( ell, optimizer, D, batch_size );
 (ℝ^28, ℝ^28) -> (ℝ^1, ℝ^0) defined by:
 
 Get Morphism:
@@ -494,7 +494,7 @@ Epoch 4/4 - loss = 0.0030655216725219204
 Now let us use the updated theta (is the last $9$ entries) to predict the label $\in$ {_class-1_, _class-2_, _class-3_} of the point $[1,-1]\in\mathbb{R}^2$.
 
 ```julia
-gap> theta := SplitDenseList( w, [ 19, 9 ] )[2];
+gap> theta := w{ [ 20 .. 28 ] };
 [ 5.09137, -4.83379, 3.06257, -5.70976, 0.837175, -4.23622, -1.71171, 5.54301, -4.80856 ]
 
 gap> theta := SkeletalSmoothMaps.Constant( theta );

diff --git a/doc/Doc.autodoc b/doc/Doc.autodoc
@@ -1,13 +1,16 @@
 @Chapter Introduction
 
-This package provides tools for exploring categorical machine learning using the CAP (Categories, Algorithms, Programming) system.
-It implements automatic differentiation using the lens pattern and provides constructs for building and training neural networks.
+The GradientBasedLearningForCAP package is a computational tool for categorical machine learning within the CAP (Categories, Algorithms, Programming) framework.
+It provides a categorical foundation for neural networks by modelling them as parametrised morphisms and performing computation in the category of smooth maps.
+The system supports symbolic expressions and automatic differentiation via the lens pattern, enabling the bidirectional data flow required for backpropagation.
+Included examples demonstrate practical applications such as finding a local minimum and training models for binary classification, multi-class classification, and linear regression, using various loss functions and optimizers including gradient descent and Adam.
+This implementation is based on the paper $\href{https://arxiv.org/abs/2404.00408}{Deep~Learning~with~Parametric~Lenses}$.
 
 @Section Overview
 
 The package implements the following main concepts:
 
-* **Examples**: Examples for creating and training neural networks.
+* **Examples**: Examples for creating and training neural networks and computing local minima.
 
 * **Expressions**: A symbolic expression system for representing mathematical formulas.
 
@@ -26,11 +29,12 @@ The package implements the following main concepts:
 * **Tools**: Few GAP operations and helper functions.
 
 
-@Chapter Examples for neural networks
+@Chapter Examples
 
-@Section Binary-class neural network with binary cross-entropy loss function
-@Section Multi-class neural network with cross-entropy loss function
-@Section Neural network with quadratic loss function
+@Section Binary-Class Neural Network with Binary Cross-Entropy Loss Function
+@Section Multi-Class Neural Network with Cross-Entropy Loss Function
+@Section Neural Network with Quadratic Loss Function
+@Section Next Local Minima
 
 @Chapter Expressions
 

diff --git a/examples/ComputingTheNextLocalMimima/next_local_minima.g b/examples/ComputingTheNextLocalMimima/next_local_minima.g
@@ -1,36 +1,155 @@
-LoadPackage( "GradientBasedLearningForCAP" );
+#! @Chapter Examples
+
+#! @Section Next Local Minima
 
+#! In this example we demonstrate how to use the fitting machinery of
+#! $\texttt{GradientBasedLearningForCAP}$ to find a nearby local minimum of a smooth
+#! function by gradient-based optimisation.
+#!
+#! We consider the function
+#! @BeginLatexOnly
+#! \[
+#! f(\theta_1,\theta_2) = \sin(\theta_1)^2 + \log(\theta_2)^2,
+#! \]
+#! @EndLatexOnly
+#! which has local minima at the points $(\pi k, 1)$ for $k \in \mathbb{Z}$.
+#! We use the Adam optimiser to find a local minimum starting from an initial point.
+#! Hence, the parameter vector is of the form
+#! @BeginLatexOnly
+#! \[
+#! w = (t, m_1, m_2, v_1, v_2, \theta_1, \theta_2),
+#! \]
+#! @EndLatexOnly
+#! where $t$ is the time step, $m_1$ and $m_2$ are the first moment estimates for
+#! $\theta_1$ and $\theta_2$ respectively, and $v_1$ and $v_2$ are the second moment
+#! estimates for $\theta_1$ and $\theta_2$ respectively.
+#! We start from the initial point
+#! @BeginLatexOnly
+#! \[
+#! w = (1, 0, 0, 0, 0, 1.58, 0.1),
+#! \]
+#! @EndLatexOnly
+#! which is close to the local minimum at $(\pi, 1)$.
+#! After running the optimisation for $500$ epochs, we reach the point
+#! @BeginLatexOnly
+#! \[
+#! w = (501, -9.35215 \times 10^{-12}, 0.041779, 0.00821802, 1.5526, 3.14159, 0.980292),
+#! \]
+#! @EndLatexOnly
+#! where the last two components correspond to the parameters $\theta_1$ and $\theta_2$.
+#! Evaluating the function $f$ at this point gives us the value
+#! @BeginLatexOnly
+#! \[
+#! f(3.14159, 0.980292) = 0.000396202,
+#! \]
+#! @EndLatexOnly
+#! which is very close to $0$, the value of the function at the local minima.
+#! Thus, we have successfully found a local minimum using gradient-based optimisation.
+#! Note that during the optimisation process,
+#! the $\theta_1$ parameter moved from approximately $1.58$ to approximately $\pi$,
+#! while the $\theta_2$ parameter moved from $0.1$ to approximately $1$.
+#!
+#! @BeginLatexOnly
+#! \begin{center}
+#! \includegraphics[width=0.6\textwidth]{../examples/ComputingTheNextLocalMimima/plot-with-3-local-minimas.png}
+#! \end{center}
+#! @EndLatexOnly
 
-# the function f(x1,x2) = sin(x1)^2 + log(x2)^2 has local miminima at the points (πk, 1) where k ∈ ℤ
+LoadPackage( "GradientBasedLearningForCAP" );
 
+#! @Example
 Smooth := SkeletalCategoryOfSmoothMaps( );
+#! SkeletalSmoothMaps
 Lenses := CategoryOfLenses( Smooth );
+#! CategoryOfLenses( SkeletalSmoothMaps )
 Para := CategoryOfParametrisedMorphisms( Smooth );
+#! CategoryOfParametrisedMorphisms( SkeletalSmoothMaps )
 
-f := PreCompose( Smooth,
+f_smooth := PreCompose( Smooth,
         DirectProductFunctorial( Smooth, [ Smooth.Sin ^ 2, Smooth.Log ^ 2 ] ),
         Smooth.Sum( 2 ) );
+#! ℝ^2 -> ℝ^1
+dummy_input := CreateContextualVariables( [ "theta_1", "theta_2" ] );
+#! [ theta_1, theta_2 ]
+Display( f_smooth : dummy_input := dummy_input );
+#! ℝ^2 -> ℝ^1
+#! 
+#! ‣ Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 )
 
 f := MorphismConstructor( Para,
         ObjectConstructor( Para, Smooth.( 0 ) ),
-        Pair( Smooth.( 2 ), f ),
+        Pair( Smooth.( 2 ), f_smooth ),
         ObjectConstructor( Para, Smooth.( 1 ) ) );
-
+#! ℝ^0 -> ℝ^1 defined by:
+#! 
+#! Underlying Object:
+#! -----------------
+#! ℝ^2
+#! 
+#! Underlying Morphism:
+#! -------------------
+#! ℝ^2 -> ℝ^1
+Display( f : dummy_input := dummy_input );
+#! ℝ^0 -> ℝ^1 defined by:
+#! 
+#! Underlying Object:
+#! -----------------
+#! ℝ^2
+#! 
+#! Underlying Morphism:
+#! -------------------
+#! ℝ^2 -> ℝ^1
+#! 
+#! ‣ Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 )
 optimizer := Lenses.AdamOptimizer( );
-
-# there is only one training example in R^0 which is the trivial vector []
-training_examples := [ [] ];
-
-# what else :)
+#! function( n ) ... end
+training_examples := [ [ ] ];
+#! [ [ ] ]
 batch_size := 1;
-
+#! 1
 one_epoch_update := OneEpochUpdateLens( f, optimizer, training_examples, batch_size );
-
-# initial value for w
+#! (ℝ^7, ℝ^7) -> (ℝ^1, ℝ^0) defined by:
+#! 
+#! Get Morphism:
+#! ------------
+#! ℝ^7 -> ℝ^1
+#! 
+#! Put Morphism:
+#! ------------
+#! ℝ^7 -> ℝ^7
+dummy_input := CreateContextualVariables(
+      [ "t", "m_1", "m_2", "v_1", "v_2", "theta_1", "theta_2" ] );
+#! [ t, m_1, m_2, v_1, v_2, theta_1, theta_2 ]
+Display( one_epoch_update : dummy_input := dummy_input );
+#! (ℝ^7, ℝ^7) -> (ℝ^1, ℝ^0) defined by:
+#! 
+#! Get Morphism:
+#! ------------
+#! ℝ^7 -> ℝ^1
+#! 
+#! ‣ (Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 )) / 1 / 1
+#! 
+#! Put Morphism:
+#! ------------
+#! ℝ^7 -> ℝ^7
+#! 
+#! ‣ t + 1
+#! ‣ 0.9 * m_1 + 0.1 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0))
+#! ‣ 0.9 * m_2 + 0.1 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1))
+#! ‣ 0.999 * v_1 + 0.001 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0)) ^ 2
+#! ‣ 0.999 * v_2 + 0.001 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1)) ^ 2
+#! ‣ theta_1 + 0.001 / (1 - 0.999 ^ t) * ((0.9 * m_1 + 0.1 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0))) / (1.e-0\
+#! 7 + Sqrt( (0.999 * v_1 + 0.001 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0)) ^ 2) / (1 - 0.999 ^ t) )))
+#! ‣ theta_2 + 0.001 / (1 - 0.999 ^ t) * ((0.9 * m_2 + 0.1 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1))) / (1.e-07 \
+#! + Sqrt( (0.999 * v_2 + 0.001 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1)) ^ 2) / (1 - 0.999 ^ t) )))
 w := [ 1, 0, 0, 0, 0, 1.58, 0.1 ];
-
-nr_epochs := 5000;
-
-w := Fit( one_epoch_update, nr_epochs, w );
-
-# after 5000 epoch the found point is [ bla bla, 3.14159, 1 ]
+#! [ 1, 0, 0, 0, 0, 1.58, 0.1 ]
+nr_epochs := 500;
+#! 500
+w := Fit( one_epoch_update, nr_epochs, w : verbose := false );
+#! [ 501, -9.35215e-12, 0.041779, 0.00821802, 1.5526, 3.14159, 0.980292 ]
+theta := w{ [ 6, 7 ] };
+#! [ 3.14159, 0.980292 ]
+Map( f_smooth )( theta );
+#! [ 0.000396202 ]
+#! @EndExample
diff --git a/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g b/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g
@@ -1,6 +1,6 @@
-#! @Chapter Examples for neural networks
+#! @Chapter Examples
 
-#! @Section Binary-class neural network with binary cross-entropy loss function
+#! @Section Binary-Class Neural Network with Binary Cross-Entropy Loss Function
 
 LoadPackage( "GradientBasedLearningForCAP" );
 

diff --git a/examples/NeuralNetwork_CrossEntropy/neural_network.g b/examples/NeuralNetwork_CrossEntropy/neural_network.g
@@ -1,6 +1,6 @@
-#! @Chapter Examples for neural networks
+#! @Chapter Examples
 
-#! @Section Multi-class neural network with cross-entropy loss function
+#! @Section Multi-Class Neural Network with Cross-Entropy Loss Function
 
 LoadPackage( "GradientBasedLearningForCAP" );
 

diff --git a/examples/NeuralNetwork_QuadraticLoss/neural_network.g b/examples/NeuralNetwork_QuadraticLoss/neural_network.g
@@ -1,8 +1,8 @@
 LoadPackage( "GradientBasedLearningForCAP" );
 
-#! @Chapter Examples for neural networks
+#! @Chapter Examples
 
-#! @Section Neural network with quadratic loss function
+#! @Section Neural Network with Quadratic Loss Function
 
 #! This example demonstrates how to train a small feed-forward neural network
 #! for a regression task using the $\texttt{GradientBasedLearningForCAP}$ package. We employ

diff --git a/gap/FitParameters.gd b/gap/FitParameters.gd
@@ -73,7 +73,7 @@
 #! \]
 #! @EndLatexOnly
 #! 
-#! For example, if we chose the optimizer to be the gradient descent optimizer with learning rate $\eta=0.01$:
+#! Suppose we choose the optimizer lens to be the gradient descent optimizer with learning rate $\eta = 0.01 > 0$,
 #! @BeginLatexOnly
 #! \[
 #! \begin{tikzpicture}
@@ -83,13 +83,13 @@
 #! \node (Ap) at (-3,-1) {$\mathbb{R}^p$};
 #! \node (Bp) at ( 3,-1) {$\mathbb{R}^p$};
 #! \draw (-1.5,-1.8) rectangle (1.5,1.8);
-#! \draw[->] (A) -- node[above] {$\Theta \mapsto f_i(\Theta)$} (B);
+#! \draw[->] (A) -- node[above] {$\Theta \mapsto \Theta$} (B);
 #! \draw[->] (Bp) -- node[midway, below] {$\Theta + \eta g \mapsfrom (\Theta, g)$} (Ap);
 #! \draw[-] (-1,1) to[out=-90, in=90] (1,-1);
 #! \end{tikzpicture}
 #! \]
 #! @EndLatexOnly
-#! The resulting One-Epoch update lens for the example $X_i$ is given by:
+#! then the resulting One-Epoch update lens for the example $X_i$ is given by
 #! @BeginLatexOnly
 #! \[
 #! \begin{tikzpicture}
@@ -99,7 +99,7 @@
 #! \node (Ap) at (-3,-1) {$\mathbb{R}^p$};
 #! \node (Bp) at ( 3,-1) {$\mathbb{R}^0$};
 #! \draw (-1.5,-1.8) rectangle (1.5,1.8);
-#! \draw[->] (A) -- node[above] {$\Theta \mapsto \Theta$} (B);
+#! \draw[->] (A) -- node[above] {$\Theta \mapsto f_i(\Theta)$} (B);
 #! \draw[->] (Bp) -- node[midway, below] {$\Theta - \eta J_{f_i}(\Theta) \mapsfrom \Theta$} (Ap);
 #! \draw[-] (-1,1) to[out=-90, in=90] (1,-1);
 #! \end{tikzpicture}