diff --git a/PackageInfo.g b/PackageInfo.g index b97a933..7f917ea 100644 --- a/PackageInfo.g +++ b/PackageInfo.g @@ -10,7 +10,7 @@ SetPackageInfo( rec( PackageName := "GradientBasedLearningForCAP", Subtitle := "Gradient Based Learning via Category Theory", -Version := "2026.01-01", +Version := "2026.01-02", Date := (function ( ) if IsBound( GAPInfo.SystemEnvironment.GAP_PKG_RELEASE_DATE ) then return GAPInfo.SystemEnvironment.GAP_PKG_RELEASE_DATE; else return Concatenation( ~.Version{[ 1 .. 4 ]}, "-", ~.Version{[ 6, 7 ]}, "-01" ); fi; end)( ), License := "GPL-2.0-or-later", diff --git a/README.md b/README.md index 17dece7..84be530 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ where the activation map applied on the output layer is the identity function _I ```julia gap> input_dim := 1;; hidden_dims := [ ];; output_dim := 1;; -gap> f := PredictionMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "IdFunc" );; +gap> f := NeuralNetworkPredictionMorphism( Para, input_dim, hidden_dims, output_dim, "IdFunc" );; ``` As a parametrized map this neural network is defined as: @@ -120,10 +120,10 @@ As a parametrized map this neural network is defined as: Note that $(\theta_1,\theta_2)$ represents the parameters-vector while $(x)$ represents the input-vector. Hence, the above output is an affine transformation of $(x)\in \mathbb{R}^1$. ```julia -gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "x" ] ); +gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "x" ] ); [ theta_1, theta_2, x ] -gap> Display( f : dummy_input := input ); +gap> Display( f : dummy_input := dummy_input ); ℝ^1 -> ℝ^1 defined by: Underlying Object: @@ -156,12 +156,12 @@ Note that $(\theta_1,\theta_2)$ represents the parameters-vector while $(x,y)$ r In the following we construct the aforementioned loss-map: ```julia -gap> ell := LossMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "IdFunc" );; +gap> ell := NeuralNetworkLossMorphism( Para, input_dim, hidden_dims, output_dim, "IdFunc" );; -gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "x", "y" ] ); +gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "x", "y" ] ); [ theta_1, theta_2, x, y ] -gap> Display( ell : dummy_input := input ); +gap> Display( ell : dummy_input := dummy_input ); ℝ^2 -> ℝ^1 defined by: Underlying Object: @@ -209,7 +209,7 @@ gap> theta := [ 0.1, -0.1 ];; To perform _nr_epochs_ = 15 updates on $\theta\in\mathbb{R}^2$ we can use the _Fit_ operation: ```julia -gap> nr_epochs := 10;; +gap> nr_epochs := 15;; gap> theta := Fit( one_epoch_update, nr_epochs, theta ); Epoch 0/15 - loss = 26.777499999999993 @@ -321,7 +321,7 @@ Its input dimension is 2 and output dimension is 3 and has no hidden layers. ```julia gap> input_dim := 2;; hidden_dims := [ ];; output_dim := 3;; -gap> f := PredictionMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "Softmax" );; +gap> f := NeuralNetworkPredictionMorphism( Para, input_dim, hidden_dims, output_dim, "Softmax" );; ``` As a parametrized map this neural network is defined as: @@ -330,10 +330,10 @@ As a parametrized map this neural network is defined as: Note that $(\theta_1,\dots,\theta_9)$ represents the parameters-vector while $(x_{1},x_{2})$ represents the input-vector. Hence, the above output is the _Softmax_ of an affine transformation of $(x_{1},x_{2})$. ```julia -gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2" ] ); +gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2" ] ); [ theta_1, theta_2, theta_3, theta_4, theta_5, theta_6, theta_7, theta_8, theta_9, x1, x2 ] -gap> Display( f : dummy_input := input ); +gap> Display( f : dummy_input := dummy_input ); ℝ^2 -> ℝ^3 defined by: Underlying Object: @@ -380,11 +380,11 @@ $$\text{Cross-Entropy}((z_1,z_2,z_3),(y_{1},y_{2},y_{3})) := -\frac{1}{3}\left(y In the following we construct the aforementioned loss-map: ```julia -gap> ell := LossMorphismOfNeuralNetwork( Para, input_dim, hidden_dims, output_dim, "Softmax" );; +gap> ell := NeuralNetworkLossMorphism( Para, input_dim, hidden_dims, output_dim, "Softmax" );; -gap> input := ConvertToExpressions( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2", "y1", "y2", "y3" ] ); +gap> dummy_input := CreateContextualVariables( [ "theta_1", "theta_2", "theta_3", "theta_4", "theta_5", "theta_6", "theta_7", "theta_8", "theta_9", "x1", "x2", "y1", "y2", "y3" ] ); -gap> Display( ell : dummy_input := input ); +gap> Display( ell : dummy_input := dummy_input ); ℝ^5 -> ℝ^1 defined by: Underlying Object: @@ -416,7 +416,7 @@ CategoryOfLenses( SkeletalSmoothMaps ) gap> optimizer := Lenses.AdamOptimizer( : learning_rate := 0.01, beta_1 := 0.9, beta_2 := 0.999 );; -gap> optimizer( 9 ) +gap> optimizer( 9 ); (ℝ^28, ℝ^28) -> (ℝ^9, ℝ^9) defined by: Get Morphism: @@ -433,7 +433,7 @@ Now we compute the One-Epoch-Update-Lens using the _batch size_ = 1: ```julia gap> batch_size := 1;; -gap> one_epoch_update := OneEpochUpdateLens( ell, optimizer, D, batch_size );; +gap> one_epoch_update := OneEpochUpdateLens( ell, optimizer, D, batch_size ); (ℝ^28, ℝ^28) -> (ℝ^1, ℝ^0) defined by: Get Morphism: @@ -494,7 +494,7 @@ Epoch 4/4 - loss = 0.0030655216725219204 Now let us use the updated theta (is the last $9$ entries) to predict the label $\in$ {_class-1_, _class-2_, _class-3_} of the point $[1,-1]\in\mathbb{R}^2$. ```julia -gap> theta := SplitDenseList( w, [ 19, 9 ] )[2]; +gap> theta := w{ [ 20 .. 28 ] }; [ 5.09137, -4.83379, 3.06257, -5.70976, 0.837175, -4.23622, -1.71171, 5.54301, -4.80856 ] gap> theta := SkeletalSmoothMaps.Constant( theta ); diff --git a/doc/Doc.autodoc b/doc/Doc.autodoc index f3ea502..df23a48 100644 --- a/doc/Doc.autodoc +++ b/doc/Doc.autodoc @@ -1,13 +1,16 @@ @Chapter Introduction -This package provides tools for exploring categorical machine learning using the CAP (Categories, Algorithms, Programming) system. -It implements automatic differentiation using the lens pattern and provides constructs for building and training neural networks. +The GradientBasedLearningForCAP package is a computational tool for categorical machine learning within the CAP (Categories, Algorithms, Programming) framework. +It provides a categorical foundation for neural networks by modelling them as parametrised morphisms and performing computation in the category of smooth maps. +The system supports symbolic expressions and automatic differentiation via the lens pattern, enabling the bidirectional data flow required for backpropagation. +Included examples demonstrate practical applications such as finding a local minimum and training models for binary classification, multi-class classification, and linear regression, using various loss functions and optimizers including gradient descent and Adam. +This implementation is based on the paper $\href{https://arxiv.org/abs/2404.00408}{Deep~Learning~with~Parametric~Lenses}$. @Section Overview The package implements the following main concepts: -* **Examples**: Examples for creating and training neural networks. +* **Examples**: Examples for creating and training neural networks and computing local minima. * **Expressions**: A symbolic expression system for representing mathematical formulas. @@ -26,11 +29,12 @@ The package implements the following main concepts: * **Tools**: Few GAP operations and helper functions. -@Chapter Examples for neural networks +@Chapter Examples -@Section Binary-class neural network with binary cross-entropy loss function -@Section Multi-class neural network with cross-entropy loss function -@Section Neural network with quadratic loss function +@Section Binary-Class Neural Network with Binary Cross-Entropy Loss Function +@Section Multi-Class Neural Network with Cross-Entropy Loss Function +@Section Neural Network with Quadratic Loss Function +@Section Next Local Minima @Chapter Expressions diff --git a/examples/ComputingTheNextLocalMimima/next_local_minima.g b/examples/ComputingTheNextLocalMimima/next_local_minima.g index 6f45a40..f83525e 100644 --- a/examples/ComputingTheNextLocalMimima/next_local_minima.g +++ b/examples/ComputingTheNextLocalMimima/next_local_minima.g @@ -1,36 +1,155 @@ -LoadPackage( "GradientBasedLearningForCAP" ); +#! @Chapter Examples + +#! @Section Next Local Minima +#! In this example we demonstrate how to use the fitting machinery of +#! $\texttt{GradientBasedLearningForCAP}$ to find a nearby local minimum of a smooth +#! function by gradient-based optimisation. +#! +#! We consider the function +#! @BeginLatexOnly +#! \[ +#! f(\theta_1,\theta_2) = \sin(\theta_1)^2 + \log(\theta_2)^2, +#! \] +#! @EndLatexOnly +#! which has local minima at the points $(\pi k, 1)$ for $k \in \mathbb{Z}$. +#! We use the Adam optimiser to find a local minimum starting from an initial point. +#! Hence, the parameter vector is of the form +#! @BeginLatexOnly +#! \[ +#! w = (t, m_1, m_2, v_1, v_2, \theta_1, \theta_2), +#! \] +#! @EndLatexOnly +#! where $t$ is the time step, $m_1$ and $m_2$ are the first moment estimates for +#! $\theta_1$ and $\theta_2$ respectively, and $v_1$ and $v_2$ are the second moment +#! estimates for $\theta_1$ and $\theta_2$ respectively. +#! We start from the initial point +#! @BeginLatexOnly +#! \[ +#! w = (1, 0, 0, 0, 0, 1.58, 0.1), +#! \] +#! @EndLatexOnly +#! which is close to the local minimum at $(\pi, 1)$. +#! After running the optimisation for $500$ epochs, we reach the point +#! @BeginLatexOnly +#! \[ +#! w = (501, -9.35215 \times 10^{-12}, 0.041779, 0.00821802, 1.5526, 3.14159, 0.980292), +#! \] +#! @EndLatexOnly +#! where the last two components correspond to the parameters $\theta_1$ and $\theta_2$. +#! Evaluating the function $f$ at this point gives us the value +#! @BeginLatexOnly +#! \[ +#! f(3.14159, 0.980292) = 0.000396202, +#! \] +#! @EndLatexOnly +#! which is very close to $0$, the value of the function at the local minima. +#! Thus, we have successfully found a local minimum using gradient-based optimisation. +#! Note that during the optimisation process, +#! the $\theta_1$ parameter moved from approximately $1.58$ to approximately $\pi$, +#! while the $\theta_2$ parameter moved from $0.1$ to approximately $1$. +#! +#! @BeginLatexOnly +#! \begin{center} +#! \includegraphics[width=0.6\textwidth]{../examples/ComputingTheNextLocalMimima/plot-with-3-local-minimas.png} +#! \end{center} +#! @EndLatexOnly -# the function f(x1,x2) = sin(x1)^2 + log(x2)^2 has local miminima at the points (πk, 1) where k ∈ ℤ +LoadPackage( "GradientBasedLearningForCAP" ); +#! @Example Smooth := SkeletalCategoryOfSmoothMaps( ); +#! SkeletalSmoothMaps Lenses := CategoryOfLenses( Smooth ); +#! CategoryOfLenses( SkeletalSmoothMaps ) Para := CategoryOfParametrisedMorphisms( Smooth ); +#! CategoryOfParametrisedMorphisms( SkeletalSmoothMaps ) -f := PreCompose( Smooth, +f_smooth := PreCompose( Smooth, DirectProductFunctorial( Smooth, [ Smooth.Sin ^ 2, Smooth.Log ^ 2 ] ), Smooth.Sum( 2 ) ); +#! ℝ^2 -> ℝ^1 +dummy_input := CreateContextualVariables( [ "theta_1", "theta_2" ] ); +#! [ theta_1, theta_2 ] +Display( f_smooth : dummy_input := dummy_input ); +#! ℝ^2 -> ℝ^1 +#! +#! ‣ Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 ) f := MorphismConstructor( Para, ObjectConstructor( Para, Smooth.( 0 ) ), - Pair( Smooth.( 2 ), f ), + Pair( Smooth.( 2 ), f_smooth ), ObjectConstructor( Para, Smooth.( 1 ) ) ); - +#! ℝ^0 -> ℝ^1 defined by: +#! +#! Underlying Object: +#! ----------------- +#! ℝ^2 +#! +#! Underlying Morphism: +#! ------------------- +#! ℝ^2 -> ℝ^1 +Display( f : dummy_input := dummy_input ); +#! ℝ^0 -> ℝ^1 defined by: +#! +#! Underlying Object: +#! ----------------- +#! ℝ^2 +#! +#! Underlying Morphism: +#! ------------------- +#! ℝ^2 -> ℝ^1 +#! +#! ‣ Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 ) optimizer := Lenses.AdamOptimizer( ); - -# there is only one training example in R^0 which is the trivial vector [] -training_examples := [ [] ]; - -# what else :) +#! function( n ) ... end +training_examples := [ [ ] ]; +#! [ [ ] ] batch_size := 1; - +#! 1 one_epoch_update := OneEpochUpdateLens( f, optimizer, training_examples, batch_size ); - -# initial value for w +#! (ℝ^7, ℝ^7) -> (ℝ^1, ℝ^0) defined by: +#! +#! Get Morphism: +#! ------------ +#! ℝ^7 -> ℝ^1 +#! +#! Put Morphism: +#! ------------ +#! ℝ^7 -> ℝ^7 +dummy_input := CreateContextualVariables( + [ "t", "m_1", "m_2", "v_1", "v_2", "theta_1", "theta_2" ] ); +#! [ t, m_1, m_2, v_1, v_2, theta_1, theta_2 ] +Display( one_epoch_update : dummy_input := dummy_input ); +#! (ℝ^7, ℝ^7) -> (ℝ^1, ℝ^0) defined by: +#! +#! Get Morphism: +#! ------------ +#! ℝ^7 -> ℝ^1 +#! +#! ‣ (Sin( theta_1 ) * Sin( theta_1 ) + Log( theta_2 ) * Log( theta_2 )) / 1 / 1 +#! +#! Put Morphism: +#! ------------ +#! ℝ^7 -> ℝ^7 +#! +#! ‣ t + 1 +#! ‣ 0.9 * m_1 + 0.1 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0)) +#! ‣ 0.9 * m_2 + 0.1 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1)) +#! ‣ 0.999 * v_1 + 0.001 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0)) ^ 2 +#! ‣ 0.999 * v_2 + 0.001 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1)) ^ 2 +#! ‣ theta_1 + 0.001 / (1 - 0.999 ^ t) * ((0.9 * m_1 + 0.1 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0))) / (1.e-0\ +#! 7 + Sqrt( (0.999 * v_1 + 0.001 * (-1 * ((1 * ((1 * (Sin( theta_1 ) * Cos( theta_1 ) + Sin( theta_1 ) * Cos( theta_1 )) + 0) * 1 + 0) * 1 + 0) * 1 + 0)) ^ 2) / (1 - 0.999 ^ t) ))) +#! ‣ theta_2 + 0.001 / (1 - 0.999 ^ t) * ((0.9 * m_2 + 0.1 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1))) / (1.e-07 \ +#! + Sqrt( (0.999 * v_2 + 0.001 * (-1 * (0 + (0 + 1 * (0 + (0 + 1 * (Log( theta_2 ) * (1 / theta_2) + Log( theta_2 ) * (1 / theta_2))) * 1) * 1) * 1)) ^ 2) / (1 - 0.999 ^ t) ))) w := [ 1, 0, 0, 0, 0, 1.58, 0.1 ]; - -nr_epochs := 5000; - -w := Fit( one_epoch_update, nr_epochs, w ); - -# after 5000 epoch the found point is [ bla bla, 3.14159, 1 ] +#! [ 1, 0, 0, 0, 0, 1.58, 0.1 ] +nr_epochs := 500; +#! 500 +w := Fit( one_epoch_update, nr_epochs, w : verbose := false ); +#! [ 501, -9.35215e-12, 0.041779, 0.00821802, 1.5526, 3.14159, 0.980292 ] +theta := w{ [ 6, 7 ] }; +#! [ 3.14159, 0.980292 ] +Map( f_smooth )( theta ); +#! [ 0.000396202 ] +#! @EndExample diff --git a/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g b/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g index a6d2f1d..1bbea08 100644 --- a/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g +++ b/examples/NeuralNetwork_BinaryCrossEntropy/neural_network.g @@ -1,6 +1,6 @@ -#! @Chapter Examples for neural networks +#! @Chapter Examples -#! @Section Binary-class neural network with binary cross-entropy loss function +#! @Section Binary-Class Neural Network with Binary Cross-Entropy Loss Function LoadPackage( "GradientBasedLearningForCAP" ); diff --git a/examples/NeuralNetwork_CrossEntropy/neural_network.g b/examples/NeuralNetwork_CrossEntropy/neural_network.g index f280f0e..61a3280 100644 --- a/examples/NeuralNetwork_CrossEntropy/neural_network.g +++ b/examples/NeuralNetwork_CrossEntropy/neural_network.g @@ -1,6 +1,6 @@ -#! @Chapter Examples for neural networks +#! @Chapter Examples -#! @Section Multi-class neural network with cross-entropy loss function +#! @Section Multi-Class Neural Network with Cross-Entropy Loss Function LoadPackage( "GradientBasedLearningForCAP" ); diff --git a/examples/NeuralNetwork_QuadraticLoss/neural_network.g b/examples/NeuralNetwork_QuadraticLoss/neural_network.g index f37746a..923cae3 100644 --- a/examples/NeuralNetwork_QuadraticLoss/neural_network.g +++ b/examples/NeuralNetwork_QuadraticLoss/neural_network.g @@ -1,8 +1,8 @@ LoadPackage( "GradientBasedLearningForCAP" ); -#! @Chapter Examples for neural networks +#! @Chapter Examples -#! @Section Neural network with quadratic loss function +#! @Section Neural Network with Quadratic Loss Function #! This example demonstrates how to train a small feed-forward neural network #! for a regression task using the $\texttt{GradientBasedLearningForCAP}$ package. We employ diff --git a/gap/FitParameters.gd b/gap/FitParameters.gd index 903f926..638d1b0 100644 --- a/gap/FitParameters.gd +++ b/gap/FitParameters.gd @@ -73,7 +73,7 @@ #! \] #! @EndLatexOnly #! -#! For example, if we chose the optimizer to be the gradient descent optimizer with learning rate $\eta=0.01$: +#! Suppose we choose the optimizer lens to be the gradient descent optimizer with learning rate $\eta = 0.01 > 0$, #! @BeginLatexOnly #! \[ #! \begin{tikzpicture} @@ -83,13 +83,13 @@ #! \node (Ap) at (-3,-1) {$\mathbb{R}^p$}; #! \node (Bp) at ( 3,-1) {$\mathbb{R}^p$}; #! \draw (-1.5,-1.8) rectangle (1.5,1.8); -#! \draw[->] (A) -- node[above] {$\Theta \mapsto f_i(\Theta)$} (B); +#! \draw[->] (A) -- node[above] {$\Theta \mapsto \Theta$} (B); #! \draw[->] (Bp) -- node[midway, below] {$\Theta + \eta g \mapsfrom (\Theta, g)$} (Ap); #! \draw[-] (-1,1) to[out=-90, in=90] (1,-1); #! \end{tikzpicture} #! \] #! @EndLatexOnly -#! The resulting One-Epoch update lens for the example $X_i$ is given by: +#! then the resulting One-Epoch update lens for the example $X_i$ is given by #! @BeginLatexOnly #! \[ #! \begin{tikzpicture} @@ -99,7 +99,7 @@ #! \node (Ap) at (-3,-1) {$\mathbb{R}^p$}; #! \node (Bp) at ( 3,-1) {$\mathbb{R}^0$}; #! \draw (-1.5,-1.8) rectangle (1.5,1.8); -#! \draw[->] (A) -- node[above] {$\Theta \mapsto \Theta$} (B); +#! \draw[->] (A) -- node[above] {$\Theta \mapsto f_i(\Theta)$} (B); #! \draw[->] (Bp) -- node[midway, below] {$\Theta - \eta J_{f_i}(\Theta) \mapsfrom \Theta$} (Ap); #! \draw[-] (-1,1) to[out=-90, in=90] (1,-1); #! \end{tikzpicture} diff --git a/makedoc.g b/makedoc.g index f9a6ca0..ea63831 100644 --- a/makedoc.g +++ b/makedoc.g @@ -16,7 +16,8 @@ AutoDoc( rec( "examples/NeuralNetwork_BinaryCrossEntropy", "examples/NeuralNetwork_CrossEntropy", "examples/NeuralNetwork_QuadraticLoss", - ], + "examples/ComputingTheNextLocalMimima", + ], ), extract_examples := rec( units := "Single", @@ -28,6 +29,9 @@ AutoDoc( rec( \usetikzlibrary{positioning} \usepackage{mathtools} \usepackage{stmaryrd} + \usepackage{fancyvrb} + \usepackage{fvextra} + \fvset{breaklines=true} \DeclareUnicodeCharacter{211D}{\ensuremath{\mathbb{R}}} \DeclareUnicodeCharacter{2023}{\ensuremath{\blacktriangleright}} """,