diff --git a/tex/bibliography.bib b/tex/bibliography.bib index 2ff39bf..be871eb 100644 --- a/tex/bibliography.bib +++ b/tex/bibliography.bib @@ -419,4 +419,68 @@ @book{ascher2008numerical author={Ascher, Uri M}, year={2008}, publisher={SIAM} +} + +@book{ascher2008-numerical-methods, + author = {Ascher, Uri M. and Greif, Chen}, + title = {A First Course in Numerical Methods}, + publisher = {Society for Industrial and Applied Mathematics}, + year = {2011}, + doi = {10.1137/9780898719987}, + address = {Philadelphia, PA}, + edition = {} +} + +@inproceedings{sandu2006properties, + title={On the properties of Runge-Kutta discrete adjoints}, + author={Sandu, Adrian}, + booktitle={Computational Science--ICCS 2006: 6th International Conference, Reading, UK, May 28-31, 2006, Proceedings, Part IV 6}, + pages={550--557}, + year={2006}, + organization={Springer} +} + +@article{sandu2011solution, + title={Solution of inverse problems using discrete ODE adjoints}, + author={Sandu, Adrian}, + journal={Large-Scale Inverse Problems and Quantification of Uncertainty}, + pages={345--365}, + year={2011}, + publisher={Wiley Online Library} +} + +@article{Hager_2000, + title={Runge-Kutta methods in optimal control and the transformed adjoint system}, + volume={87}, + ISSN={0029-599X}, + DOI={10.1007/s002110000178}, + number={2}, + journal={Numerische Mathematik}, + author={Hager, William W.}, + year={2000}, + pages={247–282} +} + +@article{Jensen_Nakshatrala_Tortorelli_2014, + title={On the consistency of adjoint sensitivity analysis for structural optimization of linear dynamic problems}, + volume={49}, + ISSN={1615-147X}, + DOI={10.1007/s00158-013-1024-4}, + number={5}, + journal={Structural and Multidisciplinary Optimization}, + author={Jensen, Jakob S. and Nakshatrala, Praveen B. and Tortorelli, Daniel A.}, + year={2014}, + pages={831–837} +} + +@article{Giering_Kaminski_1998, + title={Recipes for adjoint code construction}, + volume={24}, + ISSN={0098-3500}, + DOI={10.1145/293686.293695}, + number={4}, + journal={ACM Transactions on Mathematical Software (TOMS)}, + author={Giering, Ralf and Kaminski, Thomas}, + year={1998}, + pages={437–474} } \ No newline at end of file diff --git a/tex/main.tex b/tex/main.tex index 04b91fe..212f243 100644 --- a/tex/main.tex +++ b/tex/main.tex @@ -67,15 +67,21 @@ \tableofcontents \clearpage +\section*{Plain language summary} +\input{sections/plain} + \section{Introduction} \input{sections/introduction} -\section{Preliminaries} -\input{sections/preliminaries} +\section{Scientific motivation} +\input{sections/motivation} \section{Methods} \input{sections/methods-intro} +\subsection{Preliminaries} +\input{sections/preliminaries} + \subsection{Finite differences} \input{sections/finite-differences} @@ -85,28 +91,25 @@ \subsection{Complex step differentiation} \subsection{Automatic differentiation} \input{sections/automatic-differentiation} -% \subsection{Symbolic differentiation} -% \input{sections/symbolic} - - -% \subsection{Further remarks} -% \input{sections/further-remarks-1} -% Comparision between these. Are they the same? +\subsection{Symbolic differentiation} +\input{sections/symbolic} \subsection{Sensitivity equations} \input{sections/sensitivity-equation} -\section{Adjoint methods} +\subsection{Adjoint methods} \label{section:adjoint-methods} +\input{sections/adjoint-intro} -\subsection{Discrete adjoint method} +\subsubsection{Discrete adjoint method} \input{sections/adjoint-state} -\subsection{Continuous adjoint method} +\subsubsection{Continuous adjoint method} \input{sections/adjoint-continuous} -\section{Comparison between methods} -\input{sections/comparison} +\section{Computational implementation} +\label{sec:computational-implementation} +\input{sections/software} % \section{Do we need full gradients?} @@ -114,8 +117,8 @@ \section{Comparison between methods} % \section{Open science from scratch} -\section{Notation} -\input{sections/notation} +% \section{Notation} +% \input{sections/notation} \newpage \appendix diff --git a/tex/sections/abstract.tex b/tex/sections/abstract.tex index 553576c..436cde4 100644 --- a/tex/sections/abstract.tex +++ b/tex/sections/abstract.tex @@ -3,9 +3,9 @@ % \centering \textbf{To the community, by the community.} \textit{This manuscript was conceived with the goal of shortening the gap between developers and practitioners of differential programming applied to modern scientific machine learning. - With the advent of new tools and new software, it is important to create pedagogical content that allows the broader community to understand and integrate these methods. + With the advent of new tools and new software, it is important to create pedagogical content that allows the broader community to understand and integrate these methods into their workflows. We hope this encourages new people to be an active part of the ecosystem, by using and developing open-source tools. - This work was done under the premise \textbf{open-science from scratch}, meaning all the contents of this work, both code and text, have been in the open from the beginning and that any interested person can jump in and contribute to the project. + This work was done under the premise \textbf{open-science from scratch}, meaning all the contents of this work, both code and text, have been in the open from the beginning and that any interested person can contribute to the project. You can contribute directly to the GitHub repository \url{https://github.com/ODINN-SciML/DiffEqSensitivity-Review} } \end{quote} \ No newline at end of file diff --git a/tex/sections/adjoint-continuous.tex b/tex/sections/adjoint-continuous.tex index d1142e0..1532dab 100644 --- a/tex/sections/adjoint-continuous.tex +++ b/tex/sections/adjoint-continuous.tex @@ -16,7 +16,7 @@ \inttime \left( \frac{\partial h}{\partial \theta} + \frac{\partial h}{\partial u} s(t) \right) dt. \label{eq:casa-loss} \end{equation} -As explained in previous section, the complicated term to evaluate in the last expression is the sensitivity (Equation \eqref{eq:sensitivity-definition}). +Just as in the case of the discrete adjoint method, the complicated term to evaluate in the last expression is the sensitivity (Equation \eqref{eq:sensitivity-definition}). Just as in the case of the discrete adjoint method, the trick is to evaluate the VJP $\frac{\partial h}{\partial u} \frac{\partial u}{\partial \theta}$ by defining an intermediate adjoint variable. The continuous adjoint equation now is obtained by finding the dual/adjoint equation of the sensitivity equation using the weak formulation of Equation \eqref{eq:sensitivity_equations}. The adjoint equation is obtained by writing the sensitivity equation in the form @@ -85,37 +85,3 @@ \item Solve the backwards adjoint differential equation \eqref{eq:casa-adjoint-equation}; \item Compute the gradient using Equation \eqref{eq:casa-final-loss-gradient}. \end{enumerate} -The bottleneck on this method is the calculation of the adjoint, since in order to solve the adjoint equation we need to know $u(t)$ at any given time. -Effectively, notice that the adjoint equation involves the term $f(u, \theta, t)$ and $\frac{\partial h}{\partial u}$ which are both functions of $u(t)$. -There are different ways of addressing the evaluation of $u(t)$ during the backward step: -\begin{enumerate}[label=(\roman*)] - \item Solve for $u(t)$ again backwards. - \item Store $u(t)$ in memory during the forward step. - \item Store reference values in memory and interpolate in between. - This technique is known as \textit{checkpoiting} or windowing that tradeoffs computational running time and storage. - This is implemented in \texttt{Checkpoiting.jl} \cite{Checkpoiting_2023}. -\end{enumerate} -Computing the ODE backwards can be unstable and lead to exponential errors \cite{kim_stiff_2021}. -In \cite{chen_neural_2019}, the solution is recalculated backwards together with the adjoint simulating an augmented dynamics: -\begin{equation} - \frac{d}{dt} - \begin{bmatrix} - u \\ - \lambda \\ - \frac{dL}{d\theta} - \end{bmatrix} - = - \begin{bmatrix} - -f \\ - - \lambda^T \frac{\partial f}{\partial u} \\ - - \lambda^T \frac{\partial f}{\partial \theta} - \end{bmatrix} - = - - [ 1, \lambda^T, \lambda^T ] - \begin{bmatrix} - f & \frac{\partial f}{\partial u} & \frac{\partial f}{\partial \theta} \\ - 0 & 0 & 0 \\ - 0 & 0 & 0 - \end{bmatrix}, -\end{equation} -with initial condition $[u(t_1), \frac{\partial L}{\partial u(t_1)}, 0]$. One way of solving this system of equations that ensures stability is by using implicit methods. However, this requires cubic time in the total number of ordinary differential equations, leading to a total complexity of $\mathcal O((n+p)^3)$ for the adjoint method. Two alternatives are proposed in \cite{kim_stiff_2021}, the first one called \textit{Quadrature Adjoint} produces a high order interpolation of the solution $u(t)$ as we move forward, then solve for $\lambda$ backwards using an implicit solver and finally integrating $\frac{dL}{d\theta}$ in a forward step. This reduces the complexity to $\mathcal O (n^3 + p)$, where the cubic cost in the number of ODEs comes from the fact that we still need to solve the original stiff differential equation in the forward step. A second but similar approach is to use a implicit-explicit (IMEX) solver, where we use the implicit part for the original equation and the explicit for the adjoint. This method also will have complexity $\mathcal O (n^3 + p)$. \ No newline at end of file diff --git a/tex/sections/adjoint-intro.tex b/tex/sections/adjoint-intro.tex new file mode 100644 index 0000000..ab6acb0 --- /dev/null +++ b/tex/sections/adjoint-intro.tex @@ -0,0 +1,11 @@ +The adjoint method is a very popular approach to compute the gradients of a loss function by first computing an intermediate variable (the adjoint) that serves as a bridge between the solution of the ODE and the final sensitivity. +There is a large family of adjoint methods that a first order we can classify them between discrete and continuous adjoints. +The former usually arises as the numerical discretization of the later, and when the discrete adjoint method is a consistent estimator of the continuous adjoint depends of the ODE and equation. +Proofs of the consistency of discrete adjoint methods for Runge-Kutta methods had been provided in \cite{sandu2006properties, sandu2011solution}. +Depending the choice of the Runge-Kutta coefficients, we can have a numerical scheme that is both consistent for the original equation and consistent/inconsistent for the adjoint \cite{Hager_2000}. + +An equal important consideration when working with adjoints is when these are numerically stable. +Some works had shown that continuous adjoints can lead to unstable sensitivities \cite{Jensen_Nakshatrala_Tortorelli_2014}. +Implicit forward schemes can give rise to explicit backwards schemes, leading to unstable solutions for the gradient. +The key computational consideration when working with adjoint methods is how they handle the VJPs involved in the adjoint equations. +This calculation is carried by another sensitivity method (finite differences, AD) and this also plays a central role at the moment of analyzing accuracy and stability of the adjoint method. \ No newline at end of file diff --git a/tex/sections/adjoint-state.tex b/tex/sections/adjoint-state.tex index a0f3fd7..ad255a2 100644 --- a/tex/sections/adjoint-state.tex +++ b/tex/sections/adjoint-state.tex @@ -1,5 +1,5 @@ Also know as the adjoint state method, it is another example of a discrete method that aims to find the gradient by solving an alternative system of linear equations, known as the \textit{adjoint equations}, at the same time that we solve the original system of linear equations defined by the numerical solver. -These methods are extremely popular in optimal control theory in fluid dynamics, for example for the design of geometries for vehicles and airplanes that optimize performance \cite{Elliott_Peraire_1996, Giles_Pierce_2000} +These methods are extremely popular in optimal control theory in fluid dynamics, for example for the design of geometries for vehicles and airplanes that optimize performance \cite{Elliott_Peraire_1996, Giles_Pierce_2000}. This approach follows the discretize-optimize approach, meaning that we first discretize the system of continuous ODEs and then solve on top of these linear equations \cite{Giles_Pierce_2000}. Just as in the case of automatic differentiation, the set of adjoint equations can be solved in both forward and backward mode. @@ -7,7 +7,9 @@ % Mathematically, reverse mode AD is related to the adjoint differential equations \cite{Griewack-on-AD} -\subsubsection{Discrete differential equation} +\vspace*{10px} +\noindent \textbf{\textit{Discrete differential equation}} +\vspace*{5px} % reference: Sensitivity theory of non-linear systems @@ -59,7 +61,9 @@ \subsubsection{Discrete differential equation} It is important to notice that in most cases, the matrix $A(\theta)$ is quite large and mostly sparse. If well this representation of the discrete differential equation is quite convenient for mathematical manipulations, at the moment of solving the system we will rely in iterative solvers that save memory and computation. -\subsubsection{Adjoint state equations} +\vspace*{10px} +\noindent \textbf{\textit{Adjoint state equations}} +\vspace*{5px} We are interested in differentiating a function $L(U, \theta)$ with respect to the parameter $\theta$. Since here $U$ is the discrete set of evaluations of the solution, examples of loss functions now include @@ -186,12 +190,13 @@ \subsubsection{Adjoint state equations} \end{equation} with initial condition $\lambda_N$. This means that we can solve the adjoint equation in backwards mode, starting from the final state $\lambda_N$ and computing the values of $\lambda_i$ in decreasing index order. +In principle, notice that in order to do this we need to know the value of $u_i$ at any given timestep. % When do we solve this backwards and when forward? %In order to compute the gradient of the full solution of the differential equation, we apply this method sequentially using the chain rule. One single step of the state method can be understood as the chain of operations $\theta \mapsto g \mapsto u \mapsto L$. This allows us to create adjoints for any primitive function $g$ (i.e. the numerical solver scheme) we want, and then incorporated it as a unit of any AD program. -\subsubsection{Further remarks} +% \subsubsection{Further remarks} % Conection with duality % Conection with the adjoint operator diff --git a/tex/sections/automatic-differentiation.tex b/tex/sections/automatic-differentiation.tex index 63acf6f..6b66585 100644 --- a/tex/sections/automatic-differentiation.tex +++ b/tex/sections/automatic-differentiation.tex @@ -14,10 +14,10 @@ \subsubsection{Forward mode} \noindent \textbf{\textit{Dual numbers}} \vspace*{5px} -Let's first consider the case of dual numbers. -The idea is to extend the definition of a variable that takes a certain value to also carry information about the derivative with respect to certain scalar parameter $\theta \in \R$. +Let us first consider the case of dual numbers. +The idea is to extend the definition of a numerical variable that takes a certain value to also carry information about its derivative with respect to certain scalar parameter $\theta \in \R$. We can define an abstract type, defined as a dual number, composed of two elements: a \textit{value} coordinate $x_1$ that carries the value of the variable and a \textit{derivative} coordinate $x_2$ with the value of the derivative $\frac{\partial x_1}{\partial \theta}$. -Just as complex number do, we can represent dual numbers in the vectorial form $(x_1, x_2)$ or in the rectangular form +Just as complex number, we can represent dual numbers in the vectorial form $(x_1, x_2)$ or in the rectangular form \begin{equation} x_\epsilon = x_1 + \epsilon \, x_2 \end{equation} @@ -51,47 +51,7 @@ \subsubsection{Forward mode} \end{align} When computing first order derivatives, we can ignore everything of order $\epsilon^2$ or larger, which is represented in the condition $\epsilon^2 = 0$. This implies that we can use dual numbers to implement forward AD through a numerical algorithm. -Implementing such a type in a programming language implies defining what it means to perform basic operations and then combine them using the chain rule provided by the dual number properties. -In Julia we can create directly create a dual number by defining an object with two values, and then extend the definition of algebraic operations and functions, a process known as \textit{operator overloading} \cite{Neuenhofen_2018}, by relying in multiple dispatch -\begin{jllisting} -using Base: @kwdef - -@kwdef struct DualNumber{F <: AbstractFloat} - value::F - derivative::F -end - -# Binary sum -Base.:(+)(a::DualNumber, b::DualNumber) = DualNumber(value = a.value + b.value, derivative = a.derivative + b.derivative) - -# Binary product -Base.:(*)(a::DualNumber, b::DualNumber) = DualNumber(value = a.value * b.value, derivative = a.value*b.derivative + a.derivative*b.value) -\end{jllisting} -and then we can simply evaluate derivatives by evaluating the derivative component of the dual number that results from the combination of operations -\begin{jllisting} -a = DualNumber(value=1.0, derivative=1.0) - -b = DualNumber(value=2.0, derivative=0.0) -c = DualNumber(value=3.0, derivative=0.0) - -result = a * b * c -println("The derivative of a*b*c with respect to a is: ", result.derivative) -\end{jllisting} -Notice that in this last example the dual numbers \texttt{b} and \texttt{c} where initialized with derivative value equals to zero, while \texttt{a} with value equals to one. -This is because we were interested in computing the derivative with respect to \texttt{a}, and then $\frac{\partial a}{\partial a} = 1$, while $\frac{\partial b}{\partial a} = \frac{\partial c}{\partial a} = 0$. - -We can also extend the definition of standard functions by simply applying the chain rule and storing the derivative in the dual variable following Equation \eqref{eq:dual-number-function}: -\begin{jllisting} -function Base.:(sin)(a::DualNumber) - value = sin(a.value) - derivative = a.derivative * cos(a.value) - return DualNumber(value=value, derivative=derivative) -end -\end{jllisting} -With all these pieces together, we are able to propagate forward the value of a single-valued derivative though a series of algebraic operations. - -In the Julia ecosystem, \texttt{ForwardDiff.jl} implements forward mode AD with multidimensional dual numbers \cite{RevelsLubinPapamarkou2016}. -Notice that a major limitation of the dual number approach is that we need a dual variable for each variable we want to differentiate. +We will explore how this is carried in Section \ref{sec:computational-implementation}. \vspace*{10px} \noindent \textbf{\textit{Computational graph}} @@ -106,33 +66,34 @@ \subsubsection{Forward mode} \begin{equation} \frac{\partial v_j}{\partial v_i} = - \sum_{\substack{ \text{paths }u_0 \rightarrow u_1 \rightarrow \ldots \rightarrow u_m \\ - \text{with } u_0=v_i, u_m = v_j}} - \prod_{k=0}^{m-1} \frac{\partial u_{k+1}}{\partial u_{k}}, + \sum_{\substack{ \text{paths }w_0 \rightarrow w_1 \rightarrow \ldots \rightarrow w_K \\ + \text{with } w_0=v_i, w_K = v_j}} + \prod_{k=0}^{K-1} \frac{\partial w_{k+1}}{\partial w_{k}}, \end{equation} where the sum is calculated with respect to all the directed paths in the graph connecting the input and target node. Instead of evaluating the last expression for all possible path, a simplification is to increasingly evaluate $j=p+1, \ldots, m$ using the recursion \begin{equation} \frac{\partial v_j}{\partial v_i} = - \sum_\text{$u$\text{ such that} $u \rightarrow v_j$} - \frac{\partial v_j}{\partial u} - \frac{\partial u}{\partial v_i} + \sum_\text{$w$\text{ such that} $w \rightarrow v_j$} + \frac{\partial v_j}{\partial w} + \frac{\partial w}{\partial v_i} \label{eq:AD-graph-recursion} \end{equation} -Since every variable node $u$ such that $u \rightarrow v_j$ is an edge of the computational graph have index less than $j$, we can iterate this procedure as we run the computer program and solve for both the function and its gradient: the term $\frac{\partial u}{\partial v_i}$ has been computed in a previous iteration, while $\frac{\partial v_j}{\partial u}$ can be evaluated at the same time the node $v_j$ is computed based on only the value of the parent variable nodes. - -Both implementations of forward AD require a number of operations that increases with the number of variables to differentiate, since each computed quantity is accompanied by the corresponding gradient calculations \cite{Griewack-on-AD}. +Since every variable node $w$ such that $w \rightarrow v_j$ is an edge of the computational graph have index less than $j$, we can iterate this procedure as we run the computer program and solve for both the function and its gradient. +This is possible because in forward mode the term $\frac{\partial w}{\partial v_i}$ has been computed in a previous iteration, while $\frac{\partial v_j}{\partial w}$ can be evaluated at the same time the node $v_j$ is computed based on only the value of the parent variable nodes. \subsubsection{Backward mode} Backward mode AD is also known as the adjoint of cotangent linear mode, or backpropagation in the field of machine learning. The reverse mode of automatic differentiation has been introduced in different contexts \cite{griewank2012invented} and materializes the observation made by Phil Wolfe that if the chain rule is implemented in reverse mode, then the ratio between the computation of the gradient of a function and the function itself can be bounded by a constant that do not depend of the number of parameters to differentiate \cite{Griewack-on-AD, Wolfe_1982}, a point known as \textit{cheap gradient principle} \cite{griewank2012invented}. - Given a directional graph of operations defined by a Wengert list \cite{Wengert_1964}, we can compute gradients of any given function in the same fashion as Equation \eqref{eq:AD-graph-recursion} but in backwards mode as \begin{equation} - \bar v = \frac{\partial \ell}{\partial v_i}= \sum_{w : v \rightarrow w \in G} \frac{\partial w}{\partial v} \bar{w}. + \bar v_i = \frac{\partial \ell}{\partial v_i}= \sum_{w : v \rightarrow w \in G} \frac{\partial w}{\partial v} \bar{w}. \end{equation} +Here we have introduce the notation $\bar{\omega} = \frac{\partial \ell}{\partial \omega}$ to indicate that the partial derivative is always of the final loss function with respect to the different program variables. +Since in backwards AD the values of $\bar \omega$ are being updated in reverse order, in order to evaluate the terms $\frac{\partial \omega}{\partial v}$ we need to know the state value of all the argument variables $v$ of $\omega$, which need to be stored in memory during the evaluation of the function in order to be able to apply backward AD. + Another way of implementing backwards AD is by defining a \textit{pullback} function \cite{Innes_2018}, a method also known as \textit{continuation-passing style} \cite{Wang_Zheng_Decker_Wu_Essertel_Rompf_2019}. In the backward step, this executes a series of functions calls, one for each elementary operation. If one of the nodes in the graph $w$ is the output of an operation involving the nodes $v_1, \ldots, v_m$, where $v_i \rightarrow w$ are all nodes in the graph, then the pullback $\bar v_1, \ldots, \bar v_m = \mathcal B_w(\bar w)$ is a function that accepts gradients with respect to $w$ (defined as $\bar w$) and returns gradients with respect to each $v_i$ ($\bar v_i$) by applying the chain rule. @@ -146,18 +107,14 @@ \subsubsection{Backward mode} \frac{\partial \ell}{\partial v_1} = \frac{\partial}{\partial v_1}(v_1 \times v_2) \frac{\partial \ell}{\partial w}. \end{equation} -The libraries \texttt{ReverseDiff.jl} and \texttt{Zygote.jl} use callbacks to compute gradients. When gradients are being computed with less than $\sim 100$ parameters, the former is faster (see documentation). - -% However, this improve comes at the cost of more storage, since in order to compute gradient in reverse order one needs to evalaute intermediate variables \subsubsection{AD connection with JVPs and VJPs} -When working with unit operations that involve matrix operations dealing with vectors of different dimensions, the order in which we apply the chain rule matters. +When working with unit operations that involve matrix operations dealing with vectors of different dimensions, the order in which we apply the chain rule matters \cite{Giering_Kaminski_1998}. When computing a gradient using AD, we can encounter vector-Jacobian products (VJPs) or Jacobian-vector products (JVP). -As their name indicate, the difference between them regards the fact if the quantity we are interested in computing is described by the product of a Jacobian (the two dimensional matrix with the gradients as rows) by a vector on the left -side (VJP) or the right (JVP). +As their name indicate, the difference between them regards the fact if the quantity we are interested in computing is described by the product of a Jacobian by a vector on the left side (VJP) or the right (JVP). -For the examples we care here, the Jacobian is described as the product of multiple Jacobian using the chain rule. +For nested functions, the Jacobian is described as the product of multiple Jacobian using the chain rule. In this case, the full gradient is computed as the chain product of vectors and Jacobians. Let us consider for example the case of a loss function $L : \mathbb R^p \mapsto \mathbb R$ that can be decomposed as $L(\theta) = \ell \circ g_{k} \circ \ldots \circ g_2 \circ g_1(\theta)$, with $\ell : \mathbb R^{d_k} \mapsto \mathbb R$ the final evaluation of the loss function after we apply in order a sequence of intermediate functions $g_i : \mathbb R^{d_{i-1}} \mapsto \mathbb R^{d_i}$, $d_0 = p$. Now, using the chain rule, we can calculate the gradient of the final loss function as @@ -168,8 +125,8 @@ \subsubsection{AD connection with JVPs and VJPs} Notice that in the last equation, $\nabla \ell \in \mathbb R^{d_k}$ is a vector. In order to compute $\nabla_\theta L$, we can solve the multiplication starting from the right side, which will correspond to multiple the Jacobians forward from $Dg_1$ to $Dg_k$, or from the left side, moving backwards. The important aspect of this last case is that we will always been computing VJPs, since $\nabla \ell \in \mathbb R^{d_k}$ is a vector. -Since VJP are easier to evaluate than full Jacobians, the backward mode will be in general faster (see Figure \ref{fig:vjp-jvp}). -For general rectangular matrices $A\in \mathbb R^{d_1 \times d_2}$ and $B \in \mathbb R^{d_2 \times d_3}$, the cost of the matrix multiplication $AB$ is $\mathcal O (d_1 d_2 d_3)$ (more efficient algorithms exist but this does not impact these results ). This implies that forward AD requires a total of +Since VJPs are easier to evaluate than full Jacobians, the backward mode will be in general faster (see Figure \ref{fig:vjp-jvp}). +For general rectangular matrices $A\in \mathbb R^{d_1 \times d_2}$ and $B \in \mathbb R^{d_2 \times d_3}$, the cost of the matrix multiplication $AB$ is $\mathcal O (d_1 d_2 d_3)$ (more efficient algorithms exist but this does not impact these results). This implies that forward AD requires a total of \begin{equation} d_2 d_1 n + d_3 d_2 p + \ldots + d_k d_{k-1} p + d_k p = \mathcal O (p) \end{equation} @@ -188,6 +145,6 @@ \subsubsection{AD connection with JVPs and VJPs} \begin{figure} \centering \includegraphics[width=0.95\textwidth]{figures/VJP-JVP.png} - \caption{Comparison between forward and backward AD. Changing the order how we multiply the Jacobians change the total number of floating-point operations, which leads to different computational complexities between forward and backward mode.} + \caption{Comparison between forward and backward AD. Changing the order how we multiply the Jacobians change the total number of floating-point operations, which leads to different computational complexities between forward and backward mode. However, backwards mode requires to store in memory information about the forward execution of the program, while forward mode can update the gradient on running time.} \label{fig:vjp-jvp} \end{figure} diff --git a/tex/sections/comparison.tex b/tex/sections/comparison.tex deleted file mode 100644 index de8f3c1..0000000 --- a/tex/sections/comparison.tex +++ /dev/null @@ -1,2 +0,0 @@ -% Giles (2000) has a good discussion on this. -% \cite{ma_comparison_2021}. diff --git a/tex/sections/complex-step.tex b/tex/sections/complex-step.tex index 57d92ef..c8145a5 100644 --- a/tex/sections/complex-step.tex +++ b/tex/sections/complex-step.tex @@ -1,10 +1,10 @@ An alternative to finite differences that avoids rounding errors is based complex variable analysis. -The first proposals originated in 1967 using the Cauchy integral theorem involving the numerical evaluation of a complex valued integral \cite{Lyness_1967, Lyness_Moler_1967} . +The first proposals originated in 1967 using the Cauchy integral theorem involving the numerical evaluation of a complex valued integral \cite{Lyness_1967, Lyness_Moler_1967}. A new approach recently emerged that uses the Taylor expansion of a function to define its complex generalization \cite{Squire_Trapp_1998_complex_diff, Martins_Sturdza_Alonso_2003_complex_differentiation}. Assuming that we have one single scalar parameter $\theta \in \R$, then the function $L(\theta)$ can be expanded as the Taylor expansion \begin{equation} - L(\theta + i \, \varepsilon) + L(\theta + i \varepsilon) = L(\theta) + i \varepsilon L'(\theta) - @@ -13,7 +13,7 @@ + \mathcal O (\varepsilon^3), \end{equation} -where $i$ is the imaginary unit with the property $i^2 = -1$. +where $i$ is the imaginary unit satisfying $i^2 = -1$. From this equation we can observed that many factors vanish when we compute the imaginary part $\text{Im}(L(\theta + i \varepsilon))$, which leads to \begin{equation} L'(\theta) diff --git a/tex/sections/finite-differences.tex b/tex/sections/finite-differences.tex index 5625d03..7403295 100644 --- a/tex/sections/finite-differences.tex +++ b/tex/sections/finite-differences.tex @@ -11,20 +11,23 @@ \label{eq:finite_diff2} \end{equation} leads also to more precise estimation of the derivative. -While Equation \eqref{eq:finite_diff} gives to an error of magnitude $\mathcal O (\varepsilon)$, the centered differences schemes improves to $\mathcal O (\varepsilon^2)$. +While Equation \eqref{eq:finite_diff} gives to an error of magnitude $\mathcal O (\varepsilon)$, the centered differences schemes improves to $\mathcal O (\varepsilon^2)$ \cite{ascher2008-numerical-methods}. However, there are a series of problems associated to this approach. The first one is due to how this scales with the number of parameters $p$. Each directional derivative requires the evaluation of the loss function $L$ twice. For the centered differences approach in Equation \eqref{eq:finite_diff2}, this requires a total of $2p$ function evaluations, which at the same time demands to solve the differential equation in forward mode each time for a new set of parameters. -A second problem is due to rounding errors due to the fact that every computer ultimately stores and manipulate numbers using floating points arithmetic \cite{Goldberg_1991_floatingpoint}. +A second problem is due to rounding errors. +Every computer ultimately stores and manipulate numbers using floating points arithmetic \cite{Goldberg_1991_floatingpoint}. Equations \eqref{eq:finite_diff} and \eqref{eq:finite_diff2} involve the subtraction of two numbers that are very close to each other, which leads to large cancellation errors for small values of $\varepsilon$ than are amplified by the division by $\varepsilon$. On the other hand, large values of the stepsize give inaccurate estimations of the gradient. Finding the optimal value of $\varepsilon$ that trade-offs these two effects is sometimes called the \textit{stepsize dilemma} \cite{mathur2012stepsize-finitediff}. Due to this, some heuristics and algorithms had been introduced in order to pick the value of $\varepsilon$ that will minimize the error \cite{BARTON_1992_finite_diff, mathur2012stepsize-finitediff}, but all these methods require some a priori knowledge about the function to be differentiated. If well many analytical functions, like polynomials and trigonometric functions, can be computed with machine precision, numerical solutions of differential equations have errors larger than machine precision, which leads to inaccurate estimations of the gradient when $\varepsilon$ is too small. -This is illustrated in Figure \ref{fig:finite-diff}, where the error in computing the gradient is shown for both true analytical solution and numerical solution of a system of ODEs as a function of the stepsize. +We will further emphatize this point in Section \ref{sec:computational-implementation}. + + % Condition error %specifically picking $\varepsilon^* = \sqrt{\varepsilon_\text{machine}} \| \theta \|$, with $\varepsilon_\text{machine}$ the machine precision (e.g. \texttt{Float64}). @@ -40,7 +43,7 @@ % which depending on the output size of the function $L(\theta)$, could be a reasonable idea. Replacing derivatives by finite differences is also a common practice when solving partial differential equations (PDEs), a technique known as the \textit{method of lines} \cite{ascher2008numerical}. -To illustrate this point, let's consider the case of the one-dimensional heat equation +To illustrate this point, let us consider the case of the one-dimensional heat equation \begin{equation} \frac{\partial u}{\partial t} = @@ -65,9 +68,3 @@ This can be solved directly using an ODE solver. Further improvements can be made by exploiting the fact that the coupling between the different functions $u_m$ is sparse, that is, the temporal derivative of $u_m$ just depends of the values of the function in the neighbour points in the grid. -\begin{figure}[tbh] - \centering - \includegraphics[width=0.85\textwidth]{../code/finite_differences/finite_difference_derivative.pdf} - \caption{Absolute relative error when computing the gradient of the function $u(t) = \sin (\omega t)/\omega$ with respect to $\omega$ at $t=10.0$ as a function of the stepsize $\varepsilon$. Here $u(t)$ corresponds to the solution of the differential equation $u'' + \omega^2 u = 0$ with initial condition $u(0)=0$ and $u'(0)=1$. The blue dots correspond to the case where this is computed finite differences. The red and orange lines are for the case where $u(t)$ is numerically computed using the default Tsitouras solver \cite{Tsitouras_2011} from \texttt{OrdinaryDiffEq.jl} using different tolerances. The error when using a numerical solver are larger and these are also affercted by the precision of the numerical solver. } - \label{fig:finite-diff} -\end{figure} \ No newline at end of file diff --git a/tex/sections/further-remarks-1.tex b/tex/sections/further-remarks-1.tex deleted file mode 100644 index f77e9ad..0000000 --- a/tex/sections/further-remarks-1.tex +++ /dev/null @@ -1,29 +0,0 @@ -%Finite differences is less accurate and as costly as forward AD \cite{Griewack-on-AD}. - -Sometimes AD is compared against symbolic differentiation. -According to \cite{Laue2020}, these two are the same and the only difference is in the data structures used to implement them, while \cite{Elliott_2018} suggests that AD is symbolic differentiation performed by a compiler. - -Another important point of comparison is the one existing between forward mode AD implemented with dual numbers and complex step differentiation. -Both methods introduce an abstract unit ($\epsilon$ and $i$, respectively) associated to the imaginary part of the extender value that carries forward the numerical value of the gradient. -This resemblance between the methods makes them to be susceptible to the same advantages and disadvantages: easiness to implement with operator overloading; inefficient scaling with respect to the number of variables to differentiate. -However, although these methods seems very similar, it is important to remark that AD gives the exact gradient, while complex step differentiation relies in numerical approximations that are valid just when the stepsize $\varepsilon$ is small. -The next example shows how the calculation of the gradient of $\sin (x^2)$ is performed by these two methods: -\begin{equation} -\renewcommand{\arraystretch}{1.5} -\begin{tabular}{@{} l @{\qquad} l @{\qquad} l @{}} -Operation & AD with Dual Numbers & Complex Step Differentiation \\ -$x$ & $x + \epsilon$ & $x + i \varepsilon$ \\ -$x^2$ & $x^2 + \epsilon \, (2x)$ & $x^2 - \varepsilon^2 + 2i\varepsilon x$\\ -$\sin(x^2)$ & $\sin(x^2) + \epsilon \, \cos(x^2) (2x)$ & -$\sin(x^2 - \varepsilon^2) \cosh (2i\varepsilon) + i \, \cos(x^2 - \varepsilon^2) \sinh (2i\varepsilon)$ -\end{tabular} -\label{eq:AD-complex-comparision} -\end{equation} -While the second component of the dual number has the exact derivative of $\sin(x^2)$, it is not until we take $\varepsilon \rightarrow 0$ than we obtain the derivative in the imaginary component for the complex step method -\begin{equation} - \lim_{\varepsilon \rightarrow 0} \, \frac{1}{\varepsilon} \, \cos(x^2 - \varepsilon^2) \sinh (2i\varepsilon) - = - \, \cos(x^2) (2x). -\end{equation} -The stepsize dependence of the complex step differentiation method makes it resemble more to finite differences than AD with dual numbers. -This difference between the methods also makes the complex step method sometimes more efficient than both finite differences and AD \cite{Lantoine_Russell_Dargent_2012}, an effect that can be counterbalanced by the number of extra unnecessary operation that complex arithmetic requires (see last column of \eqref{eq:AD-complex-comparision}) \cite{Martins_Sturdza_Alonso_2003_complex_differentiation}. \ No newline at end of file diff --git a/tex/sections/introduction.tex b/tex/sections/introduction.tex index 229db6e..dde58c4 100644 --- a/tex/sections/introduction.tex +++ b/tex/sections/introduction.tex @@ -3,7 +3,7 @@ % General statement: why gradients are important? -Evaluating how the value of a function changes with respect to its arguments/parameters plays a central role in optimization, sensitivity analysis, Bayesian inference, uncertainty quantification, among many. +Evaluating how the value of a function changes with respect to its arguments and parameters plays a central role in optimization, sensitivity analysis, Bayesian inference, uncertainty quantification, among many. Modern machine learning applications require the use of gradients to explore and exploit more efficiently the space of parameters. When optimizing a loss function, gradient-based methods (for example, gradient descent and its many variants \cite{ruder2016overview-gradient-descent}) are more efficient at finding a minimum and converge faster to them than than gradient-free methods. When numerically computing the posterior of a probabilistic model, gradient-based sampling strategies converge faster to the posterior distribution than gradient-free methods. @@ -11,10 +11,16 @@ \textit{A gradient serves as a compass in modern data science: it tells us in which direction in the open wide ocean of parameters we should move towards in order to increase our chances of success}. % Differential Programming -Dynamical systems, where the goal is to model observations governed by differential equations, is not an exception to the rule. +Dynamical systems governed by differential equations are not an exception to the rule. Differential equations play a central role in describing the behaviour of systems in natural and social sciences. -In order to bridge the gap between modern machine learning methods and traditional scientific methods, some authors have recently suggested that differentiable programming is the solution to this problem \cite{Ramsundar_Krishnamurthy_Viswanathan_2021, Shen_diff_modelling}. +In order to bridge the gap between modern machine learning methods and traditional scientific methods, some authors have recently suggested differentiable programming as the solution to this problem \cite{Ramsundar_Krishnamurthy_Viswanathan_2021, Shen_diff_modelling}. Being able to compute gradients and sensitivities of dynamical systems opens the door to more complex models. +This is very appealing in geophysical models, where there is a broad literature on physical models and a long tradition in numerical methods. +The first goal of this work is to introduce some of the applications of this emerging technology and to motivate its incorporation for the modelling of complex systems in the natural and social sciences. +\begin{quote} + \textbf{Question 1. } + \textit{What are the scientific applications of differential programming for complex dynamical systems?} +\end{quote} % Some examples There are different approaches to calculate the gradient of dynamical systems depending on the traditions of each field. @@ -23,6 +29,7 @@ This is particularly useful in optimal control theory \cite{Giles_Pierce_2000}, where the goal is to find the optimal value of some control (e.g. the shape of a wing) that minimizes a given loss function. In recent years, there has been an increasing interest in designing machine learning workflows that include constraints in the form of differential equations. Examples of this include Physics-Informed Neural Networks (PINNs) \cite{PINNs_2019} and Universal Differential Equations (UDEs) \cite{rackauckas2020universal}. + % soft / hard constrains % Differentiation @@ -30,13 +37,13 @@ Except for a small set of particular cases, most differential equations require numerical methods to calculate their solution and cannot be differentiated analytically. This means that solutions cannot be directly differentiated and require special treatment if, besides the numerical solution, we also want to compute first or second order derivatives. Furthermore, numerical solutions introduce approximation errors. -These errors can be propagated and even amplified during the computation of the gradient. +These errors can be propagated and amplified during the computation of the gradient. Alternatively, there is a broad literature in numerical methods for solving differential equations. Although each method provides different guarantees and advantages depending on the use case, this means that the tools developed to compute gradients when using a solver need to be universal enough in order to be applied to all or at least to a large set of them. -The first goal of this article is making a review of the different methods that exists to achieve this goal. +The second goal of this article is making a review of the different methods that exists to achieve this goal. \begin{quote} - \textbf{Question 1. } + \textbf{Question 2. } \textit{How can I compute the gradient of a function that depends on the numerical solution of a differential equation?} \end{quote} %Notice here the phrase \textit{the gradient of a function that depends}, emphasizing the fact that in many cases we may be interested in computing the gradient of a function that depends on the solution of the differential equation. @@ -44,33 +51,30 @@ % AD The broader set of tools known as Automatic Differentiation (AD) aims at computing derivatives by sequentially applying the chain rule to the sequence of unit operations that constitute a computer program. -The premise is simple: every computer program, including a numerical solver, is ultimately an algorithm described by a chain of simple algebraic operations (addition, multiplication) that are i) easy to differentiate and ii) their combination is easy to differentiate by using the chain rule. +The premise is simple: every computer program, including a numerical solver, is ultimately an algorithm described by a chain of simple algebraic operations (addition, multiplication) that are easy to differentiate and their combination is easy to differentiate by using the chain rule. Although many modern differentiation tools use AD to some extend, there is also a family of methods that compute the gradient by relying on a auxiliary set of differential equations. We are going to refer to this family of methods as \textit{continuous}, and we will dedicate them a special treatment in future sections to distinguish them from the discrete algorithms that resemble more to pure AD. The differences between methods arise both from their mathematical formulation and their computational implementations. The first provides different guarantees on the method returning the actual gradient or a good approximation of it. -The second involves how the theory is translated to software, and what are the data structures and algorithms used to implement it. +The second involves how theory is translated to software, and what are the data structures and algorithms used to implement it. Different methods have different computational complexities depending on the number of parameters and differential equations, and these complexities are also balanced between total execution time and required memory. -The second goal of this work is then to illustrate the different strengths and weaknesses of these methods, and how to use them in modern scientific software. +The third goal of this work is then to illustrate the different strengths and weaknesses of these methods, and how to use them in modern scientific software. \begin{quote} - \textbf{Question 2. } + \textbf{Question 3. } \textit{What are the advantages and disadvantages of different differentiation methods and how can I incorporate them in my research?} \end{quote} Despite the fact that these methods can be (in principle) implemented in different programming languages, here we decided to use the Julia programming language for the different examples. Julia is a recently new but mature programming language that has already a large tradition in implementing packages aiming to advance differential programming \cite{Julialang_2017}. % The need to introduce all this methods in a common framework -Without aiming at making an extensive and specialized review of the field, we bielieve such a study will be useful to other researchers working on problems that combine optimization and sensitivity analysis with differential equations. +Without aiming at making an extensive and specialized review of the field, we believe this study will be useful to other researchers working on problems that combine optimization and sensitivity analysis with differential equations. Differential programming is opening new ways of doing research across sciences. As we make progress in the use of these tools, new methodological questions start to emerge. How do these methods compare? How can their been improved? We also hope this paper serves as a gateway to new questions regarding advances in these methods. -\begin{quote} - \textbf{Question 3. } - \textit{Are there opportunities for developing new sensitivity methods?} -\end{quote} - - -%\subsection{Use cases} \ No newline at end of file +% \begin{quote} +% \textbf{Question 3. } +% \textit{Are there opportunities for developing new sensitivity methods?} +% \end{quote} \ No newline at end of file diff --git a/tex/sections/methods-intro.tex b/tex/sections/methods-intro.tex index 5df8ed5..87f6342 100644 --- a/tex/sections/methods-intro.tex +++ b/tex/sections/methods-intro.tex @@ -4,11 +4,11 @@ \caption{Schematic representation of the different methods available for differentiation involving differential equation solutions. These can be classified depending if they find the gradient by solving a new system of differential equations (\textit{continuous}) or if instead they manipulate unit algebraic operations (\textit{discrete}). Furthermore, depending if these methods run in the same direction than the numerical solver, we are going to be talking about \textit{backward} and \textit{forward} methods.} \label{fig:diff} \end{figure} -Depending on the number of parameters and the complexity of the differential equation we are trying to solve, there are different methods to compute gradients with different numerical and computational advantages and that also scale differently depending of the number of differential equations $n$ and number of parameters $p$. +Depending on the number of parameters and the complexity of the differential equation we are trying to solve, there are different methods to compute gradients with different numerical and computational advantages. These methods can be roughly classified as: \begin{itemize} - \item \textit{Discrete} vs \textit{continuous} methods. - \item \textit{Forward} vs \textit{backwards} methods. + \item \textit{Discrete} vs \textit{continuous} methods + \item \textit{Forward} vs \textit{backwards} methods \end{itemize} The first difference regards the fact that the method for computing the gradient can be either based on the manipulation of atomic operations that are easy to differentiate using the chain rule several times (discrete), in opposition to the approach of approximating the gradient as the numerical solution of a new set of differential equations (continuous). Another way of conceptualizing this difference is by comparing them with the discretize-optimize and optimize-discretize approaches \cite{bradley2013pde, Onken_Ruthotto_2020}. @@ -17,4 +17,6 @@ The second distinction is related to the fact that some methods compute gradients by resolving a new sequential problem that may move in the same direction of the original numerical solver - i.e. moving forward in time - or, instead, they solve a new system that goes backwards in time. Figure \ref{fig:diff} displays a classification of some methods under this two-fold classification. In the following section we are going to explore more in detail these methods. -% It is important to remark that all the \ No newline at end of file +It is important to remark that if well all the methods we explore in this section are mathematically correct, \textit{that does not imply they are numerically stable}. +This statements applied to methods based on pure automatic differentiation as well as adjoint methods. +We are going to explore this consideration in more detail in section \ref{sec:computational-implementation}. \ No newline at end of file diff --git a/tex/sections/plain.tex b/tex/sections/plain.tex new file mode 100644 index 0000000..701df75 --- /dev/null +++ b/tex/sections/plain.tex @@ -0,0 +1 @@ +Plain language summary \ No newline at end of file diff --git a/tex/sections/preliminaries.tex b/tex/sections/preliminaries.tex index dac5941..fae259c 100644 --- a/tex/sections/preliminaries.tex +++ b/tex/sections/preliminaries.tex @@ -3,34 +3,35 @@ \frac{du}{dt} = f(u, \theta, t), \label{eq:original_ODE} \end{equation} -where $u \in \mathbb{R}^n$ is the unknown solution; $\theta \in \mathbb R^p$ is a parameter vector; and with initial condition $u(t_0) = u_0$. +where $u \in \mathbb{R}^n$ is the unknown solution; $f$ is a function that depends on the state $u$, some parameter $\theta \in \mathbb R^p$, and potentially time $t$; and with initial condition $u(t_0) = u_0$. Here $n$ denotes the total number of ODEs and $p$ the size of a parameter embedded in the functional form of the differential equation. Although here we consider the case of ODEs, that is, when the derivatives are just with respect to the time variable $t$, this also includes the case of partial differential equations (PDEs). Furthermore, the fact that both $u$ and $\theta$ are one-dimensional vectors does not prevent the use of higher-dimension objects (e.g. when $u$ is a matrix or a tensor). We are interested in computing the gradient of a given function $L(u(\cdot, \theta))$ with respect to the parameter $\theta$. -Here we are using the letter $L$ to emphasize that in many cases this will be a loss function, but without loss of generality this include a broader class of functions: +Here we are using the letter $L$ to emphasize that in many cases this will be a loss function, but without loss of generality this include a broader class of functions. \begin{itemize} - \item \textbf{Loss functions}. This is usually a real valued function that quantifies the prediction power of a given model. Examples of loss functions include + \item \textbf{Loss functions}. This is usually a real valued function that quantifies the accuracy or prediction power of a given model. Examples of loss functions include the squared error \begin{equation} L(u(\cdot, \theta)) = \frac{1}{2} \| u(t_1, \theta) - u^{\text{target}(t_1)} \|_2^2, \label{eq:quadratic-loss-function} \end{equation} where $u^{\text{target}(t_1)}$ is the desired target observation at some later time $t_1$; and \begin{equation} - L(u(\cdot, \theta)) = \frac{1}{2} \int_{t_0}^{t_1} h( u(t;\theta), \theta) ) dt, + L(u(\cdot, \theta)) = \int_{t_0}^{t_1} h( u(t;\theta), \theta) ) dt, \end{equation} - with $h$ being a function that quantifies the contribution of the error term at time $t \in [t_0, t_1]$. This also includes the case of likelihood functions. + with $h$ being a function that quantifies the contribution of the error term at every time $t \in [t_0, t_1]$. + \item \textbf{Likelihood profiles. } \item \textbf{Summary of the solution.} Another important example is when $L$ returns the value of the solution at one or many points, which is useful when we want to know how the solution itself changes as we move the parameter values. - \item \textbf{Diagnosis of the solution.} In many cases we are interested in maximizing/minimizing the value of some interest quantity that is a function of the solution of a differential equation. This is particularly the case in design control theory, a very popular approach in aerodynamics modelling where the goal could be to maximize the speed of an airplane given the solution of the flow equation for a given geometry profile \cite{Jameson_1988}. + \item \textbf{Diagnosis of the solution.} In many cases we are interested in optimizing the value of some interest quantity that is a function of the solution of a differential equation. This is the case in design control theory, a popular approach in aerodynamics modelling where goals include to maximize the speed of an airplane given the solution of the flow equation for a given geometry profile \cite{Jameson_1988}. \end{itemize} We are interested in computing the gradient of the loss function with respect to the parameter $\theta$, which can be written using the chain rule as -\begin{equation} +\begin{equation} \frac{dL}{d\theta} = \frac{dL}{du} \frac{\partial u}{\partial \theta}. \label{eq:dLdtheta_VJP} \end{equation} -The first term on the right hand side is usually easy to evaluate, since it just involves the partial derivative of the scalar loss function to the solution. +The first term on the right hand side is usually easy to evaluate, since it just involves the partial derivative of the scalar loss function with respect to the solution. For example, for the loss function in Equation \eqref{eq:quadratic-loss-function} this is simply \begin{equation} \frac{dL}{du} = u - u^{\text{target}(t_1)}. @@ -53,8 +54,8 @@ Notice here the distinction between the total derivative (indicated with the $d$) and partial derivative symbols ($\partial$). When a function depends on more than one argument, we are going to use the partial derivative symbol to emphasize this distinction (e.g., Equation \eqref{eq:sensitivity-definition}). On the other side, when this is not the case, we will use the total derivative symbol (e.g., Equation \eqref{eq:dLdu}). -Also notice that the sensitivity $s$ defined in Equation \eqref{eq:sensitivity-definition} is what is called a \textit{Jacobian}, that is, a matrix of first derivatives. +Also notice that the sensitivity $s$ defined in Equation \eqref{eq:sensitivity-definition} is what is called a \textit{Jacobian}, that is, a matrix of first derivatives for general vector-valued functions. -In this article we are going to use the word gradient or derivative to refer to the first order derivatives of a given function. -Although the names adjoint and tangent are sometime used to refer to the same object, we are going to skip the use of these to avoid confusion. -The same nature of the adjoint methods deserves to be treated entirely in Section \ref{section:adjoint-methods}. \ No newline at end of file +% In this article we are going to use the word gradient or derivative to refer to the first order derivatives of a given function. +% Although the names adjoint and tangent are sometime used to refer to the same object, we are going to skip the use of these to avoid confusion. +% The same nature of the adjoint methods deserves to be treated entirely in Section \ref{section:adjoint-methods}. \ No newline at end of file diff --git a/tex/sections/software.tex b/tex/sections/software.tex new file mode 100644 index 0000000..c22b34c --- /dev/null +++ b/tex/sections/software.tex @@ -0,0 +1,128 @@ +% Giles (2000) has a good discussion on this. +% \cite{ma_comparison_2021}. + +In this section we are going to address how these tools are computationally implemented and how to decide which method to use depending the problem. + +\subsection{Forward discrete methods} + +Let us start comparing how finite differences and complex-step differentiation perform for a simple problem with one single parameter. +Figure \ref{fig:finite-diff} illustrates the error in computing the gradient of a simple loss function for both true analytical solution and numerical solution of a system of ODEs as a function of the stepsize $\varepsilon$. + +\begin{figure}[tbh] + \centering + \includegraphics[width=0.85\textwidth]{../code/finite_differences/finite_difference_derivative.pdf} + \caption{Absolute relative error when computing the gradient of the function $u(t) = \sin (\omega t)/\omega$ with respect to $\omega$ at $t=10.0$ as a function of the stepsize $\varepsilon$. Here $u(t)$ corresponds to the solution of the differential equation $u'' + \omega^2 u = 0$ with initial condition $u(0)=0$ and $u'(0)=1$. The blue dots correspond to the case where this is computed finite differences. The red and orange lines are for the case where $u(t)$ is numerically computed using the default Tsitouras solver \cite{Tsitouras_2011} from \texttt{OrdinaryDiffEq.jl} using different tolerances. The error when using a numerical solver is larger and it is dependent of the numerical precision of the numerical solver. } + \label{fig:finite-diff} +\end{figure} + +In general, finite differences is very easy to implement since it does not require any type of software support, but it is a less accurate and as costly as forward AD \cite{Griewack-on-AD} and comple-step differentiation. +Implementing these last types in a programming language implies defining what it means to perform basic operations and then combine them using the chain rule provided by the dual number properties. +In Julia we can create a dual number by defining an object with two values, and then extend the definition of algebraic operations and functions, a process known as \textit{operator overloading} \cite{Neuenhofen_2018}, by relying in multiple dispatch: +\begin{jllisting} +using Base: @kwdef + +@kwdef struct DualNumber{F <: AbstractFloat} + value::F + derivative::F +end + +# Binary sum +Base.:(+)(a::DualNumber, b::DualNumber) = DualNumber(value = a.value + b.value, derivative = a.derivative + b.derivative) + +# Binary product +Base.:(*)(a::DualNumber, b::DualNumber) = DualNumber(value = a.value * b.value, derivative = a.value*b.derivative + a.derivative*b.value) +\end{jllisting} +and then we can simply evaluate derivatives by evaluating the derivative component of the dual number that results from the combination of operations +\begin{jllisting} +a = DualNumber(value=1.0, derivative=1.0) + +b = DualNumber(value=2.0, derivative=0.0) +c = DualNumber(value=3.0, derivative=0.0) + +result = a * b * c +println("The derivative of a*b*c with respect to a is: ", result.derivative) +\end{jllisting} +Notice that in this last example the dual numbers \texttt{b} and \texttt{c} where initialized with derivative value equals to zero, while \texttt{a} with value equals to one. +This is because we were interested in computing the derivative with respect to \texttt{a}, and then $\frac{\partial a}{\partial a} = 1$, while $\frac{\partial b}{\partial a} = \frac{\partial c}{\partial a} = 0$. + +We can also extend the definition of standard functions by simply applying the chain rule and storing the derivative in the dual variable following Equation \eqref{eq:dual-number-function}: +\begin{jllisting} +function Base.:(sin)(a::DualNumber) + value = sin(a.value) + derivative = a.derivative * cos(a.value) + return DualNumber(value=value, derivative=derivative) +end +\end{jllisting} +With all these pieces together, we are able to propagate forward the value of a single-valued derivative though a series of algebraic operations. + +In the Julia ecosystem, \texttt{ForwardDiff.jl} implements forward mode AD with multidimensional dual numbers \cite{RevelsLubinPapamarkou2016}. +Notice that a major limitation of the dual number approach is that we need a dual variable for each variable we want to differentiate. +Implementations of forward AD using dual numbers and computational graphs require a number of operations that increases with the number of variables to differentiate, since each computed quantity is accompanied by the corresponding gradient calculations \cite{Griewack-on-AD}. +This consideration applies to most forward methods. + +Notice that both AD based in dual number and complex-step differentiation are based on defining and extended variable that carries information about the gradient. +% Another important point of comparison is the one existing between forward mode AD implemented with dual numbers and complex step differentiation. +Both methods introduce an abstract unit ($\epsilon$ and $i$, respectively) associated to the imaginary part of the extender value that carries forward the numerical value of the gradient. +This resemblance between the methods makes them to be susceptible to the same advantages and disadvantages: easiness to implement with operator overloading; inefficient scaling with respect to the number of variables to differentiate. +However, although these methods seems very similar, it is important to remark that AD gives the exact gradient, while complex step differentiation relies in numerical approximations that are valid just when the stepsize $\varepsilon$ is small. +The next example shows how the calculation of the gradient of $\sin (x^2)$ is performed by these two methods: +\begin{equation} +\renewcommand{\arraystretch}{1.5} +\begin{tabular}{@{} l @{\qquad} l @{\qquad} l @{}} +Operation & AD with Dual Numbers & Complex Step Differentiation \\ +$x$ & $x + \epsilon$ & $x + i \varepsilon$ \\ +$x^2$ & $x^2 + \epsilon \, (2x)$ & $x^2 - \varepsilon^2 + 2i\varepsilon x$\\ +$\sin(x^2)$ & $\sin(x^2) + \epsilon \, \cos(x^2) (2x)$ & +$\sin(x^2 - \varepsilon^2) \cosh (2i\varepsilon) + i \, \cos(x^2 - \varepsilon^2) \sinh (2i\varepsilon)$ +\end{tabular} +\label{eq:AD-complex-comparision} +\end{equation} +While the second component of the dual number has the exact derivative of $\sin(x^2)$, it is not until we take $\varepsilon \rightarrow 0$ than we obtain the derivative in the imaginary component for the complex step method +\begin{equation} + \lim_{\varepsilon \rightarrow 0} \, \frac{1}{\varepsilon} \, \cos(x^2 - \varepsilon^2) \sinh (2i\varepsilon) + = + \, \cos(x^2) (2x). +\end{equation} +The stepsize dependence of the complex step differentiation method makes it resemble more to finite differences than AD with dual numbers. +This difference between the methods also makes the complex step method sometimes more efficient than both finite differences and AD \cite{Lantoine_Russell_Dargent_2012}, an effect that can be counterbalanced by the number of extra unnecessary operation that complex arithmetic requires (see last column of \eqref{eq:AD-complex-comparision}) \cite{Martins_Sturdza_Alonso_2003_complex_differentiation}. + +\subsection{Backwards methods} + +The libraries \texttt{ReverseDiff.jl} and \texttt{Zygote.jl} use callbacks to compute gradients. When gradients are being computed with less than $\sim 100$ parameters, the former is faster (see documentation). + +\subsection{Solving the adjoint} + +The bottleneck on this method is the calculation of the adjoint, since in order to solve the adjoint equation we need to know $u(t)$ at any given time. +Effectively, notice that the adjoint equation involves the term $f(u, \theta, t)$ and $\frac{\partial h}{\partial u}$ which are both functions of $u(t)$. +There are different ways of addressing the evaluation of $u(t)$ during the backward step: +\begin{enumerate}[label=(\roman*)] + \item Solve for $u(t)$ again backwards. + \item Store $u(t)$ in memory during the forward step. + \item Store reference values in memory and interpolate in between. + This technique is known as \textit{checkpoiting} or windowing that tradeoffs computational running time and storage. + This is implemented in \texttt{Checkpoiting.jl} \cite{Checkpoiting_2023}. +\end{enumerate} +Computing the ODE backwards can be unstable and lead to exponential errors \cite{kim_stiff_2021}. +In \cite{chen_neural_2019}, the solution is recalculated backwards together with the adjoint simulating an augmented dynamics: +\begin{equation} + \frac{d}{dt} + \begin{bmatrix} + u \\ + \lambda \\ + \frac{dL}{d\theta} + \end{bmatrix} + = + \begin{bmatrix} + -f \\ + - \lambda^T \frac{\partial f}{\partial u} \\ + - \lambda^T \frac{\partial f}{\partial \theta} + \end{bmatrix} + = + - [ 1, \lambda^T, \lambda^T ] + \begin{bmatrix} + f & \frac{\partial f}{\partial u} & \frac{\partial f}{\partial \theta} \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 + \end{bmatrix}, +\end{equation} +with initial condition $[u(t_1), \frac{\partial L}{\partial u(t_1)}, 0]$. One way of solving this system of equations that ensures stability is by using implicit methods. However, this requires cubic time in the total number of ordinary differential equations, leading to a total complexity of $\mathcal O((n+p)^3)$ for the adjoint method. Two alternatives are proposed in \cite{kim_stiff_2021}, the first one called \textit{Quadrature Adjoint} produces a high order interpolation of the solution $u(t)$ as we move forward, then solve for $\lambda$ backwards using an implicit solver and finally integrating $\frac{dL}{d\theta}$ in a forward step. This reduces the complexity to $\mathcal O (n^3 + p)$, where the cubic cost in the number of ODEs comes from the fact that we still need to solve the original stiff differential equation in the forward step. A second but similar approach is to use a implicit-explicit (IMEX) solver, where we use the implicit part for the original equation and the explicit for the adjoint. This method also will have complexity $\mathcal O (n^3 + p)$. \ No newline at end of file diff --git a/tex/sections/symbolic.tex b/tex/sections/symbolic.tex index e69de29..912b4fd 100644 --- a/tex/sections/symbolic.tex +++ b/tex/sections/symbolic.tex @@ -0,0 +1,2 @@ +Sometimes AD is compared against symbolic differentiation. +According to \cite{Laue2020}, these two are the same and the only difference is in the data structures used to implement them, while \cite{Elliott_2018} suggests that AD is symbolic differentiation performed by a compiler. \ No newline at end of file