SGD
Algorithm
\[\begin{split} \begin{array}{l}
\; \\ \hline
%
\mathbf{\mbox{Inputs}} \color{white}{ \Large A} \\
% ---
\begin{array}{llll}
\mbox{OptProb} & : & \mbox{Routines associated with an optimization problem} &
\left\{ \begin{array}{l} \bullet \; \displaystyle \sum_{n=1}^N f_n(\mathbf{W}) \\ \bullet \; \nabla f_n(\mathbf{W}) \\ \bullet \; \mbox{etc.} \end{array} \right. \\
\alpha_0 & : & \mbox{Initial }l.r.\mbox{ (learning rate)} & \quad \\
\mbox{nEpoch} & : & \mbox{Maximum number of epoch} & \quad \\
\mbox{blkSize} & : & \mbox{Cardinality of }{\cal{I}_b},\mbox{ where }\mathbf{g} = \sum_{l \in {\cal{I}_b}} \nabla f_l(\mathbf{W}) \approx \nabla F(\mathbf{W}) & \quad \\
\mbox{hyperP} & : & \mbox{Routines associated with hyper-parameters selection} &
\end{array} \\ \hline
%
% ---
%
\color{white}{ \Large A} \\
% ---
%
\mathbf{W}_0 = \mbox{OptProb}.initSol() \phantom{\huge A} \\
k = -1 \\
%
\mathbf{\mbox{for }} \varepsilon=1,2,\ldots, \mbox{nEpoch}\\
%
\begin{array}{llllll}
%
& {\cal{I}} = randperm( N ) & \\
& \mathbf{\mbox{for }} b=1,2,\ldots, \frac{N}{ \mbox{blkSize}} &
\end{array} \\
\begin{array}{llllll}
& & ++k & \\
& & \mathbf{g} & = & \frac{1}{\# {\cal{I}}_b} \displaystyle \sum_{l \in {\cal{I}}_b}
\mbox{OptProb}.grad(l, \mathbf{W}_{k}) & \qquad \color{darkgray} \displaystyle \sum_{l \in {\cal{I}_b}} \nabla f_l(\mathbf{W}) \\
& & \alpha & = & \mbox{hyperP}.lr(k, \alpha_0, \mathbf{u}_{k}, \mathbf{g}) & \qquad \color{darkgray} \mbox{Given a policy, computes the }l.r. \\
& & \mathbf{W}_{k+1} & = & \mathbf{W}_k - \alpha \cdot \mathbf{g} & \qquad \color{darkgray} \mathbf{u}_k - \alpha \cdot F(\mathbf{u}_k)
%
\end{array} \\
\; \\ \hline
% ---
\end{array}
\; \\
\end{split}\]