Generalized SGD
Algorithm
\[\begin{split} \begin{array}{l}
\; \\ \hline
%
\mathbf{\mbox{Inputs}} \color{white}{ \Large A} \\
% ---
\begin{array}{llll}
\mbox{OptProb} & : & \mbox{Routines associated with an optimization problem} &
\left\{ \begin{array}{l} \bullet \; \displaystyle \sum_{n=1}^N f_n(\mathbf{W}) \\ \bullet \; \nabla f_n(\mathbf{W}) \\ \bullet \; \mbox{etc.} \end{array} \right. \\
\alpha_0 & : & \mbox{Initial }l.r.\mbox{ (learning rate)} & \quad \\
\mbox{nEpoch} & : & \mbox{Maximum number of epoch} & \quad \\
\mbox{blkSize} & : & \mbox{Cardinality of }{\cal{I}_b},\mbox{ where }\mathbf{g} = \sum_{l \in {\cal{I}_b}} \nabla f_l(\mathbf{W}) \approx \nabla F(\mathbf{W}) & \quad \\
\mbox{hyperP} & : & \mbox{Routines associated with hyper-parameters selection} &
\end{array} \\ \hline
%
% ---
%
\color{white}{ \Large A} \\
% ---
%
\mathbf{W}_0 = \mbox{OptProb}.initSol() \phantom{\huge A} \\
k = -1 \\
%
\mathbf{\mbox{for }} \varepsilon=1,2,\ldots, \mbox{nEpoch}\\
%
\begin{array}{llllll}
%
& {\cal{I}} = randperm( N ) & \\
& \mathbf{\mbox{for }} b=1,2,\ldots, \frac{N}{ \mbox{blkSize}} &
\end{array} \\
\begin{array}{llllll}
& & ++k & \\
& & \mathbf{g} & = & \frac{1}{\# {\cal{I}}_b} \displaystyle \sum_{l \in {\cal{I}}_b}
\mbox{OptProb}.grad(l, \mathbf{W}_{k}) & \qquad \color{darkgray} \displaystyle \sum_{l \in {\cal{I}_b}} \nabla f_l(\mathbf{W}) \\
& & \alpha & = & \mbox{hyperP}.lr(k, \alpha_0, \mathbf{u}_{k}, \mathbf{g}) & \qquad \color{darkgray} \mbox{Given a policy, computes the }l.r. \\
& & \mathbf{v} & = & Q(\mathbf{g}, \gamma_2) & \qquad \color{darkgray} \mbox{e.g. EMA } \mathbf{v} = \gamma_2\cdot\mathbf{v} + (1-\gamma_2)\cdot\mathbf{g}\odot\mathbf{g} \\
& & \mathbf{w} & = & \frac{1}{\sqrt{\epsilon + \mathbf{v}}} & \qquad \color{darkgray} \mbox{Variance correction} \\
& & \nabla & = & {\cal{A}}(\mathbf{g}, \gamma_1, \beta_k) & \qquad \color{darkgray} \mbox{e.g. EMA } \nabla = \gamma_1\cdot\nabla + (1-\gamma_1)\cdot\mathbf{g} \\
& & \mathbf{W}_{k+1} & = & \mathbf{W}_k - \alpha \cdot \mathbf{w} \odot \nabla + \beta_k\cdot(\mathbf{W}_{k}-\mathbf{W}_{k-1}) & \qquad
%
\end{array} \\
\; \\ \hline
% ---
\end{array}
\; \\
\end{split}\]