\documentclass{beamer}
\usepackage{beamerthemesplit}
\usepackage{url}
\usepackage{xcolor}
\usepackage{tabu}
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usepackage{amsmath, amsthm, amssymb}
\usepackage[normalem]{ulem}
\usepackage{cancel}
\DeclareMathOperator*{\argmax}{arg\,max}
\useoutertheme{infolines}
\setbeamertemplate{navigation symbols}{}
%\AtBeginSection{\frame{\sectionpage}}
\newenvironment{titledslide}[1]{\begin{frame}\frametitle{#1}}{\end{frame}}
\title[Variational]{Variational Methods}
\author{James Cussens, University of York}
%\date{PGM'18}
\begin{document}
\frame{\titlepage}
%\begin{frame}\frametitle{Outline}\tableofcontents[hideallsubsections]\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{Variational Inference}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{Variational Inference (pros)}
From \cite{NIPS2015_5758}:
\begin{itemize}
\item For machine learning models, calculating the posterior is often
difficult; we resort to approximation.
\item Variational inference (VI) approximates the posterior with a
simpler density.
\item We search
over a family of simple densities and find the member closest to the
posterior.
\item This turns approximate inference into optimization.
\item VI
has had a tremendous impact on machine learning; it is typically
faster than Markov chain Monte Carlo (MCMC) sampling (as we show
here too) and has recently scaled up to massive data.
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{Variational Inference (cons)}
\begin{itemize}
\item Which family of approximating densities to choose?
\item How to solve the resulting optimisation problem?
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{Automatic Differentiation Variational Inference}
\begin{itemize}
\item Automatic Differentiation Variational Inference
(ADVI)
\begin{itemize}
\item Given a (Stan) model,
\item Automatically determine an appropriate \emph{variational
family} (family of candidate approximating densities), and
\item Automatically work out how to solve the optimisation problem
\end{itemize}
\item It uses Automatic Differentiation ;-)
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{VI by minising KL divergence}
\begin{itemize}
\item Given some posterior density $p(\theta | \mathbf{X})$,
\item and some family of approximating densities $q(\theta ; \phi)$
\item find the $\phi$ which gives the smallest KL divergence
\end{itemize}
\[
\min_{\phi} \mathrm{KL}(q(\theta ; \phi) \; || \; p(\theta | \mathbf{X}))
\]
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{The evidence lower bound}
The evidence lower bound (ELBO) is:
\[
{\cal L}(\phi) = \mathbb{E}_{q(\theta:\phi)}[ \log(p(\theta,\mathbf{X}) ] -
\mathbb{E}_{q(\theta ; \phi)}[ \log(q(\theta ; \phi) ]
\]
Maximising the ELBO minimises the KL divergence (and so that's what we do).
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{A transformation-based approach}
\begin{itemize}
\item The \emph{support} of a density (e.g.\ $p(\theta |
\mathbf{X})$) is where it is non-zero.
\item We often have variables in $\theta$ which have to be positive
(e.g.\ a variance), so have no negative numbers in the support.
\item The Stan approach to VI is transformation-based, step one is
to define a one-to-one differentiable function:
\end{itemize}
\[
T: \mathrm{supp}(p(\theta)) \rightarrow \mathbb{R}^{K}
\]
where $K$ is the dimension of $\theta$.
\begin{itemize}
\item Cue \cite[Fig~3]{NIPS2015_5758}
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{Mean field approximation}
\begin{itemize}
\item Let $\zeta$ be the transformed variables ($\zeta$ lives in
$\mathbb{R}^{K}$).
\item In the \emph{mean field} approach to VI we choose the family of
approximating distributions to be products of independent Gaussians:
\end{itemize}
\[
q(\zeta : \phi) = {\cal N}(\zeta ; \mu, \sigma^{2}) = \prod_{k=1}^{K}
{\cal N}(\zeta_{k} ; \mu_{k}, \sigma_{k}^{2})
\]
where $\phi = (\mu_{1}, \dots, \mu_{K}, \sigma^{2}_{1}, \dots, \sigma^{2}_{K})$
\begin{itemize}
\item Note that this mean field approach would have been a weird
choice had, say, some of the $\zeta_k$ been only allowed to take
positive values.
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{Maximising ELBO in real co-ordinate space}
\begin{itemize}
\item Let ${\cal L}(\mu,\sigma^{2})$ be the ELBO in the real
co-ordinate space (i.e.\ the transformed space).
\end{itemize}
``We now seek to maximize the ELBO in real coordinate space
\[
\mu^{*}, \sigma^{2*} = \arg \max_{\mu, \sigma^{2}}
{\cal L}(\mu,\sigma^{2}) \mbox{ such that $\sigma^{2} \succ 0$}
\]
We can use gradient ascent to reach a local maximum of the ELBO''
\cite{NIPS2015_5758}:
\begin{itemize}
\item This is doable (after one further reparameterisation
\cite[Fig~3]{NIPS2015_5758}) using AD.
\end{itemize}
\end{titledslide}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{titledslide}{So does it actually work?}
\begin{itemize}
\item Let's have a look \dots
\end{itemize}
\end{titledslide}
\bibliographystyle{alpha}
%\bibliography{jc,jc_all}
\bibliography{jc_foo}
\end{document}