% % file: presentation.tex % created: pasha mar 29 2007 % \documentclass{beamer} %\usepackage[T1]{fontenc} \setbeamertemplate{headline} { \vskip2pt\hskip1pt\insertframenumber } \title{Maximum likelihood and EM algorithm \\ (after the Chapter 8)} \author{Pasha Zusmanovich, deCODE} \date{Statistics Colloquium \\ March 30, 2007 } \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} Likelihood is just a conditional probability. \begin{block}{Formal definition} Given random events $A$ and $B$, the \textbf{likelihood function} of $A$ relative to $B$ is: \begin{align*} \{ \text{set of states of } B \} &\to [0,1] \\ x &\mapsto Pr(A \,|\, B = x). \end{align*} \end{block} Nothing fancy so far. Consider an ... \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Example: alleles and genotypes} \medskip \begin{tabular}{llll} \begin{minipage}{3.5cm} frequencies of alleles: $a$: $\theta$ $A$: $1 - \theta$ \end{minipage} \uncover<2->{ & $\Longrightarrow$ & \begin{minipage}{4cm} frequencies of genotypes: $aa$: $\theta^2$ $aA$: $2\theta (1 - \theta)$ $AA$: $(1 - \theta)^2$ \end{minipage} } & \uncover<3->{ \begin{minipage}{4cm} numbers: $n_{aa}$ $n_{aA}$ $n_{AA}$ \end{minipage} } \end{tabular} \medskip \uncover<3->{ The probability that numbers of genotypes would be exactly $(n_{aa}, n_{aA}, n_{AA})$: $$ f(\theta) = \frac{(n_{aa} + n_{aA} + n_{AA})!}{n_{aa}! n_{aA}! n_{AA}!} \theta^{2n_{aa}} (2\theta(1 - \theta))^{n_{aA}} (1 - \theta)^{2n_{AA}} $$ $f$ is a likelihood function: \{ probability of alleles \} $\to$ \{ conditional probability of genotypes assuming given probability of alleles \}. } \end{block} \uncover<4->{ This is a model with parameter $\theta$. \textbf{Question}: Which parameter makes model the ``best''? \textbf{Answer} ... } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Example: alleles and genotypes (continued)} \textbf{Question}: Which parameter makes model the ``best''? \textbf{Answer}: Those which makes the observed data more likely, i.e. which maximizes $$ f(\theta) = \frac{(n_{aa} + n_{aA} + n_{AA})!}{n_{aa}! n_{aA}! n_{AA}!} \theta^{2n_{aa}} (2\theta(1 - \theta))^{n_{aA}} (1 - \theta)^{2n_{AA}} $$ on $[0,1]$. \textbf{Solution}: $$ \hat{\theta} = \frac{2n_{aa} + n_{aA}}{2(n_{aa} + n_{aA} + n_{AA})}. $$ \bigskip \uncover<2->{ \textbf{But this is exactly the Hardy-Weinberg equilibrium!} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Another example: linear regression} Fitting a line to the set of points on the plane $\{(x_1,y_1), \dots, (x_n,y_n)\}$, assuming observations are independent, and errors are normally distributed. The model is: $$ Y = \beta_1 X + \beta_0 + \varepsilon, \quad \varepsilon \sim N(0, \sigma^2). $$ What is the ``probability'' to have the observed data under the given model? \uncover<2->{ $$ P(Y \text{ lies in } \delta\text{-neighbourhood of } y_i | X = x_i) \approx \text{density} (Y)|_{X=x_i,Y=y_i} \cdot 2\delta, $$ so ``probability'' is replaced by density. If $X$ is fixed, $$ Y - \beta_1 X - \beta_0 \sim N(0, \sigma^2) \Rightarrow Y \sim N(\beta_1 X + \beta_0 ,\sigma^2). $$ } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Another example: linear regression (continued)} Maximizing \begin{multline*} \text{density} (Y)|_{X=x_i,Y=y_i} = \prod_{i=1}^n \frac{1}{\sqrt{2\pi} \sigma} exp \big( -\frac{(\beta_1 x_i + \beta_0 - y_i)^2}{2\sigma^2} \big) \\ = \big( \frac{1}{\sqrt{2\pi}\sigma} \big)^n exp\big( - \frac{1}{2\sigma^2} \sum_{i=1}^n (\beta_1 x_i + \beta_0 - y_i)^2 \big) \end{multline*} is equivalent to minimizing $$ \sum_{i=1}^n (\beta_1 x_i + \beta_0 - y_i)^2. $$ \uncover<2->{ \textbf{But this is exactly the least squares!} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Refined formal definition} Assuming a random variable $X$ has a density function $f(x,\theta)$ parametrized by $\theta$, the likelihood function is: $$ \theta \mapsto f(x,\theta). $$ \end{block} \begin{block}{``Conceptual'' definition} Likelihood is the probability of observed data under the given model. \end{block} \medskip Thus, the maximum likelihood correspond to the model (in the given parametrized class of models) which makes the observerd data ``most likely''. One usually maximize $log\,f(x,\theta)$ instead of $f(x,\theta)$ (\textbf{log-likelihood function}). Ok, since $log$ is monotonic. But ... \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Why logarithm?} \begin{itemize} \item Turns multiplicative things to additive. \uncover<2->{ In most cases on practice, the likelihood function is the product of several functions. E.g., if $X_1, \dots, X_n$ are independent random variables, then their likelihood function: $$ f(x_1, \dots, x_n, \theta) = f(x_1, \theta) \dots f(x_n, \theta), $$ so logarithm turns multiplicative things to additive and easier to deal with. (And logarithm is the \textbf{only} ``good'' function taking multiplication to addition). } \item Diminishes the ``long tail''. \uncover<3->{ A random variable with values in $\mathbb R^+$ (say, results of a measurement) tends to have a skewed distribution to the right because there is lower limit but not upper limit. Passing to log diminishes this skewness. } \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Maximum likelihood behaves nicely asymtotically} Taylor series: $$ \ell (\theta) = \ell (\hat{\theta}) + \frac{1}{2}(\theta - \hat{\theta})^2 \ell^{\prime\prime} (\hat{\theta}) + \dots $$ $i(\theta) = E(-\ell^{\prime\prime} (\theta))$ -- \textbf{Fisher information}. \medskip $\hat{\theta} \sim N(\theta_0, i(\theta_0)^{-1})$ as number of samples $\to \infty$. \medskip Could be used to assess the precision of $\hat{\theta}$. \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Connection with some fancy areas of Mathematics} Back to alleles and genotypes example: model with \textbf{inbreeding coefficient} $\lambda$: \medskip \begin{tabular}{lll} \begin{minipage}{3.4cm} frequencies of alleles: $a$: $\theta$ $A$: $1 - \theta$ \end{minipage} & \begin{minipage}{4cm} frequencies of genotypes: $aa$: $\theta^2 + \theta(1 - \theta)\lambda$ $aA$: $2\theta (1 - \theta)(1 - \lambda)$ $AA$: $(1 - \theta)^2 + \theta(1 - \theta)\lambda$ \end{minipage} & \begin{minipage}{4cm} numbers: 38 95 53 \end{minipage} \end{tabular} \begin{flushright}{\tiny (some real blood groups data from UK, 1947)}\end{flushright} \textbf{Scoring equations} are equivalent to: \uncover<2->{ \begin{multline*} 372 \theta^3 \lambda^2 - 744 \theta^3 \lambda-558 \theta^2 \lambda^2 + 372 \theta^3 + 1131 \theta^2 \lambda + 186 \theta \lambda^2 - 573 \theta^2 \\ -668 \theta \lambda\ + 201 \theta + 148 \lambda = 0; \\ 186 \theta^2 \lambda^2-372 \theta^2 \lambda - 186 \theta \lambda^2 + 186 \theta^2 + 387 \theta \lambda - 201 \theta - 148 \lambda + 53 = 0. \end{multline*} } \uncover<3->{ Statistics + Algebraic Geometry = \textbf{Algebraic Statistics}. } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{Advantages (to summarize)} \begin{itemize} \item Agrees with intuition. \item Confirmed by other methods. \item ``Nice'' asymptotic behavior. \item Very good practical results. \item Universal. \item Connection with other areas of Mathematics. \end{itemize} \end{block} \uncover<2->{ \begin{block}{Disadvantages} \begin{itemize} \item No ``theoretical'' justification. \item Could be bad for small samples. \item No way to compare ``disjoint'' models. \item ``Bayesian'' issue ... \end{itemize} \end{block} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{What is likelihood and what it is good for?} \begin{block}{``Bayesian'' issue:} $$ Pr(data|model) = \frac{Pr(model|data) Pr(data)}{Pr(model)}. $$ \end{block} \uncover<2->{ \begin{block}{Philosophical mumbo-jumbo:} \begin{itemize} \item M. Forster and E. Sober, \emph{Why likelihood?}, The Nature of Scientific Evidence (ed. M. Taper and S. Lele), Univ. of Chicago Press, 2004, 153--165 {\small\texttt{http://philosophy.wisc.edu/forster/Likelihood/default.htm}} \item B. Fitelson, \emph{Likelihoodism, bayesianism, and relational confirmation}, Synthese, to appear {\small\texttt{http://fitelson.org/research.htm}} \end{itemize} \end{block} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{EM algorithm} Finding the maximum of likelihood function could be difficult. \begin{block}{Example: alleles and phenotypes} Assume $A$ is \textbf{dominant}, and we observe only \textbf{phenotypes}: \medskip \begin{tabular}{llll} \begin{minipage}{3.5cm} frequencies of alleles: $a$: $\theta$ $A$: $1 - \theta$ \end{minipage} & \begin{minipage}{3.5cm} frequencies of genotypes: $aa$: $\theta^2$ $aA$: $2\theta (1 - \theta)$ $AA$: $(1 - \theta)^2$ \end{minipage} & \begin{minipage}{3.5cm} numbers of phenotypes: $a$: 38 $A$: 148 \end{minipage} \end{tabular} \medskip \uncover<2->{ Scoring equation amounts to: $38/\theta^2 - 148/(1 - \theta^2) = 0$, i.e. is biquadratic. Suppose we don't know how/don't want to solve it. What to do? } \uncover<3->{ Introduce back missing numbers $n_{aA}$ and $n_{AA}$ (\textbf{hidden parameters}) and iterate. } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{EM algorithm} \begin{block}{Example: alleles and phenotypes (continued)} \small % 5 - too small, 10 - too big \begin{tabular}{ll} \uncover<2->{ \textbf{E} & \begin{minipage}{8cm} Step 1: initial genotype numbers: $n_{aA} = n_{AA} = 148/2 = 74.00$ \medskip \end{minipage} } \\ \uncover<3->{ \textbf{M} & \begin{minipage}{8cm} Step 2: find MLE for those numbers: $\theta = (2 \cdot 38 + 74.00)/(2 \cdot 186) = 0.40$ \medskip \end{minipage} } \\ \uncover<4->{ \textbf{E} & \begin{minipage}{8cm} Step 3: for $\theta = 0.40$, find genotype frequencies: for $aA$: $2 \cdot 0.40 \cdot (1 - 0.40) = 0.48$ and for $AA$: $(1-0.40)^2=0.36$, and for them, genotype numbers: $n_{aA} = 186 \cdot 0.48 = 89.28$, $n_{AA} = 148 - 89.28 = 58.72$ \medskip \end{minipage} } \\ \uncover<5->{ \textbf{M} & \begin{minipage}{8cm} Step 4: find MLE for those numbers: $\theta = (2 \cdot 38 + 89.28)/(2 \cdot 186) =0.44$ \medskip \end{minipage} } \\ \uncover<6->{ \textbf{E} & \begin{minipage}{8cm} Step 5: for $\theta = 0.44$, find genotype frequencies: for $aA$: $2 \cdot 0.44 \cdot (1-0.44) = 0.49$ and for $AA$: $(1-0.44)^2 = 0.31$ and genotype numbers: $n_{aA} = 186 \cdot 0.49 = 91.14$, $n_{AA} = 148 - 91.14 = 56.86$ \medskip \end{minipage} } \\ \uncover<7->{ \textbf{M} & \begin{minipage}{8cm} Step 6: find MLE for those numbers: $\theta = (2 \cdot 38 + 91.14)/(2 \cdot 186) = 0.44$ \medskip \end{minipage} } \\ \uncover<8->{ & Stop! } \end{tabular} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{EM algorithm} \begin{block}{Advantages} \begin{itemize} \item Reduces MLE problem to another more manageable (MLE) problem. \item Agrees with results obtained by other means. \item Works on practice. \end{itemize} \end{block} \uncover<2->{ \begin{block}{Disadvantages} \begin{itemize} \item No theoretical justification. \end{itemize} \end{block} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Maximum likelihood and ME algorithm at deCODE} \begin{block}{Associations studies} \texttt{nemo} by Dan\'iel Gudbjartsson. %Gu{\dh}bjartsson Typical input data: list of affected and unaffected individuals, list of markers (e.g. SNPs), list of genotypes (per marker and per individual). \end{block} \uncover<2->{ \begin{block}{Haplotypes inference from genotypes} \textbf{Maximum parsimony} vs. maximum likelihood. } \uncover<3->{ Example (0,1 -- homozygote, 2 -- heterozygote): \begin{tabular}{lll} \begin{minipage}{1.5cm} genotypes: 2120 2102 1221 \end{minipage} } \uncover<4->{ & $\Longleftarrow$ & \begin{minipage}{4cm} parsimonial solution: 0100 + 1110 0100 + 1101 1011 + 1101 \end{minipage} \end{tabular} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \begin{center} {\Huge That's all.} \end{center} \vskip30pt Slides at \texttt{http://justpasha.org/tmp/presentation.pdf} . \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document} % end of presentation.tex