--- title: "Supervised Learning" author: "Dan Lizotte" date: '`r Sys.Date()`' output: ioslides_presentation: css: ../my_ioslides.css html_document: default pdf_document: null --- ```{r echo=F,message=F,warning=F} library(dplyr) library(gridExtra) ``` ## Relationships between variables {.smaller} ```{r message=F,warning=F,echo=F} library(ggplot2) fbase <- ggplot(data=faithful,aes(x=eruptions,y=waiting)) + xlim(1,5.75) + ylim(38,100) + scale_fill_distiller(palette="Greys",direction=1) + theme(legend.position="none") fbase + geom_point() ``` - ***Random vectors*** or ***vector-valued*** random variables. - Variables that occur ***together*** in some meaningful sense. ## Joint distribution {.smaller}
```{r} library(knitr); kable(head(faithful,10)) ``` ## Correlation (JWHT 2.3,3.1.3)
```{r message=F,warning=F,echo=F} library(ggplot2) fbase <- ggplot(data=faithful,aes(x=eruptions,y=waiting)) + xlim(1,5.75) + ylim(38,100) + scale_fill_distiller(palette="Greys",direction=1) + theme(legend.position="none") fbase + geom_point() ``` ## Pearson Correlation

\[ \rho_{X,Y} = \frac{E[(X - \mu_X)(Y - \mu_Y)]}{\sigma_X\sigma_Y} \] ## Pearson Correlation: "Plugin" Estimate

\[ r_{X,Y} = \frac{\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^n (x_i - \bar{x})^2}\sqrt{\sum_{i=1}^n (y_i - \bar{y})^2}} \] ## Sample Correlation ```{r message=F,warning=F,echo=F} library(ggplot2) fbase <- ggplot(data=faithful,aes(x=eruptions,y=waiting)) + xlim(1,5.75) + ylim(38,100) + scale_fill_distiller(palette="Greys",direction=1) + theme(legend.position="none") fbase + geom_point() cor(faithful) ``` ## Correlation Gotchas

## Joint distribution - Density {.smaller}
```{r echo=F} fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) ``` ## Marginal distributions - Densities
```{r echo=F} mhist <- fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) ggExtra::ggMarginal(mhist, type = "density") ``` ## Marginal distributions - Rug Plot
```{r echo=F} library(ggExtra) fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) ``` ## Conditional distributions
```{r echo=F} cond1 = 60; fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) + geom_hline(yintercept = I(cond1),color="blue") ``` ## Conditional distributions
```{r echo=F} ggplot(faithful %>% filter(waiting==cond1),aes(x=eruptions)) + geom_density() + xlim(1,5.75) + ggtitle(paste0("f(eruptions|waiting=",cond1,")")) ``` ## Conditional distributions
```{r echo=F} cond2=80 fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) + geom_hline(yintercept = I(cond2),color="blue") ``` ## Conditional distributions
```{r echo=F} ggplot(faithful %>% filter(waiting==cond2),aes(x=eruptions)) + geom_density() + xlim(1,5.75) + ggtitle(paste0("f(eruptions|waiting=",cond2,")")) ``` ## Conditional distributions
```{r echo=F} cond3=c(2.0,2.1) fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) + annotate("rect",xmin=cond3[1],xmax=cond3[2],ymin=-Inf,ymax=Inf,alpha=1/4,fill="blue") ``` ## Conditional distributions
```{r echo=F} ggplot(faithful %>% filter(cond3[1] <= eruptions, eruptions <= cond3[2]),aes(x=waiting)) + geom_density() + xlim(40,100) + ggtitle(paste0("f( waiting | ", cond3[1], " <= eruptions <= ",cond3[2],")")) + coord_flip() ``` ## Conditional distributions
```{r echo=F} cond4=c(4.4,4.5) fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) + annotate("rect",xmin=cond4[1],xmax=cond4[2],ymin=-Inf,ymax=Inf,alpha=1/4,fill="blue") ``` ## Conditional distributions
```{r echo=F} ggplot(faithful %>% filter(cond4[1] <= eruptions, eruptions <= cond4[2]),aes(x=waiting)) + geom_density() + xlim(40,100) + ggtitle(paste0("f( waiting | ", cond4[1], " <= eruptions <= ",cond4[2],")")) + coord_flip() ``` ## Independence
- Two random variables $X$ and $Y$ that are part of a random vector are **independent** iff: \[ F_{X,Y}(x,y) = F_X(x)F_Y(y) \] - Consequences: \[ \Pr(X=x|Y=y) = \Pr(X=x) \] \[ \Pr(Y=y|X=x) = \Pr(Y=y) \] - Any conditional distribution of $X$ is the same as the marginal distribution of $X$ - Knowing about $Y$ provides no information about $X$. ## Independence vs. Correlation ```{r echo=F,message=F,warning=F} library(MASS);n=10000;mu <- c(0,0);Sig <- matrix(c(1,0.7,0.7,1),2,2); xx <- mvrnorm(n,mu,Sig) d <- data.frame(x=xx[,1],y=xx[,2]) ggplot(d,aes(x=x,y=y)) + geom_point(alpha=I(1/4)) + geom_rug() cor(d) ``` ## Independence vs. Correlation ```{r echo=F} n=10000 x <- 2*pi*runif(n) - 2*pi d <- data.frame(x=x,y=cos(x)+0.25*rnorm(n)) ggplot(d,aes(x=x,y=y)) + geom_point(alpha=I(1/4)) + geom_rug() cor(d) ``` ## Independence vs. Correlation ```{r echo=F} n=10000 r <- runif(n) theta <- runif(n)*2*pi d <- data.frame(x=sqrt(r)*cos(theta),y=sqrt(r)*sin(theta)) ggplot(d,aes(x=x,y=y)) + geom_point(alpha=I(1/4)) + geom_rug() cor(d) ``` ## Independence vs. Correlation ```{r echo=F} n=10000 d <- data.frame(x=rnorm(n),y=rnorm(n)) ggplot(d,aes(x=x,y=y)) + geom_point(alpha=I(1/4)) + geom_rug() cor(d) ``` ## Predicting Waiting Time
```{r echo=F} pred <- mean(faithful$waiting) ggplot(faithful,aes(x=waiting)) + geom_density() + xlim(40,100) + ggtitle(paste0("f( waiting | ", cond3[1], " <= eruptions <= ",cond3[2],")")) + geom_vline(xintercept=pred,color="red") + coord_flip() message(sprintf("Mean: %.2f",pred)) ``` ## Conditional predictions - If I know eruption time, can I do better? ```{r echo=F} pred <- mean(subset(faithful,cond3[1] <= eruptions & eruptions <= cond3[2])$waiting) ggplot(faithful %>% filter(cond3[1] <= eruptions, eruptions <= cond3[2]),aes(x=waiting)) + geom_density() + xlim(40,100) + ggtitle(paste0("f( waiting | ", cond3[1], " <= eruptions <= ",cond3[2],")")) + geom_vline(xintercept=pred,color="red") + coord_flip() message(sprintf("Mean: %.2f",pred)) ``` ## Conditional predictions - If I know eruption time, can I do better? ```{r echo=F} pred <- mean(subset(faithful,cond4[1] <= eruptions & eruptions <= cond4[2])$waiting) ggplot(faithful %>% filter(cond4[1] <= eruptions, eruptions <= cond4[2]),aes(x=waiting)) + geom_density() + xlim(40,100) + ggtitle(paste0("f( waiting | ", cond4[1], " <= eruptions <= ",cond4[2],")")) + geom_vline(xintercept=pred,color="red") + coord_flip() message(sprintf("Mean: %.2f",pred)) ``` ## Conditional predictions? - If I know eruption time, can I do better? ```{r echo=F} cond5=c(3.15,3.25) fbase + stat_density_2d(geom="polygon",aes(fill = ..level..)) + geom_point(alpha=I(1/4)) + geom_rug(alpha=I(1/4),size=2) + annotate("rect",xmin=cond5[1],xmax=cond5[2],ymin=-Inf,ymax=Inf,alpha=1/4,fill="blue") ``` ## Supervised Learning Framework (HTF 2, JWHT 2) Training experience: a set of ***labeled examples*** of the form \[\langle x_1,\,x_2,\,\dots x_p,y\rangle,\] where $x_j$ are ***feature values*** and $y$ is the ***output*** * Task: ***Given a new $x_1,\,x_2,\,\dots x_p$, predict $y$*** What to learn: A ***function*** $f:\mathcal{X}_1 \times \mathcal{X}_2 \times \cdots \times \mathcal{X}_p \rightarrow \mathcal{Y}$, which maps the features into the output domain * Goal: Make accurate ***future predictions*** (on unseen data) * Plan: Learn to make accurate predictions on the training data ## [Wisconsin Breast Cancer Prognostic Data](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Prognostic)) {.smaller} ```{r echo=F} #setwd("/Users/dlizotte/Seafile/My Library/Teaching/cs4437/Lectures/5_Supervised Learning") wpbc_featurenames = c("Radius","Texture","Perimeter","Area","Smoothness","Compactness","Concavity","Concave Points","Symmetry","Fractal Dim") wpbc_names = c("ID","Outcome","Time",paste(wpbc_featurenames,"Mean"),paste(wpbc_featurenames,"SE"),paste(wpbc_featurenames,"Worst"),"Tumor Diameter","Lymph Status") bc <- read.csv("data/wpbc.data.txt",header=FALSE,col.names=wpbc_names) ``` Cell samples were taken from tumors in breast cancer patients before surgery and imaged; tumors were excised; patients were followed to determine whether or not the cancer recurred, and how long until recurrence or disease free. ![image](images/xcyt1.gif) ## Wisconsin data (continued) {.smaller} - 198 instances, 32 features for prediction - Outcome (R=recurrence, N=non-recurrence) - Time (until recurrence, for R, time healthy, for N). ```{r results='asis',echo=F,warning=F,message=F,eval=F} kable(bc %>% select(Radius.Mean,Texture.Mean,Perimeter.Mean,Area.Mean,Outcome,Time,),format="markdown") ``` | Radius.Mean| Texture.Mean| Perimeter.Mean| ... |Outcome | Time| |-----------:|------------:|--------------:|---------:|:-------|----:| | 18.02| 27.60| 117.50| |N | 31| | 17.99| 10.38| 122.80| |N | 61| | 21.37| 17.44| 137.50| |N | 116| | 11.42| 20.38| 77.58| |N | 123| | 20.29| 14.34| 135.10| |R | 27| | 12.75| 15.29| 84.60| |R | 77| | ...| ...| ...| |... | ...| ## Terminology {.smaller} | Radius.Mean| Texture.Mean| Perimeter.Mean| ... |Outcome | Time| |-----------:|------------:|--------------:|---------:|:-------|----:| | 18.02| 27.60| 117.50| |N | 31| | 17.99| 10.38| 122.80| |N | 61| | 21.37| 17.44| 137.50| |N | 116| | 11.42| 20.38| 77.58| |N | 123| | 20.29| 14.34| 135.10| |R | 27| | 12.75| 15.29| 84.60| |R | 77| | ...| ...| ...| |... | ...|
- Columns are called *input variables* or ***features*** or *attributes* - The outcome and time (which we are trying to predict) are called ***labels*** or *output variables* or *targets* - A row in the table is called ***training example*** or *instance* - The whole table is called ***(training) data set***. ## Prediction problems {.smaller} | Radius.Mean| Texture.Mean| Perimeter.Mean| ... |Outcome | Time| |-----------:|------------:|--------------:|---------:|:-------|----:| | 18.02| 27.60| 117.50| |N | 31| | 17.99| 10.38| 122.80| |N | 61| | 21.37| 17.44| 137.50| |N | 116| | 11.42| 20.38| 77.58| |N | 123| | 20.29| 14.34| 135.10| |R | 27| | 12.75| 15.29| 84.60| |R | 77| | ...| ...| ...| |... | ...|
- The problem of predicting the recurrence is called ***(binary) classification*** - The problem of predicting the time is called ***regression*** ## More formally - The $i$th training example has the form: $\langle x_{1,i}, \dots x_{p,i}, y_i\rangle$ where $p$ is the number of features (32 in our case). - Notation ${\bf x}_i$ denotes a **column** vector with elements $x_{1,i},\dots x_{p,i}$. - The training set $D$ consists of $n$ training examples - We denote the $n\times p$ matrix of features by $X$ and the size-$n$ column vector of outputs from the data set by ${\mathbf{y}}$. - In statistics, $X$ is called the data matrix or the *design matrix.* - ${{\cal X}}$ denotes space of input values - ${{\cal Y}}$ denotes space of output values ## Supervised learning problem - Given a data set $D \subset ({{\cal X}}\times {{\cal Y}})^n$, find a function: $$h : {{\cal X}}\rightarrow {{\cal Y}}$$ such that $h({\bf x})$ is a *“good predictor”* for the value of $y$. - $h$ is called a *predictive model* or *hypothesis* - Problems are categorized by the type of output domain - If ${{\cal Y}}=\mathbb{R}$, this problem is called *regression* - If ${{\cal Y}}$ is a finite discrete set, the problem is called *classification* - If ${{\cal Y}}$ has 2 elements, the problem is called *binary classification* ## Steps to solving a supervised learning problem 1. Decide what the input-output pairs are. 2. Decide how to encode inputs and outputs. This defines the input space ${{\cal X}}$, and the output space ${{\cal Y}}$. (We will discuss this in detail later) 3. Choose model space/hypothesis class ${{\cal H}}$ . 4. ... ## Example: Choosing a model space ```{r echo=F} library(ggplot2) ex <- data.frame(x0=rep(1,10),x=c(0.86,0.09,-0.85,0.87,-0.44,-0.43,-1.10,0.40,-0.96,0.17), y=c(2.49,0.83,-0.25,3.10,0.87,0.02,-0.12,1.81,-0.83,0.43)) ggplot(ex,aes(x=x,y=y)) + geom_point(size=4) ``` ## Linear hypothesis (HTF 3, JWHT 3) \newcommand{\T}{\mathsf{T}} \newcommand{\x}{\mathbf{x}} \newcommand{\w}{\mathbf{w}} - Suppose $y$ was a linear function of ${\bf x}$: $$h_{\bf w}({\bf x}) = w_0 + w_1 x_1 + w_2 x_2 + \cdots$$ - $w_i$ are called *parameters* or *weights* (often $\beta_i$ in stats books) - Typically include an attribute $x_0=1$ (also called *bias term* or *intercept term*) so that the number of weights is $p+1$. We then write: $$h_{\bf w}({\bf x}) = \sum_{i=0}^p w_i x_i = \x^\T \w$$ where ${\bf w}$ and ${\bf x}$ are column vectors of size $p+1$. - The design matrix $X$ is now $n$ by $p+1$. ## Example: Design matrix with bias term

```{r echo=F,results='asis'} library(knitr) exb <- ex names(exb) <- c("x0","x1","y") kable(exb) ```

Models will be of the form $$ \begin{align} h_\w(\x) & = x_0 w_0 + x_1 w_1\\ & = w_0 + x_1 w_1 \end{align} $$ *How should we pick $\w$?*

## Error minimization - Intuitively, $\bf w$ should make the predictions of $h_{\bf w}$ close to the true values $y_i$ on on the training data - Define an *error function* or *cost function* to measure how much our prediction differs from the “true” answer on the training data - Pick $\bf w$ such that the error function is minimized - Hopefully, new examples are somehow “similar” to the training examples, and will also have small error. *How should we choose the error function?* ## Least mean squares (LMS) - Main idea: try to make $h_{\bf w}({\bf x})$ close to $y$ on the examples in the training set - We define a *sum-of-squares* error function $$J({\bf w}) = \frac{1}{2}\sum_{i=1}^n (h_{\bf w}({\bf x}_i)-y_i)^2$$ (the $1/2$ is just for convenience) - We will choose $\bf w$ such as to minimize $J(\bf w)$ - One way to do it: compute $\bf w$ such that: $$\frac{\partial}{\partial w_j}J({\bf w}) = 0,\,\, \forall j=0\dots p$$ ## Example: $w_0=0.9,w_1=-0.4$ {.smaller} ```{r echo=F} mod <- lm(y ~ x1, data=exb) mod$coefficients <- c(0.9, -0.4) #Put in example coefficients exb$d1pred <- predict(mod,exb) message(sprintf("SSE: %.3f",sum((exb$y - exb$d1pred)^2))) ggplot(exb,aes(x=x1,y=y,xend=x1,yend=d1pred)) + geom_point(size=4) + geom_abline(intercept=0.9,slope=-0.4,color="blue",size=1) + geom_segment(color="red") ``` ## OLS Fit to Example Data {.smaller} ```{r} mod <- lm(y ~ x1, data=exb); print(mod$coefficients) ``` ```{r echo=F} exb$d1pred <- predict(mod,exb) message(sprintf("SSE: %.3f",sum((exb$y - exb$d1pred)^2))) ggplot(exb,aes(x=x1,y=y,xend=x1,yend=d1pred)) + geom_point(size=4) + geom_smooth(method="lm",formula=y ~ x,se=F,n=200,na.rm=F) + geom_segment(color="red") ``` ## Solving a supervised learning problem 1. Decide what the input-output pairs are. 2. Decide how to encode inputs and outputs. This defines the input space ${{\cal X}}$, and the output space ${{\cal Y}}$. 3. Choose a class of models/hypotheses ${{\cal H}}$ . 4. Choose an error function (cost function) to define the best model in the class 5. Choose an algorithm for searching efficiently through the space of models to find the best. ## Recurrence Time from Tumor Radius {.smaller} ```{r message=F} mod <- lm(Time ~ Radius.Mean, data=bc %>% filter(Outcome == 'R')); print(mod$coefficients) ``` ```{r echo=F} ggplot(bc %>% filter(Outcome == 'R'),aes(x=Radius.Mean,y=Time)) + geom_point(size=4) + geom_smooth(method="lm",formula=y ~ x,se=F,n=200,na.rm=F) ``` ## Notation reminder - Consider a function $J(u_1,u_2,\ldots,u_p):\mathbb{R}^p\mapsto\mathbb{R}$ (for us, this will usually be an error function) - The *gradient* $\nabla J(u_1,u_2,\ldots,u_p):\mathbb{R}^p\mapsto\mathbb{R}^p$ is a function which outputs a vector containing the partial derivatives.\ That is: $$\nabla J = \left\langle{\frac{\partial}{\partial u_1}}J,{\frac{\partial}{\partial u_2}}J,\ldots,{\frac{\partial}{\partial u_p}}J\right\rangle$$ - If $J$ is differentiable and convex, we can find the global minimum of $J$ by solving $\nabla J = \mathbf{0}$. - The partial derivative is the derivative along the $u_i$ axis, keeping all other variables fixed. ## The Least Squares Solution (HTF 2.6, 3.2, JWHT 3.1) \newcommand{\y}{\mathbf{y}} - Recalling some multivariate calculus: $$\begin{aligned} \nabla_\w J & = & \nabla_\w(X\w-{\y})^{\T}(X\w-{\y}{}) \\ & = & \nabla_\w(\w^\T X^\T-{\y^\T})(X\w-{\y}{}) \\ & = & \nabla_\w(\w^{\T}X^{\T}X\w-{\y}^{\T}X\w-\w^{\T}X^{\T}{\y}{}+{\y}{}^{\T}{\y}{}) \\ & = & \nabla_\w(\w^{\T}X^{\T}X\w-2{\y}^{\T}X\w+{\y}{}^{\T}{\y}{}) \\ & = & 2 X^{\T}X \w- 2 X^{\T}{\y} \end{aligned}$$ - Setting gradient equal to zero: $$\begin{aligned} 2 X^{\T}X \w- 2 X^{\T}{\y}{} & = & 0 \\ \Rightarrow X^{\T}X \w& = & X^{\T}{\y}{} \\ \Rightarrow \w= (X^{\T}X)^{-1}X^{\T}{\y}{}\end{aligned}$$ - The inverse exists if the columns of $X$ are linearly independent. ## Example of linear regression

```{r echo=F,results='asis'} library(knitr) exb <- ex names(exb) <- c("x0","x1","y") kable(exb) ```

```{r fig.width=6,echo=F,message=F} ggplot(exb,aes(x=x1,y=y)) + geom_point(size=4) + geom_smooth(method="lm",formula=y ~ x,se=F,n=200,na.rm=F) #mod <- lm(y ~ x1, data=exb); print(mod$coefficients) ``` $h_{\bf w} ({\bf x}) = 1.06 + 1.61 x_1$

## Data matrices $$X=\left[\begin{array}{rr} 1 & 0.86 \\ 1 & 0.09 \\ 1 & -0.85 \\ 1 & 0.87 \\ 1 & -0.44 \\ 1 & -0.43 \\ 1 & -1.10 \\ 1 & 0.40 \\ 1 & -0.96 \\ 1 & 0.17 \end{array}\right]~~~~~{\y}{}=\left[\begin{array}{r} 2.49 \\ 0.83 \\ -0.25 \\ 3.10 \\ 0.87 \\ 0.02 \\ -0.12 \\ 1.81 \\ -0.83 \\ 0.43 \end{array}\right]$$ ## $X^{\T}X$ $$X^{\T}X =$$ $${\tiny \left[\begin{array}{rrrrrrrrrr} 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0.86 & 0.09 & -0.85 & 0.87 & -0.44 & -0.43 & -1.10 & 0.40 & -0.96 & 0.17 \end{array}\right]\times\left[\begin{array}{cc} 1 & 0.86 \\ 1 & 0.09 \\ 1 & -0.85 \\ 1 & 0.87 \\ 1 & -0.44 \\ 1 & -0.43 \\ 1 & -1.10 \\ 1 & 0.40 \\ 1 & -0.96 \\ 1 & 0.17 \end{array}\right]}$$ $$=\left[\begin{array}{rr} 10 & -1.39 \\ -1.39 & 4.95 \end{array}\right]$$ ## $X^{\T}{\y}{}$ $$X^{\T}{\y}{}=$$ $${\tiny \left[\begin{array}{rrrrrrrrrr} 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0.86 & 0.09 & -0.85 & 0.87 & -0.44 & -0.43 & -1.10 & 0.40 & -0.96 & 0.17 \end{array}\right]\times\left[\begin{array}{c} 2.49 \\ 0.83 \\ -0.25 \\ 3.10 \\ 0.87 \\ 0.02 \\ -0.12 \\ 1.81 \\ -0.83 \\ 0.43 \end{array}\right]}$$ $$=\left[\begin{array}{r} 8.34 \\ 6.49 \end{array}\right]$$ ## Solving for ${\bf w}$ $${\bf w}=(X^{\T}X)^{-1}X^{\T}{\y}{} = \left[\begin{array}{rr} 10 & -1.39 \\ -1.39 & 4.95 \end{array}\right]^{-1}\left[\begin{array}{r} 8.34 \\ 6.49 \end{array}\right] = \left[\begin{array}{r} 1.06 \\ 1.61 \end{array}\right]$$ So the best fit line is $y=1.06 + 1.61x$. ## Linear regression summary - The optimal solution (minimizing sum-squared-error) can be computed in polynomial time in the size of the data set. - The solution is ${\bf w}=(X^{\T}X)^{-1}X^{\T}{\y}{}$, where $X$ is the data matrix augmented with a column of ones, and ${\y}{}$ is the column vector of target outputs. - A very rare case in which an analytical, exact solution is possible ## Is linear regression enough? - Linear regression should be the **first thing** you try for real-valued outputs! - ...but it is sometimes not expressive enough. - Two possible solutions: 1. Explicitly transform the data, i.e. create additional features - Add cross-terms, higher-order terms - More generally, apply a transformation of the inputs from ${\cal X}$ to some other space ${\cal X}'$, then do linear regression in the transformed space 2. Use a different model space/hypothesis class - Idea (1) and idea (2) are two views of the strategy. Today we focus on the first approach ## Polynomial fits (HTF 2.6, JWHT 7.1) - Suppose we want to fit a higher-degree polynomial to the data.\ (E.g., $y=w_0 + w_1x_1+w_2 x_1^2$.) - Suppose for now that there is a single input variable $x_{i,1}$ per training sample. - How do we do it? ## Answer: Polynomial regression - Given data: $(x_{1,1},y_1), (x_{1,2},y_2), \ldots, (x_{1,n},y_n)$. - Suppose we want a degree-$d$ polynomial fit. - Let ${\y}{}$ be as before and let $$X=\left[\begin{array}{rrrrr} 1 & x_{1,1} & x_{1,1}^2 & \ldots & x_{1,1}^d \\ 1 & x_{1,2} & x_{1,2}^2 & \ldots & x_{1,2}^d \\ \vdots & & \vdots & \vdots & \vdots \\ 1 & x_{1,n} & x_{1,n}^2 & \ldots & x_{1,n}^d \\ \end{array}\right]$$ - We are **making up features** to add to our design matrix - Solve the linear regression $X{\bf w}\approx {\y}{}$. ## Example of quadratic regression: Data matrices $$X=\left[\begin{array}{rrr} 1 & 0.86 & 0.75 \\ 1 & 0.09 & 0.01 \\ 1 & -0.85 & 0.73 \\ 1 & 0.87 & 0.76 \\ 1 & -0.44 & 0.19 \\ 1 & -0.43 & 0.18 \\ 1 & -1.10 & 1.22 \\ 1 & 0.40 & 0.16 \\ 1 & -0.96 & 0.93 \\ 1 & 0.17 & 0.03 \end{array}\right]~~~~~{\y}{}=\left[\begin{array}{r} 2.49 \\ 0.83 \\ -0.25 \\ 3.10 \\ 0.87 \\ 0.02 \\ -0.12 \\ 1.81 \\ -0.83 \\ 0.43 \end{array}\right]$$ ## $X^{\T}X$ $$X^{\T}X =$$ $${\tiny \hspace{-0.3in}\left[\begin{array}{rrrrrrrrrr} 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0.86 & 0.09 & -0.85 & 0.87 & -0.44 & -0.43 & -1.10 & 0.40 & -0.96 & 0.17 \\ 0.75 & 0.01 & 0.73 & 0.76 & 0.19 & 0.18 & 1.22 & 0.16 & 0.93 & 0.03 \end{array}\right]\times\left[\begin{array}{rrr} 1 & 0.86 & 0.75 \\ 1 & 0.09 & 0.01 \\ 1 & -0.85 & 0.73 \\ 1 & 0.87 & 0.76 \\ 1 & -0.44 & 0.19 \\ 1 & -0.43 & 0.18 \\ 1 & -1.10 & 1.22 \\ 1 & 0.40 & 0.16 \\ 1 & -0.96 & 0.93 \\ 1 & 0.17 & 0.03 \end{array}\right]}$$ $$=\left[\begin{array}{rrr} 10 & -1.39 & 4.95 \\ -1.39 & 4.95 & 1.64 \\ 4.95 & 1.64 & 4.11 \end{array}\right]$$ ## $X^{\T}{\y}{}$ $$X^{\T}{\y}{}=$$ $${\tiny \hspace{-0.3in}\left[\begin{array}{rrrrrrrrrr} 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0.86 & 0.09 & -0.85 & 0.87 & -0.44 & -0.43 & -1.10 & 0.40 & -0.96 & 0.17 \\ 0.75 & 0.01 & 0.73 & 0.76 & 0.19 & 0.18 & 1.22 & 0.16 & 0.93 & 0.03\\ \end{array}\right]\times\left[\begin{array}{r} 2.49 \\ 0.83 \\ -0.25 \\ 3.10 \\ 0.87 \\ 0.02 \\ -0.12 \\ 1.81 \\ -0.83 \\ 0.43 \end{array}\right]}$$ $$=\left[\begin{array}{r} 8.34 \\ 6.49 \\ 3.60 \end{array}\right]$$ ## Solving for ${\bf w}$ $${\bf w}=(X^{\T}X)^{-1}X^{\T}{\y}{} = {\tiny \left[\begin{array}{rrr} 10 & -1.39 & 4.95 \\ -1.39 & 4.95 & 1.64 \\ 4.95 & 1.64 & 4.11 \\ \end{array}\right]^{-1}\left[\begin{array}{r} 3.60 \\ 6.49 \\ 8.34 \end{array}\right] = \left[\begin{array}{r} 0.74 \\ 1.75 \\ 0.69 \end{array}\right]}$$ So the best order-2 polynomial is $y=0.74 + 1.75 x + 0.69x^2$. ## Data and linear fit ```{r echo=F} options(digits=2,width=120) fitplots_aes <- aes(x=x,y=y,xend=x,yend=pred) pymin=-1;pymax=3.3 ``` ```{r echo=F} form <- y ~ x; mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) f1 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") f1 ``` ## Data and quadratic fit ```{r echo=F} form <- y ~ x + I(x^2); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-3 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-4 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-5 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5); mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-6 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-7 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-8 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8); mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-9 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) f9 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") f9 ``` Is this a better fit to the data? ## Evaluating Performance ```{r echo=F,fig.width=8.5} library(gridExtra) grid.arrange(f1,f9,nrow=1) ``` Which do you prefer and why? ## Performance of a Fixed Hypothesis

(HTF 7.1–7.4, JWHT 2.2, 5)

- Assume data $({{\x}},y)$ are drawn from some fixed distribution - Given a model $h$, (which could have come from anywhere), its *generalization error* is: $$J^*_h = E[L(h({{\bf X}}),Y)]$$ - Given a set of data points **from the same distribution**, we can compute the *empirical error* $$\hat{J}^*_h = \frac{1}{n}\sum_{i=1}^m L(h({{\x}}_i),y_i)$$ - $\hat{J}^*_h$ is an *unbiased* estimate of $J^*_h$ **so long as the data did not influence the choice of $h$.** - Can use $\hat{J}^*_h$ with CLT or bootstrap to get a C.I. for $J^*_h$. ## Test Error: The Gold Standard {.smaller} $$\hat{J}^*_h = \frac{1}{n}\sum_{i=1}^n L(h({{\x}}_i),y_i)$$ - $\hat{J}^*_h$ is an *unbiased* estimate of $J^*_h$ **so long as the $({{\x}}_i,y_i)$ do not influence $h$.** Can use $\hat{J}^*_h$ to get a confidence interval for $J^*_h$. - Gives a strong statistical guarantee about the true performance of our system, *if we didn’t use the test data to choose $h$.* - We can write "training error" for model class $\mathcal{H}$ on a given data set as $$\hat{J}_\mathcal{H} = \min_{h' \in \mathcal{H}} \frac{1}{n}\sum_{i=1}^n L(h'({{\x}}_i),y_i)$$ - Let the corresponding learned hypothesis be $$h^* = \arg\min_{h' \in \mathcal{H}} \frac{1}{n}\sum_{i=1}^n L(h'({{\x}}_i),y_i)$$ - Obviously, for any data set, $\hat{J}_\mathcal{H} \le \hat{J}^*_h$. ## Model Selection and Performance 1. We would like to estimate the **generalization error** of our resulting predictor. 2. We would like to choose the best model space (e.g. linear, quadratic, ...) ## Problem 1: Estimating Generalization Error Training error $\hat{J}_\mathcal{H}$ systematically underestimates generalization error $J^*_{h^*}$ for the learned hypothesis $h^*$. ## Problem 2: Model Selection {.smaller} The more complex the model, the smaller the training error.

```{r fig.width=4,echo=F} f9 ```

- Training error of the degree-9 polynomial is 0. - Training error of the degree-9 polynomial *on any set of 10 points* is 0.

## Problem 2: Model Selection Smaller training error does not mean smaller generalization error. - Suppose $\mathcal{H}_1$ is the space of all linear functions, $\mathcal{H}_2$ is the space of all quadratic functions. Note $\mathcal{H}_1 \subset \mathcal{H}_2$. - Fix a data set. - Let $h^*_1 = \arg\min_{h' \in \mathcal{H}_1} \hat{J}^*_{h'}$ and $h^*_2 = \arg\min_{h' \in \mathcal{H}_2} \hat{J}^*_{h'}$, both computed using the same dataset. - We **must** have $\hat{J}^*_{h^*_2} \le \hat{J}^*_{h^*_1}$, *but we may have $J^*_{h_2} > J^*_{h_1}$*. ## Problem 2: Model Selection ```{r echo=F} set.seed(1) ntrain=10; ntest = 1000; dmax=9 test <- data.frame(x = rnorm(ntest)) test$y <- 1 - test$x + 2*(test$x)^2 - 4*(test$x)^3 + runif(ntest) train <- head(test,ntrain) test <- tail(test,ntest-ntrain) trainerr <- NULL; testerr <- NULL; for(deg in 1:dmax) { mod <- lm(y~poly(x,deg),data=train) trainerr[deg] <- mean((mod$residuals)^2) testerr[deg] <- mean((test$y - predict(mod,test))^2) } errs = data.frame(trainerr=trainerr,testerr=testerr,deg=1:dmax) ggplot(errs,aes(x=deg,y=log(trainerr))) + geom_point(colour="blue") + geom_point(aes(y=log(testerr)),colour="red") + ylab("log(error)") + xlab("degree") + scale_x_continuous(breaks=1:dmax) ``` Training error is no good for choosing the model space. ## Fixing Problem 1: Generalization Error Training error $\hat{J}_\mathcal{H}$ underestimates generalization error $J^*_h$ - If you really want a good estimate of $J^*_h$, you need a separate **test set** - (But new stat methods can produce a CI using training error) - Could report test error, then deploy whatever you train on the whole data. (Probably won’t be worse.) ## Fixing Problem 2: Model Selection Smaller training error does not mean smaller generalization error. - Small training error, large generalization error is known as **overfitting** - A separate **validation set** can be used for model selection. - Train on the training set using each proposed model space - evaluate each on the validation set, choose the one with lowest *validation* error ## Training, Model Selection, and Error Estimation - A general procedure for estimating the true error of a specific learned model using model selection - The data is randomly partitioned into three disjoint subsets: - A *training set* used only to find the parameters ${\bf w}$ - A *validation set* used to find the right model space (e.g., the degree of the polynomial) - A *test set* used to estimate the generalization error of the resulting model - Can generate standard confidence intervals for the generalization error of the learned model ## Problems with the Single-Partition Approach - Pros: - Measures what we want: Performance of the actual learned model. - Cons: - Smaller effective training sets make performance more variable. - Small validation sets can give poor model selection - Small test sets can give poor estimates of performance - For a test set of size 100, with 60 correct classifications, 95% C.I. for actual accuracy is $(0.497,0.698)$. ## k-fold cross-validation
(HTF 7.10, JWHT 5.1) - Divide the instances into $k$ disjoint partitions or folds of size $n/k$ - Loop through the partitions $i = 1 ... k$: - Partition $i$ is for evaluation (i.e., estimating the performance of the algorithm after learning is done) - The rest are used for training (i.e., choosing the specific model within the space) - *"Cross-Validation Error" is the average error on the evaluation partitions*. Has lower variance than error on one partition. - This is the main CV **idea**; CV is used for different purposes though. ## Misuse of CV (HTF 7.10.2) - $n=50$ examples, binary classification, balanced classes - $p = 5000$ features, all statistically independent of $y$ - Use model selection to find best $100$ features by correlation on entire dataset. - Use cross-validation with these $p = 100$ to estimate error. - CV-based error rate was 3%. ## k-fold cross-validation model selection
(HTF 7.10, JWHT 5.1) - Divide the instances into $k$ folds of size $n/k$. - Loop over $m$ model spaces $1 ... m$ - Loop over the $k$ folds $i = 1 ... k$: - Fold $i$ is for validation (i.e., estimating the performance of the algorithm after learning is done) - The rest are used for training (i.e., choosing the specific model within the space) - For each model space, report average error over folds, and standard error. ## CV for Model Selection ```{r echo=F,message=F,fig.height=6,fig.width=8.5} library(cvTools);library(gridExtra);library(grid) set.seed(1) ntrain=24; ntest = 10000; dmax=9 #test <- data.frame(x = rnorm(ntest)) test <- data.frame(x = 5*(runif(ntest)-0.5)) test$y <- 10*test$x - 4*(test$x)^3 + 10*rnorm(ntest) train <- head(test,ntrain) test <- tail(test,ntest-ntrain) trainerr <- NULL; testerr <- NULL; valerr <- NULL; ptplots <- list(); dstar <- NULL folds <- 6 fsize=ntrain/folds showdeg <- function(deg) { processfold <- function(i,deg) { vfold <- i vrange <- ((vfold-1)*fsize + 1):(vfold*fsize) vf <- train[vrange,] #Validation fold tr <- train[-vrange,] #Learn on training data mod <- lm(y~poly(x,deg),data=tr) #Test on validation data valerr <- mean((vf$y - predict(mod,vf))^2) plt <- ggplot(tr,aes(x=x,y=y)) + geom_point(color="red",size=2) + geom_point(data=vf,color="blue",size=3) + geom_smooth(data=tr,method="lm",formula=y ~ poly(x,deg),se=F,n=200,na.rm=F,fullrange=T) + ggtitle(sprintf("MSE: %.2f",valerr)) plt$valerr <- valerr plt } ptplots <- mapply(processfold,1:folds,rep(deg,folds),SIMPLIFY = F) serrors <- sapply(ptplots, function(e) e$valerr) grid.arrange(grobs=ptplots,top=textGrob(sprintf("Degree: %d, Mean MSE: %.2f, SE: %.2f", deg,mean(serrors),sd(serrors)/sqrt(folds)),gp=gpar(fontsize=20))) } showdeg(1) ``` ## CV for Model Selection ```{r echo=F,fig.height=6,fig.width=8.5} showdeg(2) ``` ## CV for Model Selection ```{r echo=F,fig.height=6,fig.width=8.5} showdeg(3) ``` ## CV for Model Selection ```{r echo=F,fig.height=6,fig.width=8.5} showdeg(4) ``` ## CV for Model Selection ```{r echo=F,fig.height=6,fig.width=8.5} showdeg(5) ``` ## CV for Model Selection ```{r echo=F,fig.height=6,fig.width=8.5} showdeg(9) ``` ## CV for Model Selection - Typically select "most parsimonious model whose error is no more than one standard error above the error of the best model." (HTF)

## Estimating "which is best"
vs. "performance of best"

[Estimated errors using 290 model spaces.](http://bioinformatics.oxfordjournals.org/content/30/22/3152.long) ## Nested CV for Model Evaluation {.smaller} - Divide the instances into $k$ "outer" folds of size $n/k$. - Loop over the $k$ outer folds $i = 1 ... k$: - Fold $i$ is for testing; all others for training. - Divide the training instances into $k'$ "inner" folds of size $(n - n/k)/k'$. - Loop over $m$ model spaces $1 ... m$ - Loop over the $k'$ inner folds $j = 1 ... k'$: - Fold $j$ is for validation - The rest are used for training - Use average error over folds and SE to choose model space. - Train on all inner folds. - Test the model on outer test fold ## Nested CV for Model Evaluation ```{r echo=F,fig.height=6,fig.width=8.5} library(cvTools) set.seed(1) processtestfold <- function(i) { trange <- ((i-1)*fsize + 1):(i*fsize) tf <- train[trange,] #Test fold tr <- train[c(-trange),] thefolds <- cvFolds(nrow(tr), K = folds, R = 1) valerr <- NULL for (deg in 1:min(dmax,nrow(tr)-1)) { thecall <- call("lm",formula=y~poly(x,deg)) valerr[deg] <- cvTool(call=thecall,data=tr,y=tr$y,folds=thefolds) } dstar <- which.min(valerr) modstar <- lm(y~poly(x,dstar),data=tr) testerr <- mean((tf$y - predict(modstar,tf))^2) plt <- ggplot(tr,aes(x=x,y=y)) + geom_point(color="red",size=2) + geom_point(data=tf,color="green3",size=3) + geom_smooth(data=tr,method="lm",formula=y ~ poly(x,dstar),se=F,n=200,na.rm=F,fullrange=T) + ggtitle(sprintf("Degree: %d, Error: %.2f",dstar,testerr)) plt$testerr <- testerr plt } ptplots <- lapply(1:folds,processtestfold) terrors <- sapply(ptplots, function(e) e$testerr) grid.arrange(grobs=ptplots,top=textGrob(sprintf("Mean MSE: %.2f, SE: %.2f", mean(terrors),sd(terrors)/sqrt(folds)),gp=gpar(fontsize=20))) ``` ## Generalization Error for degree 3 model ```{r echo=F} dstar <- 3 bestmod <- lm(y~poly(x,dstar),data=train) testerr <- mean((test$y - predict(bestmod,test))^2) ggplot(train,aes(x=x,y=y)) + geom_point(color="red",size=2) + geom_smooth(data=train,method="lm",formula=y ~ poly(x,dstar),se=F,n=200,na.rm=F,fullrange=T) + ggtitle(sprintf("Degree: %d, Error: %.2f on test set of size 10000",dstar,testerr)) ``` Minimum-CV Estimate: 128.48, Nested CV Estimate: 149.91 ## Bias-correction for the CV Procedure [Cawley, Talbot. On Over-fitting in Model Selection and Subsequent Selection Bias in Performance Evaluation. JMLR v.11, 2010.](http://jmlr.org/papers/volume11/cawley10a/cawley10a.pdf) [Tibshirani, Tibshirani. A bias correction for the minimum error rate in cross-validation. arXiv. 2009.](http://arxiv.org/abs/0908.2904) [Ding et al. Bias correction for selecting the minimal-error classifier from many machine learning models. Bioinformatics 30 (22). 2014.](http://bioinformatics.oxfordjournals.org/content/30/22/3152.long) ## Summary - The training error decreases with the degree of the polynomial $M$, i.e. *the complexity (size) of the model space* - Generalization error decreases at first, then starts increasing - Set aside a *validation set* helps us find a good model space - We then can report unbiased error estimate, using a *test set*, **untouched during both parameter training and validation** - Cross-validation is a lower-variance but possibly biased version of this approach. It is standard. - ***If you have lots of data, just use held-out validation and test sets.***