---
title: "Introduction to Statistical Learning"
author: "Dan Lizotte"
date: "`r Sys.Date()`"
output:
pdf_document:
df_print: tibble
html_document:
df_print: paged
ioslides_presentation:
css: ../my_ioslides.css
df_print: paged
---
```{r echo=FALSE,warning=FALSE,message=FALSE}
library(ISLR)
library(dplyr)
library(tibble)
```
## Advertising Data {.smaller}
A simulated dataset containing sales of child car seats at 400 different stores
```{r rows.print=15}
Carseats
```
# Simple questions: Summaries of one variable
## Data summaries
A "statistic" is a the result of applying a function (summary) to the data: `statistic <- function(data)`
E.g.\ ranks: Min, Quantiles, Median, Mean, Max
```{r}
summary (Carseats$Sales)
```
*Roughly*, a quantile for a proportion $p$ is a value $x$ for which $p$ of the data are less than or equal to $x$. The first quartile, median, and third quartile are the quantiles for $p=0.25$, $p=0.5$, and $p=0.75$, respectively.
## Visual Summary 1: Box Plot, Jitter Plot {.smaller}
```{r warning=FALSE,message=FALSE}
library(ggplot2);
summary(Carseats$Sales)
ggplot(Carseats, aes(x="All",y=Sales)) + labs(x=NULL) + geom_boxplot() + coord_flip()
```
## Visual Summary 2: Jitter Plot {.smaller}
```{r warning=FALSE,message=FALSE}
library(ggplot2);library(gridExtra); #boxplot relatives
#jitter plot
ggplot(Carseats, aes(x="All",y=Sales)) + labs(x=NULL) +
geom_jitter(position=position_jitter(height=0,width=0.25)) + coord_flip()
```
## Visual Summary 3: Histogram {.smaller}
```{r warning=FALSE,message=FALSE}
## Construct different histogram of eruption times
ggplot(Carseats, aes(x=Sales)) + labs(y="Count") + geom_histogram(aes(y = ..count..))
```
# Complex questions: Relationships
## Relationships between variables {.smaller}
```{r message=F,warning=F,echo=F}
library(ggplot2)
ggplot(data=Carseats,aes(x=Price,y=Sales)) + geom_point()
```
## All of Supervised Learning
Proposal:
$$
Y = f(X) + \epsilon
$$
1. Here is some data
2. Tell me what $f$ is
## Example: linear fit {.smaller}
```{r}
csform <- Sales ~ Price; csmod <- lm(csform, data=Carseats); print(csmod$coefficients)
ggplot(Carseats, aes(x = Price, y = Sales)) + geom_point() + geom_smooth(method = lm)
```
## Fitting by Minimizing Error
```{r echo=F}
Carseats$pred <- predict(csmod,Carseats)
ggplot(Carseats, aes(x = Price, y = Sales, xend=Price,yend=pred)) + geom_point(size=4) + geom_smooth(method="lm",formula=y ~ x,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(0,17)) + geom_segment(color="red")
```
## What kind of $f$ are you looking for?
```{r echo=F}
library(ggplot2)
ex <- data.frame(x0=rep(1,10),x=c(0.86,0.09,-0.85,0.87,-0.44,-0.43,-1.10,0.40,-0.96,0.17), y=c(2.49,0.83,-0.25,3.10,0.87,0.02,-0.12,1.81,-0.83,0.43))
ggplot(ex,aes(x=x,y=y)) + geom_point(size=4)
#ex <- Carseats; ex$y <- ex$Sales; ex$x <- ex$Price
```
## Data and linear fit
```{r echo=F}
options(digits=2,width=120)
fitplots_aes <- aes(x=x,y=y,xend=x,yend=pred)
pymin=-1;pymax=3.3
#pymin = 0; pymax = 17.5;
```
```{r echo=F}
form <- y ~ x; mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex)
f1 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
f1
```
## Data and quadratic fit
```{r echo=F}
form <- y ~ x + I(x^2); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-3 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-4 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-5 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5); mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-6 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6)
mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-7 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7)
mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-8 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8);
mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex)
ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
```
Is this a better fit to the data?
## Order-9 fit
```{r echo=F}
form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9)
mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex)
f9 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red")
f9
```
Is this a better fit to the data?
## Evaluating Performance
```{r echo=F,fig.width=8.5}
library(gridExtra)
grid.arrange(f1,f9,nrow=1)
```
Which do you prefer and why?
## Recommended exercises
- JWHT 2.3 Lab: Introduction to R
(Or, follow along and see if you can do it in python.)