--- title: "Introduction to Statistical Learning" author: "Dan Lizotte" date: "`r Sys.Date()`" output: pdf_document: df_print: tibble html_document: df_print: paged ioslides_presentation: css: ../my_ioslides.css df_print: paged --- ```{r echo=FALSE,warning=FALSE,message=FALSE} library(ISLR) library(dplyr) library(tibble) ``` ## Advertising Data {.smaller} A simulated dataset containing sales of child car seats at 400 different stores ```{r rows.print=15} Carseats ``` # Simple questions: Summaries of one variable ## Data summaries A "statistic" is a the result of applying a function (summary) to the data: `statistic <- function(data)` E.g.\ ranks: Min, Quantiles, Median, Mean, Max ```{r} summary (Carseats$Sales) ``` *Roughly*, a quantile for a proportion $p$ is a value $x$ for which $p$ of the data are less than or equal to $x$. The first quartile, median, and third quartile are the quantiles for $p=0.25$, $p=0.5$, and $p=0.75$, respectively. ## Visual Summary 1: Box Plot, Jitter Plot {.smaller} ```{r warning=FALSE,message=FALSE} library(ggplot2); summary(Carseats$Sales) ggplot(Carseats, aes(x="All",y=Sales)) + labs(x=NULL) + geom_boxplot() + coord_flip() ``` ## Visual Summary 2: Jitter Plot {.smaller} ```{r warning=FALSE,message=FALSE} library(ggplot2);library(gridExtra); #boxplot relatives #jitter plot ggplot(Carseats, aes(x="All",y=Sales)) + labs(x=NULL) + geom_jitter(position=position_jitter(height=0,width=0.25)) + coord_flip() ``` ## Visual Summary 3: Histogram {.smaller} ```{r warning=FALSE,message=FALSE} ## Construct different histogram of eruption times ggplot(Carseats, aes(x=Sales)) + labs(y="Count") + geom_histogram(aes(y = ..count..)) ``` # Complex questions: Relationships ## Relationships between variables {.smaller} ```{r message=F,warning=F,echo=F} library(ggplot2) ggplot(data=Carseats,aes(x=Price,y=Sales)) + geom_point() ``` ## All of Supervised Learning Proposal: $$ Y = f(X) + \epsilon $$ 1. Here is some data 2. Tell me what $f$ is ## Example: linear fit {.smaller} ```{r} csform <- Sales ~ Price; csmod <- lm(csform, data=Carseats); print(csmod$coefficients) ggplot(Carseats, aes(x = Price, y = Sales)) + geom_point() + geom_smooth(method = lm) ``` ## Fitting by Minimizing Error ```{r echo=F} Carseats$pred <- predict(csmod,Carseats) ggplot(Carseats, aes(x = Price, y = Sales, xend=Price,yend=pred)) + geom_point(size=4) + geom_smooth(method="lm",formula=y ~ x,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(0,17)) + geom_segment(color="red") ``` ## What kind of $f$ are you looking for? ```{r echo=F} library(ggplot2) ex <- data.frame(x0=rep(1,10),x=c(0.86,0.09,-0.85,0.87,-0.44,-0.43,-1.10,0.40,-0.96,0.17), y=c(2.49,0.83,-0.25,3.10,0.87,0.02,-0.12,1.81,-0.83,0.43)) ggplot(ex,aes(x=x,y=y)) + geom_point(size=4) #ex <- Carseats; ex$y <- ex$Sales; ex$x <- ex$Price ``` ## Data and linear fit ```{r echo=F} options(digits=2,width=120) fitplots_aes <- aes(x=x,y=y,xend=x,yend=pred) pymin=-1;pymax=3.3 #pymin = 0; pymax = 17.5; ``` ```{r echo=F} form <- y ~ x; mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) f1 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") f1 ``` ## Data and quadratic fit ```{r echo=F} form <- y ~ x + I(x^2); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-3 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-4 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4); mod <- lm(form, data=ex); print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-5 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5); mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-6 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) mod <- lm(form, data=ex); ex$pred <- predict(mod,ex); print(mod$coefficients) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-7 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-8 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8); mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") ``` Is this a better fit to the data? ## Order-9 fit ```{r echo=F} form <- y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) + I(x^8) + I(x^9) mod <- lm(form, data=ex);print(mod$coefficients); ex$pred <- predict(mod,ex) f9 <- ggplot(ex,fitplots_aes) + geom_point(size=4) + geom_smooth(method="lm",formula=form,se=F,n=200,na.rm=F) + coord_cartesian(ylim=c(pymin,pymax)) + geom_segment(color="red") f9 ``` Is this a better fit to the data? ## Evaluating Performance ```{r echo=F,fig.width=8.5} library(gridExtra) grid.arrange(f1,f9,nrow=1) ``` Which do you prefer and why? ## Recommended exercises - JWHT 2.3 Lab: Introduction to R (Or, follow along and see if you can do it in python.)