# DATA SCIENCE # Presentation 20.11.18 # How R works # object-based programming language # everything is an object a = 10 b <- 5 # both work the same (= and <-), the latter one is more established # now we can work with our objects a+b sqrt(a+b) # R is powerful for vectors # To define vectors, use c(...) v1 <- c(1,2,3) v2 <- c(4,5,6) # Calculations with vectors v1*5 v1*v2 # Multiplication is done element-wise # If we want the scalar-product, use: v1 %*% v2 ################################################# # LETS USE REAL DATA ################################################# # We want to import the Advertising.csv file. # To R, this is like a foreign language. # To import Files from Stata, Excel etc, we need the package "foreign" library(foreign) adv <- read.csv("C:/Users/FSR WIWI 18/Downloads/Advertising.csv") #adv <- read.csv("/HOME1/users/students/vinr/Downloads/Advertising.csv") attach(adv) # Now we have the adv-Data in our environment # Make the three scatterplots, just like in the first Lab par(mfrow=c(1,3)) plot(TV,sales, col="red", pch=16) plot(radio, sales, col="darkblue", pch=3) plot(newspaper, sales, col="darkgreen") # Now we want to perform a simple linear regression # We will store our model in an object, called model.tv # the command to create a linear model is lm (short for linear model) # lm(Y~X) produces a linear model for Y with predictor X # Lineare Regression für TV model.tv <- lm(sales~TV) summary(model.tv) # Interpret the summary! # Now include the regression line into the scatter plot par(mfrow=c(1,1)) plot(TV, sales, col="red") abline(model.tv, lty=3) abline(model.tv, lwd=5) # Now, lets do multiple regression, i.e. adding more regressors/predictors to our model # new object: model # additional regressors can be added by + # lm(Y~X1+X2+X3...) # Multiple Regression model <- lm(sales~TV+radio+newspaper) summary(model) # Interpret the model! ########################################################### # Correlation matrix for all variables of advertising.csv cor(adv) # Scatterplot matrix pairs(adv) ######################################################### # To show customizable plots: Histogram of 100.000 draws from a normal distribution hist(rnorm(100000), breaks=100) # A bit more fancy: hist(rnorm(100000), main="YEAH", breaks=100, col=rainbow(50)) ################################################ # If time allows: KNN # KNN # For a demonstration of the KNN approach, we will use the stock market data set # (example like in the book). # Data of the returns of S&P 500 between 2001 and 2005 # Lag1 to Lag5 show the percentage returns of the 5 previous trading days # Direction shows, wheter the market went UP or DOWN) library(class) library(ISLR) #names(Smarket) attach(Smarket) #plot(Volume) # First, we devide our data in training and test data # The years 2001-2004 will serve as our training data # We create vectors for Lag 1 and 2 and the Direction for test and training set train.X <- cbind(Lag1, Lag2)[Year<2005,] test.X <- cbind(Lag1, Lag2)[Year>=2005,] dir.training <- Direction[Year<2005] dir.test <- Direction[Year >= 2005] # Now we use the function knn and we need to specify 4 arguments # 1) predictors for the training-period: train.X (matrix) # 2) predictors for the test period: test.X (matrix) # 3) training observations of Direction: dir.training (vector) # 4) value for k, we choose 3 set.seed(1) #for reproducibility knn.pred <- knn(train.X, test.X, dir.training, k=3) # Table of KNN-prediction vs actual values of dir.test table(knn.pred, dir.test) # Lets see how well we have performed (48+86)/length(knn.pred) # maybe more elegant: mean(knn.pred==dir.test) # 53.17 percent of our predicted values have been correct # END