# DATA SCIENCE
# Presentation 20.11.18

# How R works
# object-based programming language
# everything is an object


a = 10
b <- 5
# both work the same (= and <-), the latter one is more established

# now we can work with our objects
a+b
sqrt(a+b)

# R is powerful for vectors
# To define vectors, use c(...)
v1 <- c(1,2,3)
v2 <- c(4,5,6)

# Calculations with vectors
v1*5 
v1*v2
# Multiplication is done element-wise
# If we want the scalar-product, use:
v1 %*% v2

#################################################

# LETS USE REAL DATA

#################################################


# We want to import the Advertising.csv file.
# To R, this is like a foreign language.
# To import Files from Stata, Excel etc, we need the package "foreign"

library(foreign)
adv <- read.csv("C:/Users/FSR WIWI 18/Downloads/Advertising.csv")
#adv <- read.csv("/HOME1/users/students/vinr/Downloads/Advertising.csv")
attach(adv)

# Now we have the adv-Data in our environment

# Make the three scatterplots, just like in the first Lab
par(mfrow=c(1,3))
plot(TV,sales, col="red", pch=16)
plot(radio, sales, col="darkblue", pch=3)
plot(newspaper, sales, col="darkgreen")


# Now we want to perform a simple linear regression
# We will store our model in an object, called model.tv
# the command to create a linear model is lm (short for linear model)
# lm(Y~X) produces a linear model for Y with predictor X

# Lineare Regression für TV
model.tv <- lm(sales~TV)
summary(model.tv)

# Interpret the summary!


# Now include the regression line into the scatter plot
par(mfrow=c(1,1))
plot(TV, sales, col="red")
abline(model.tv, lty=3)
abline(model.tv, lwd=5)

# Now, lets do multiple regression, i.e. adding more regressors/predictors to our model
# new object: model
# additional regressors can be added by + 
# lm(Y~X1+X2+X3...)

# Multiple Regression 
model <- lm(sales~TV+radio+newspaper)
summary(model)

# Interpret the model!


###########################################################


# Correlation matrix for all variables of advertising.csv
cor(adv)

# Scatterplot matrix
pairs(adv)



#########################################################

# To show customizable plots: Histogram of 100.000 draws from a normal distribution

hist(rnorm(100000), breaks=100)

# A bit more fancy:
hist(rnorm(100000), main="YEAH", breaks=100, col=rainbow(50))




################################################

# If time allows: KNN

# KNN

# For a demonstration of the KNN approach, we will use the stock market data set
# (example like in the book).
# Data of the returns of S&P 500  between 2001 and 2005
# Lag1 to Lag5 show the percentage returns of the 5 previous trading days
# Direction shows, wheter the market went UP or DOWN)

library(class)
library(ISLR)
#names(Smarket)
attach(Smarket)
#plot(Volume)

# First, we devide our data in training and test data
# The years 2001-2004 will serve as our training data

# We create vectors for Lag 1 and 2 and the Direction for test and training set


train.X <- cbind(Lag1, Lag2)[Year<2005,]
test.X <- cbind(Lag1, Lag2)[Year>=2005,]
dir.training <- Direction[Year<2005]
dir.test <- Direction[Year >= 2005]

# Now we use the function knn and we need to specify 4 arguments
# 1) predictors for the training-period: train.X (matrix)
# 2) predictors for the test period: test.X (matrix)
# 3) training observations of Direction: dir.training (vector)
# 4) value for k, we choose 3

set.seed(1) #for reproducibility
knn.pred <- knn(train.X, test.X, dir.training, k=3)

# Table of KNN-prediction vs actual values of dir.test
table(knn.pred, dir.test)

# Lets see how well we have performed
(48+86)/length(knn.pred)

# maybe more elegant:
mean(knn.pred==dir.test)

# 53.17 percent of our predicted values have been correct 

# END