# IML 2016 # Exercise session II # Model solutions # Ville H. & Teemu R. ########################################################### # Exercise 3 # (a) # read mnist data set source('loadmnist.R') # contains code downloaded from https://gist.github.com/brendano/39760 load_mnist() show_digit(train$x[5, ]) # draw 5:th digit. Seems to work correctly... ########################################################## # (b) train$n = 5000 # training set size 5000 train$x = train$x[1:train$n,] train$y = train$y[1:train$n] test$n = 1000 # test set size 1000 test$x = test$x[1:test$n,] test$y = test$y[1:test$n] # install.packages('proxy') library(proxy) # compute the distance matrix ddd = dist(train$x, test$x) ddd[1,1] # looks fine ########################################################## # (c) err = rep(NA,50) # initialize error rate table for (k in 1:50) { # initialize set of neighbors and the selected class nearest = matrix(0, ncol = k, nrow = n) knn.class = rep(-1, n) for (i in 1:n) { nearest[i,] = order(ddd[,i])[1:k] # these are the nearest k points # next use the given hint to sort the classes by decreasing frequency # and pick the first (most frequent): this is our classification knn.class[i] = names(sort(table(train$y[nearest[i,]]), decreasing=TRUE))[1] } # count the number of misclassifications and divide by test set size err[k] = sum(knn.class != test$y) / test$n } # plot the error rate plot(err, type = 'b', pch = 18, col = 'red', xlab = 'k', ylab = 'test error rate') ########################################################### # Exercise 4 # install.packages('ISLR') library(ISLR) data("Weekly") # correlations between variables round(cor(Weekly[ ,1:8]), 3) pairs(Weekly, pch = 18, col = 'blue', cex = .7) # There seem to be no strong relationships between the variables we are interested in. # Only strong correlation is between Year and Volume: it seems that the volume of # trade has grown over time. # b logistic_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = binomial, data = Weekly) summary(logistic_fit) # Only regression coefficient that is statistically significant at 0.05-level is Lag2 # So let's use it only at our next models. # (Note: not very foolproof method for model selection...) # c logistic_prob <- predict(logistic_fit, type = 'response') # predicted probabilities contrasts(Weekly$Direction) # check which probability the model predicts (Up) logistic_pred <- ifelse(logistic_prob > 0.5, "Up", "Down") # predicted classes table(logistic_pred, Weekly$Direction) # confusion matrix mean(logistic_pred == Weekly$Direction) # training error rate # It seems that logistic regression with all variables as predictors predicts mostly that market goes up: # it predicts Up for 430 + 557 = 987 days, and Down only for 54+48 = 102 days. # However, in reality market goes up 48+557 = 605 days, and down 430+54 = 484 days. # d train <- Weekly[Weekly$Year <= 2008, ] test <- Weekly[Weekly$Year >= 2009, ] # Fit logistic regression with only Lag2 as a predictor logistic_single_fit <- glm(Direction ~ Lag2, family = binomial, data = train) summary(logistic_single_fit) logistic_prob <- predict(logistic_single_fit, type = 'response', newdata = test) # predicted probabilities logistic_pred <- ifelse(logistic_prob > 0.5, "Up", "Down") # predicted classes table(logistic_pred, test$Direction) # confusion matrix mean(logistic_pred == test$Direction) # training error rate # e # install.packages('MASS') library('MASS') lda_fit <- lda(Direction ~ Lag2, data = train) lda_pred <- predict(lda_fit, newdata = test) table(lda_pred$class, test$Direction) mean(lda_pred$class == test$Direction) # Same predictions as logistic regression, and as you can see, in this case also # practically the same predicted probabilites: head(lda_pred$posterior) head(logistic_prob) # f qda_fit <- qda(Direction ~ Lag2, data = train) qda_pred <- predict(qda_fit, newdata = test) table(qda_pred$class, test$Direction) mean(qda_pred$class == test$Direction) # QDA predicts "Up" for all cases of the test set, so it gives the same results as # the mean model (just predicting the most common class for all the observations): mean(test$Direction == "Up") # g # install.packages('class') library(class) set.seed(1) knn_pred <- knn(train = as.matrix(train$Lag2), test = as.matrix(test$Lag2), cl = train$Direction) table(knn_pred, test$Direction) mean(knn_pred != test$Direction) # K-nn classifier with K = 1 seems to predict almost as many "Downs" and "Ups" # The accuracy is the same as with random guessing, and worse than the mean model. # h # So Logistic regression and LDA were the only models that gave higher accuracy than the mean model. # However, we should be a bit sceptical about these results because of the smallish test set.