Preamble. Deep learning is hard. It is much harder than MNIST would lead you to believe. MNIST for all practical purposes is "solved" and as such, it isn't as interesting as it once was. Nonetheless, we know models can be trained to perform very well against the data. This provides an opportunity for me to test out some of my generic coding based on tfruns.
This blog post presents my newly created "tunable" convolutional neural network (CNN) that runs within RStudio using their tfruns package. The tfruns package addresses one of the large hassles of training a CNN--how does one systematically search over the numerous tunable parameters? A large tuning array crossed with significant training time means either you stay glued to the computer to minimize computational downtime or you write your own wrappers to do the grid search. The latter is a good approach. However, the good folks at RStudio have written something that works extremely well.
The following is the code for a three part program. First, is the control program that defines the grid search. The second program is the flexible CNN architecture that is tunable. The final program analyzes the results and shows how a model can be extracted from the results and utilized for prediction.
This video details the programs and walks through the results that were obtained.
Program 01: prog01_tfruns_control.R
library(tfruns) library(keras) library(tidyverse) library(keras) library(reticulate) ## load the mnist data # Input image dimensions num_classes <- 10 img_rows <- 28 img_cols <- 28 # The data, shuffled and split between train and test sets mnist <- dataset_mnist() x_train <- mnist$train$x y_train <- mnist$train$y x_test <- mnist$test$x y_test <- mnist$test$y # Redefine dimension of train/test inputs x_train <- array_reshape(x_train, c(nrow(x_train), img_rows, img_cols, 1)) x_test <- array_reshape(x_test, c(nrow(x_test), img_rows, img_cols, 1)) input_shape <- c(img_rows, img_cols, 1) # Transform RGB values into [0,1] range and drop underscore for skeleton program xtrain <- x_train / 255 xtest <- x_test / 255 cat('x_train_shape:', dim(xtrain), '\n') cat(nrow(xtrain), 'train samples\n') cat(nrow(xtest), 'test samples\n') # Convert class vectors to binary class matrices y_train <- to_categorical(y_train, num_classes) y_test <- to_categorical(y_test, num_classes) # initial tuning run runs <- tuning_run("prog02_keras_skeleton.R", sample=.005, flags=list( batchsize=c(128,256), nepochs=20, lrate=c(0.001), augment=c(T, F), flip=c(T,F), valsplit=.1, nconvolutions = 2, drop1 = .5, activationc= c("leaky","relu"), meanpool = c(T,F), pad1="same", pool1=c(2), pool2=c(2), kernel1=c(3), kernel2=c(3), filter1 = c(32, 64, 96), filter2 = c(32, 64, 96), filter3 = c(32, 64, 96), ndense1 = c(1,2), ndense2 = c(1,2,4), activationd= c("relu","leaky"), hidden1=c(100,500), hidden2=c(100,500), drop2= .5 ) ) View(runs)
Program 02: prog02_keras_skeleton.R
FLAGS <- flags( flag_integer("batchsize", 128, "Batch size"), flag_numeric("nepochs", 10, "Number of epochs"), flag_numeric("lrate",0.01,"Learning rate"), flag_boolean("augment",T, "Use data augmentation"), flag_boolean("flip",T,"Horizontal flip of images"), flag_numeric("valsplit",0.2, "Validation sample split (non-augmented only)"), ## convolution turning parameters flag_numeric("nconvolutions",1,"Number of extra convolution cycles"), flag_numeric("drop1",0.5, "Dropout for the pooling"), flag_string("activationc","relu","Activation for the convolutions"), flag_boolean("meanpool",T, "Use mean or max pooling"), flag_string("pad1","same", "Padding"), flag_integer("pool1",2, "Size of first pooling"), flag_integer("pool2",2, "Size of subsequent pooling"), flag_integer("kernel1",3, "Kernel size for first set of convs / pooling"), flag_integer("kernel2",3, "Kernel size for extra set of convolutions"), flag_integer("filter1",16, "Number of filters for first conv within set"), flag_integer("filter2",32, "Number of filters for second conv within set"), flag_integer("filter3",16, "Number of filters for third conv within set"), ## dense layer tuning flag_numeric("ndense1",1,"Number of dense layer 1s"), flag_numeric("ndense2",1, "Number of dense Layer 2s"), flag_string("activationd","relu","Activation for the dense layers"), flag_numeric("hidden1",100, "Size of first dense layer"), flag_numeric("hidden2",100, "Size of subsequent dense layers"), flag_numeric("drop2",0.5, "Dropout for the dense layers") ) ydim <- ncol(y_train) numsamples <- nrow(xtrain) numtest <- nrow(xtest) batch_size <- FLAGS$batchsize n_epoch <- FLAGS$nepochs meanpooling <- FLAGS$meanpool activation <- FLAGS$activationc denseactivation <- FLAGS$activationd model <- keras_model_sequential() ############################# CONVOLUTION SET 1 ########################## # three set of filters, then a pooling # note: using several model %>% commands to make the whole model more programatic model %>% layer_conv_2d(filter = FLAGS$filter1, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1, input_shape = c(img_rows, img_cols, 1) ) model %>% layer_batch_normalization() # add switch for the activation function if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_conv_2d(filter = FLAGS$filter2, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1) model %>% layer_batch_normalization() if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_conv_2d(filter = FLAGS$filter3, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1) model %>% layer_batch_normalization() # add switch for the activation function if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } if ( meanpooling) { # Use mean pooling model %>% layer_average_pooling_2d(pool_size = c(FLAGS$pool1, FLAGS$pool1)) } else { model %>% layer_max_pooling_2d(pool_size = c(FLAGS$pool1, FLAGS$pool1)) } model %>% layer_dropout(FLAGS$drop1) ############################## extra convolutions for (i in seq(1, FLAGS$nconvolutions)){ model %>% layer_conv_2d(filter = FLAGS$filter1, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) model %>% layer_batch_normalization() # add switch for the activation function if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_conv_2d(filter = FLAGS$filter2, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) model %>% layer_batch_normalization() # add switch for the activation function if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_conv_2d(filter = FLAGS$filter3, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) model %>% layer_batch_normalization() # add switch for the activation function if (activation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } if ( meanpooling) { # Use mean pooling model %>% layer_average_pooling_2d(pool_size = c(FLAGS$pool2, FLAGS$pool2), padding=FLAGS$pad1) } else { model %>% layer_max_pooling_2d(pool_size = c(FLAGS$pool2, FLAGS$pool2), padding=FLAGS$pad1) } model %>% layer_dropout(FLAGS$drop1) } ################# end of extra convolutions # Flatten max filtered output into feature vector # and feed into dense layer model %>% layer_flatten() ## now loop over the number of dense layer 1s for (i in seq(1,FLAGS$ndense1)){ model %>% layer_dense(FLAGS$hidden1) if (denseactivation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_dropout(FLAGS$drop2) } for (i in seq(1,FLAGS$ndense2)){ model %>% layer_dense(FLAGS$hidden2) if (denseactivation == "leaky"){ model %>% layer_activation_leaky_relu() } else { model %>% layer_activation("relu") } model %>% layer_dropout(FLAGS$drop2) } model %>% layer_dense(ydim) model %>% layer_activation("softmax") opt <- optimizer_adam(lr=FLAGS$lrate) model %>% compile( loss = "categorical_crossentropy", optimizer = opt, metrics = "accuracy" ) summary(model) # Training ---------------------------------------------------------------- callbacks_list <- list( callback_model_checkpoint(filepath = "bestepoch.h5", save_best_only = TRUE, save_weights_only = FALSE) ) data_augmentation <- FLAGS$augment #data_augmentation <- F if(! data_augmentation){ model %>% fit( xtrain, y_train, batch_size = FLAGS$batchsize, epochs = FLAGS$nepochs, validation_split = FLAGS$valsplit, shuffle = TRUE, callbacks = callbacks_list ) } else { datagen <- image_data_generator( featurewise_center = TRUE, featurewise_std_normalization = TRUE, rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, horizontal_flip = TRUE ) valgen <- image_data_generator( featurewise_center = TRUE, featurewise_std_normalization = TRUE, rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, horizontal_flip = TRUE ) datagen %>% fit_image_data_generator(xtrain) valgen %>% fit_image_data_generator(xtest) model %>% fit_generator( flow_images_from_data(xtrain, y_train, datagen, batch_size = FLAGS$batchsize), steps_per_epoch = as.integer(floor(numsamples/FLAGS$batchsize)), epochs = FLAGS$nepochs, callbacks = callbacks_list, validation_data = flow_images_from_data(xtest,y_test, valgen, batch_size=FLAGS$batchsize), validation_steps = as.integer(floor(numtest/FLAGS$batchsize)) ) } # --------- save_model_hdf5(model, "model.h5") scores <- model %>% evaluate(xtest, y_test, verbose=0) cat('Test loss', scores[[1]], '\n') cat('Test accuracy:', scores[[2]], '\n')
Program 03: program03_analyzeruns.R
## analyze Runs library(tidyverse) library(tfruns) library(keras) load("finishedruns.RData") myplot_cat <- function(varname){ p<-ggplot(finishedruns, aes_string(y="metric_val_acc", x=varname)) + geom_jitter(height=0, width=.2) + theme_bw() return(p) } flagnames <- names(finishedruns) flagtrue <- stringr::str_detect(flagnames, "flag") subflagnames <- flagnames[flagtrue] for (i in subflagnames){ print(myplot_cat(i)) } ## now look at the poor performance poormodels <- dplyr::filter(finishedruns, metric_val_acc < .9) View(poormodels) ## load best model bestrun <- finishedruns %>% arrange(-metric_val_acc) run_info(bestrun$run_dir[1]) bestrun<-run_info(bestrun$run_dir[1]) names(bestrun) bestrun$run_dir modelpath<- paste0(bestrun$run_dir, "/model.h5") mod1 <- load_model_hdf5(modelpath) ############# test model on original images x_train <- mnist$train$x y_train <- mnist$train$y x_test <- mnist$test$x y_test <- mnist$test$y # Redefine dimension of train/test inputs x_train <- array_reshape(x_train, c(nrow(x_train), img_rows, img_cols, 1)) x_test <- array_reshape(x_test, c(nrow(x_test), img_rows, img_cols, 1)) input_shape <- c(img_rows, img_cols, 1) # Transform RGB values into [0,1] range and drop underscore for skeleton program xtrain <- x_train / 255 xtest <- x_test / 255 pred_test <- predict_classes(mod1, xtest) head(pred_test) cm<-table(pred_test, y_test) caret::confusionMatrix(cm)