keras

tflearns applied to MNIST data

Preamble. Deep learning is hard. It is much harder than MNIST would lead you to believe. MNIST for all practical purposes is "solved" and as such, it isn't as interesting as it once was.  Nonetheless, we know models can be trained to perform very well against the data. This provides an opportunity for me to test out some of my generic coding based on tfruns.

 

This blog post presents my newly created "tunable" convolutional neural network (CNN) that runs within RStudio using their tfruns package.  The tfruns package addresses one of the large hassles of training a CNN--how does one systematically search over the numerous tunable parameters?  A large tuning array crossed with significant training time means either you stay glued to the computer to minimize computational downtime or you write your own wrappers to do the grid search.  The latter is a good approach. However, the good folks at RStudio have written something that works extremely well.

 

The following is the code for a three part program.  First, is the control program that defines the grid search.  The second program is the flexible CNN architecture that is tunable. The final program analyzes the results and shows how a model can be extracted from the results and utilized for prediction.

 

This video details the programs and walks through the results that were obtained.


Program 01: prog01_tfruns_control.R

library(tfruns)
library(keras)
library(tidyverse)
library(keras)
library(reticulate)


## load the mnist data
# Input image dimensions
num_classes <- 10
img_rows <- 28
img_cols <- 28

# The data, shuffled and split between train and test sets
mnist <- dataset_mnist()
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y

# Redefine  dimension of train/test inputs
x_train <- array_reshape(x_train, c(nrow(x_train), img_rows, img_cols, 1))
x_test <- array_reshape(x_test, c(nrow(x_test), img_rows, img_cols, 1))
input_shape <- c(img_rows, img_cols, 1)

# Transform RGB values into [0,1] range  and drop underscore for skeleton program
xtrain <- x_train / 255
xtest <- x_test / 255

cat('x_train_shape:', dim(xtrain), '\n')
cat(nrow(xtrain), 'train samples\n')
cat(nrow(xtest), 'test samples\n')

# Convert class vectors to binary class matrices
y_train <- to_categorical(y_train, num_classes)
y_test <- to_categorical(y_test, num_classes)


# initial tuning run
runs <- tuning_run("prog02_keras_skeleton.R", sample=.005, flags=list(
        batchsize=c(128,256),
        nepochs=20,
        lrate=c(0.001),
        augment=c(T, F),
        flip=c(T,F),
        valsplit=.1,
        nconvolutions = 2,
        drop1 = .5,                                                       
        activationc= c("leaky","relu"),
        meanpool = c(T,F),
        
        pad1="same",
        pool1=c(2),
        pool2=c(2),
        kernel1=c(3),
        kernel2=c(3),
        filter1 = c(32, 64, 96),
        filter2 = c(32, 64, 96),
        filter3 = c(32, 64, 96),
        
        ndense1 = c(1,2),
        ndense2 = c(1,2,4),
        activationd= c("relu","leaky"),
        hidden1=c(100,500), 
        hidden2=c(100,500),
        drop2= .5
        ) 
)


View(runs)

Program 02: prog02_keras_skeleton.R

FLAGS <- flags(
  flag_integer("batchsize", 128, "Batch size"),
  flag_numeric("nepochs", 10, "Number of epochs"),
  flag_numeric("lrate",0.01,"Learning rate"),
  flag_boolean("augment",T, "Use data augmentation"),
  flag_boolean("flip",T,"Horizontal flip of images"),
  flag_numeric("valsplit",0.2, "Validation sample split (non-augmented only)"),
  
  
  ## convolution turning parameters
  flag_numeric("nconvolutions",1,"Number of extra convolution cycles"),
  flag_numeric("drop1",0.5, "Dropout for the pooling"),
  flag_string("activationc","relu","Activation for the convolutions"),
  flag_boolean("meanpool",T, "Use mean or max pooling"),
  flag_string("pad1","same", "Padding"),
  flag_integer("pool1",2, "Size of first pooling"),
  flag_integer("pool2",2, "Size of subsequent pooling"),
  flag_integer("kernel1",3, "Kernel size for first set of convs / pooling"),
  flag_integer("kernel2",3, "Kernel size for extra set of convolutions"),

  flag_integer("filter1",16, "Number of filters for first conv within set"),
  flag_integer("filter2",32, "Number of filters for second conv within set"),
  flag_integer("filter3",16, "Number of filters for third conv within set"),
  
  ## dense layer tuning
  flag_numeric("ndense1",1,"Number of dense layer 1s"),
  flag_numeric("ndense2",1, "Number of dense Layer 2s"),
  
  flag_string("activationd","relu","Activation for the dense layers"),
  flag_numeric("hidden1",100, "Size of first dense layer"),
  flag_numeric("hidden2",100, "Size of subsequent dense layers"),
  flag_numeric("drop2",0.5, "Dropout for the dense layers")
)


ydim <- ncol(y_train)
numsamples <- nrow(xtrain)
numtest <- nrow(xtest)

batch_size <- FLAGS$batchsize
n_epoch <- FLAGS$nepochs
meanpooling <- FLAGS$meanpool
activation <- FLAGS$activationc
denseactivation <- FLAGS$activationd

model <- keras_model_sequential() 


  ############################# CONVOLUTION SET 1 ##########################
  # three set of filters, then a pooling
  # note: using several model %>% commands to make the whole model more programatic 
model %>%  layer_conv_2d(filter = FLAGS$filter1, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1, input_shape = c(img_rows, img_cols, 1) )
model %>% layer_batch_normalization()   
# add switch for the activation function
if (activation == "leaky"){
model %>% layer_activation_leaky_relu() 
} else {
model %>% layer_activation("relu")  
}

model %>% layer_conv_2d(filter = FLAGS$filter2, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1)
model %>%  layer_batch_normalization()   
if (activation == "leaky"){
  model %>% layer_activation_leaky_relu() 
} else {
  model %>% layer_activation("relu")  
}

model %>% layer_conv_2d(filter = FLAGS$filter3, kernel_size = c(FLAGS$kernel1,FLAGS$kernel1), padding = FLAGS$pad1) 
model %>%  layer_batch_normalization() 
# add switch for the activation function
if (activation == "leaky"){
  model %>% layer_activation_leaky_relu() 
} else {
  model %>% layer_activation("relu")  
}

if ( meanpooling) {
# Use mean pooling
  model %>% layer_average_pooling_2d(pool_size = c(FLAGS$pool1, FLAGS$pool1)) 
} else {
  model %>% layer_max_pooling_2d(pool_size = c(FLAGS$pool1, FLAGS$pool1)) 
}
model %>%  layer_dropout(FLAGS$drop1) 
  
  
  ############################## extra convolutions
for (i in seq(1, FLAGS$nconvolutions)){
  
model %>%  layer_conv_2d(filter = FLAGS$filter1, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) 
model %>%  layer_batch_normalization()   
# add switch for the activation function
if (activation == "leaky"){
  model %>% layer_activation_leaky_relu() 
} else {
  model %>% layer_activation("relu")  
}

  
model %>%  layer_conv_2d(filter = FLAGS$filter2, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) 
model %>%  layer_batch_normalization() 
# add switch for the activation function
if (activation == "leaky"){
  model %>% layer_activation_leaky_relu() 
} else {
  model %>% layer_activation("relu")  
}

  
  
  
model %>%  layer_conv_2d(filter = FLAGS$filter3, kernel_size = c(FLAGS$kernel2,FLAGS$kernel2), padding = FLAGS$pad1) 
model %>%  layer_batch_normalization() 
# add switch for the activation function
if (activation == "leaky"){
  model %>% layer_activation_leaky_relu() 
} else {
  model %>% layer_activation("relu")  
}
 
 
  if ( meanpooling) {
    # Use mean pooling
model %>% layer_average_pooling_2d(pool_size = c(FLAGS$pool2, FLAGS$pool2), padding=FLAGS$pad1) 
  } else {
model %>% layer_max_pooling_2d(pool_size = c(FLAGS$pool2, FLAGS$pool2), padding=FLAGS$pad1) 
  }
model %>%  layer_dropout(FLAGS$drop1)
  
}  
  ################# end of extra convolutions
    
    
  # Flatten max filtered output into feature vector 
  # and feed into dense layer
model %>%  layer_flatten()
  

## now loop over the number of dense layer 1s

for (i in seq(1,FLAGS$ndense1)){
model %>%    layer_dense(FLAGS$hidden1) 
  if (denseactivation == "leaky"){
    model %>% layer_activation_leaky_relu() 
  } else {
    model %>% layer_activation("relu")  
  }
model %>%    layer_dropout(FLAGS$drop2)
}  
 
for (i in seq(1,FLAGS$ndense2)){ 
model %>%    layer_dense(FLAGS$hidden2) 
    if (denseactivation == "leaky"){
      model %>% layer_activation_leaky_relu() 
    } else {
      model %>% layer_activation("relu")  
    }
model %>%   layer_dropout(FLAGS$drop2) 
}  
  

model %>%   layer_dense(ydim) 
model %>%    layer_activation("softmax")

opt <- optimizer_adam(lr=FLAGS$lrate)

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = opt,
  metrics = "accuracy"
)

summary(model)

# Training ----------------------------------------------------------------

callbacks_list <- list(
 callback_model_checkpoint(filepath = "bestepoch.h5", save_best_only = TRUE, save_weights_only = FALSE) 
  
)


data_augmentation <- FLAGS$augment
#data_augmentation <- F
if(! data_augmentation){
  
  model %>% fit(
    xtrain, y_train,
    batch_size = FLAGS$batchsize,
    epochs = FLAGS$nepochs,
    validation_split = FLAGS$valsplit,
    shuffle = TRUE,
    callbacks = callbacks_list
  )
  
} else {
  
  datagen <- image_data_generator(
    featurewise_center = TRUE,
    featurewise_std_normalization = TRUE,
    rotation_range = 20,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    horizontal_flip = TRUE
  )
  valgen <- image_data_generator(
    featurewise_center = TRUE,
    featurewise_std_normalization = TRUE,
    rotation_range = 20,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    horizontal_flip = TRUE
  )
  
  
  datagen %>% fit_image_data_generator(xtrain)
  valgen %>% fit_image_data_generator(xtest)
  
  model %>% fit_generator(
    flow_images_from_data(xtrain, y_train, datagen, batch_size = FLAGS$batchsize),
    steps_per_epoch = as.integer(floor(numsamples/FLAGS$batchsize)), 
    epochs = FLAGS$nepochs,
    callbacks = callbacks_list,
    validation_data = flow_images_from_data(xtest,y_test, valgen, batch_size=FLAGS$batchsize),
    validation_steps = as.integer(floor(numtest/FLAGS$batchsize))
  )
  
}


# ---------

save_model_hdf5(model, "model.h5")
scores <- model %>% evaluate(xtest, y_test, verbose=0)

cat('Test loss', scores[[1]], '\n')
cat('Test accuracy:', scores[[2]], '\n')

Program 03: program03_analyzeruns.R

## analyze Runs
library(tidyverse)
library(tfruns)
library(keras)
load("finishedruns.RData")

myplot_cat <- function(varname){
  p<-ggplot(finishedruns, aes_string(y="metric_val_acc", x=varname)) + geom_jitter(height=0, width=.2) +
    theme_bw()
  return(p)
}
flagnames <- names(finishedruns)
flagtrue <- stringr::str_detect(flagnames, "flag")
subflagnames <- flagnames[flagtrue]

for (i in subflagnames){
print(myplot_cat(i))
}


## now look at the poor performance
poormodels <- dplyr::filter(finishedruns, metric_val_acc < .9)

View(poormodels)


## load best model
bestrun <- finishedruns %>% arrange(-metric_val_acc)
run_info(bestrun$run_dir[1])
bestrun<-run_info(bestrun$run_dir[1])

names(bestrun)


bestrun$run_dir
modelpath<- paste0(bestrun$run_dir, "/model.h5")
mod1 <- load_model_hdf5(modelpath)

############# test model on original images
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y

# Redefine  dimension of train/test inputs
x_train <- array_reshape(x_train, c(nrow(x_train), img_rows, img_cols, 1))
x_test <- array_reshape(x_test, c(nrow(x_test), img_rows, img_cols, 1))
input_shape <- c(img_rows, img_cols, 1)

# Transform RGB values into [0,1] range  and drop underscore for skeleton program
xtrain <- x_train / 255
xtest <- x_test / 255

pred_test <- predict_classes(mod1, xtest)
head(pred_test)

cm<-table(pred_test, y_test)

caret::confusionMatrix(cm)