Character-level Language Model using RNN

This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -

  • readr
  • stringr
  • stringi
  • mxnet

We will use the tinyshakespeare dataset to build this model.

library("readr")
library("stringr")
library("stringi")
library("mxnet")

Preprocess and prepare the data

Download the data:

download.data <- function(data_dir) {
    dir.create(data_dir, showWarnings = FALSE)
    if (!file.exists(paste0(data_dir,'input.txt'))) {
        download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
                      destfile=paste0(data_dir,'input.txt'), method='wget')
    }
}

Next we transform the test into feature vectors that is fed into the RNN model. The make_data function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length seq.len.

make_data <- function(path, seq.len = 32, dic=NULL) {
  
  text_vec <- read_file(file = path)
  text_vec <- stri_enc_toascii(str = text_vec)
  text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
  text_vec <- strsplit(text_vec, '') %>% unlist
  
  if (is.null(dic)) {
    char_keep <- sort(unique(text_vec))
  } else char_keep <- names(dic)[!dic == 0]
  
  # Remove terms not part of dictionary
  text_vec <- text_vec[text_vec %in% char_keep]
  
  # Build dictionary
  dic <- 1:length(char_keep)
  names(dic) <- char_keep
  
  # reverse dictionary
  rev_dic <- names(dic)
  names(rev_dic) <- dic
  
  # Adjust by -1 to have a 1-lag for labels
  num.seq <- (length(text_vec) - 1) %/% seq.len
  
  features <- dic[text_vec[1:(seq.len * num.seq)]] 
  labels <- dic[text_vec[1:(seq.len*num.seq) + 1]]
  
  features_array <- array(features, dim = c(seq.len, num.seq))
  labels_array <- array(labels, dim = c(seq.len, num.seq))
  
  return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
}


seq.len <- 100
data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)

Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.

X <- data_prep$features_array
Y <- data_prep$labels_array
dic <- data_prep$dic
rev_dic <- data_prep$rev_dic
vocab <- length(dic)

samples <- tail(dim(X), 1)
train.val.fraction <- 0.9

X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]

X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]

train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))

train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)

Create iterators for training and evaluation datasets.

vocab <- length(eval_buckets$dic)

batch.size <- 32

train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size, 
                                data.mask.element = 0, shuffle = TRUE)

eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
                               data.mask.element = 0, shuffle = FALSE)

Train the Model

This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.

rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3, 
                               num_hidden = 96,
                               input_size = vocab,
                               num_embed = 64, 
                               num_decode = vocab,
                               dropout = 0.2, 
                               ignore_label = 0,
                               cell_type = "lstm",
                               masking = F,
                               output_last_state = T,
                               loss_output = "softmax",
                               config = "one-to-one")

graph.viz(rnn_graph_one_one, type = "graph", direction = "LR", 
          graph.height.px = 180, shape=c(100, 64))

devices <- mx.cpu()

initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)

optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
                           clip_gradient = 5, rescale.grad = 1/batch.size)

logger <- mx.metric.logger()
epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
batch.end.callback <- mx.callback.log.train.metric(period = 50)

mx.metric.custom_nd <- function(name, feval) {
  init <- function() {
    c(0, 0)
  }
  update <- function(label, pred, state) {
    m <- feval(label, pred)
    state <- c(state[[1]] + 1, state[[2]] + m)
    return(state)
  }
  get <- function(state) {
    list(name=name, value = (state[[2]] / state[[1]]))
  }
  ret <- (list(init = init, update = update, get = get))
  class(ret) <- "mx.metric"
  return(ret)
}

mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
  label <- mx.nd.reshape(label, shape = -1)
  label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
  batch <- length(label_probs)
  NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
  Perplexity <- exp(NLL)
  return(Perplexity)
})

model <- mx.model.buckets(symbol = rnn_graph_one_one,
                          train.data = train.data, eval.data = eval.data, 
                          num.round = 20, ctx = devices, verbose = TRUE,
                          metric = mx.metric.Perplexity, 
                          initializer = initializer, optimizer = optimizer, 
                          batch.end.callback = NULL, 
                          epoch.end.callback = epoch.end.callback)

mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
Start training with 1 devices
[1] Train-Perplexity=13.7040474322178
[1] Validation-Perplexity=7.94617194460922
[2] Train-Perplexity=6.57039815554525
[2] Validation-Perplexity=6.60806110658011
[3] Train-Perplexity=5.65360504501481
[3] Validation-Perplexity=6.18932770630876
[4] Train-Perplexity=5.32547285727298
[4] Validation-Perplexity=6.02198756798859
[5] Train-Perplexity=5.14373631472579
[5] Validation-Perplexity=5.8095658243407
[6] Train-Perplexity=5.03077673487379
[6] Validation-Perplexity=5.72582993567431
[7] Train-Perplexity=4.94453383291536
[7] Validation-Perplexity=5.6445258528126
[8] Train-Perplexity=4.88635290100261
[8] Validation-Perplexity=5.6730024536433
[9] Train-Perplexity=4.84205646230548
[9] Validation-Perplexity=5.50960780230982
[10] Train-Perplexity=4.80441673535513
[10] Validation-Perplexity=5.57002263750006
[11] Train-Perplexity=4.77763413242626
[11] Validation-Perplexity=5.55152143269169
[12] Train-Perplexity=4.74937775290777
[12] Validation-Perplexity=5.44968305351486
[13] Train-Perplexity=4.72824849541467
[13] Validation-Perplexity=5.50889348298234
[14] Train-Perplexity=4.70980846981694
[14] Validation-Perplexity=5.51473225859859
[15] Train-Perplexity=4.69685776886122
[15] Validation-Perplexity=5.45391985233811
[16] Train-Perplexity=4.67837107034824
[16] Validation-Perplexity=5.46636764997829
[17] Train-Perplexity=4.66866961934873
[17] Validation-Perplexity=5.44267086113492
[18] Train-Perplexity=4.65611469144194
[18] Validation-Perplexity=5.4290169469462
[19] Train-Perplexity=4.64614689879405
[19] Validation-Perplexity=5.44221549833917
[20] Train-Perplexity=4.63764001963654
[20] Validation-Perplexity=5.42114250842862

Inference on the Model

We now use the saved model to do inference and sample text character by character that will look like the original training data.

set.seed(0)
model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)

internals <- model$symbol$get.internals()
sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)

infer_raw <- c("Thou ")
infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
infer_length <- length(infer_split)

infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),  
                              batch.size = 1, shuffle = FALSE)

infer <- mx.infer.rnn.one(infer.data = infer.data, 
                          symbol = symbol,
                          arg.params = model$arg.params,
                          aux.params = model$aux.params,
                          input.params = NULL, 
                          ctx = devices)

pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
    infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length)))
pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
predict <- c(predict, pred)

for (i in 1:200) {
  
  infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),  
                                batch.size = 1, shuffle = FALSE)
  
  infer <- mx.infer.rnn.one(infer.data = infer.data, 
                            symbol = symbol,
                            arg.params = model$arg.params,
                            aux.params = model$aux.params,
                            input.params = list(rnn.state = infer[[2]], 
                                                rnn.state.cell = infer[[3]]), 
                            ctx = devices)
  
  pred_prob <- as.numeric(as.array(infer$loss_output))
  pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
  predict <- c(predict, pred)
}

predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
print(predict_txt_tot)
[1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "