Customized loss function

This tutorial provides guidelines for using customized loss function in network construction.

Model Training Example

Let's begin with a small regression example. We can build and train a regression model with the following code:

data(BostonHousing, package = "mlbench")
BostonHousing[, sapply(BostonHousing, is.factor)] <-
  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
BostonHousing <- data.frame(scale(BostonHousing))

test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
train.x = data.matrix(BostonHousing[-test.ind,-14])
train.y = BostonHousing[-test.ind, 14]
test.x = data.matrix(BostonHousing[--test.ind,-14])
test.y = BostonHousing[--test.ind, 14]

require(mxnet)
## Loading required package: mxnet
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")

mx.set.seed(0)
model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
                                     ctx = mx.cpu(),
                                     num.round = 5,
                                     array.batch.size = 60,
                                     optimizer = "rmsprop",
                                     verbose = TRUE,
                                     array.layout = "rowmajor",
                                     batch.end.callback = NULL,
                                     epoch.end.callback = NULL)
## Start training with 1 devices
pred <- predict(model, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred[1,])^2) / length(test.y)
## [1] 0.2485236

Besides the LinearRegressionOutput, we also provide LogisticRegressionOutput and MAERegressionOutput. However, this might not be enough for real-world models. You can provide your own loss function by using mx.symbol.MakeLoss when constructing the network.

How to Use Your Own Loss Function

We still use our previous example, but this time we use mx.symbol.MakeLoss to minimize the (pred-label)^2

data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")

Then we can train the network just as usual.

mx.set.seed(0)
model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 5,
                                      array.batch.size = 60,
                                      optimizer = "rmsprop",
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)
## Start training with 1 devices

We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.

pred2 <- predict(model2, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred2)^2) / length(test.y)
## [1] 1.234584

This is because output of mx.symbol.MakeLoss is the gradient of loss with respect to the input data. We can get the real prediction as below.

internals = internals(model2$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]

model3 <- list(symbol = fc_symbol,
               arg.params = model2$arg.params,
               aux.params = model2$aux.params)

class(model3) <- "MXFeedForwardModel"

pred3 <- predict(model3, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum((test.y - pred3[1,])^2) / length(test.y)
## [1] 0.248294

We have provided many operations on the symbols. An example of |pred-label| can be found below.

lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
mx.set.seed(0)
model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 20,
                                      array.batch.size = 60,
                                      optimizer = "sgd",
                                      learning.rate = 0.001,
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)
## Start training with 1 devices
internals = internals(model4$symbol)
fc_symbol = internals[[match("fc2_output", outputs(internals))]]

model5 <- list(symbol = fc_symbol,
               arg.params = model4$arg.params,
               aux.params = model4$aux.params)

class(model5) <- "MXFeedForwardModel"

pred5 <- predict(model5, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum(abs(test.y - pred5[1,])) / length(test.y)
## [1] 0.7056902
lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
mx.set.seed(0)
model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
                                      ctx = mx.cpu(),
                                      num.round = 20,
                                      array.batch.size = 60,
                                      optimizer = "sgd",
                                      learning.rate = 0.001,
                                      verbose = TRUE,
                                      array.layout = "rowmajor",
                                      batch.end.callback = NULL,
                                      epoch.end.callback = NULL)
## Start training with 1 devices
pred6 <- predict(model6, test.x)
## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
sum(abs(test.y - pred6[1,])) / length(test.y)
## [1] 0.7056902

Next Steps