Source code for mxnet.gluon.parameter

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# coding: utf-8
# pylint: disable=
"""Neural network parameter."""

from collections import OrderedDict
import warnings
import numpy as np


from ..base import mx_real_t, MXNetError
from .. import symbol, ndarray, initializer, context
from ..context import Context
from .. import autograd
from .utils import _indent

# pylint: disable= invalid-name
tensor_types = (symbol.Symbol, ndarray.NDArray)
# pylint: enable= invalid-name

class DeferredInitializationError(MXNetError):
    """Error for unfinished deferred initialization."""
    pass

[docs]class Parameter(object):
    """A Container holding parameters (weights) of `Block`s.

    `Parameter` holds a copy of the parameter on each `Context` after
    it is initialized with `Parameter.initialize(...)`. If `grad_req` is
    not `null`, it will also hold a gradient array on each `Context`::

        ctx = mx.gpu(0)
        x = mx.nd.zeros((16, 100), ctx=ctx)
        w = mx.gluon.Parameter('fc_weight', shape=(64, 100), init=mx.init.Xavier())
        b = mx.gluon.Parameter('fc_bias', shape=(64,), init=mx.init.Zero())
        w.initialize(ctx=ctx)
        b.initialize(ctx=ctx)
        out = mx.nd.FullyConnected(x, w.data(ctx), b.data(ctx), num_hidden=64)

    Parameters
    ----------
    name : str
        Name of this parameter.
    grad_req : {'write', 'add', 'null'}, default 'write'
        Specifies how to update gradient to grad arrays.

        - 'write' means everytime gradient is written to grad `NDArray`.
        - 'add' means everytime gradient is added to the grad `NDArray`. You need
          to manually call `zero_grad()` to clear the gradient buffer before each
          iteration when using this option.
        - 'null' means gradient is not requested for this parameter. gradient arrays
          will not be allocated.
    shape : tuple of int, default None
        Shape of this parameter. By default shape is not specified. Parameter with
        unknown shape can be used for `Symbol` API, but `init` will throw an error
        when using `NDArray` API.
    dtype : numpy.dtype or str, default 'float32'
        Data type of this parameter. For example, numpy.float32 or 'float32'.
    lr_mult : float, default 1.0
        Learning rate multiplier. Learning rate will be multiplied by lr_mult
        when updating this parameter with optimizer.
    wd_mult : float, default 1.0
        Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.
    init : Initializer, default None
        Initializer of this parameter. Will use the global initializer by default.

    Attributes
    ----------
    grad_req : {'write', 'add', 'null'}
        This can be set before or after initialization. Setting grad_req to null
        with `x.grad_req = 'null'` saves memory and computation when you don't
        need gradient w.r.t x.
    """
    def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
                 lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False,
                 differentiable=True):
        self._var = None
        self._data = None
        self._grad = None
        self._deferred_init = ()
        self._differentiable = differentiable
        self._grad_req = None
        self.name = name
        self.shape = shape
        self.dtype = dtype
        self.lr_mult = lr_mult
        self.wd_mult = wd_mult
        self.grad_req = grad_req
        self.init = init
        self.allow_deferred_init = allow_deferred_init

    def __repr__(self):
        s = 'Parameter {name} (shape={shape}, dtype={dtype})'
        return s.format(**self.__dict__)

    @property
    def grad_req(self):
        return self._grad_req

    @grad_req.setter
    def grad_req(self, req):
        assert req in ['write', 'add', 'null'], \
            "grad_req must be one of write, add, or null, but got %s"%req
        if not self._differentiable:
            req = 'null'
        if self._grad_req == req:
            return
        self._grad_req = req
        if req == 'null' and self._grad is not None:
            self._grad = None
            for ctx in self._data:
                self._data[ctx] = self._data[ctx].detach()
        elif self._data is not None:
            self._init_grad()

    def _check_initialized(self, ctx=None):
        if self._data is not None:
            if ctx is not None and ctx not in self._data:
                raise RuntimeError(
                    "Parameter %s was not initialized on context %s. "
                    "It was only initialized on %s."%(
                        self.name, str(ctx), str(self.list_ctx())))
            return
        if self._deferred_init:
            raise DeferredInitializationError
        raise RuntimeError(
            "Parameter %s has not been initialized. Note that " \
            "you should initialize parameters and create Trainer " \
            "with Block.collect_params() instead of Block.params " \
            "because the later does not include Parameters of " \
            "nested child Blocks"%(self.name))

    def _load_init(self, data, ctx):
        """(Re)initializes by loading from data."""
        if self.shape:
            for i, j in zip(self.shape, data.shape):
                assert i == 0 or i == j, \
                    "Failed loading Parameter %s from saved params: " \
                    "shape incompatible expacted %s vs saved %s"%(
                        self.name, str(self.shape), str(data.shape))
        if self.dtype:
            assert np.dtype(self.dtype).type == data.dtype, \
                "Failed loading Parameter %s from saved params: " \
                "dtype incompatible expacted %s vs saved %s"%(
                    self.name, str(self.dtype), str(data.dtype))
        if isinstance(ctx, Context):
            ctx = [ctx]
        if self._data is None:
            if self._deferred_init:
                assert set(ctx) == set(self._deferred_init[1]), \
                    "Failed to load Parameter %s on %s because it was " \
                    "previous initialized on %s."%(
                        self.name, str(ctx), str(self.list_ctx()))
            self._init_impl(data, ctx)
        else:
            assert set(ctx) == set(self.list_ctx()), \
                "Failed to load Parameter %s on %s because it was " \
                "previous initialized on %s."%(
                    self.name, str(ctx), str(self.list_ctx()))
            self.set_data(data)
        self._deferred_init = ()

    def _finish_deferred_init(self):
        """Finishes deferred initialization."""
        if not self._deferred_init:
            return
        init, ctx, default_init = self._deferred_init
        self._deferred_init = ()
        assert self.shape is not None and np.prod(self.shape) > 0, \
            "Cannot initialize Parameter %s because it has " \
            "invalid shape: %s. Please specify in_units, " \
            "in_channels, etc for `Block`s."%(
                self.name, str(self.shape))

        with autograd.pause():
            data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
                                 ctx=context.cpu())
            initializer.create(default_init)(
                initializer.InitDesc(self.name, {'__init__': init}), data)

            self._init_impl(data, ctx)

    def _init_impl(self, data, ctx):
        """Sets data and grad."""
        self._data = OrderedDict()
        for i in ctx:
            self._data[i] = data.copyto(i)
        self._init_grad()

    def _init_grad(self):
        """Initialize grad buffers."""
        if self.grad_req == 'null':
            self._grad = None
            return

        self._grad = OrderedDict()
        for i in self._data:
            self._grad[i] = ndarray.zeros_like(self._data[i])

        autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req)

    def _reduce(self):
        """Reduce data from multiple context."""
        block = self.list_data()
        data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
        return data

[docs]    def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
                   force_reinit=False):
        """Initializes parameter and gradient arrays. Only used for `NDArray` API.

        Parameters
        ----------
        init : Initializer
            The initializer to use. Overrides `Parameter.init` and default_init.
        ctx : Context or list of Context, defaults to `context.current_context()`.
            Initialize Parameter on given context. If ctx is a list of Context, a
            copy will be made for each context.

            .. note:: Copies are independent arrays. User is responsible for keeping
            their values consistent when updating. Normally `gluon.Trainer` does this for you.
        default_init : Initializer
            Default initializer is used when both `init` and `Parameter.init` are `None`.
        force_reinit : bool, default False
            Whether to force re-initialization if parameter is already initialized.

        Examples
        --------
        >>> weight = mx.gluon.Parameter('weight', shape=(2, 2))
        >>> weight.initialize(ctx=mx.cpu(0))
        >>> weight.data()
        [[-0.01068833  0.01729892]
         [ 0.02042518 -0.01618656]]
        
        >>> weight.grad()
        [[ 0.  0.]
         [ 0.  0.]]
        
        >>> weight.initialize(ctx=[mx.gpu(0), mx.gpu(1)])
        >>> weight.data(mx.gpu(0))
        [[-0.00873779 -0.02834515]
         [ 0.05484822 -0.06206018]]
        
        >>> weight.data(mx.gpu(1))
        [[-0.00873779 -0.02834515]
         [ 0.05484822 -0.06206018]]
        
        """
        if self._data is not None and not force_reinit:
            warnings.warn("Parameter %s is already initialized, ignoring. " \
                          "Set force_reinit=True to re-initialize."%self.name)
            return
        self._data = self._grad = None

        if ctx is None:
            ctx = [context.current_context()]
        if isinstance(ctx, Context):
            ctx = [ctx]
        if init is None:
            init = default_init if self.init is None else self.init
        if not self.shape or np.prod(self.shape) <= 0:
            if self.allow_deferred_init:
                self._deferred_init = (init, ctx, default_init)
                return
            raise ValueError("Cannot initialize Parameter %s because it has " \
                             "invalid shape: %s."%(self.name, str(self.shape)))

        self._deferred_init = (init, ctx, default_init)
        self._finish_deferred_init()

[docs]    def reset_ctx(self, ctx):
        """Re-assign Parameter to other contexts.

        ctx : Context or list of Context, default `context.current_context()`.
            Assign Parameter to given context. If ctx is a list of Context, a
            copy will be made for each context.
        """
        if ctx is None:
            ctx = [context.current_context()]
        if isinstance(ctx, Context):
            ctx = [ctx]
        if self._data:
            data = self._reduce()
            with autograd.pause():
                self._init_impl(data, ctx)
        elif self._deferred_init:
            init, _, default_init = self._deferred_init
            self._deferred_init = (init, ctx, default_init)
        else:
            raise ValueError("Cannot reset context for Parameter %s because it "
                             "has not been initialized."%self.name)


[docs]    def set_data(self, data):
        """Sets this parameter's value on all contexts to data."""
        assert self._data is not None, \
            "Parameter %s has not been initialized"%self.name
        for arr in self.list_data():
            arr[:] = data

[docs]    def data(self, ctx=None):
        """Returns a copy of this parameter on one context. Must have been
        initialized on this context before.

        Parameters
        ----------
        ctx : Context
            Desired context.

        Returns
        -------
        NDArray on ctx
        """
        if ctx is None:
            list_ctx = self.list_ctx()
            if len(list_ctx) == 1:
                ctx = list_ctx[0]
            else:
                ctx = context.current_context()
        self._check_initialized(ctx)
        return self._data[ctx]

[docs]    def list_data(self):
        """Returns copies of this parameter on all contexts, in the same order
        as creation."""
        self._check_initialized()
        return list(self._data.values())

[docs]    def grad(self, ctx=None):
        """Returns a gradient buffer for this parameter on one context.

        Parameters
        ----------
        ctx : Context
            Desired context.
        """
        if ctx is None:
            list_ctx = self.list_ctx()
            if len(list_ctx) == 1:
                ctx = list_ctx[0]
            else:
                ctx = context.current_context()
        self._check_initialized(ctx)
        if self._grad is None:
            raise RuntimeError(
                "Cannot get gradient array for Parameter %s " \
                "because grad_req='null'"%(self.name))
        return self._grad[ctx]

[docs]    def list_grad(self):
        """Returns gradient buffers on all contexts, in the same order
        as `values`."""
        self._check_initialized()
        assert self._grad is not None, \
            "Parameter %s does not have gradients because grad_req='null'"%self.name
        return list(self._grad.values())

[docs]    def list_ctx(self):
        """Returns a list of contexts this parameter is initialized on."""
        if self._data is None:
            if self._deferred_init:
                return self._deferred_init[1]
            raise RuntimeError("Parameter %s has not been initialized"%self.name)
        return list(self._data.keys())

[docs]    def zero_grad(self):
        """Sets gradient buffer on all contexts to 0. No action is taken if
        parameter is uninitialized or doesn't require gradient."""
        if self._grad is None:
            return
        for i in self._grad.values():
            i[:] = 0

[docs]    def var(self):
        """Returns a symbol representing this parameter."""
        if self._var is None:
            self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                   lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                   init=self.init)
        return self._var


[docs]class ParameterDict(object):
    """A dictionary managing a set of parameters.

    Parameters
    ----------
    prefix : str, default ''
        The prefix to be prepended to all Parameters' names created by this dict.
    shared : ParameterDict or None
        If not `None`, when this dict's `get` method creates a new parameter, will
        first try to retrieve it from `shared` dict. Usually used for sharing
        parameters with another `Block`.
    """
    def __init__(self, prefix='', shared=None):
        self._prefix = prefix
        self._params = OrderedDict()
        self._shared = shared

    def __repr__(self):
        s = '{name}(\n{content}\n)'
        name = self._prefix+' ' if self._prefix else ''
        return s.format(name=name,
                        content='\n'.join([_indent('  {0}'.format(v), 2)
                                           for v in self.values()]))

    def __getitem__(self, key):
        return self._params[key]

    def __iter__(self):
        return iter(self._params)

    def items(self):
        return self._params.items()

    def keys(self):
        return self._params.keys()

    def values(self):
        return self._params.values()

    @property
    def prefix(self):
        """Prefix of this dict. It will be prepended to Parameters' name created
        with `get`."""
        return self._prefix

    def _get_impl(self, name):
        if name in self._params:
            return self._params[name]
        if self._shared is not None and name in self._shared._params:
            self._params[name] = self._shared._params[name]
            return self._shared._params[name]
        return None

[docs]    def get(self, name, **kwargs):
        """Retrieves a `Parameter` with name `self.prefix+name`. If not found,
        `get` will first try to retrieve it from `shared` dict. If still not
        found, `get` will create a new `Parameter` with key-word arguments and
        insert it to self.

        Parameters
        ----------
        name : str
            Name of the desired Parameter. It will be prepended with this dictionary's
            prefix.
        **kwargs : dict
            The rest of key-word arguments for the created `Parameter`.

        Returns
        -------
        Parameter
            The created or retrieved `Parameter`.
        """
        name = self.prefix + name
        param = self._get_impl(name)
        if param is None:
            param = Parameter(name, **kwargs)
            self._params[name] = param
        else:
            for k, v in kwargs.items():
                if hasattr(param, k) and getattr(param, k) is not None:
                    assert v is None or v == getattr(param, k), \
                        "Cannot retrieve Parameter %s because desired attribute " \
                        "does not match with stored for attribute %s: " \
                        "desired %s vs stored %s."%(
                            name, k, str(v), str(getattr(param, k)))
                else:
                    setattr(param, k, v)
        return param

[docs]    def update(self, other):
        """Copies all Parameters in `other` to self."""
        for k, v in other.items():
            if k in self._params:
                assert self._params[k] is v, \
                    "Cannot update self with other because they have different " \
                    "Parameters with the same name %s"%k
            else:
                self._params[k] = v

[docs]    def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
                   force_reinit=False):
        """Initializes all Parameters managed by this dictionary to be used for `NDArray`
        API. It has no effect when using `Symbol` API.

        Parameters
        ----------
        init : Initializer
            Global default Initializer to be used when `Parameter.init` is `None`.
            Otherwise, `Parameter.init` takes precedence.
        ctx : Context or list of Context
            Keeps a copy of Parameters on one or many context(s).
        force_reinit : bool, default False
            Whether to force re-initialization if parameter is already initialized.
        """
        if verbose:
            init.set_verbosity(verbose=verbose)
        for _, v in self.items():
            v.initialize(None, ctx, init, force_reinit=force_reinit)

[docs]    def zero_grad(self):
        """Sets all Parameters' gradient buffer to 0."""
        for i in self.values():
            i.zero_grad()

[docs]    def reset_ctx(self, ctx):
        """Re-assign all Parameters to other contexts.

        ctx : Context or list of Context, default `context.current_context()`.
            Assign Parameter to given context. If ctx is a list of Context, a
            copy will be made for each context.
        """
        for i in self.values():
            i.reset_ctx(ctx)

[docs]    def setattr(self, name, value):
        """Set an attribute to a new value for all Parameters.

        For example, set grad_req to null if you don't need gradient w.r.t a
        model's Parameters::

            model.collect_params().setattr('grad_req', 'null')

        or change the learning rate multiplier::

            model.collect_params().setattr('lr_mult', 0.5)

        Parameters
        ----------
        name : str
            Name of the attribute.
        value : valid type for attribute name
            The new value for the attribute.
        """
        for i in self.values():
            setattr(i, name, value)

[docs]    def save(self, filename, strip_prefix=''):
        """Save parameters to file.

        filename : str
            Path to parameter file.
        strip_prefix : str, default ''
            Strip prefix from parameter names before saving.
        """
        arg_dict = {}
        for param in self.values():
            weight = param._reduce()
            if not param.name.startswith(strip_prefix):
                raise ValueError(
                    "Prefix %s is to be striped before saving, but Parameter " \
                    "%s does not start with %s. If you are using Block.save_params, " \
                    "This may be due to your Block shares parameters from other " \
                    "Blocks or you forgot to use `with name_scope()`` during init. " \
                    "Consider switching to Block.collect_params.save and " \
                    "Block.collect_params.load instead."%(
                        strip_prefix, param.name, strip_prefix))
            arg_dict[param.name[len(strip_prefix):]] = weight
        ndarray.save(filename, arg_dict)

[docs]    def load(self, filename, ctx, allow_missing=False,
             ignore_extra=False, restore_prefix=''):
        """Load parameters from file.

        filename : str
            Path to parameter file.
        ctx : Context or list of Context
            Context(s) initialize loaded parameters on.
        allow_missing : bool, default False
            Whether to silently skip loading parameters not represents in the file.
        ignore_extra : bool, default False
            Whether to silently ignore parameters from the file that are not
            present in this ParameterDict.
        restore_prefix : str, default ''
            prepend prefix to names of stored parameters before loading.
        """
        if restore_prefix:
            for name in self.keys():
                assert name.startswith(restore_prefix), \
                    "restore_prefix is %s but Parameters name %s does not start " \
                    "with %s"%(restore_prefix, name, restore_prefix)
        lprefix = len(restore_prefix)
        arg_dict = {restore_prefix+k: v for k, v in ndarray.load(filename).items()}
        if not allow_missing:
            for name in self.keys():
                assert name in arg_dict, \
                    "Parameter %s is missing in file %s"%(name[lprefix:], filename)
        for name in arg_dict:
            if name not in self._params:
                assert ignore_extra, \
                    "Parameter %s loaded from file %s is not present in ParameterDict"%(
                        name[lprefix:], filename)
                continue
            self[name]._load_init(arg_dict[name], ctx)