Source code for mxnet.gluon.data.vision.transforms

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# coding: utf-8
# pylint: disable= arguments-differ
"Image transforms."

import random
import numpy as np

from ...block import Block, HybridBlock
from ...nn import Sequential, HybridSequential
from .... import image
from ....base import numeric_types
from ....util import is_np_array


[docs]class Compose(Sequential): """Sequentially composes multiple transforms. Parameters ---------- transforms : list of transform Blocks. The list of transforms to be composed. Inputs: - **data**: input tensor with shape of the first transform Block requires. Outputs: - **out**: output tensor with shape of the last transform Block produces. Examples -------- >>> transformer = transforms.Compose([transforms.Resize(300), ... transforms.CenterCrop(256), ... transforms.ToTensor()]) >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 3x256x256 @cpu(0)> """ def __init__(self, transforms): super(Compose, self).__init__() transforms.append(None) hybrid = [] for i in transforms: if isinstance(i, HybridBlock): hybrid.append(i) continue elif len(hybrid) == 1: self.add(hybrid[0]) hybrid = [] elif len(hybrid) > 1: hblock = HybridSequential() for j in hybrid: hblock.add(j) hblock.hybridize() self.add(hblock) hybrid = [] if i is not None: self.add(i)
[docs]class Cast(HybridBlock): """Cast input to a specific data type Parameters ---------- dtype : str, default 'float32' The target data type, in string or `numpy.dtype`. Inputs: - **data**: input tensor with arbitrary shape and dtype. Outputs: - **out**: output tensor with the same shape as `data` and data type as dtype. """ def __init__(self, dtype='float32'): super(Cast, self).__init__() self._dtype = dtype
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.cast(x, self._dtype)
[docs]class ToTensor(HybridBlock): """Converts an image NDArray or batch of image NDArray to a tensor NDArray. Converts an image NDArray of shape (H x W x C) in the range [0, 255] to a float32 tensor NDArray of shape (C x H x W) in the range [0, 1]. If batch input, converts a batch image NDArray of shape (N x H x W x C) in the range [0, 255] to a float32 tensor NDArray of shape (N x C x H x W). Inputs: - **data**: input tensor with (H x W x C) or (N x H x W x C) shape and uint8 type. Outputs: - **out**: output tensor with (C x H x W) or (N x C x H x W) shape and float32 type. Examples -------- >>> transformer = vision.transforms.ToTensor() >>> image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8) >>> transformer(image) [[[ 0.85490197 0.72156864] [ 0.09019608 0.74117649] [ 0.61960787 0.92941177] [ 0.96470588 0.1882353 ]] [[ 0.6156863 0.73725492] [ 0.46666667 0.98039216] [ 0.44705883 0.45490196] [ 0.01960784 0.8509804 ]] [[ 0.39607844 0.03137255] [ 0.72156864 0.52941179] [ 0.16470589 0.7647059 ] [ 0.05490196 0.70588237]]] <NDArray 3x4x2 @cpu(0)> """ def __init__(self): super(ToTensor, self).__init__()
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.to_tensor(x)
[docs]class Normalize(HybridBlock): """Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and standard deviation. Given mean `(m1, ..., mn)` and std `(s1, ..., sn)` for `n` channels, this transform normalizes each channel of the input tensor with:: output[i] = (input[i] - mi) / si If mean or std is scalar, the same value will be applied to all channels. Parameters ---------- mean : float or tuple of floats The mean values. std : float or tuple of floats The standard deviation values. Inputs: - **data**: input tensor with (C x H x W) or (N x C x H x W) shape. Outputs: - **out**: output tensor with the shape as `data`. Examples -------- >>> transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1)) >>> image = mx.nd.random.uniform(0, 1, (3, 4, 2)) >>> transformer(image) [[[ 0.18293785 0.19761486] [ 0.23839645 0.28142193] [ 0.20092112 0.28598186] [ 0.18162774 0.28241724]] [[-0.2881726 -0.18821815] [-0.17705294 -0.30780914] [-0.2812064 -0.3512327 ] [-0.05411351 -0.4716435 ]] [[-1.0363373 -1.7273437 ] [-1.6165586 -1.5223348 ] [-1.208275 -1.1878313 ] [-1.4711051 -1.5200229 ]]] <NDArray 3x4x2 @cpu(0)> """ def __init__(self, mean=0.0, std=1.0): super(Normalize, self).__init__() self._mean = mean self._std = std
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.normalize(x, self._mean, self._std)
[docs]class Rotate(Block): """Rotate the input image by a given angle. Keeps the original image shape. Parameters ---------- rotation_degrees : float32 Desired rotation angle in degrees. zoom_in : bool Zoom in image so that no padding is present in final output. zoom_out : bool Zoom out image so that the entire original image is present in final output. Inputs: - **data**: input tensor with (C x H x W) or (N x C x H x W) shape. Outputs: - **out**: output tensor with (C x H x W) or (N x C x H x W) shape. """ def __init__(self, rotation_degrees, zoom_in=False, zoom_out=False): super(Rotate, self).__init__() self._args = (rotation_degrees, zoom_in, zoom_out)
[docs] def forward(self, x): if x.dtype is not np.float32: raise TypeError("This transformation only supports float32. " "Consider calling it after ToTensor") return image.imrotate(x, *self._args)
[docs]class RandomRotation(Block): """Random rotate the input image by a random angle. Keeps the original image shape and aspect ratio. Parameters ---------- angle_limits: tuple Tuple of 2 elements containing the upper and lower limit for rotation angles in degree. zoom_in : bool Zoom in image so that no padding is present in final output. zoom_out : bool Zoom out image so that the entire original image is present in final output. rotate_with_proba : float32 Inputs: - **data**: input tensor with (C x H x W) or (N x C x H x W) shape. Outputs: - **out**: output tensor with (C x H x W) or (N x C x H x W) shape. """ def __init__(self, angle_limits, zoom_in=False, zoom_out=False, rotate_with_proba=1.0): super(RandomRotation, self).__init__() lower, upper = angle_limits if lower >= upper: raise ValueError("`angle_limits` must be an ordered tuple") if rotate_with_proba < 0 or rotate_with_proba > 1: raise ValueError("Probability of rotating the image should be between 0 and 1") self._args = (angle_limits, zoom_in, zoom_out) self._rotate_with_proba = rotate_with_proba
[docs] def forward(self, x): if np.random.random() > self._rotate_with_proba: return x if x.dtype is not np.float32: raise TypeError("This transformation only supports float32. " "Consider calling it after ToTensor") return image.random_rotate(x, *self._args)
[docs]class RandomResizedCrop(Block): """Crop the input image with random scale and aspect ratio. Makes a crop of the original image with random size (default: 0.08 to 1.0 of the original image size) and random aspect ratio (default: 3/4 to 4/3), then resize it to the specified size. Parameters ---------- size : int or tuple of (W, H) Size of the final output. scale : tuple of two floats If scale is `(min_area, max_area)`, the cropped image's area will range from min_area to max_area of the original image's area ratio : tuple of two floats Range of aspect ratio of the cropped image before resizing. interpolation : int Interpolation method for resizing. By default uses bilinear interpolation. See OpenCV's resize function for available choices. Inputs: - **data**: input tensor with (Hi x Wi x C) shape. Outputs: - **out**: output tensor with (H x W x C) shape. """ def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0), interpolation=1): super(RandomResizedCrop, self).__init__() if isinstance(size, numeric_types): size = (size, size) self._args = (size, scale, ratio, interpolation)
[docs] def forward(self, x): return image.random_size_crop(x, *self._args)[0]
[docs]class CropResize(HybridBlock): r"""Crop the input image with and optionally resize it. Makes a crop of the original image then optionally resize it to the specified size. Parameters ---------- x : int Left boundary of the cropping area y : int Top boundary of the cropping area w : int Width of the cropping area h : int Height of the cropping area size : int or tuple of (w, h) Optional, resize to new size after cropping interpolation : int, optional Interpolation method for resizing. By default uses bilinear interpolation. See OpenCV's resize function for available choices. https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize Note that the Resize on gpu use contrib.bilinearResize2D operator which only support bilinear interpolation(1). Inputs: - **data**: input tensor with (H x W x C) or (N x H x W x C) shape. Outputs: - **out**: input tensor with (H x W x C) or (N x H x W x C) shape. Examples -------- >>> transformer = vision.transforms.CropResize(x=0, y=0, width=100, height=100) >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 100x100x3 @cpu(0)> >>> image = mx.nd.random.uniform(0, 255, (3, 224, 224, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 3x100x100x3 @cpu(0)> >>> transformer = vision.transforms.CropResize(x=0, y=0, width=100, height=100, size=(50, 50), interpolation=1) >>> transformer(image) <NDArray 3x50x50 @cpu(0)> """ def __init__(self, x, y, width, height, size=None, interpolation=None): super(CropResize, self).__init__() self._x = x self._y = y self._width = width self._height = height self._size = size self._interpolation = interpolation
[docs] def hybrid_forward(self, F, x): out = F.image.crop(x, self._x, self._y, self._width, self._height) if self._size: out = F.image.resize(out, self._size, False, self._interpolation) return out
[docs]class CenterCrop(Block): """Crops the image `src` to the given `size` by trimming on all four sides and preserving the center of the image. Upsamples if `src` is smaller than `size`. Parameters ---------- size : int or tuple of (W, H) Size of output image. interpolation : int Interpolation method for resizing. By default uses bilinear interpolation. See OpenCV's resize function for available choices. Inputs: - **data**: input tensor with (Hi x Wi x C) shape. Outputs: - **out**: output tensor with (H x W x C) shape. Examples -------- >>> transformer = vision.transforms.CenterCrop(size=(1000, 500)) >>> image = mx.nd.random.uniform(0, 255, (2321, 3482, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 500x1000x3 @cpu(0)> """ def __init__(self, size, interpolation=1): super(CenterCrop, self).__init__() if isinstance(size, numeric_types): size = (size, size) self._args = (size, interpolation)
[docs] def forward(self, x): return image.center_crop(x, *self._args)[0]
[docs]class Resize(HybridBlock): """Resize an image or a batch of image NDArray to the given size. Should be applied before `mxnet.gluon.data.vision.transforms.ToTensor`. Parameters ---------- size : int or tuple of (W, H) Size of output image. keep_ratio : bool Whether to resize the short edge or both edges to `size`, if size is give as an integer. interpolation : int Interpolation method for resizing. By default uses bilinear interpolation. See OpenCV's resize function for available choices. Note that the Resize on gpu use contrib.bilinearResize2D operator which only support bilinear interpolation(1). Inputs: - **data**: input tensor with (H x W x C) or (N x H x W x C) shape. Outputs: - **out**: output tensor with (H x W x C) or (N x H x W x C) shape. Examples -------- >>> transformer = vision.transforms.Resize(size=(1000, 500)) >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 500x1000x3 @cpu(0)> >>> image = mx.nd.random.uniform(0, 255, (3, 224, 224, 3)).astype(dtype=np.uint8) >>> transformer(image) <NDArray 3x500x1000x3 @cpu(0)> """ def __init__(self, size, keep_ratio=False, interpolation=1): super(Resize, self).__init__() self._keep = keep_ratio self._size = size self._interpolation = interpolation
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.resize(x, self._size, self._keep, self._interpolation)
[docs]class RandomFlipLeftRight(HybridBlock): """Randomly flip the input image left to right with a probability of 0.5. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self): super(RandomFlipLeftRight, self).__init__()
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_flip_left_right(x)
[docs]class RandomFlipTopBottom(HybridBlock): """Randomly flip the input image top to bottom with a probability of 0.5. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self): super(RandomFlipTopBottom, self).__init__()
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_flip_top_bottom(x)
[docs]class RandomBrightness(HybridBlock): """Randomly jitters image brightness with a factor chosen from `[max(0, 1 - brightness), 1 + brightness]`. Parameters ---------- brightness: float How much to jitter brightness. brightness factor is randomly chosen from `[max(0, 1 - brightness), 1 + brightness]`. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, brightness): super(RandomBrightness, self).__init__() self._args = (max(0, 1-brightness), 1+brightness)
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_brightness(x, *self._args)
[docs]class RandomContrast(HybridBlock): """Randomly jitters image contrast with a factor chosen from `[max(0, 1 - contrast), 1 + contrast]`. Parameters ---------- contrast: float How much to jitter contrast. contrast factor is randomly chosen from `[max(0, 1 - contrast), 1 + contrast]`. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, contrast): super(RandomContrast, self).__init__() self._args = (max(0, 1-contrast), 1+contrast)
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_contrast(x, *self._args)
[docs]class RandomSaturation(HybridBlock): """Randomly jitters image saturation with a factor chosen from `[max(0, 1 - saturation), 1 + saturation]`. Parameters ---------- saturation: float How much to jitter saturation. saturation factor is randomly chosen from `[max(0, 1 - saturation), 1 + saturation]`. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, saturation): super(RandomSaturation, self).__init__() self._args = (max(0, 1-saturation), 1+saturation)
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_saturation(x, *self._args)
[docs]class RandomHue(HybridBlock): """Randomly jitters image hue with a factor chosen from `[max(0, 1 - hue), 1 + hue]`. Parameters ---------- hue: float How much to jitter hue. hue factor is randomly chosen from `[max(0, 1 - hue), 1 + hue]`. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, hue): super(RandomHue, self).__init__() self._args = (max(0, 1-hue), 1+hue)
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_hue(x, *self._args)
[docs]class RandomColorJitter(HybridBlock): """Randomly jitters the brightness, contrast, saturation, and hue of an image. Parameters ---------- brightness : float How much to jitter brightness. brightness factor is randomly chosen from `[max(0, 1 - brightness), 1 + brightness]`. contrast : float How much to jitter contrast. contrast factor is randomly chosen from `[max(0, 1 - contrast), 1 + contrast]`. saturation : float How much to jitter saturation. saturation factor is randomly chosen from `[max(0, 1 - saturation), 1 + saturation]`. hue : float How much to jitter hue. hue factor is randomly chosen from `[max(0, 1 - hue), 1 + hue]`. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): super(RandomColorJitter, self).__init__() self._args = (brightness, contrast, saturation, hue)
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_color_jitter(x, *self._args)
[docs]class RandomLighting(HybridBlock): """Add AlexNet-style PCA-based noise to an image. Parameters ---------- alpha : float Intensity of the image. Inputs: - **data**: input tensor with (H x W x C) shape. Outputs: - **out**: output tensor with same shape as `data`. """ def __init__(self, alpha): super(RandomLighting, self).__init__() self._alpha = alpha
[docs] def hybrid_forward(self, F, x): if is_np_array(): F = F.npx return F.image.random_lighting(x, self._alpha)
[docs]class RandomApply(Sequential): """Apply a list of transformations randomly given probability Parameters ---------- transforms List of transformations. p : float Probability of applying the transformations. Inputs: - **data**: input tensor. Outputs: - **out**: transformed image. """ def __init__(self, transforms, p=0.5): super(RandomApply, self).__init__() self.transforms = transforms self.p = p
[docs] def forward(self, x): if self.p < random.random(): return x x = self.transforms(x) return x