#!/usr/bin/env python
# ******************************************************************************
# MIT License
#
# Copyright (c) 2017 Ngoc Anh Huynh
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# ******************************************************************************
"""
Data generator for YOLO training.
"""
import copy
import cv2
import numpy as np
from imgaug import augmenters as iaa
from imgaug.augmentables.bbs import BoundingBoxesOnImage, BoundingBox as BoundingBoxAug
from keras.utils.data_utils import Sequence
from .processing import BoundingBox
[docs]class BatchGenerator(Sequence):
""" Data generator used for the training process.
"""
def __init__(self,
input_shape,
data,
grid_size,
labels,
batch_size,
shuffle=True,
jitter=True):
self._input_shape = input_shape
self._data = data
self._data_length = len(data)
self._grid_size = grid_size
self._labels = labels
self._batch_size = batch_size
self._shuffle = shuffle
self._jitter = jitter
self._aug_pipe = self.build_aug_pipeline()
if shuffle:
np.random.shuffle(self._data)
@staticmethod
def build_aug_pipeline():
""" Define a sequence of augmentation steps that will be applied to every image.
Returns:
iaa.Sequential: sequence of augmentation.
"""
# augmentors by https://github.com/aleju/imgaug
def sometimes(aug): return iaa.Sometimes(0.5, aug)
# All augmenters with per_channel=0.5 will sample one value per
# image in 50% of all cases. In all other cases they will sample new
# values per channel.
return iaa.Sequential(
[
# apply the following augmenters to most images
sometimes(iaa.Affine()),
# execute 0 to 5 of the following (less important) augmenters
# per image. Don't execute all of them, as that would often be
# way too strong
iaa.SomeOf(
(0, 5),
[
iaa.OneOf([
# blur images with a sigma between 0 and 3.0
iaa.GaussianBlur((0, 3.0)),
# blur image using local means (kernel sizes between
# 2 and 7)
iaa.AverageBlur(k=(2, 7)),
# blur image using local medians (kernel sizes
# between 3 and 11)
iaa.MedianBlur(k=(3, 11)),
]),
# sharpen images
iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)),
# add gaussian noise
iaa.AdditiveGaussianNoise(
loc=0, scale=(0.0, 0.05 * 255), per_channel=0.5),
# randomly remove up to 10% of the pixels
iaa.OneOf([
iaa.Dropout((0.01, 0.1), per_channel=0.5),
]),
# change brightness of images
iaa.Add((-10, 10), per_channel=0.5),
# change brightness of images
iaa.Multiply((0.5, 1.5), per_channel=0.5),
# improve or worsen the contrast
iaa.LinearContrast((0.5, 2.0), per_channel=0.5),
],
random_order=True)
],
random_order=True)
def list2bbox(self, boxes, image_shape):
""" Transforms bounding boxes from numpy array to BoundingBox format from imgaug library.
"""
formattedbox = [BoundingBoxAug(x1=box["x1"], y1=box["y1"], x2=box["x2"], y2=box["y2"],
label=box['label']) for box in boxes]
return BoundingBoxesOnImage(formattedbox, shape=image_shape)
def bbox2list(self, boxes):
""" Extracts bounding boxes from imgaug object to numpy array.
"""
return [{"label": box.label, "x1": box.x1, "y1": box.y1, "x2": box.x2, "y2": box.y2}
for box in boxes]
def __len__(self):
return int(np.ceil(float(self._data_length) / self._batch_size))
def __getitem__(self, idx):
raise NotImplementedError("This function must be defined by the children class.")
def on_epoch_end(self):
if self._shuffle:
np.random.shuffle(self._data)
def aug_image(self, train_instance, jitter):
""" Performs data augmentation on an image.
Args:
train_instance (dict): the dictionnary of input data (image and
boxes)
jitter (bool): enable jitter in the augmentation pipeline
Raises:
ValueError: if the number of channel in the input shape is wrong
(ie. different from 1 or 3).
RuntimeError: if the image cannot be found
Returns:
tuple: the augmented image and the associated boxes
"""
image_path = train_instance['image_path']
if self._input_shape[2] == 1:
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
elif self._input_shape[2] == 3:
image = cv2.imread(image_path)
else:
raise ValueError("Invalid number of image channels.")
if image is None:
raise RuntimeError(f'Cannot find {image_path}')
h = image.shape[0]
w = image.shape[1]
all_objs = copy.deepcopy(train_instance['boxes'])
if jitter:
# scale the image
scale = np.random.uniform() / 10. + 1.
image = cv2.resize(image, (0, 0), fx=scale, fy=scale)
# translate the image
max_offx = (scale - 1.) * w
max_offy = (scale - 1.) * h
offx = int(np.random.uniform() * max_offx)
offy = int(np.random.uniform() * max_offy)
image = image[offy:(offy + h), offx:(offx + w)]
# flip the image
flip = np.random.binomial(1, .5)
if flip > 0.5:
image = cv2.flip(image, 1)
# augment both image and boxes
bbs = self.list2bbox(all_objs, image.shape)
image, bbs = self._aug_pipe(image=image, bounding_boxes=bbs)
bbs.remove_out_of_image().clip_out_of_image()
all_objs = self.bbox2list(bbs)
# resize the image to standard size
image = cv2.resize(image, (self._input_shape[0], self._input_shape[1]))
if self._input_shape[2] == 1:
image = image[..., np.newaxis]
image = image[..., ::-1]
# fix object's position and size
for obj in all_objs:
for attr in ['x1', 'x2']:
if jitter:
obj[attr] = int(obj[attr] * scale - offx)
obj[attr] = int(obj[attr] * float(self._input_shape[0]) / w)
obj[attr] = max(min(obj[attr], self._input_shape[0]), 0)
for attr in ['y1', 'y2']:
if jitter:
obj[attr] = int(obj[attr] * scale - offy)
obj[attr] = int(obj[attr] * float(self._input_shape[1]) / h)
obj[attr] = max(min(obj[attr], self._input_shape[1]), 0)
if jitter and flip > 0.5:
xmin = obj['x1']
obj['x1'] = self._input_shape[0] - obj['x2']
obj['x2'] = self._input_shape[0] - xmin
return image, all_objs
[docs]class BatchYoloGenerator(BatchGenerator):
def __init__(self,
input_shape,
data,
grid_size,
labels,
anchors,
batch_size,
shuffle=True,
jitter=True):
super().__init__(input_shape, data, grid_size, labels, batch_size, shuffle, jitter)
self._anchors = [BoundingBox(0, 0, anchors[i][0], anchors[i][1])
for i in range(len(anchors))]
def __getitem__(self, idx):
lower_bound = idx * self._batch_size
upper_bound = (idx + 1) * self._batch_size
if upper_bound > self._data_length:
upper_bound = self._data_length
lower_bound = upper_bound - self._batch_size
instance_count = 0
x_batch = np.zeros((upper_bound - lower_bound, self._input_shape[1],
self._input_shape[0], self._input_shape[2]))
y_batch = np.zeros(
(upper_bound - lower_bound, self._grid_size[1], self._grid_size[0],
len(self._anchors),
4 + 1 + len(self._labels))) # desired network output
for train_instance in self._data[lower_bound:upper_bound]:
# augment input image and fix object's position and size
img, all_objs = self.aug_image(train_instance, jitter=self._jitter)
for obj in all_objs:
if obj['x2'] > obj['x1'] and obj['y2'] > obj['y1'] and obj[
'label'] in self._labels:
center_x = .5 * (obj['x1'] + obj['x2'])
center_x = center_x / (float(self._input_shape[0]) /
self._grid_size[0])
center_y = .5 * (obj['y1'] + obj['y2'])
center_y = center_y / (float(self._input_shape[1]) /
self._grid_size[0])
grid_x = int(np.floor(center_x))
grid_y = int(np.floor(center_y))
if grid_x < self._grid_size[0] and grid_y < self._grid_size[
1]:
obj_indx = self._labels.index(obj['label'])
center_w = (obj['x2'] - obj['x1']) / (
float(self._input_shape[0]) / self._grid_size[0])
center_h = (obj['y2'] - obj['y1']) / (
float(self._input_shape[1]) / self._grid_size[0])
box = [center_x, center_y, center_w, center_h]
# find the anchor that best predicts this box
best_anchor = -1
max_iou = -1
shifted_box = BoundingBox(0, 0, center_w, center_h)
for anchor_id, anchor in enumerate(self._anchors):
iou = shifted_box.iou(anchor)
if max_iou < iou:
best_anchor = anchor_id
max_iou = iou
# assign ground truth x, y, w, h, confidence and class
# probs to y_batch
y_batch[instance_count, grid_y, grid_x, best_anchor,
0:4] = box
y_batch[instance_count, grid_y, grid_x, best_anchor,
4] = 1.
y_batch[instance_count, grid_y, grid_x, best_anchor,
5 + obj_indx] = 1
# assign input image to x_batch
x_batch[instance_count] = img
# increase instance counter in current batch
instance_count += 1
return x_batch, y_batch