Source code for akida_models.detection.processing

#!/usr/bin/env python
# ******************************************************************************
# Copyright 2020 Brainchip Holdings Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
"""
Processing tools for YOLO data handling.
"""

__all__ = ["BoundingBox", "load_image", "preprocess_image", "decode_output",
           "parse_voc_annotations", "parse_widerface_annotations"]

import os

import xml.etree.ElementTree as et
import numpy as np
import tensorflow as tf


[docs]class BoundingBox:
    """ Utility class to represent a bounding box.

    The box is defined by its top left corner (x1, y1), bottom right corner
    (x2, y2), label, score and classes.
    """

    def __init__(self, x1, y1, x2, y2, score=-1, classes=None):
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.label = -1
        self.score = score
        self.classes = classes

    def __repr__(self):
        return "<BoundingBox({}, {}, {}, {}, {}, {}, {})>\n".format(
            self.x1, self.x2, self.y1, self.y2, self.get_label(),
            self.get_score(), self.classes)

[docs]    def get_label(self):
        """ Returns the label for this bounding box.

        Returns:
            Index of the label as an integer.
        """
        if self.label == -1:
            self.label = np.argmax(self.classes)

        return self.label

[docs]    def get_score(self):
        """ Returns the score for this bounding box.

        Returns:
            Confidence as a float.
        """
        if self.score == -1:
            self.score = self.classes[self.get_label()]
        return self.score

[docs]    def iou(self, other):
        """ Computes intersection over union ratio between this bounding box and
        another one.

        Args:
            other (BoundingBox): the other bounding box for IOU computation

        Returns:
            IOU value as a float
        """

        def _interval_overlap(interval_1, interval_2):
            x1, x2 = interval_1
            x3, x4 = interval_2

            if x3 < x1:
                if x4 < x1:
                    return 0
                return min(x2, x4) - x1
            if x2 < x3:
                return 0
            return min(x2, x4) - x3

        intersect_w = _interval_overlap([self.x1, self.x2],
                                        [other.x1, other.x2])
        intersect_h = _interval_overlap([self.y1, self.y2],
                                        [other.y1, other.y2])

        intersect = intersect_w * intersect_h

        w1, h1 = self.x2 - self.x1, self.y2 - self.y1
        w2, h2 = other.x2 - other.x1, other.y2 - other.y1

        union = w1 * h1 + w2 * h2 - intersect

        return float(intersect) / union


[docs]def load_image(image_path):
    """ Loads an image from a path.

    Args:
        image_path (string): full path of the image to load

    Returns:
        a Tensorflow image Tensor
    """
    raw_image = tf.io.read_file(image_path)
    return tf.image.decode_jpeg(raw_image, channels=3)


[docs]def preprocess_image(image_buffer, output_size):
    """ Preprocess an image for YOLO inference.

    Args:
        image_buffer (tf.Tensor): image to preprocess
        output_size (tuple): shape of the image after preprocessing

    Returns:
        A resized and normalized image as a Numpy array.
    """
    # Resize
    width = tf.constant(output_size[0])
    height = tf.constant(output_size[1])
    image = tf.compat.v1.image.resize(image_buffer, [height, width],
                                      method=tf.image.ResizeMethod.BILINEAR,
                                      align_corners=False)
    return image.numpy()


[docs]def decode_output(output,
                  anchors,
                  nb_classes,
                  obj_threshold=0.5,
                  nms_threshold=0.5):
    """ Decodes a YOLO model output.

    Args:
        output (tf.Tensor): model output to decode
        anchors (list): list of anchors boxes
        nb_classes (int): number of classes
        obj_threshold (float): confidence threshold for a box
        nms_threshold (float): non-maximal supression threshold

    Returns:
        List of `BoundingBox` objects
    """

    def _sigmoid(x):
        return 1. / (1. + np.exp(-x))

    def _softmax(x, axis=-1, t=-100.):
        x = x - np.max(x)

        if np.min(x) < t:
            x = x / np.min(x) * t

        e_x = np.exp(x)

        return e_x / e_x.sum(axis, keepdims=True)

    grid_h, grid_w, nb_box = output.shape[:3]

    boxes = []

    # decode the output by the network
    output[..., 4] = _sigmoid(output[..., 4])
    output[...,
           5:] = output[..., 4][..., np.newaxis] * _softmax(output[..., 5:])
    output[..., 5:] *= output[..., 5:] > obj_threshold

    for row in range(grid_h):
        for col in range(grid_w):
            for b in range(nb_box):
                # from 5th element onwards are class classes
                classes = output[row, col, b, 5:]

                if np.sum(classes) > 0:
                    # first 4 elements are x, y, w, and h
                    x, y, w, h = output[row, col, b, :4]

                    x = (col + _sigmoid(x)
                         ) / grid_w  # center position, unit: image width
                    y = (row + _sigmoid(y)
                         ) / grid_h  # center position, unit: image height
                    w = anchors[b][0] * np.exp(w) / grid_w  # unit: image width
                    h = anchors[b][1] * np.exp(h) / grid_h  # unit: image height

                    confidence = output[row, col, b, 4]

                    x1 = max(x - w / 2, 0)
                    y1 = max(y - h / 2, 0)
                    x2 = min(x + w / 2, grid_w)
                    y2 = min(y + h / 2, grid_h)

                    box = BoundingBox(x1, y1, x2, y2, confidence, classes)

                    boxes.append(box)

    # suppress non-maximal boxes
    for c in range(nb_classes):
        sorted_indices = list(
            reversed(np.argsort([box.classes[c] for box in boxes])))
        for ind, index_i in enumerate(sorted_indices):
            if boxes[index_i].classes[c] == 0:
                continue

            for j in range(ind + 1, len(sorted_indices)):
                index_j = sorted_indices[j]

                # filter out redundant boxes (same class and overlapping too
                # much)
                if (boxes[index_i].iou(boxes[index_j]) >= nms_threshold) and (
                        c == boxes[index_i].get_label()) and (
                            c == boxes[index_j].get_label()):
                    boxes[index_j].score = 0

    # remove the boxes which are less likely than a obj_threshold
    boxes = [box for box in boxes if box.get_score() > obj_threshold]

    return boxes


[docs]def parse_voc_annotations(gt_folder, image_folder, file_path, labels):
    """ Loads PASCAL-VOC data.

    Data is loaded using the groundtruth informations and stored in a
    dictionary.

    Args:
        gt_folder (str): path to the folder containing ground truth files
        image_folder (str): path to the folder containing the images
        file_path (str): file containing the list of files to parse
        labels (list): list of labels of interest
    Returns:
        dict: a dictionnary containing all data present in the ground truth file

    """

    def _build_list(file_path):
        """
        Builds a list of file names from a text file path.

        Args:
            file_path (string): path of the file containing the names
        Returns:
            list: list of file names
        """
        file_list = []
        with open(file_path, "r", encoding='UTF-8') as f:
            for line in f:
                file_list.append(line.rstrip())
        return file_list

    file_list = _build_list(file_path)

    all_data = []

    for f in file_list:
        tree = et.parse(os.path.join(gt_folder, f) + ".xml")

        data = {"boxes": []}

        for elem in tree.iter():
            if 'filename' in elem.tag:
                data['image_path'] = os.path.join(image_folder, elem.text)
            if 'size' in elem.tag:
                for attr in list(elem):
                    if 'width' in attr.tag:
                        width = int(attr.text)
                    if 'height' in attr.tag:
                        height = int(attr.text)
                    if 'depth' in attr.tag:
                        depth = int(attr.text)
                data['image_shape'] = (width, height, depth)

            if 'object' in elem.tag:
                box = {}

                for attr in list(elem):
                    if 'difficult' in attr.tag:
                        if int(attr.text) != 0:
                            box.clear()
                            break

                    if 'name' in attr.tag:
                        box['label'] = attr.text

                        if box['label'] not in labels:
                            box.clear()
                            break

                    if 'bndbox' in attr.tag:
                        for dim in list(attr):
                            if 'xmin' in dim.tag:
                                box['x1'] = int(round(float(dim.text)))
                            if 'ymin' in dim.tag:
                                box['y1'] = int(round(float(dim.text)))
                            if 'xmax' in dim.tag:
                                box['x2'] = int(round(float(dim.text)))
                            if 'ymax' in dim.tag:
                                box['y2'] = int(round(float(dim.text)))

                if len(box) == 5:
                    data["boxes"].append(box)

        if len(data["boxes"]) != 0:
            all_data.append(data)

    return all_data


[docs]def parse_widerface_annotations(gt_file, image_folder):
    """ Loads WiderFace data.

    Data is loaded using the groundtruth informations and stored in a
    dictionary.

    Args:
        gt_file (str): path to the ground truth file
        image_folder (str): path to the directory containing the images
    Returns:
        dict: a dictionnary containing all data present in the ground truth file

    """

    def _is_valid_box(w_box, h_box, w_img, h_img):
        box_area = w_box * h_box
        img_area = w_img * h_img
        return box_area >= img_area / 60.0

    all_data = []

    with open(gt_file, "r", encoding='UTF-8') as f:
        for line in f:
            file_path = line.rstrip()
            full_file_path = os.path.join(image_folder, str(file_path))

            if not os.path.isfile(full_file_path):
                raise RuntimeError(f"Not a file path: {full_file_path}")

            image = load_image(full_file_path)
            shape = (image.shape[1], image.shape[0], image.shape[2])

            data = {
                "image_path": full_file_path,
                "image_shape": shape,
                "boxes": []
            }

            nb_boxes = int(f.readline())
            if nb_boxes == 0:
                f.readline()

            for _ in range(nb_boxes):
                annotation = f.readline()
                values = annotation.split(' ')

                x1 = int(values[0])
                y1 = int(values[1])
                w = int(values[2])
                h = int(values[3])

                if _is_valid_box(w, h, shape[0], shape[1]):
                    box = {
                        "x1": x1,
                        "y1": y1,
                        "x2": x1 + w,
                        "y2": y1 + h,
                        "label": "face"
                    }
                    data["boxes"].append(box)

            if len(data["boxes"]) != 0:
                all_data.append(data)

        return all_data