#!/usr/bin/env python
# ******************************************************************************
# Copyright 2022 Brainchip Holdings Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
__all__ = ["quantize", "dump_config"]
import warnings
from keras import Model, layers
from onnx import ModelProto
from .utils import deep_clone_model
from .transforms import insert_layer, sanitize
from .calibrate import calibrate
from ..layers import (OutputQuantizer, WeightQuantizer, Dequantizer, quantization,
QuantizationParams, get_quantization_params, Attention, QuantizedConv2D,
QuantizedDepthwiseConv2D, QuantizedSeparableConv2D, QuantizedDense,
StatefulRecurrent)
from ..layers.layers_base import (_GLOBAL_LAYER_TO_QLAYER, _GLOBAL_NO_OUTPUT_QUANTIZER,
_GLOBAL_ALIGNED_INPUTS)
from ..onnx_support.quantization.quantize import quantize as quantize_onnx
# List of Quantizer layer's that do not have a float layer representation
NO_FLOAT_CUSTOM_QLAYERS = [Dequantizer, OutputQuantizer, WeightQuantizer]
def get_quantized_layer(layer):
""" Returns the quantized version of the layer.
Args:
layer (keras.layers.Layer): layer of interest
Returns:
keras.layer: quantized version of the layer if it exists, None otherwise.
"""
return _GLOBAL_LAYER_TO_QLAYER.get(layer.__class__.__name__, None)
def is_quantized_layer(layer):
""" Returns True when the layer is a quantized layer.
Args:
layer (keras.layers.Layer): layer of interest
Returns:
bool: True when the layer is a quantized layer, False otherwise.
"""
return layer.__class__ in _GLOBAL_LAYER_TO_QLAYER.values()
def _handle_not_quantizable_layers(model):
""" Checks if the model has not quantizable layers and adds a Dequantizer before.
Args:
model (keras.Model): model to check
Returns:
keras.Model: the updated model
"""
def is_quantizable(layer):
return (get_quantized_layer(layer)
or is_quantized_layer(layer)
or layer.__class__ in NO_FLOAT_CUSTOM_QLAYERS)
# Find layers that cannot be quantized
for layer in model.layers:
if not is_quantizable(layer):
# This layer cannot be quantized, check its inbounds
inbound = layer.inbound_nodes[0]
if not inbound.inbound_layers:
# Skip input layers
continue
elif isinstance(inbound.inbound_layers, list):
raise RuntimeError(f"'{layer.name}' is not quantizable and has multiple inbounds "
"which is not supported.")
# Check if the layer inbound will be quantized but is not a Dequantizer to prevent
# adding an additional Dequantizer.
if (is_quantizable(inbound.inbound_layers) and
not isinstance(inbound.inbound_layers, Dequantizer)):
# Inbound will be quantized, add a Dequantizer after it and return the model
inbound_name = inbound.inbound_layers.name
model = insert_layer(model, inbound_name, Dequantizer())
warnings.warn(f"'{layer.name}' of type {layer.__class__} is not supported to "
"quantize, a Dequantizer is added before it and quantization will "
"stop at this layer.")
return model
return model
def _prepare_output_quantizers(model):
""" Parse the model and prepare OutputQuantizer configurations for layers requiring them.
To ensure that an OutputQuantizer will be added to the latest possible layer in a 'block', the
model is parsed in reverse order. If a layer requires aligned inputs, the function will find the
preceding layer that can accept an OutputQuantizer and set it in the returned dictionary.
Args:
model (keras.Model): the model to parse
Returns:
dict: dictionary mapping layer names to an OutputQuantizer config.
"""
# Dictionary that will contain layers and their OutputQuantizer configurations
out_quantizer_configs = {}
# Get quantization parameters
qparams = get_quantization_params()
def set_output_quantizer(layer_names, next_layer):
""" Populates `out_quantizer_configs` with layer names and their OutputQuantizer. """
for name in layer_names:
current_layer = model.get_layer(name)
# Handle special cases were the OutputQuantizer must be per-tensor:
# - when current_layer has vector outputs,
# - when next_layer is an Attention layer and the layer is Query or Key
# (first and second inputs)
# - when the layer is a StatefulRecurrent layer
if isinstance(current_layer, Attention):
output_shape = current_layer.output_shape[0]
else:
output_shape = current_layer.output_shape
vector_outputs = len(output_shape) < 3
query_or_key = (isinstance(current_layer, layers.Dense)
and isinstance(next_layer, Attention)
and next_layer.inbound_nodes[0].inbound_layers.
index(current_layer) in [0, 1])
is_stateful_rec = isinstance(current_layer, StatefulRecurrent)
per_tensor = (query_or_key or vector_outputs or is_stateful_rec or
qparams.per_tensor_activations)
# If this is a new entry, set a default configuration
if name not in out_quantizer_configs:
axis = "per-tensor" if per_tensor else "per-axis"
if isinstance(current_layer, layers.ReLU):
params = dict(bitwidth=qparams.activation_bits,
signed=qparams.activation_bits >= 8,
axis=axis)
else:
# StatefulRecurrent special: previous and self OutputQuantizer should be 16-bits
if is_stateful_rec or isinstance(next_layer, StatefulRecurrent):
bitwidth = 16
else:
bitwidth = qparams.output_bits
params = dict(bitwidth=bitwidth, axis=axis)
out_quantizer_configs[name] = dict(output_quantizer=params)
# If the layer OutputQuantizer configuration is already set, simply check the axis:
# override the config if the outputs must be per-tensor
else:
current_axis = out_quantizer_configs[name]["output_quantizer"]["axis"]
per_tensor = per_tensor or current_axis == "per-tensor"
axis = "per-tensor" if per_tensor else "per-axis"
out_quantizer_configs[name]["output_quantizer"]["axis"] = axis
def cannot_have_output_quantizer(layer):
""" Returns True when the layer cannot have an OutputQuantizer. """
qlayer = get_quantized_layer(layer)
return (isinstance(layer, Dequantizer)
or qlayer is None
or qlayer in _GLOBAL_NO_OUTPUT_QUANTIZER)
def get_preceding_layer_names(layer):
""" Retrieve inbounds layers names where an OutputQuantizer can be set. """
previous_layers = []
inbounds = layer.inbound_nodes[0].inbound_layers
if not isinstance(inbounds, list):
inbounds = [inbounds]
for inbound in inbounds:
# Skip input layers
if isinstance(inbound, layers.InputLayer):
continue
# When the given layer cannot have an OutputQuantizer, recursively call the function on
# this layer
if cannot_have_output_quantizer(inbound):
previous_layers.extend(get_preceding_layer_names(inbound))
else:
previous_layers.append(inbound.name)
return previous_layers
# Parse the layers in reverse order
for layer in model.layers[::-1]:
# Find layers that will need aligned inputs
if get_quantized_layer(layer) in _GLOBAL_ALIGNED_INPUTS:
# Retrieve the inbounds that can have an OutputQuantizer
previous_layers = get_preceding_layer_names(layer)
# Set an OutputQuantizer in their inbounds
set_output_quantizer(previous_layers, layer)
return out_quantizer_configs
def quantize_keras(model, q_config=None, qparams=QuantizationParams(), samples=None,
num_samples=1024, batch_size=None, epochs=1):
"""Quantizes a Keras model using the provided configuration or parameters.
Details on how this function behaves:
- `q_config` has priority over `qparams`, meaning that when a match is found in `q_config` the
given configuration will be used instead of `qparams`. This is useful to handle specific cases
(e.g per-tensor output quantizer).
- when no configuration is given, quantization parameters are deduced from `qparams` and
OutputQuantizers are automatically set on appropriate layers.
- `qparams` are only applied to 'float' Keras layers when they are first quantized. As a result,
when re-quantizing a model, one must provide a complete `q_config`. This is made easy with the
`dump_config` helper.
If not already present, a final Dequantizer will be added at the end of the Model.
The model will also be calibrated using the provided (or randomly generated inputs).
Args:
model (keras.Model): the model to quantize
q_config (dict, optional): quantization configuration as a dictionary mapping layer names to
their quantization configuration. Defaults to None.
qparams (QuantizationParams, optional): global quantization parameters. Defaults to
QuantizationParams().
samples (tf.Dataset, np.array or generator, optional): calibration samples. When no samples
are provided, random samples are generated. Defaults to None.
num_samples (int, optional): number of samples to use in the provided samples or number of
samples to generate. Defaults to 1024.
batch_size (int, optional): the batch size. Defaults to None.
epochs (int, optional): the number of epochs. Defaults to 1.
Returns:
keras.Model: the quantized model
"""
q_config = q_config or dict()
# Layers will no longer be quantized as soon as a Dequantizer is found
quantization_stopped = False
# Handle input_weight_bits using another QuantizationParams where
# weight_bits = qparams.input_weight_bits, it will be set to False once the input layer has been
# quantized.
input_qparams = QuantizationParams(activation_bits=qparams.activation_bits,
per_tensor_activations=qparams.per_tensor_activations,
weight_bits=qparams.input_weight_bits,
output_bits=qparams.output_bits,
buffer_bits=qparams.buffer_bits)
def _replace_layer(layer):
nonlocal quantization_stopped, out_quantizer_configs, input_qparams
config = layer.get_config()
# Function to handle unsupported arguments in config
def pop_unsupported_args(class_type):
for arg, default_value in getattr(class_type, "unsupported_args", {}).items():
if (arg in config and config[arg] != default_value):
raise RuntimeError(
f"Argument '{arg}' in layer '{layer.name}' is only "
f"supported with default value '{default_value}'. "
f"Receives '{config[arg]}'.")
config.pop(arg, None)
# Function to handle arguments that should be ignored, i.e. dropped,
# from config
def pop_ignored_args(class_type):
for arg in getattr(class_type, "ignored_args", []):
config.pop(arg, None)
# Function that return a quantized layer given its float version
def get_quantize_layer(layer, quantize_config=None):
"""Quantize float layer in three steps:
- first, we get its quantized version,
- second, remove unsupported arguments,
- then, we return the quantized layer with config updated
"""
nonlocal quantization_stopped, out_quantizer_configs
# 1. Check if qlayer exists in custom layers and return the float version of the layer
# if not
qlayer = get_quantized_layer(layer)
if qlayer is None:
qlayer = layer
if not (is_quantized_layer(qlayer) or qlayer.__class__ in NO_FLOAT_CUSTOM_QLAYERS):
warnings.warn(f"'{ layer.__class__.__name__}' is not supported to quantize. "
"It will be ignored.")
# If a Dequantizer is found, quantization must be stopped
quantization_stopped = isinstance(layer, Dequantizer)
# 2.1 Remove ignored arguments
pop_ignored_args(qlayer)
# 2.2 Remove unsupported arguments
pop_unsupported_args(qlayer)
# 3. Instantiate quantized layer
# Instantiate from configuration if there is one
if quantize_config:
config['quant_config'] = quantize_config
# Set the preset default configuration otherwise
elif layer.name in out_quantizer_configs:
config['quant_config'] = out_quantizer_configs[layer.name]
return qlayer.from_config(config)
# When a not quantizable layer is found, stop quantization returning initial layer
if quantization_stopped:
return layer.from_config(config)
match_conf = q_config.get(layer.name, None)
try:
# Overwrite quantization context with input_qparams
if input_qparams:
with quantization(input_qparams):
qlayer = get_quantize_layer(layer, match_conf)
else:
qlayer = get_quantize_layer(layer, match_conf)
except Exception as e:
raise type(e)(f"Layer '{layer.name}': {str(e)}") from e
# When the qlayer is an input layer that has been quantized, disable input_qparams
input_layers = (QuantizedConv2D, QuantizedDepthwiseConv2D, QuantizedSeparableConv2D,
QuantizedDense)
if input_qparams and isinstance(qlayer, input_layers):
input_qparams = False
return qlayer
# Sanitize the model and make it quantization ready
model = sanitize(model)
# Check if the model has not quantizable layers and add a Dequantizer before
model = _handle_not_quantizable_layers(model)
# Quantize the model replacing layers with their quantized version
with quantization(qparams):
# Determine where to set OutputQuantizers, the return dict will be used as a non-local
# variable in the _replace_layer function.
out_quantizer_configs = _prepare_output_quantizers(model)
new_model = deep_clone_model(model, clone_function=_replace_layer)
out = new_model.outputs
# Append Dequantizer at the end of the model to convert the output to float value
if not isinstance(new_model.layers[-1], Dequantizer) and not quantization_stopped:
out = Dequantizer()(out)
# Build the model
qmodel = Model(new_model.input, out, name=model.name)
# Now that the model is quantized, proceed to calibration
calibrate(model, qmodel, samples=samples, num_samples=num_samples, batch_size=batch_size,
epochs=epochs)
return qmodel
[docs]def quantize(model, q_config=None, qparams=QuantizationParams(), samples=None, num_samples=1024,
batch_size=None, epochs=1):
"""Quantizes a Keras or ONNX model using the provided configuration or parameters.
Details on how this function behaves:
- `q_config` has priority over `qparams`, meaning that when a match is found in `q_config` the
given configuration will be used instead of `qparams`. This is useful to handle specific cases
(e.g per-tensor output quantizer). This is only used when quantizing Keras models.
- when no configuration is given, quantization parameters are deduced from `qparams` and
OutputQuantizers are automatically set on appropriate layers.
- `qparams` are only applied to 'float' Keras layers when they are first quantized. As a result,
when re-quantizing a model, one must provide a complete `q_config`. This is made easy with the
`dump_config` helper. Note the only configuration supported when quantizing ONNX models is
8-bit for weights and activations, but per_tensor_activations param will be taken into
account.
If not already present, a final Dequantizer will be added at the end of the Model.
The model will also be calibrated using the provided (or randomly generated inputs).
Args:
model (keras.Model or ModelProto): the model to quantize
q_config (dict, optional): quantization configuration as a dictionary mapping layer names to
their quantization configuration. Defaults to None.
qparams (QuantizationParams, optional): global quantization parameters. Defaults to
QuantizationParams().
samples (tf.Dataset, np.array or generator, optional): calibration samples. When no samples
are provided, random samples are generated. Defaults to None.
num_samples (int, optional): number of samples to use in the provided samples or number of
samples to generate. Defaults to 1024.
batch_size (int, optional): the batch size. Defaults to None.
epochs (int, optional): the number of epochs. This parameter must be 1 for ONNX models.
Defaults to 1.
Returns:
keras.Model or ModelProto: the quantized model
"""
# Calibration with random samples will only provide meaningful results when quantizing
# per-tensor
if samples is None and not qparams.per_tensor_activations:
warnings.warn("Quantizing per-axis with random calibration samples is not accurate.\
Set QuantizationParams.per_tensor_activations=True when calibrating with \
random samples.")
if type(model) != ModelProto:
return quantize_keras(model, q_config, qparams, samples, num_samples, batch_size, epochs)
elif q_config:
raise ValueError("unsupported parameter q_config for ONNX models quantization")
elif epochs != 1:
raise ValueError("unsupported parameter epochs != 1 for ONNX models quantization")
return quantize_onnx(model, qparams, samples, num_samples, batch_size)
[docs]def dump_config(model):
"""Dump the quantization configuration of a quantized model, exporting the configuration for
each quantized layer.
Args:
model (keras.Model): a quantized model.
Returns:
dict: the configuration of the model.
"""
# Get the configuration of the model, iterating over each layer and updating on config.
config = {}
for layer in model.layers:
# Try to take the current quantized configuration
ly_config = layer.get_config().get('quant_config')
# Only append quantized configuration
if is_quantized_layer(layer) and ly_config:
config[layer.name] = ly_config
return config