This commit is contained in:
fierval
2019-07-02 17:23:56 -07:00
parent 14ecfb0bf3
commit c75e820107
38 changed files with 6262 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,5 @@
{
"subscription_id": "93177b32-3f08-4530-a61e-d1775d2480ad",
"resource_group": "MSRBrainwave",
"workspace_name": "brainwave"
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

View File

@@ -0,0 +1,26 @@
<annotation>
<folder>runeightft1</folder>
<filename>1555394321.8154433.jpg</filename>
<path>E:/Image grocerydemostills/runeightft1/1555394321.8154433.jpg</path>
<source>
<database>Unknown</database>
</source>
<size>
<width>852</width>
<height>506</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>stockout</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>660</xmin>
<ymin>201</ymin>
<xmax>712</xmax>
<ymax>294</ymax>
</bndbox>
</object>
</annotation>

View File

@@ -0,0 +1,92 @@
import numpy as np
import math
from model.ssd_vgg_300 import SSDNet, SSDParams
_R_MEAN = 123.
_G_MEAN = 117.
_B_MEAN = 104.
EVAL_SIZE = (300, 300)
defaults = SSDNet.default_params
img_shape = defaults.img_shape
num_classes = defaults.num_classes
feat_layers = defaults.feat_layers
feat_shapes = defaults.feat_shapes
anchor_size_bounds = defaults.anchor_size_bounds
anchor_sizes = defaults.anchor_sizes
anchor_ratios = defaults.anchor_ratios
anchor_steps = defaults.anchor_steps
anchor_offset = defaults.anchor_offset
normalizations = defaults.normalizations
prior_scaling = defaults.prior_scaling
def ssd_anchor_one_layer(img_shape,
feat_shape,
sizes,
ratios,
step,
offset=0.5,
dtype=np.float32):
"""Computer SSD default anchor boxes for one feature layer.
Determine the relative position grid of the centers, and the relative
width and height.
Arguments:
feat_shape: Feature shape, used for computing relative position grids;
size: Absolute reference sizes;
ratios: Ratios to use on these features;
img_shape: Image shape, used for computing height, width relatively to the
former;
offset: Grid offset.
Return:
y, x, h, w: Relative x and y grids, and height and width.
"""
# Compute the position grid: simple way.
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
# y = (y.astype(dtype) + offset) / feat_shape[0]
# x = (x.astype(dtype) + offset) / feat_shape[1]
# Weird SSD-Caffe computation using steps values...
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
# Expand dims to support easy broadcasting.
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
# Compute relative height and width.
# Tries to follow the original implementation of SSD for the order.
num_anchors = len(sizes) + len(ratios)
h = np.zeros((num_anchors, ), dtype=dtype)
w = np.zeros((num_anchors, ), dtype=dtype)
# Add first anchor boxes with ratio=1.
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios):
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
return y, x, h, w
def ssd_anchors_all_layers(img_shape = img_shape,
offset=0.5,
dtype=np.float32):
"""Compute anchor boxes for all feature layers.
"""
layers_anchors = []
for i, s in enumerate(feat_shapes):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
return layers_anchors

View File

@@ -0,0 +1,114 @@
import os
import sys
import tarfile
from six.moves import urllib
import tensorflow as tf
LABELS_FILENAME = 'labels.txt'
import shutil
from os import path
def check_labelmatch(images, annotations):
data_dir_images = os.path.split(images[0])[0]
data_dir_annot = os.path.split(annotations[0])[0]
im_files = {os.path.splitext(os.path.split(f)[1])[0] for f in images}
annot_files = {os.path.splitext(os.path.split(f)[1])[0] for f in annotations}
extra_ims = im_files.difference(annot_files)
extra_annots = annot_files.difference(im_files)
mismatch = len(extra_ims) > 0 or len(extra_annots) > 0
if mismatch:
print(f"The following files will be removed from the training process:")
if len(extra_ims) > 0:
print(f"images without annotations: {extra_ims}")
if len(extra_annots) > 0:
print(f"annotations without images: {extra_annots}")
if not mismatch:
print(str(len(images)) + ' images found and ' + str(len(annotations)) + ' matching annotations found.' )
return (images, annotations)
im_files = im_files.difference(extra_ims)
annot_files = annot_files.difference(extra_annots)
im_files = [os.path.join(data_dir_images, f+".jpg") for f in im_files]
annot_files = [os.path.join(data_dir_annot, f+".xml") for f in annot_files]
return(im_files, annot_files)
def create_dir(path):
try:
path_annotations = path + '/Annotations'
path_images = path + '/JPEGImages'
os.makedirs(path_annotations)
os.makedirs(path_images)
except OSError:
print("Creation of folders in directory %s failed. Folder may already exist." % path)
else:
print("Successfully created images and annotations folders at %s" % path)
def move_images(data_dir, train_images, train_annotations,
test_images, test_annotations):
source = data_dir + '/'
for image in train_images:
image = data_dir + '/' + image
dst = source + 'train/' + '/JPEGImages'
if path.exists(image):
shutil.copy(image, dst)
for image in test_images:
image = data_dir + '/' + image
dst = source + 'test/' + '/JPEGImages'
if path.exists(image):
shutil.copy(image, dst)
for annot in train_annotations:
annot = data_dir + '/' + annot
dst = source + 'train/' + '/Annotations'
if path.exists(annot):
shutil.copy(annot, dst)
for annot in test_annotations:
annot = data_dir + '/' + annot
dst = source + 'test/' + '/Annotations'
if path.exists(annot):
shutil.copy(annot, dst)
print('Images and annotations have been copied to directories: ' + source + 'train' + ' and ' + source + 'test')
def int64_feature(value):
"""Wrapper for inserting int64 features into Example proto.
"""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def float_feature(value):
"""Wrapper for inserting float features into Example proto.
"""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def bytes_feature(value):
"""Wrapper for inserting bytes features into Example proto.
"""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

View File

@@ -0,0 +1,112 @@
# Copyright 2015 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides data for the Pascal VOC Dataset (images + annotations).
"""
import os
import tensorflow as tf
from dataprep import dataset_utils
slim = tf.contrib.slim
VOC_LABELS = {
'none': (0, 'Background'),
'aeroplane': (1, 'Vehicle'),
'bicycle': (2, 'Vehicle'),
'bird': (3, 'Animal'),
'boat': (4, 'Vehicle'),
'bottle': (5, 'Indoor'),
'bus': (6, 'Vehicle'),
'car': (7, 'Vehicle'),
'cat': (8, 'Animal'),
'chair': (9, 'Indoor'),
'cow': (10, 'Animal'),
'diningtable': (11, 'Indoor'),
'dog': (12, 'Animal'),
'horse': (13, 'Animal'),
'motorbike': (14, 'Vehicle'),
'person': (15, 'Person'),
'pottedplant': (16, 'Indoor'),
'sheep': (17, 'Animal'),
'sofa': (18, 'Indoor'),
'train': (19, 'Vehicle'),
'tvmonitor': (20, 'Indoor'),
}
def get_split(split_name, dataset_dir, file_pattern, reader,
split_to_sizes, items_to_descriptions, num_classes):
"""Gets a dataset tuple with instructions for reading Pascal VOC dataset.
Args:
split_name: A train/test split name.
dataset_dir: The base directory of the dataset sources.
file_pattern: The file pattern to use when matching the dataset sources.
It is assumed that the pattern contains a '%s' string so that the split
name can be inserted.
reader: The TensorFlow reader type.
Returns:
A `Dataset` namedtuple.
Raises:
ValueError: if `split_name` is not a valid train/test split.
"""
if split_name not in split_to_sizes:
raise ValueError('split name %s was not recognized.' % split_name)
file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
# Allowing None in the signature so that dataset_factory can use the default.
if reader is None:
reader = tf.TFRecordReader
# Features in Pascal VOC TFRecords.
keys_to_features = {
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
'image/height': tf.FixedLenFeature([1], tf.int64),
'image/width': tf.FixedLenFeature([1], tf.int64),
'image/channels': tf.FixedLenFeature([1], tf.int64),
'image/shape': tf.FixedLenFeature([3], tf.int64),
'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
}
items_to_handlers = {
'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
'shape': slim.tfexample_decoder.Tensor('image/shape'),
'object/bbox': slim.tfexample_decoder.BoundingBox(
['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
}
decoder = slim.tfexample_decoder.TFExampleDecoder(
keys_to_features, items_to_handlers)
labels_to_names = None
if dataset_utils.has_labels(dataset_dir):
labels_to_names = dataset_utils.read_label_file(dataset_dir)
return slim.dataset.Dataset(
data_sources=file_pattern,
reader=reader,
decoder=decoder,
num_samples=split_to_sizes[split_name],
items_to_descriptions=items_to_descriptions,
num_classes=num_classes,
labels_to_names=labels_to_names)

View File

@@ -0,0 +1,223 @@
# Copyright 2015 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Converts Pascal VOC data to TFRecords file format with Example protos.
The raw Pascal VOC data set is expected to reside in JPEG files located in the
directory 'JPEGImages'. Similarly, bounding box annotations are supposed to be
stored in the 'Annotation directory'
This TensorFlow script converts the training and evaluation data into
a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
Each validation TFRecord file contains ~500 records. Each training TFREcord
file contains ~1000 records. Each record within the TFRecord file is a
serialized Example proto. The Example proto contains the following fields:
image/encoded: string containing JPEG encoded image in RGB colorspace
image/height: integer, image height in pixels
image/width: integer, image width in pixels
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always'JPEG'
image/object/bbox/xmin: list of float specifying the 0+ human annotated
bounding boxes
image/object/bbox/xmax: list of float specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymin: list of float specifying the 0+ human annotated
bounding boxes
image/object/bbox/ymax: list of float specifying the 0+ human annotated
bounding boxes
image/object/bbox/label: list of integer specifying the classification index.
image/object/bbox/label_text: list of string descriptions.
Note that the length of xmin is identical to the length of xmax, ymin and ymax
for each example.
"""
import os
import sys
import random
import numpy as np
import tensorflow as tf
import xml.etree.ElementTree as ET
from dataprep.dataset_utils import int64_feature, float_feature, bytes_feature
# TFRecords conversion parameters.
RANDOM_SEED = 4242
SAMPLES_PER_FILES = 100
def _set_voc_labels_map(class_list):
return dict(**{'none': 0}, **{cl: i + 1 for i, cl in enumerate(class_list)})
def _process_image(img_name, annot_name, class_list):
"""Process a image and annotation file.
Args:
img_name: string, path to an image file e.g., '/path/to/example.JPG'.
Returns:
image_buffer: string, JPEG encoding of RGB image.
height: integer, image height in pixels.
width: integer, image width in pixels.
"""
# Read the image file.
image_data = tf.gfile.FastGFile(img_name, 'rb').read()
class_dict = _set_voc_labels_map(class_list)
# Read the XML annotation file.
filename = annot_name
tree = ET.parse(filename)
root = tree.getroot()
# Image shape.
size = root.find('size')
shape = [int(size.find('height').text),
int(size.find('width').text),
int(size.find('depth').text)]
# Find annotations.
bboxes = []
labels = []
labels_text = []
difficult = []
truncated = []
for obj in root.findall('object'):
label = obj.find('name').text
labels.append(class_dict[label])
labels_text.append(label.encode('ascii'))
if obj.find('difficult'):
difficult.append(int(obj.find('difficult').text))
else:
difficult.append(0)
if obj.find('truncated'):
truncated.append(int(obj.find('truncated').text))
else:
truncated.append(0)
bbox = obj.find('bndbox')
bboxes.append((to_valid_range(float(bbox.find('ymin').text) / shape[0]),
to_valid_range(float(bbox.find('xmin').text) / shape[1]),
to_valid_range(float(bbox.find('ymax').text) / shape[0]),
to_valid_range(float(bbox.find('xmax').text) / shape[1])
))
return image_data, shape, np.clip(bboxes, a_min=0., a_max=1.), labels, labels_text, difficult, truncated
def to_valid_range(v):
if v < 0.0:
return 0.0
if v > 1.0:
return 1.0
return v
def _convert_to_example(image_data, labels, labels_text, bboxes, shape,
difficult, truncated):
"""Build an Example proto for an image example.
Args:
image_data: string, JPEG encoding of RGB image;
labels: list of integers, identifier for the ground truth;
labels_text: list of strings, human-readable labels;
bboxes: list of bounding boxes; each box is a list of integers;
specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong
to the same label as the image label.
shape: 3 integers, image shapes in pixels.
Returns:
Example proto
"""
xmin = []
ymin = []
xmax = []
ymax = []
for b in bboxes:
assert len(b) == 4
# pylint: disable=expression-not-assigned
[l.append(point) for l, point in zip([ymin, xmin, ymax, xmax], b)]
# pylint: enable=expression-not-assigned
image_format = b'JPEG'
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': int64_feature(shape[0]),
'image/width': int64_feature(shape[1]),
'image/channels': int64_feature(shape[2]),
'image/shape': int64_feature(shape),
'image/object/bbox/xmin': float_feature(xmin),
'image/object/bbox/xmax': float_feature(xmax),
'image/object/bbox/ymin': float_feature(ymin),
'image/object/bbox/ymax': float_feature(ymax),
'image/object/bbox/label': int64_feature(labels),
'image/object/bbox/label_text': bytes_feature(labels_text),
'image/object/bbox/difficult': int64_feature(difficult),
'image/object/bbox/truncated': int64_feature(truncated),
'image/format': bytes_feature(image_format),
'image/encoded': bytes_feature(image_data)}))
return example
def _add_to_tfrecord(img_name, annot_name, class_list, tfrecord_writer):
"""Loads data from image and annotations files and add them to a TFRecord.
Args:
dataset_dir: Dataset directory;
name: Image name to add to the TFRecord;
tfrecord_writer: The TFRecord writer to use for writing.
"""
image_data, shape, bboxes, labels, labels_text, difficult, truncated = \
_process_image(img_name, annot_name, class_list)
example = _convert_to_example(image_data, labels, labels_text,
bboxes, shape, difficult, truncated)
tfrecord_writer.write(example.SerializeToString())
def _get_output_filename(output_dir, name, idx):
return os.path.join(output_dir, f"{name}_{idx:04d}.tfrecord")
def run(output_dir, classes_list, images_list, annotations_list, output_name):
"""Runs the conversion operation.
Args:
output_dir: Output directory.
"""
if not tf.gfile.Exists(output_dir):
tf.gfile.MakeDirs(output_dir)
if(len(images_list) != len(annotations_list)):
raise ValueError("Images and annotations lists are of different legnths!")
# Process dataset files.
fidx = 0
i = 0
im_annot = list(zip(images_list, annotations_list))
while i < len(im_annot):
# Open new TFRecord file.
tf_filename = _get_output_filename(output_dir, output_name, fidx)
with tf.python_io.TFRecordWriter(tf_filename) as tfrecord_writer:
j = 0
while i < len(im_annot) and j < SAMPLES_PER_FILES:
sys.stdout.write('\r>> Converting image %d/%d' % (i+1, len(im_annot)))
sys.stdout.flush()
img_name, annot_name = im_annot[i]
_add_to_tfrecord(img_name, annot_name, classes_list, tfrecord_writer)
i += 1
j += 1
fidx += 1
print('\nFinished converting the Pascal VOC dataset!')

View File

@@ -0,0 +1,65 @@
import tensorflow as tf
import numpy as np
import os
from datautil.ssd_vgg_preprocessing import preprocess_for_train, preprocess_for_eval
from model import ssd_common
from tfutil import tf_utils
features = {
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
'image/height': tf.FixedLenFeature([1], tf.int64),
'image/width': tf.FixedLenFeature([1], tf.int64),
'image/channels': tf.FixedLenFeature([1], tf.int64),
'image/shape': tf.FixedLenFeature([3], tf.int64),
'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
}
def get_parser_func(anchors, num_classes, is_training, var_scope):
'''
Dataset parser function for training and evaluation
Arguments:
preprocess_fn - function that does preprocesing
'''
preprocess_fn = preprocess_for_train if is_training else preprocess_for_eval
def parse_tfrec_data(example_proto):
with tf.variable_scope(var_scope):
parsed_features = tf.parse_single_example(example_proto, features)
image_string = parsed_features['image/encoded']
image_decoded = tf.image.decode_jpeg(image_string)
labels = tf.sparse.to_dense(parsed_features['image/object/bbox/label'])
xmin = tf.sparse.to_dense(parsed_features['image/object/bbox/xmin'])
xmax = tf.sparse.to_dense(parsed_features['image/object/bbox/xmax'])
ymin = tf.sparse.to_dense(parsed_features['image/object/bbox/ymin'])
ymax = tf.sparse.to_dense(parsed_features['image/object/bbox/ymax'])
bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=1)
if is_training:
image, labels, bboxes = preprocess_fn(image_decoded, labels, bboxes)
else:
image, labels, bboxes, _ = preprocess_fn(image_decoded, labels, bboxes)
# ground truth encoding
# each of the returns is a litst of tensors
if is_training:
classes, localisations, scores = \
ssd_common.tf_ssd_bboxes_encode(labels, bboxes, anchors, num_classes)
return tf_utils.reshape_list([image, classes, localisations, scores])
else:
return tf_utils.reshape_list([image, labels, bboxes])
return parse_tfrec_data

View File

@@ -0,0 +1,397 @@
# Copyright 2015 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Pre-processing images for SSD-type networks.
"""
from enum import Enum, IntEnum
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
import tfextended as tfe
from datautil import tf_image
# Resizing strategies.
Resize = IntEnum('Resize', ('NONE', # Nothing!
'CENTRAL_CROP', # Crop (and pad if necessary).
'PAD_AND_RESIZE', # Pad, and resize to output shape.
'WARP_RESIZE')) # Warp resize.
# VGG mean parameters.
_R_MEAN = 123.
_G_MEAN = 117.
_B_MEAN = 104.
# Some training pre-processing parameters.
BBOX_CROP_OVERLAP = 0.5 # Minimum overlap to keep a bbox after cropping.
MIN_OBJECT_COVERED = 0.25
CROP_RATIO_RANGE = (0.6, 1.67) # Distortion ratio during cropping.
EVAL_SIZE = (300, 300)
def tf_image_whitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN]):
"""Subtracts the given means from each image channel.
Returns:
the centered image.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
mean = tf.constant(means, dtype=image.dtype)
image = image - mean
return image
def tf_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True):
"""Re-convert to original image distribution, and convert to int if
necessary.
Returns:
Centered image.
"""
mean = tf.constant(means, dtype=image.dtype)
image = image + mean
if to_int:
image = tf.cast(image, tf.int32)
return image
def np_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True):
"""Re-convert to original image distribution, and convert to int if
necessary. Numpy version.
Returns:
Centered image.
"""
img = np.copy(image)
img += np.array(means, dtype=img.dtype)
if to_int:
img = img.astype(np.uint8)
return img
def tf_summary_image(image, bboxes, name='image', unwhitened=False):
"""Add image with bounding boxes to summary.
"""
if unwhitened:
image = tf_image_unwhitened(image)
image = tf.expand_dims(image, 0)
bboxes = tf.expand_dims(bboxes, 0)
image_with_box = tf.image.draw_bounding_boxes(image, bboxes)
tf.summary.image(name, image_with_box)
def apply_with_random_selector(x, func, num_cases):
"""Computes func(x, sel), with sel sampled from [0...num_cases-1].
Args:
x: input Tensor.
func: Python function to apply.
num_cases: Python int32, number of cases to sample sel from.
Returns:
The result of func(x, sel), where func receives the value of the
selector as a python integer, but sel is sampled dynamically.
"""
sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
# Pass the real x only to one of the func calls.
return control_flow_ops.merge([
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
for case in range(num_cases)])[0]
def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
"""Distort the color of a Tensor image.
Each color distortion is non-commutative and thus ordering of the color ops
matters. Ideally we would randomly permute the ordering of the color ops.
Rather then adding that level of complication, we select a distinct ordering
of color ops for each preprocessing thread.
Args:
image: 3-D Tensor containing single image in [0, 1].
color_ordering: Python int, a type of distortion (valid values: 0-3).
fast_mode: Avoids slower ops (random_hue and random_contrast)
scope: Optional scope for name_scope.
Returns:
3-D Tensor color-distorted image on range [0, 1]
Raises:
ValueError: if color_ordering not in [0, 3]
"""
with tf.name_scope(scope, 'distort_color', [image]):
if fast_mode:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
else:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
elif color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
elif color_ordering == 2:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
elif color_ordering == 3:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
raise ValueError('color_ordering must be in [0, 3]')
# The random_* ops do not necessarily clamp.
return tf.clip_by_value(image, 0.0, 1.0)
def distorted_bounding_box_crop(image,
labels,
bboxes,
min_object_covered=0.3,
aspect_ratio_range=(0.9, 1.1),
area_range=(0.1, 1.0),
max_attempts=200,
clip_bboxes=True,
scope=None):
"""Generates cropped_image using a one of the bboxes randomly distorted.
See `tf.image.sample_distorted_bounding_box` for more documentation.
Args:
image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
image.
min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
area of the image must contain at least this fraction of any bounding box
supplied.
aspect_ratio_range: An optional list of `floats`. The cropped area of the
image must have an aspect ratio = width / height within this range.
area_range: An optional list of `floats`. The cropped area of the image
must contain a fraction of the supplied image within in this range.
max_attempts: An optional `int`. Number of attempts at generating a cropped
region of the image of the specified constraints. After `max_attempts`
failures, return the entire image.
scope: Optional scope for name_scope.
Returns:
A tuple, a 3-D Tensor cropped_image and the distorted bbox
"""
with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
tf.shape(image),
bounding_boxes=tf.expand_dims(bboxes, 0),
min_object_covered=min_object_covered,
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts,
use_image_if_no_bounding_boxes=True)
distort_bbox = distort_bbox[0, 0]
# Crop the image to the specified bounding box.
cropped_image = tf.slice(image, bbox_begin, bbox_size)
# Restore the shape since the dynamic slice loses 3rd dimension.
cropped_image.set_shape([None, None, 3])
# Update bounding boxes: resize and filter out.
bboxes = tfe.bboxes_resize(distort_bbox, bboxes)
labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
threshold=BBOX_CROP_OVERLAP,
assign_negative=False)
return cropped_image, labels, bboxes, distort_bbox
def preprocess_for_train(image, labels, bboxes,
out_shape = (300, 300), data_format='NHWC',
scope='ssd_preprocessing_train'):
"""Preprocesses the given image for training.
Note that the actual resizing scale is sampled from
[`resize_size_min`, `resize_size_max`].
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing.
Returns:
A preprocessed image.
"""
fast_mode = False
with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
# Convert to float scaled [0, 1].
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
tf_summary_image(image, bboxes, 'image_with_bboxes')
# # Remove DontCare labels.
# labels, bboxes = ssd_common.tf_bboxes_filter_labels(out_label,
# labels,
# bboxes)
# Distort image and bounding boxes.
dst_image = image
dst_image, labels, bboxes, distort_bbox = \
distorted_bounding_box_crop(image, labels, bboxes,
min_object_covered=MIN_OBJECT_COVERED,
aspect_ratio_range=CROP_RATIO_RANGE)
# Resize image to output size.
dst_image = tf_image.resize_image(dst_image, out_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
tf_summary_image(dst_image, bboxes, 'image_shape_distorted')
# Randomly flip the image horizontally.
dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)
# Randomly distort the colors. There are 4 ways to do it.
dst_image = apply_with_random_selector(
dst_image,
lambda x, ordering: distort_color(x, ordering, fast_mode),
num_cases=4)
tf_summary_image(dst_image, bboxes, 'image_color_distorted')
# Rescale to VGG input scale.
image = dst_image * 255.
image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
# Image data format.
if data_format == 'NCHW':
image = tf.transpose(image, perm=(2, 0, 1))
return image, labels, bboxes
def preprocess_for_eval(image, labels, bboxes,
out_shape=EVAL_SIZE, data_format='NHWC',
difficults=None, resize=Resize.WARP_RESIZE,
scope='ssd_preprocessing_train'):
"""Preprocess an image for evaluation.
Args:
image: A `Tensor` representing an image of arbitrary size.
out_shape: Output shape after pre-processing (if resize != None)
resize: Resize strategy.
Returns:
A preprocessed image.
"""
with tf.name_scope(scope):
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
image = tf.cast(image, tf.float32)
image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
# Add image rectangle to bboxes.
bbox_img = tf.constant([[0., 0., 1., 1.]])
if bboxes is None:
bboxes = bbox_img
else:
bboxes = tf.concat([bbox_img, bboxes], axis=0)
if resize == Resize.NONE:
# No resizing...
pass
elif resize == Resize.CENTRAL_CROP:
# Central cropping of the image.
image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad(
image, bboxes, out_shape[0], out_shape[1])
elif resize == Resize.PAD_AND_RESIZE:
# Resize image first: find the correct factor...
shape = tf.shape(image)
factor = tf.minimum(tf.to_double(1.0),
tf.minimum(tf.to_double(out_shape[0] / shape[0]),
tf.to_double(out_shape[1] / shape[1])))
resize_shape = factor * tf.to_double(shape[0:2])
resize_shape = tf.cast(tf.floor(resize_shape), tf.int32)
image = tf_image.resize_image(image, resize_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
# Pad to expected size.
image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad(
image, bboxes, out_shape[0], out_shape[1])
elif resize == Resize.WARP_RESIZE:
# Warp resize of the image.
image = tf_image.resize_image(image, out_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
# Split back bounding boxes.
bbox_img = bboxes[0]
bboxes = bboxes[1:]
# Remove difficult boxes.
if difficults is not None:
mask = tf.logical_not(tf.cast(difficults, tf.bool))
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
# Image data format.
if data_format == 'NCHW':
image = tf.transpose(image, perm=(2, 0, 1))
return image, labels, bboxes, bbox_img
def preprocess_image(image,
labels,
bboxes,
out_shape,
data_format,
is_training=False,
**kwargs):
"""Pre-process an given image.
Args:
image: A `Tensor` representing an image of arbitrary size.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
resize_side_min: The lower bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, then this value
is used for rescaling.
resize_side_max: The upper bound for the smallest side of the image for
aspect-preserving resizing. If `is_training` is `False`, this value is
ignored. Otherwise, the resize side is sampled from
[resize_size_min, resize_size_max].
Returns:
A preprocessed image.
"""
if is_training:
return preprocess_for_train(image, labels, bboxes,
out_shape=out_shape,
data_format=data_format)
else:
return preprocess_for_eval(image, labels, bboxes,
out_shape=out_shape,
data_format=data_format,
**kwargs)

View File

@@ -0,0 +1,306 @@
# Copyright 2015 The TensorFlow Authors and Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Custom image operations.
Most of the following methods extend TensorFlow image library, and part of
the code is shameless copy-paste of the former!
"""
import tensorflow as tf
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import gen_image_ops
from tensorflow.python.ops import gen_nn_ops
from tensorflow.python.ops import string_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.ops import variables
# =========================================================================== #
# Modification of TensorFlow image routines.
# =========================================================================== #
def _assert(cond, ex_type, msg):
"""A polymorphic assert, works with tensors and boolean expressions.
If `cond` is not a tensor, behave like an ordinary assert statement, except
that a empty list is returned. If `cond` is a tensor, return a list
containing a single TensorFlow assert op.
Args:
cond: Something evaluates to a boolean value. May be a tensor.
ex_type: The exception class to use.
msg: The error message.
Returns:
A list, containing at most one assert op.
"""
if _is_tensor(cond):
return [control_flow_ops.Assert(cond, [msg])]
else:
if not cond:
raise ex_type(msg)
else:
return []
def _is_tensor(x):
"""Returns `True` if `x` is a symbolic tensor-like object.
Args:
x: A python object to check.
Returns:
`True` if `x` is a `tf.Tensor` or `tf.Variable`, otherwise `False`.
"""
return isinstance(x, (ops.Tensor, variables.Variable))
def _ImageDimensions(image):
"""Returns the dimensions of an image tensor.
Args:
image: A 3-D Tensor of shape `[height, width, channels]`.
Returns:
A list of `[height, width, channels]` corresponding to the dimensions of the
input image. Dimensions that are statically known are python integers,
otherwise they are integer scalar tensors.
"""
if image.get_shape().is_fully_defined():
return image.get_shape().as_list()
else:
static_shape = image.get_shape().with_rank(3).as_list()
dynamic_shape = array_ops.unstack(array_ops.shape(image), 3)
return [s if s is not None else d
for s, d in zip(static_shape, dynamic_shape)]
def _Check3DImage(image, require_static=True):
"""Assert that we are working with properly shaped image.
Args:
image: 3-D Tensor of shape [height, width, channels]
require_static: If `True`, requires that all dimensions of `image` are
known and non-zero.
Raises:
ValueError: if `image.shape` is not a 3-vector.
Returns:
An empty list, if `image` has fully defined dimensions. Otherwise, a list
containing an assert op is returned.
"""
try:
image_shape = image.get_shape().with_rank(3)
except ValueError:
raise ValueError("'image' must be three-dimensional.")
if require_static and not image_shape.is_fully_defined():
raise ValueError("'image' must be fully defined.")
if any(x == 0 for x in image_shape):
raise ValueError("all dims of 'image.shape' must be > 0: %s" %
image_shape)
if not image_shape.is_fully_defined():
return [check_ops.assert_positive(array_ops.shape(image),
["all dims of 'image.shape' "
"must be > 0."])]
else:
return []
def fix_image_flip_shape(image, result):
"""Set the shape to 3 dimensional if we don't know anything else.
Args:
image: original image size
result: flipped or transformed image
Returns:
An image whose shape is at least None,None,None.
"""
image_shape = image.get_shape()
if image_shape == tensor_shape.unknown_shape():
result.set_shape([None, None, None])
else:
result.set_shape(image_shape)
return result
# =========================================================================== #
# Image + BBoxes methods: cropping, resizing, flipping, ...
# =========================================================================== #
def bboxes_crop_or_pad(bboxes,
height, width,
offset_y, offset_x,
target_height, target_width):
"""Adapt bounding boxes to crop or pad operations.
Coordinates are always supposed to be relative to the image.
Arguments:
bboxes: Tensor Nx4 with bboxes coordinates [y_min, x_min, y_max, x_max];
height, width: Original image dimension;
offset_y, offset_x: Offset to apply,
negative if cropping, positive if padding;
target_height, target_width: Target dimension after cropping / padding.
"""
with tf.name_scope('bboxes_crop_or_pad'):
# Rescale bounding boxes in pixels.
scale = tf.cast(tf.stack([height, width, height, width]), bboxes.dtype)
bboxes = bboxes * scale
# Add offset.
offset = tf.cast(tf.stack([offset_y, offset_x, offset_y, offset_x]), bboxes.dtype)
bboxes = bboxes + offset
# Rescale to target dimension.
scale = tf.cast(tf.stack([target_height, target_width,
target_height, target_width]), bboxes.dtype)
bboxes = bboxes / scale
return bboxes
def resize_image_bboxes_with_crop_or_pad(image, bboxes,
target_height, target_width):
"""Crops and/or pads an image to a target width and height.
Resizes an image to a target width and height by either centrally
cropping the image or padding it evenly with zeros.
If `width` or `height` is greater than the specified `target_width` or
`target_height` respectively, this op centrally crops along that dimension.
If `width` or `height` is smaller than the specified `target_width` or
`target_height` respectively, this op centrally pads with 0 along that
dimension.
Args:
image: 3-D tensor of shape `[height, width, channels]`
target_height: Target height.
target_width: Target width.
Raises:
ValueError: if `target_height` or `target_width` are zero or negative.
Returns:
Cropped and/or padded image of shape
`[target_height, target_width, channels]`
"""
with tf.name_scope('resize_with_crop_or_pad'):
image = ops.convert_to_tensor(image, name='image')
assert_ops = []
assert_ops += _Check3DImage(image, require_static=False)
assert_ops += _assert(target_width > 0, ValueError,
'target_width must be > 0.')
assert_ops += _assert(target_height > 0, ValueError,
'target_height must be > 0.')
image = control_flow_ops.with_dependencies(assert_ops, image)
# `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
# Make sure our checks come first, so that error messages are clearer.
if _is_tensor(target_height):
target_height = control_flow_ops.with_dependencies(
assert_ops, target_height)
if _is_tensor(target_width):
target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
def max_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.maximum(x, y)
else:
return max(x, y)
def min_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.minimum(x, y)
else:
return min(x, y)
def equal_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.equal(x, y)
else:
return x == y
height, width, _ = _ImageDimensions(image)
width_diff = target_width - width
offset_crop_width = max_(-width_diff // 2, 0)
offset_pad_width = max_(width_diff // 2, 0)
height_diff = target_height - height
offset_crop_height = max_(-height_diff // 2, 0)
offset_pad_height = max_(height_diff // 2, 0)
# Maybe crop if needed.
height_crop = min_(target_height, height)
width_crop = min_(target_width, width)
cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
height_crop, width_crop)
bboxes = bboxes_crop_or_pad(bboxes,
height, width,
-offset_crop_height, -offset_crop_width,
height_crop, width_crop)
# Maybe pad if needed.
resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
target_height, target_width)
bboxes = bboxes_crop_or_pad(bboxes,
height_crop, width_crop,
offset_pad_height, offset_pad_width,
target_height, target_width)
# In theory all the checks below are redundant.
if resized.get_shape().ndims is None:
raise ValueError('resized contains no shape.')
resized_height, resized_width, _ = _ImageDimensions(resized)
assert_ops = []
assert_ops += _assert(equal_(resized_height, target_height), ValueError,
'resized height is not correct.')
assert_ops += _assert(equal_(resized_width, target_width), ValueError,
'resized width is not correct.')
resized = control_flow_ops.with_dependencies(assert_ops, resized)
return resized, bboxes
def resize_image(image, size,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False):
"""Resize an image and bounding boxes.
"""
# Resize image.
with tf.name_scope('resize_image'):
height, width, channels = _ImageDimensions(image)
image = tf.expand_dims(image, 0)
image = tf.image.resize_images(image, size,
method, align_corners)
image = tf.reshape(image, tf.stack([size[0], size[1], channels]))
return image
def random_flip_left_right(image, bboxes, seed=None):
"""Random flip left-right of an image and its bounding boxes.
"""
def flip_bboxes(bboxes):
"""Flip bounding boxes coordinates.
"""
bboxes = tf.stack([bboxes[:, 0], 1 - bboxes[:, 3],
bboxes[:, 2], 1 - bboxes[:, 1]], axis=-1)
return bboxes
# Random flip. Tensorflow implementation.
with tf.name_scope('random_flip_left_right'):
image = ops.convert_to_tensor(image, name='image')
_Check3DImage(image, require_static=False)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
# Flip image.
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse_v2(image, [1]),
lambda: image)
# Flip bboxes.
bboxes = control_flow_ops.cond(mirror_cond,
lambda: flip_bboxes(bboxes),
lambda: bboxes)
return fix_image_flip_shape(image, result), bboxes

View File

@@ -0,0 +1,159 @@
import tensorflow as tf
import numpy as np
import os, sys, time
from anchors import generate_anchors
from model import ssd_common, ssd_vgg_300
from datautil.parser import get_parser_func
from datautil.ssd_vgg_preprocessing import preprocess_for_eval, preprocess_for_train
from tfutil import endpoints, tf_utils
import tfextended as tfe
from finetune.train_eval_base import TrainerBase
class EvalVggSsd(TrainerBase):
'''
Run fine-tuning
Have training and validation recordset files
'''
def __init__(self, ckpt_dir, validation_recordset_files, steps_to_save = 1000, num_steps = 1000, num_classes = 21, print_steps = 10):
'''
ckpt_dir - directory of checkpoint metagraph
train_recordset_files - list of files represetnting the recordset for training
validation_recordset_files - list of files representing validation recordset
'''
super().__init__(ckpt_dir, validation_recordset_files, steps_to_save, num_steps, num_classes, print_steps, 1, is_training=False)
self.eval_classes = num_classes
def get_eval_ops(self, b_labels, b_bboxes, predictions, localizations):
'''
Create evaluation operation
'''
b_difficults = tf.zeros(tf.shape(b_labels), dtype=tf.int64)
# Performing post-processing on CPU: loop-intensive, usually more efficient.
with tf.device('/device:CPU:0'):
# Detected objects from SSD output.
detected_localizations = self.ssd_net.bboxes_decode(localizations, self.anchors)
rscores, rbboxes = \
self.ssd_net.detected_bboxes(predictions, detected_localizations,
select_threshold=0.01,
nms_threshold=0.45,
clipping_bbox=None,
top_k=400,
keep_top_k=20)
# Compute TP and FP statistics.
num_gbboxes, tp, fp, rscores = \
tfe.bboxes_matching_batch(rscores.keys(), rscores, rbboxes,
b_labels, b_bboxes, b_difficults,
matching_threshold=0.5)
# =================================================================== #
# Evaluation metrics.
# =================================================================== #
dict_metrics = {}
metrics_scope = 'ssd_metrics_scope'
# First add all losses.
for loss in tf.get_collection(tf.GraphKeys.LOSSES):
dict_metrics[loss.op.name] = tf.metrics.mean(loss, name=metrics_scope)
# Extra losses as well.
for loss in tf.get_collection('EXTRA_LOSSES'):
dict_metrics[loss.op.name] = tf.metrics.mean(loss, name=metrics_scope)
# Add metrics to summaries and Print on screen.
for name, metric in dict_metrics.items():
# summary_name = 'eval/%s' % name
summary_name = name
tf.summary.scalar(summary_name, metric[0])
# FP and TP metrics.
tp_fp_metric = tfe.streaming_tp_fp_arrays(num_gbboxes, tp, fp, rscores, name=metrics_scope)
for c in tp_fp_metric[0].keys():
dict_metrics['tp_fp_%s' % c] = (tp_fp_metric[0][c],
tp_fp_metric[1][c])
# Add to summaries precision/recall values.
aps_voc12 = {}
# TODO: We cut it short by the actual number of classes we have
for c in list(tp_fp_metric[0].keys())[:self.eval_classes - 1]:
# Precison and recall values.
prec, rec = tfe.precision_recall(*tp_fp_metric[0][c])
# Average precision VOC12.
v = tfe.average_precision_voc12(prec, rec)
summary_name = 'AP_VOC12/%s' % c
tf.summary.scalar(summary_name, v)
aps_voc12[c] = v
# Mean average precision VOC12.
summary_name = 'AP_VOC12/mAP'
mAP = tf.add_n(list(aps_voc12.values())) / len(aps_voc12)
tf.summary.scalar(summary_name, mAP)
names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map(dict_metrics)
# Split into values and updates ops.
return (names_to_values, names_to_updates, mAP)
def eval(self):
tf.logging.set_verbosity(tf.logging.INFO)
# shorthand
sess = self.sess
sess.run(self.iterator.initializer)
batch_data = self.iterator.get_next()
# image, classes, scores, ground_truths are neatly packed into a flat list
# this is how we will slice it to extract the data we need:
# we will convert the flat list into a list of lists, where each sub-list
# is as long as each slice dimension
slice_shape = [1] * 3
b_image, b_labels, b_bboxes = tf_utils.reshape_list(batch_data, slice_shape)
# network endpoints
predictions, localizations, _, _ = self.get_output_tensors(b_image)
# branch to create evaluation operation
_, names_to_updates, mAP = \
self.get_eval_ops(b_labels, b_bboxes, predictions, localizations)
eval_update_ops = tf_utils.reshape_list(list(names_to_updates.values()))
# summaries
summary_op = tf.summary.merge_all()
saver = tf.train.Saver()
eval_writer = tf.summary.FileWriter(self.ckpt_dir + '/eval')
# initialize globals
sess.run(tf.global_variables_initializer())
saver.restore(self.sess, self.ckpt_file)
sess.run(tf.local_variables_initializer())
tf.logging.info(f"Starting evaluation for {self.num_steps} steps")
cur_step = self.latest_ckpt_step
for step in range(self.num_steps):
print(f"Evaluation step: {step + 1}", end='\r', flush=True)
_, summary = sess.run([eval_update_ops, summary_op])
if (step + 1) % self.print_steps == 0 or step == self.num_steps:
eval_writer.add_summary(summary, cur_step + step + 1)
summary_final, mAP_val = sess.run([summary_op, mAP])
print(f"\nmAP: {mAP_val:.4f}")
if (step + 1) % self.print_steps != 0:
eval_writer.add_summary(summary_final, self.num_steps + cur_step)

View File

@@ -0,0 +1,95 @@
import tensorflow as tf
import os, time
from anchors import generate_anchors
from model import np_methods
from tfutil import endpoints, tf_utils
from datautil.ssd_vgg_preprocessing import preprocess_for_eval
import tfextended as tfe
from azureml.accel.models import SsdVgg
class InferVggSsd:
'''
Run fine-tuning
Have training and validation recordset files
'''
def __init__(self, ckpt_dir, ckpt_file=None, gpu=True):
'''
ckpt_dir - directory of checkpoint metagraph
'''
if gpu:
gpu_options = tf.GPUOptions(allow_growth=True)
config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
else:
config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 0})
self.sess = tf.Session(config=config)
ssd_net_graph = SsdVgg(ckpt_dir)
self.ckpt_dir = ssd_net_graph.model_path
self.img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
# Evaluation pre-processing: resize to SSD net shape.
image_pre, _, _, self.bbox_img = preprocess_for_eval(
self.img_input, None, None, generate_anchors.img_shape, "NHWC")
self.image_4d = tf.expand_dims(image_pre, 0)
# import the graph
ssd_net_graph.import_graph_def(self.image_4d, is_training=False)
graph = tf.get_default_graph()
self.localizations = [graph.get_tensor_by_name(tensor_name) for tensor_name in endpoints.localizations_names]
self.predictions = [graph.get_tensor_by_name(tensor_name) for tensor_name in endpoints.predictions_names]
# Restore SSD model.
self.sess.run(tf.global_variables_initializer())
if ckpt_file is None:
ssd_net_graph.restore_weights(self.sess)
else:
saver = tf.train.Saver()
saver.restore(self.sess, os.path.join(self.ckpt_dir, ckpt_file))
# SSD default anchor boxes.
self.ssd_anchors = generate_anchors.ssd_anchors_all_layers()
def close(self):
self.sess.close()
tf.reset_default_graph()
def process_image(self, img, select_threshold=0.4, nms_threshold=.45, net_shape=(300, 300)):
# Run SSD network.
rpredictions, rlocalisations, rbbox_img = \
self.sess.run([self.predictions, self.localizations, self.bbox_img],
feed_dict={self.img_input: img})
# Get classes and bboxes from the net outputs.
rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
rpredictions, rlocalisations, self.ssd_anchors,
select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
return rclasses, rscores, rbboxes
def infer(self, img, visualize):
rclasses, rscores, rbboxes = self.process_image(img)
if visualize:
from tfutil import visualization
visualization.plt_bboxes(img, rclasses, rscores, rbboxes)
return rclasses, rscores, rbboxes
def infer_file(self, im_file, visualize=False):
import cv2
img = cv2.imread(im_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return self.infer(img, visualize)

View File

@@ -0,0 +1,57 @@
import tensorflow as tf
import os, time
from azureml.accel.models import SsdVgg
import azureml.accel.models.utils as utils
class SaverVggSsd:
'''
Run fine-tuning
Have training and validation recordset files
'''
def __init__(self, ckpt_dir):
'''
ckpt_dir - directory of checkpoint metagraph
'''
config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 0})
self.sess = tf.Session(config=config)
ssd_net_graph = SsdVgg(ckpt_dir, is_frozen=True)
self.ckpt_dir = ssd_net_graph.model_path
self.in_images = tf.placeholder(tf.string)
self.image_tensors = utils.preprocess_array(self.in_images, output_width=300, output_height=300,
preserve_aspect_ratio=False)
self.output_tensors = ssd_net_graph.import_graph_def(self.image_tensors, is_training=False)
self.output_names = ssd_net_graph.output_tensor_list
self.input_name_str = self.in_images.name
# Restore SSD model.
ssd_net_graph.restore_weights(self.sess)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
self.sess.close()
tf.reset_default_graph()
def save_for_deployment(self, saved_path):
output_map = {'out_{}'.format(i): output for i, output in enumerate(self.output_tensors)}
tf.saved_model.simple_save(self.sess,
saved_path,
inputs={"images": self.in_images},
outputs=output_map)

View File

@@ -0,0 +1,144 @@
import tensorflow as tf
import numpy as np
import os, sys, time, re
from anchors import generate_anchors
from model import ssd_common, ssd_vgg_300
from datautil.parser import get_parser_func
from datautil.ssd_vgg_preprocessing import preprocess_for_eval, preprocess_for_train
from tfutil import endpoints, tf_utils
import tfextended as tfe
from finetune.train_eval_base import TrainerBase
class TrainVggSsd(TrainerBase):
'''
Run fine-tuning
Have training and validation recordset files
'''
def __init__(self, ckpt_dir, train_recordset_files,
steps_to_save = 1000, num_steps = 1000, num_classes = 21,
print_steps = 10, batch_size = 2,
learning_rate = 1e-4, learning_rate_decay_steps=None, learning_rate_decay_value = None,
adam_beta1 = 0.9, adam_beta2 = 0.999, adam_epsilon = 1e-8):
'''
ckpt_dir - directory of checkpoint metagraph
train_recordset_files - list of files represetnting the recordset for training
validation_recordset_files - list of files representing validation recordset
'''
super().__init__(ckpt_dir, train_recordset_files, steps_to_save, num_steps, num_classes, print_steps, batch_size)
# optimizer parameters
self.learning_rate = learning_rate
self.learning_rate_decay_steps = learning_rate_decay_steps
self.learning_rate_decay_value = learning_rate_decay_value
if self.learning_rate <= 0 \
or (self.learning_rate_decay_value is not None and self.learning_rate_decay_value <= 0) \
or (self.learning_rate_decay_steps is not None and self.learning_rate_decay_steps <= 0) \
or (self.learning_rate_decay_steps is None and self.learning_rate_decay_value is not None) \
or (self.learning_rate_decay_steps is not None and self.learning_rate_decay_value is None):
raise ValueError("learning rate, learning rate steps, learning rate decay must be positive, \
learning decay steps and value must be both present or both absent")
self.adam_beta1 = adam_beta1
self.adam_beta2 = adam_beta2
self.adam_epsilon = adam_epsilon
def get_optimizer(self, learning_rate):
optimizer = tf.train.AdamOptimizer(
learning_rate,
beta1=self.adam_beta1,
beta2=self.adam_beta2,
epsilon=self.adam_epsilon)
return optimizer
def get_learning_rate(self, global_step):
'''
Configure learning rate based on decay specifications
'''
if self.learning_rate_decay_steps is None:
return tf.constant(self.learning_rate, name = 'fixed_learning_rate')
else:
return tf.train.exponential_decay(self.learning_rate, global_step, \
self.learning_rate_decay_steps, self.learning_rate_decay_value, \
staircase=True, name="exponential_decay_learning_rate")
def train(self):
tf.logging.set_verbosity(tf.logging.INFO)
# shorthand
sess = self.sess
batch_data = self.iterator.get_next()
# image, classes, scores, ground_truths are neatly packed into a flat list
# this is how we will slice it to extract the data we need:
# we will convert the flat list into a list of lists, where each sub-list
# is as long as each slice dimension
slice_shape = [1] + [len(self.anchors)] * 3
b_image, b_classes, b_localizations, b_scores = tf_utils.reshape_list(batch_data, slice_shape)
# network endpoints
_, localizations, logits, bw_saver = self.get_output_tensors(b_image)
variables_to_train = tf.trainable_variables()
sess.run(tf.initialize_variables(variables_to_train))
# add losses
total_loss = self.ssd_net.losses(logits, localizations, b_classes, b_localizations, b_scores)
tf.summary.scalar("total_loss", total_loss)
global_step = tf.train.get_or_create_global_step()
learning_rate = self.get_learning_rate(global_step)
# configure learning rate now that we have the global step
# add optimizer
optimizer = self.get_optimizer(learning_rate)
tf.summary.scalar("learning_rate", learning_rate)
grads_and_vars = optimizer.compute_gradients(total_loss, var_list=variables_to_train)
grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# initialize all the variables we should initialize
# weights will be restored right after
sess.run(tf.global_variables_initializer())
# after the first restore, we want global step in our checkpoint
saver = tf.train.Saver(variables_to_train + [global_step])
if self.latest_ckpt_step == 0:
bw_saver.restore(sess, self.ckpt_file)
else:
saver.restore(sess, self.ckpt_file)
self.ckpt_file = os.path.join(self.ckpt_dir, self.ckpt_prefix)
# summaries
train_summary_op = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(self.ckpt_dir + '/train', tf.get_default_graph())
tf.logging.info(f"Starting training for {self.num_steps} steps")
sess.run(self.iterator.initializer)
# training loop
start = time.time()
for _ in range(self.num_steps):
loss, _, cur_step, summary = sess.run([total_loss, grad_updates, global_step, train_summary_op])
cur_step += 1
if cur_step % self.print_steps == 0:
print(f"{cur_step}: loss: {loss:.3f}, avg per step: {(time.time() - start) / self.print_steps:.3f} sec", end='\r', flush=True)
train_writer.add_summary(summary, cur_step + 1)
start = time.time()
if cur_step % self.steps_to_save == 0:
saver.save(sess, self.ckpt_file, global_step=global_step)
print("\n")

View File

@@ -0,0 +1,125 @@
import tensorflow as tf
import numpy as np
import os, sys, time, glob, re
from anchors import generate_anchors
from model import ssd_common, ssd_vgg_300
from datautil.parser import get_parser_func
from datautil.ssd_vgg_preprocessing import preprocess_for_eval, preprocess_for_train
from tfutil import endpoints, tf_utils
import tfextended as tfe
from azureml.accel.models import SsdVgg
slim = tf.contrib.slim
class TrainerBase:
'''
Run fine-tuning
Have training and validation recordset files
'''
def __init__(self, ckpt_dir, recordset_files,
steps_to_save = 1000, num_steps = 1000, num_classes = 21, print_steps = 10, batch_size=2, is_training=True):
'''
ckpt_dir - directory of checkpoint metagraph
recordset_files - list of files represetnting the recordset for training
validation_recordset_files - list of files representing validation recordset
'''
self.is_training = is_training
# This will pull the model with its weights
# And seed the checkpoint
self.ssd_net_graph = SsdVgg(ckpt_dir)
self.ckpt_dir = self.ssd_net_graph.model_path
self.ckpt_file = tf.train.latest_checkpoint(self.ssd_net_graph.model_path)
try:
self.latest_ckpt_step = int(re.findall("-[0-9]+$", self.ckpt_file)[0][1:])
except:
self.latest_ckpt_step = 0
self.recordset = recordset_files
self.ckpt_prefix = os.path.split(self.ssd_net_graph.model_ref + "_bw")[1]
self.pb_graph_path = os.path.join(self.ckpt_dir, self.ckpt_prefix + ".graph.pb")
#if self.is_training:
self.graph_file = os.path.join(self.ckpt_dir, self.ckpt_prefix + ".meta")
#else:
# self.graph_file = self.ckpt_file + ".meta"
# anchors
self.anchors = generate_anchors.ssd_anchors_all_layers()
# shuffle
self.n_shuffle = 1000
self.num_steps = num_steps
# num of classes
# REVIEW: this has to be 21!
self.num_classes = 21
# initialize data pipeline
self.batch_size = batch_size
self.iterator = None
self.prep_dataset_and_iterator()
self.steps_to_save = steps_to_save
self.print_steps = print_steps
# for losses etc
self.ssd_net = ssd_vgg_300.SSDNet()
# input placeholder
self.input_tensor_name = self.ssd_net_graph.input_tensor_list[0]
def __enter__(self):
gpu_options = tf.GPUOptions(allow_growth=True)
config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
self.sess = tf.Session(config=config)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.sess.close()
tf.reset_default_graph()
def prep_dataset_and_iterator(self):
'''
Create datasets for training or validation
'''
var_scope = "training" if self.is_training else "eval"
parse_func = get_parser_func(self.anchors, self.num_classes, self.is_training, var_scope)
with tf.variable_scope(var_scope):
# data pipeline
dataset = tf.data.TFRecordDataset(self.recordset)
if self.is_training:
dataset = dataset.shuffle(self.n_shuffle)
dataset = dataset.map(parse_func)
dataset = dataset.repeat()
dataset = dataset.batch(self.batch_size)
dataset = dataset.prefetch(1)
self.iterator = dataset.make_initializable_iterator()
def get_output_tensors(self, image):
is_training = tf.constant(self.is_training, dtype=tf.bool, shape=())
input_map = {self.input_tensor_name: image, "is_training": is_training}
saver = tf.train.import_meta_graph(self.graph_file, input_map=input_map)
graph = tf.get_default_graph()
logits = [graph.get_tensor_by_name(tensor_name) for tensor_name in endpoints.logit_names]
localizations = [graph.get_tensor_by_name(tensor_name) for tensor_name in endpoints.localizations_names]
predictions = [graph.get_tensor_by_name(tensor_name) for tensor_name in endpoints.predictions_names]
return predictions, localizations, logits, saver

View File

@@ -0,0 +1,164 @@
# Copyright 2015 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implement some custom layers, not provided by TensorFlow.
Trying to follow as much as possible the style/standards used in
tf.contrib.layers
"""
import tensorflow as tf
from tensorflow.contrib.framework.python.ops import add_arg_scope
from tensorflow.contrib.layers.python.layers import initializers
from tensorflow.contrib.framework.python.ops import variables
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.python.ops import nn
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import variable_scope
def abs_smooth(x):
"""Smoothed absolute function. Useful to compute an L1 smooth error.
Define as:
x^2 / 2 if abs(x) < 1
abs(x) - 0.5 if abs(x) > 1
We use here a differentiable definition using min(x) and abs(x). Clearly
not optimal, but good enough for our purpose!
"""
absx = tf.abs(x)
minx = tf.minimum(absx, 1)
r = 0.5 * ((absx - 1) * minx + absx)
return r
@add_arg_scope
def l2_normalization(
inputs,
scaling=False,
scale_initializer=init_ops.ones_initializer(),
reuse=None,
variables_collections=None,
outputs_collections=None,
data_format='NHWC',
trainable=True,
scope=None):
"""Implement L2 normalization on every feature (i.e. spatial normalization).
Should be extended in some near future to other dimensions, providing a more
flexible normalization framework.
Args:
inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
scaling: whether or not to add a post scaling operation along the dimensions
which have been normalized.
scale_initializer: An initializer for the weights.
reuse: whether or not the layer and its variables should be reused. To be
able to reuse the layer scope must be given.
variables_collections: optional list of collections for all the variables or
a dictionary containing a different list of collection per variable.
outputs_collections: collection to add the outputs.
data_format: NHWC or NCHW data format.
trainable: If `True` also add variables to the graph collection
`GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
scope: Optional scope for `variable_scope`.
Returns:
A `Tensor` representing the output of the operation.
"""
with variable_scope.variable_scope(
scope, 'L2Normalization', [inputs], reuse=reuse) as sc:
inputs_shape = inputs.get_shape()
inputs_rank = inputs_shape.ndims
dtype = inputs.dtype.base_dtype
if data_format == 'NHWC':
# norm_dim = tf.range(1, inputs_rank-1)
norm_dim = tf.range(inputs_rank-1, inputs_rank)
params_shape = inputs_shape[-1:]
elif data_format == 'NCHW':
# norm_dim = tf.range(2, inputs_rank)
norm_dim = tf.range(1, 2)
params_shape = (inputs_shape[1])
# Normalize along spatial dimensions.
outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
# Additional scaling.
if scaling:
scale_collections = utils.get_variable_collections(
variables_collections, 'scale')
scale = variables.model_variable('gamma',
shape=params_shape,
dtype=dtype,
initializer=scale_initializer,
collections=scale_collections,
trainable=trainable)
if data_format == 'NHWC':
outputs = tf.multiply(outputs, scale)
elif data_format == 'NCHW':
scale = tf.expand_dims(scale, axis=-1)
scale = tf.expand_dims(scale, axis=-1)
outputs = tf.multiply(outputs, scale)
# outputs = tf.transpose(outputs, perm=(0, 2, 3, 1))
return utils.collect_named_outputs(outputs_collections,
sc.original_name_scope, outputs)
@add_arg_scope
def pad2d(inputs,
pad=(0, 0),
mode='CONSTANT',
data_format='NHWC',
trainable=True,
scope=None):
"""2D Padding layer, adding a symmetric padding to H and W dimensions.
Aims to mimic padding in Caffe and MXNet, helping the port of models to
TensorFlow. Tries to follow the naming convention of `tf.contrib.layers`.
Args:
inputs: 4D input Tensor;
pad: 2-Tuple with padding values for H and W dimensions;
mode: Padding mode. C.f. `tf.pad`
data_format: NHWC or NCHW data format.
"""
with tf.name_scope(scope, 'pad2d', [inputs]):
# Padding shape.
if data_format == 'NHWC':
paddings = [[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]]
elif data_format == 'NCHW':
paddings = [[0, 0], [0, 0], [pad[0], pad[0]], [pad[1], pad[1]]]
net = tf.pad(inputs, paddings, mode=mode)
return net
@add_arg_scope
def channel_to_last(inputs,
data_format='NHWC',
scope=None):
"""Move the channel axis to the last dimension. Allows to
provide a single output format whatever the input data format.
Args:
inputs: Input Tensor;
data_format: NHWC or NCHW.
Return:
Input in NHWC format.
"""
with tf.name_scope(scope, 'channel_to_last', [inputs]):
if data_format == 'NHWC':
net = inputs
elif data_format == 'NCHW':
net = tf.transpose(inputs, perm=(0, 2, 3, 1))
return net

View File

@@ -0,0 +1,252 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Additional Numpy methods. Big mess of many things!
"""
import numpy as np
# =========================================================================== #
# Numpy implementations of SSD boxes functions.
# =========================================================================== #
def ssd_bboxes_decode(feat_localizations,
anchor_bboxes,
prior_scaling=[0.1, 0.1, 0.2, 0.2]):
"""Compute the relative bounding boxes from the layer features and
reference anchor bounding boxes.
Return:
numpy array Nx4: ymin, xmin, ymax, xmax
"""
# Reshape for easier broadcasting.
l_shape = feat_localizations.shape
feat_localizations = np.reshape(feat_localizations,
(-1, l_shape[-2], l_shape[-1]))
yref, xref, href, wref = anchor_bboxes
xref = np.reshape(xref, [-1, 1])
yref = np.reshape(yref, [-1, 1])
# Compute center, height and width
cx = feat_localizations[:, :, 0] * wref * prior_scaling[0] + xref
cy = feat_localizations[:, :, 1] * href * prior_scaling[1] + yref
w = wref * np.exp(feat_localizations[:, :, 2] * prior_scaling[2])
h = href * np.exp(feat_localizations[:, :, 3] * prior_scaling[3])
# bboxes: ymin, xmin, xmax, ymax.
bboxes = np.zeros_like(feat_localizations)
bboxes[:, :, 0] = cy - h / 2.
bboxes[:, :, 1] = cx - w / 2.
bboxes[:, :, 2] = cy + h / 2.
bboxes[:, :, 3] = cx + w / 2.
# Back to original shape.
bboxes = np.reshape(bboxes, l_shape)
return bboxes
def ssd_bboxes_select_layer(predictions_layer,
localizations_layer,
anchors_layer,
select_threshold=0.5,
img_shape=(300, 300),
num_classes=21,
decode=True):
"""Extract classes, scores and bounding boxes from features in one layer.
Return:
classes, scores, bboxes: Numpy arrays...
"""
# First decode localizations features if necessary.
if decode:
localizations_layer = ssd_bboxes_decode(localizations_layer, anchors_layer)
# Reshape features to: Batches x N x N_labels | 4.
p_shape = predictions_layer.shape
batch_size = p_shape[0] if len(p_shape) == 5 else 1
predictions_layer = np.reshape(predictions_layer,
(batch_size, -1, p_shape[-1]))
l_shape = localizations_layer.shape
localizations_layer = np.reshape(localizations_layer,
(batch_size, -1, l_shape[-1]))
# Boxes selection: use threshold or score > no-label criteria.
if select_threshold is None or select_threshold == 0:
# Class prediction and scores: assign 0. to 0-class
classes = np.argmax(predictions_layer, axis=2)
scores = np.amax(predictions_layer, axis=2)
mask = (classes > 0)
classes = classes[mask]
scores = scores[mask]
bboxes = localizations_layer[mask]
else:
sub_predictions = predictions_layer[:, :, 1:]
idxes = np.where(sub_predictions > select_threshold)
classes = idxes[-1]+1
scores = sub_predictions[idxes]
bboxes = localizations_layer[idxes[:-1]]
return classes, scores, bboxes
def ssd_bboxes_select(predictions_net,
localizations_net,
anchors_net,
select_threshold=0.5,
img_shape=(300, 300),
num_classes=21,
decode=True):
"""Extract classes, scores and bounding boxes from network output layers.
Return:
classes, scores, bboxes: Numpy arrays...
"""
l_classes = []
l_scores = []
l_bboxes = []
# l_layers = []
# l_idxes = []
for i in range(len(predictions_net)):
classes, scores, bboxes = ssd_bboxes_select_layer(
predictions_net[i], localizations_net[i], anchors_net[i],
select_threshold, img_shape, num_classes, decode)
l_classes.append(classes)
l_scores.append(scores)
l_bboxes.append(bboxes)
# Debug information.
# l_layers.append(i)
# l_idxes.append((i, idxes))
classes = np.concatenate(l_classes, 0)
scores = np.concatenate(l_scores, 0)
bboxes = np.concatenate(l_bboxes, 0)
return classes, scores, bboxes
# =========================================================================== #
# Common functions for bboxes handling and selection.
# =========================================================================== #
def bboxes_sort(classes, scores, bboxes, top_k=400):
"""Sort bounding boxes by decreasing order and keep only the top_k
"""
# if priority_inside:
# inside = (bboxes[:, 0] > margin) & (bboxes[:, 1] > margin) & \
# (bboxes[:, 2] < 1-margin) & (bboxes[:, 3] < 1-margin)
# idxes = np.argsort(-scores)
# inside = inside[idxes]
# idxes = np.concatenate([idxes[inside], idxes[~inside]])
idxes = np.argsort(-scores)
classes = classes[idxes][:top_k]
scores = scores[idxes][:top_k]
bboxes = bboxes[idxes][:top_k]
return classes, scores, bboxes
def bboxes_clip(bbox_ref, bboxes):
"""Clip bounding boxes with respect to reference bbox.
"""
bboxes = np.copy(bboxes)
bboxes = np.transpose(bboxes)
bbox_ref = np.transpose(bbox_ref)
bboxes[0] = np.maximum(bboxes[0], bbox_ref[0])
bboxes[1] = np.maximum(bboxes[1], bbox_ref[1])
bboxes[2] = np.minimum(bboxes[2], bbox_ref[2])
bboxes[3] = np.minimum(bboxes[3], bbox_ref[3])
bboxes = np.transpose(bboxes)
return bboxes
def bboxes_resize(bbox_ref, bboxes):
"""Resize bounding boxes based on a reference bounding box,
assuming that the latter is [0, 0, 1, 1] after transform.
"""
bboxes = np.copy(bboxes)
# Translate.
bboxes[:, 0] -= bbox_ref[0]
bboxes[:, 1] -= bbox_ref[1]
bboxes[:, 2] -= bbox_ref[0]
bboxes[:, 3] -= bbox_ref[1]
# Resize.
resize = [bbox_ref[2] - bbox_ref[0], bbox_ref[3] - bbox_ref[1]]
bboxes[:, 0] /= resize[0]
bboxes[:, 1] /= resize[1]
bboxes[:, 2] /= resize[0]
bboxes[:, 3] /= resize[1]
return bboxes
def bboxes_jaccard(bboxes1, bboxes2):
"""Computing jaccard index between bboxes1 and bboxes2.
Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable.
"""
bboxes1 = np.transpose(bboxes1)
bboxes2 = np.transpose(bboxes2)
# Intersection bbox and volume.
int_ymin = np.maximum(bboxes1[0], bboxes2[0])
int_xmin = np.maximum(bboxes1[1], bboxes2[1])
int_ymax = np.minimum(bboxes1[2], bboxes2[2])
int_xmax = np.minimum(bboxes1[3], bboxes2[3])
int_h = np.maximum(int_ymax - int_ymin, 0.)
int_w = np.maximum(int_xmax - int_xmin, 0.)
int_vol = int_h * int_w
# Union volume.
vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1])
vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1])
jaccard = int_vol / (vol1 + vol2 - int_vol)
return jaccard
def bboxes_intersection(bboxes_ref, bboxes2):
"""Computing jaccard index between bboxes1 and bboxes2.
Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable.
"""
bboxes_ref = np.transpose(bboxes_ref)
bboxes2 = np.transpose(bboxes2)
# Intersection bbox and volume.
int_ymin = np.maximum(bboxes_ref[0], bboxes2[0])
int_xmin = np.maximum(bboxes_ref[1], bboxes2[1])
int_ymax = np.minimum(bboxes_ref[2], bboxes2[2])
int_xmax = np.minimum(bboxes_ref[3], bboxes2[3])
int_h = np.maximum(int_ymax - int_ymin, 0.)
int_w = np.maximum(int_xmax - int_xmin, 0.)
int_vol = int_h * int_w
# Union volume.
vol = (bboxes_ref[2] - bboxes_ref[0]) * (bboxes_ref[3] - bboxes_ref[1])
score = int_vol / vol
return score
def bboxes_nms(classes, scores, bboxes, nms_threshold=0.45):
"""Apply non-maximum selection to bounding boxes.
"""
keep_bboxes = np.ones(scores.shape, dtype=np.bool)
for i in range(scores.size-1):
if keep_bboxes[i]:
# Computer overlap with bboxes which are following.
overlap = bboxes_jaccard(bboxes[i], bboxes[(i+1):])
# Overlap threshold for keeping + checking part of the same class
keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i])
keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap)
idxes = np.where(keep_bboxes)
return classes[idxes], scores[idxes], bboxes[idxes]
def bboxes_nms_fast(classes, scores, bboxes, threshold=0.45):
"""Apply non-maximum selection to bounding boxes.
"""
pass

View File

@@ -0,0 +1,408 @@
# Copyright 2015 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Shared function between different SSD implementations.
"""
import numpy as np
import tensorflow as tf
import tfextended as tfe
# =========================================================================== #
# TensorFlow implementation of boxes SSD encoding / decoding.
# =========================================================================== #
def tf_ssd_bboxes_encode_layer(labels,
bboxes,
anchors_layer,
num_classes,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32):
"""Encode groundtruth labels and bounding boxes using SSD anchors from
one layer.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors_layer: Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores): Target Tensors.
"""
# Anchors coordinates and volume.
yref, xref, href, wref = anchors_layer
ymin = yref - href / 2.
xmin = xref - wref / 2.
ymax = yref + href / 2.
xmax = xref + wref / 2.
vol_anchors = (xmax - xmin) * (ymax - ymin)
# Initialize tensors...
shape = (yref.shape[0], yref.shape[1], href.size)
feat_labels = tf.zeros(shape, dtype=tf.int64)
feat_scores = tf.zeros(shape, dtype=dtype)
feat_ymin = tf.zeros(shape, dtype=dtype)
feat_xmin = tf.zeros(shape, dtype=dtype)
feat_ymax = tf.ones(shape, dtype=dtype)
feat_xmax = tf.ones(shape, dtype=dtype)
def jaccard_with_anchors(bbox):
"""Compute jaccard score between a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = vol_anchors - inter_vol \
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
jaccard = tf.divide(inter_vol, union_vol)
return jaccard
def intersection_with_anchors(bbox):
"""Compute intersection between score a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
inter_vol = h * w
scores = tf.divide(inter_vol, vol_anchors)
return scores
def condition(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Condition: check label index.
"""
r = tf.less(i, tf.shape(labels))
return r[0]
def body(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Body: update feature labels, scores and bboxes.
Follow the original SSD paper for that purpose:
- assign values when jaccard > 0.5;
- only update if beat the score of other bboxes.
"""
# Jaccard score.
label = labels[i]
bbox = bboxes[i]
jaccard = jaccard_with_anchors(bbox)
# Mask: check threshold + scores + no annotations + num_classes.
mask = tf.greater(jaccard, feat_scores)
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
mask = tf.logical_and(mask, feat_scores > -0.5)
mask = tf.logical_and(mask, label < num_classes)
imask = tf.cast(mask, tf.int64)
fmask = tf.cast(mask, dtype)
# Update values using mask.
feat_labels = imask * label + (1 - imask) * feat_labels
feat_scores = tf.where(mask, jaccard, feat_scores)
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
# Check no annotation label: ignore these anchors...
# interscts = intersection_with_anchors(bbox)
# mask = tf.logical_and(interscts > ignore_threshold,
# label == no_annotation_label)
# # Replace scores by -1.
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
return [i+1, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
# Main loop definition.
i = 0
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax] = tf.while_loop(condition, body,
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax])
# Transform to center / size.
feat_cy = (feat_ymax + feat_ymin) / 2.
feat_cx = (feat_xmax + feat_xmin) / 2.
feat_h = feat_ymax - feat_ymin
feat_w = feat_xmax - feat_xmin
# Encode features.
feat_cy = (feat_cy - yref) / href / prior_scaling[0]
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
feat_h = tf.log(feat_h / href) / prior_scaling[2]
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
# Use SSD ordering: x / y / w / h instead of ours.
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
return feat_labels, feat_localizations, feat_scores
def tf_ssd_bboxes_encode(labels,
bboxes,
anchors,
num_classes,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32,
scope='ssd_bboxes_encode'):
"""Encode groundtruth labels and bounding boxes using SSD net anchors.
Encoding boxes for all feature layers.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors: List of Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores):
Each element is a list of target Tensors.
"""
with tf.name_scope(scope):
target_labels = []
target_localizations = []
target_scores = []
for i, anchors_layer in enumerate(anchors):
with tf.name_scope('bboxes_encode_block_%i' % i):
t_labels, t_loc, t_scores = \
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
num_classes,
ignore_threshold,
prior_scaling, dtype)
target_labels.append(t_labels)
target_localizations.append(t_loc)
target_scores.append(t_scores)
return target_labels, target_localizations, target_scores
def tf_ssd_bboxes_decode_layer(feat_localizations,
anchors_layer,
prior_scaling=[0.1, 0.1, 0.2, 0.2]):
"""Compute the relative bounding boxes from the layer features and
reference anchor bounding boxes.
Arguments:
feat_localizations: Tensor containing localization features.
anchors: List of numpy array containing anchor boxes.
Return:
Tensor Nx4: ymin, xmin, ymax, xmax
"""
yref, xref, href, wref = anchors_layer
# Compute center, height and width
cx = feat_localizations[:, :, :, :, 0] * wref * prior_scaling[0] + xref
cy = feat_localizations[:, :, :, :, 1] * href * prior_scaling[1] + yref
w = wref * tf.exp(feat_localizations[:, :, :, :, 2] * prior_scaling[2])
h = href * tf.exp(feat_localizations[:, :, :, :, 3] * prior_scaling[3])
# Boxes coordinates.
ymin = cy - h / 2.
xmin = cx - w / 2.
ymax = cy + h / 2.
xmax = cx + w / 2.
bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=-1)
return bboxes
def tf_ssd_bboxes_decode(feat_localizations,
anchors,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
scope='ssd_bboxes_decode'):
"""Compute the relative bounding boxes from the SSD net features and
reference anchors bounding boxes.
Arguments:
feat_localizations: List of Tensors containing localization features.
anchors: List of numpy array containing anchor boxes.
Return:
List of Tensors Nx4: ymin, xmin, ymax, xmax
"""
with tf.name_scope(scope):
bboxes = []
for i, anchors_layer in enumerate(anchors):
bboxes.append(
tf_ssd_bboxes_decode_layer(feat_localizations[i],
anchors_layer,
prior_scaling))
return bboxes
# =========================================================================== #
# SSD boxes selection.
# =========================================================================== #
def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,
select_threshold=None,
num_classes=21,
ignore_class=0,
scope=None):
"""Extract classes, scores and bounding boxes from features in one layer.
Batch-compatible: inputs are supposed to have batch-type shapes.
Args:
predictions_layer: A SSD prediction layer;
localizations_layer: A SSD localization layer;
select_threshold: Classification threshold for selecting a box. All boxes
under the threshold are set to 'zero'. If None, no threshold applied.
Return:
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of
size Batches X N x 1 | 4. Each key corresponding to a class.
"""
select_threshold = 0.0 if select_threshold is None else select_threshold
with tf.name_scope(scope, 'ssd_bboxes_select_layer',
[predictions_layer, localizations_layer]):
# Reshape features: Batches x N x N_labels | 4
p_shape = tfe.get_shape(predictions_layer)
predictions_layer = tf.reshape(predictions_layer,
tf.stack([p_shape[0], -1, p_shape[-1]]))
l_shape = tfe.get_shape(localizations_layer)
localizations_layer = tf.reshape(localizations_layer,
tf.stack([l_shape[0], -1, l_shape[-1]]))
d_scores = {}
d_bboxes = {}
for c in range(0, num_classes):
if c != ignore_class:
# Remove boxes under the threshold.
scores = predictions_layer[:, :, c]
fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype)
scores = scores * fmask
bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1)
# Append to dictionary.
d_scores[c] = scores
d_bboxes[c] = bboxes
return d_scores, d_bboxes
def tf_ssd_bboxes_select(predictions_net, localizations_net,
select_threshold=None,
num_classes=21,
ignore_class=0,
scope=None):
"""Extract classes, scores and bounding boxes from network output layers.
Batch-compatible: inputs are supposed to have batch-type shapes.
Args:
predictions_net: List of SSD prediction layers;
localizations_net: List of localization layers;
select_threshold: Classification threshold for selecting a box. All boxes
under the threshold are set to 'zero'. If None, no threshold applied.
Return:
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of
size Batches X N x 1 | 4. Each key corresponding to a class.
"""
with tf.name_scope(scope, 'ssd_bboxes_select',
[predictions_net, localizations_net]):
l_scores = []
l_bboxes = []
for i in range(len(predictions_net)):
scores, bboxes = tf_ssd_bboxes_select_layer(predictions_net[i],
localizations_net[i],
select_threshold,
num_classes,
ignore_class)
l_scores.append(scores)
l_bboxes.append(bboxes)
# Concat results.
d_scores = {}
d_bboxes = {}
for c in l_scores[0].keys():
ls = [s[c] for s in l_scores]
lb = [b[c] for b in l_bboxes]
d_scores[c] = tf.concat(ls, axis=1)
d_bboxes[c] = tf.concat(lb, axis=1)
return d_scores, d_bboxes
def tf_ssd_bboxes_select_layer_all_classes(predictions_layer, localizations_layer,
select_threshold=None):
"""Extract classes, scores and bounding boxes from features in one layer.
Batch-compatible: inputs are supposed to have batch-type shapes.
Args:
predictions_layer: A SSD prediction layer;
localizations_layer: A SSD localization layer;
select_threshold: Classification threshold for selecting a box. If None,
select boxes whose classification score is higher than 'no class'.
Return:
classes, scores, bboxes: Input Tensors.
"""
# Reshape features: Batches x N x N_labels | 4
p_shape = tfe.get_shape(predictions_layer)
predictions_layer = tf.reshape(predictions_layer,
tf.stack([p_shape[0], -1, p_shape[-1]]))
l_shape = tfe.get_shape(localizations_layer)
localizations_layer = tf.reshape(localizations_layer,
tf.stack([l_shape[0], -1, l_shape[-1]]))
# Boxes selection: use threshold or score > no-label criteria.
if select_threshold is None or select_threshold == 0:
# Class prediction and scores: assign 0. to 0-class
classes = tf.argmax(predictions_layer, axis=2)
scores = tf.reduce_max(predictions_layer, axis=2)
scores = scores * tf.cast(classes > 0, scores.dtype)
else:
sub_predictions = predictions_layer[:, :, 1:]
classes = tf.argmax(sub_predictions, axis=2) + 1
scores = tf.reduce_max(sub_predictions, axis=2)
# Only keep predictions higher than threshold.
mask = tf.greater(scores, select_threshold)
classes = classes * tf.cast(mask, classes.dtype)
scores = scores * tf.cast(mask, scores.dtype)
# Assume localization layer already decoded.
bboxes = localizations_layer
return classes, scores, bboxes
def tf_ssd_bboxes_select_all_classes(predictions_net, localizations_net,
select_threshold=None,
scope=None):
"""Extract classes, scores and bounding boxes from network output layers.
Batch-compatible: inputs are supposed to have batch-type shapes.
Args:
predictions_net: List of SSD prediction layers;
localizations_net: List of localization layers;
select_threshold: Classification threshold for selecting a box. If None,
select boxes whose classification score is higher than 'no class'.
Return:
classes, scores, bboxes: Tensors.
"""
with tf.name_scope(scope, 'ssd_bboxes_select',
[predictions_net, localizations_net]):
l_classes = []
l_scores = []
l_bboxes = []
for i in range(len(predictions_net)):
classes, scores, bboxes = \
tf_ssd_bboxes_select_layer_all_classes(predictions_net[i],
localizations_net[i],
select_threshold)
l_classes.append(classes)
l_scores.append(scores)
l_bboxes.append(bboxes)
classes = tf.concat(l_classes, axis=1)
scores = tf.concat(l_scores, axis=1)
bboxes = tf.concat(l_bboxes, axis=1)
return classes, scores, bboxes

View File

@@ -0,0 +1,660 @@
# Copyright 2016 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Definition of 300 VGG-based SSD network.
This model was initially introduced in:
SSD: Single Shot MultiBox Detector
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
Cheng-Yang Fu, Alexander C. Berg
https://arxiv.org/abs/1512.02325
Two variants of the model are defined: the 300x300 and 512x512 models, the
latter obtaining a slightly better accuracy on Pascal VOC.
Usage:
with slim.arg_scope(ssd_vgg.ssd_vgg()):
outputs, end_points = ssd_vgg.ssd_vgg(inputs)
This network port of the original Caffe model. The padding in TF and Caffe
is slightly different, and can lead to severe accuracy drop if not taken care
in a correct way!
In Caffe, the output size of convolution and pooling layers are computing as
following: h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1
Nevertheless, there is a subtle difference between both for stride > 1. In
the case of convolution:
top_size = floor((bottom_size + 2*pad - kernel_size) / stride) + 1
whereas for pooling:
top_size = ceil((bottom_size + 2*pad - kernel_size) / stride) + 1
Hence implicitely allowing some additional padding even if pad = 0. This
behaviour explains why pooling with stride and kernel of size 2 are behaving
the same way in TensorFlow and Caffe.
Nevertheless, this is not the case anymore for other kernel sizes, hence
motivating the use of special padding layer for controlling these side-effects.
@@ssd_vgg_300
"""
import math
from collections import namedtuple
import numpy as np
import tensorflow as tf
import tfextended as tfe
from model import custom_layers, ssd_common
slim = tf.contrib.slim
# =========================================================================== #
# SSD class definition.
# =========================================================================== #
SSDParams = namedtuple('SSDParameters', ['img_shape',
'num_classes',
'no_annotation_label',
'feat_layers',
'feat_shapes',
'anchor_size_bounds',
'anchor_sizes',
'anchor_ratios',
'anchor_steps',
'anchor_offset',
'normalizations',
'prior_scaling'
])
class SSDNet(object):
"""Implementation of the SSD VGG-based 300 network.
The default features layers with 300x300 image input are:
conv4 ==> 38 x 38
conv7 ==> 19 x 19
conv8 ==> 10 x 10
conv9 ==> 5 x 5
conv10 ==> 3 x 3
conv11 ==> 1 x 1
The default image size used to train this network is 300x300.
"""
default_params = SSDParams(
img_shape=(300, 300),
num_classes=21,
no_annotation_label=21,
feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],
feat_shapes=[(37, 37), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
anchor_size_bounds=[0.15, 0.90],
# anchor_size_bounds=[0.20, 0.90],
anchor_sizes=[(21., 45.),
(45., 99.),
(99., 153.),
(153., 207.),
(207., 261.),
(261., 315.)],
# anchor_sizes=[(30., 60.),
# (60., 111.),
# (111., 162.),
# (162., 213.),
# (213., 264.),
# (264., 315.)],
anchor_ratios=[[2, .5],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5],
[2, .5]],
anchor_steps=[8, 16, 32, 64, 100, 300],
anchor_offset=0.5,
normalizations=[20, -1, -1, -1, -1, -1],
prior_scaling=[0.1, 0.1, 0.2, 0.2]
)
def __init__(self, params=None):
"""Init the SSD net with some parameters. Use the default ones
if none provided.
"""
if isinstance(params, SSDParams):
self.params = params
else:
self.params = SSDNet.default_params
# ======================================================================= #
def net(self, inputs,
is_training=True,
update_feat_shapes=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD network definition.
"""
r = ssd_net(inputs,
num_classes=self.params.num_classes,
feat_layers=self.params.feat_layers,
anchor_sizes=self.params.anchor_sizes,
anchor_ratios=self.params.anchor_ratios,
normalizations=self.params.normalizations,
is_training=is_training,
dropout_keep_prob=dropout_keep_prob,
prediction_fn=prediction_fn,
reuse=reuse,
scope=scope)
# Update feature shapes (try at least!)
if update_feat_shapes:
shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)
self.params = self.params._replace(feat_shapes=shapes)
return r
def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
"""Network arg_scope.
"""
return ssd_arg_scope(weight_decay, data_format=data_format)
def arg_scope_caffe(self, caffe_scope):
"""Caffe arg_scope used for weights importing.
"""
return ssd_arg_scope_caffe(caffe_scope)
# ======================================================================= #
def update_feature_shapes(self, predictions):
"""Update feature shapes from predictions collection (Tensor or Numpy
array).
"""
shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
self.params = self.params._replace(feat_shapes=shapes)
def anchors(self, img_shape, dtype=np.float32):
"""Compute the default anchor boxes, given an image shape.
"""
return ssd_anchors_all_layers(img_shape,
self.params.feat_shapes,
self.params.anchor_sizes,
self.params.anchor_ratios,
self.params.anchor_steps,
self.params.anchor_offset,
dtype)
def bboxes_encode(self, labels, bboxes, anchors,
scope=None):
"""Encode labels and bounding boxes.
"""
return ssd_common.tf_ssd_bboxes_encode(
labels, bboxes, anchors,
self.params.num_classes,
ignore_threshold=0.5,
prior_scaling=self.params.prior_scaling,
scope=scope)
def bboxes_decode(self, feat_localizations, anchors,
scope='ssd_bboxes_decode'):
"""Encode labels and bounding boxes.
"""
return ssd_common.tf_ssd_bboxes_decode(
feat_localizations, anchors,
prior_scaling=self.params.prior_scaling,
scope=scope)
def detected_bboxes(self, predictions, localisations,
select_threshold=None, nms_threshold=0.5,
clipping_bbox=None, top_k=400, keep_top_k=200):
"""Get the detected bounding boxes from the SSD network output.
"""
# Select top_k bboxes from predictions, and clip
rscores, rbboxes = \
ssd_common.tf_ssd_bboxes_select(predictions, localisations,
select_threshold=select_threshold,
num_classes=self.params.num_classes)
rscores, rbboxes = \
tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
# Apply NMS algorithm.
rscores, rbboxes = \
tfe.bboxes_nms_batch(rscores, rbboxes,
nms_threshold=nms_threshold,
keep_top_k=keep_top_k)
if clipping_bbox is not None:
rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
return rscores, rbboxes
def losses(self, logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
scope='ssd_losses'):
"""Define the SSD network losses.
"""
return ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=match_threshold,
negative_ratio=negative_ratio,
alpha=alpha,
label_smoothing=label_smoothing,
scope=scope)
# =========================================================================== #
# SSD tools...
# =========================================================================== #
def ssd_size_bounds_to_values(size_bounds,
n_feat_layers,
img_shape=(300, 300)):
"""Compute the reference sizes of the anchor boxes from relative bounds.
The absolute values are measured in pixels, based on the network
default size (300 pixels).
This function follows the computation performed in the original
implementation of SSD in Caffe.
Return:
list of list containing the absolute sizes at each scale. For each scale,
the ratios only apply to the first value.
"""
assert img_shape[0] == img_shape[1]
img_size = img_shape[0]
min_ratio = int(size_bounds[0] * 100)
max_ratio = int(size_bounds[1] * 100)
step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2)))
# Start with the following smallest sizes.
sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]]
for ratio in range(min_ratio, max_ratio + 1, step):
sizes.append((img_size * ratio / 100.,
img_size * (ratio + step) / 100.))
return sizes
def ssd_feat_shapes_from_net(predictions, default_shapes=None):
"""Try to obtain the feature shapes from the prediction layers. The latter
can be either a Tensor or Numpy ndarray.
Return:
list of feature shapes. Default values if predictions shape not fully
determined.
"""
feat_shapes = []
for l in predictions:
# Get the shape, from either a np array or a tensor.
if isinstance(l, np.ndarray):
shape = l.shape
else:
shape = l.get_shape().as_list()
shape = shape[1:4]
# Problem: undetermined shape...
if None in shape:
return default_shapes
else:
feat_shapes.append(shape)
return feat_shapes
def ssd_anchor_one_layer(img_shape,
feat_shape,
sizes,
ratios,
step,
offset=0.5,
dtype=np.float32):
"""Computer SSD default anchor boxes for one feature layer.
Determine the relative position grid of the centers, and the relative
width and height.
Arguments:
feat_shape: Feature shape, used for computing relative position grids;
size: Absolute reference sizes;
ratios: Ratios to use on these features;
img_shape: Image shape, used for computing height, width relatively to the
former;
offset: Grid offset.
Return:
y, x, h, w: Relative x and y grids, and height and width.
"""
# Compute the position grid: simple way.
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
# y = (y.astype(dtype) + offset) / feat_shape[0]
# x = (x.astype(dtype) + offset) / feat_shape[1]
# Weird SSD-Caffe computation using steps values...
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
# Expand dims to support easy broadcasting.
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
# Compute relative height and width.
# Tries to follow the original implementation of SSD for the order.
num_anchors = len(sizes) + len(ratios)
h = np.zeros((num_anchors, ), dtype=dtype)
w = np.zeros((num_anchors, ), dtype=dtype)
# Add first anchor boxes with ratio=1.
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios):
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
return y, x, h, w
def ssd_anchors_all_layers(img_shape,
layers_shape,
anchor_sizes,
anchor_ratios,
anchor_steps,
offset=0.5,
dtype=np.float32):
"""Compute anchor boxes for all feature layers.
"""
layers_anchors = []
for i, s in enumerate(layers_shape):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
return layers_anchors
# =========================================================================== #
# Functional definition of VGG-based SSD 300.
# =========================================================================== #
def tensor_shape(x, rank=3):
"""Returns the dimensions of a tensor.
Args:
image: A N-D Tensor of shape.
Returns:
A list of dimensions. Dimensions that are statically known are python
integers,otherwise they are integer scalar tensors.
"""
if x.get_shape().is_fully_defined():
return x.get_shape().as_list()
else:
static_shape = x.get_shape().with_rank(rank).as_list()
dynamic_shape = tf.unstack(tf.shape(x), rank)
return [s if s is not None else d
for s, d in zip(static_shape, dynamic_shape)]
def ssd_multibox_layer(inputs,
num_classes,
sizes,
ratios=[1],
normalization=-1,
bn_normalization=False):
"""Construct a multibox layer, return a class and localization predictions.
"""
net = inputs
if normalization > 0:
net = custom_layers.l2_normalization(net, scaling=True)
# Number of anchors.
num_anchors = len(sizes) + len(ratios)
# Location.
num_loc_pred = num_anchors * 4
loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
scope='conv_loc')
loc_pred = custom_layers.channel_to_last(loc_pred)
loc_pred = tf.reshape(loc_pred,
tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
# Class prediction.
num_cls_pred = num_anchors * num_classes
cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,
scope='conv_cls')
cls_pred = custom_layers.channel_to_last(cls_pred)
cls_pred = tf.reshape(cls_pred,
tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes])
return cls_pred, loc_pred
def ssd_net(inputs,
num_classes=SSDNet.default_params.num_classes,
feat_layers=SSDNet.default_params.feat_layers,
anchor_sizes=SSDNet.default_params.anchor_sizes,
anchor_ratios=SSDNet.default_params.anchor_ratios,
normalizations=SSDNet.default_params.normalizations,
is_training=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD net definition.
"""
# if data_format == 'NCHW':
# inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
# End_points collect relevant activations for external use.
end_points = {}
with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
# Original VGG-16 blocks.
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
end_points['block1'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool1')
# Block 2.
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
end_points['block2'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool2')
# Block 3.
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
end_points['block3'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool3')
# Block 4.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
end_points['block4'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool4')
# Block 5.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
end_points['block5'] = net
net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')
# Additional SSD blocks.
# Block 6: let's dilate the hell out of it!
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
end_points['block6'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 7: 1x1 conv. Because the fuck.
net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
end_points['block7'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
end_point = 'block8'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block9'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block10'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block11'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
# Prediction and localisations layers.
predictions = []
logits = []
localisations = []
for i, layer in enumerate(feat_layers):
with tf.variable_scope(layer + '_box'):
p, l = ssd_multibox_layer(end_points[layer],
num_classes,
anchor_sizes[i],
anchor_ratios[i],
normalizations[i])
predictions.append(prediction_fn(p))
logits.append(p)
localisations.append(l)
return predictions, localisations, logits, end_points
ssd_net.default_image_size = 300
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
"""Defines the VGG arg scope.
Args:
weight_decay: The l2 regularization coefficient.
Returns:
An arg_scope.
"""
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=tf.contrib.layers.xavier_initializer(),
biases_initializer=tf.zeros_initializer()):
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
padding='SAME',
data_format=data_format):
with slim.arg_scope([custom_layers.pad2d,
custom_layers.l2_normalization,
custom_layers.channel_to_last],
data_format=data_format) as sc:
return sc
# =========================================================================== #
# Caffe scope: importing weights at initialization.
# =========================================================================== #
def ssd_arg_scope_caffe(caffe_scope):
"""Caffe scope definition.
Args:
caffe_scope: Caffe scope object with loaded weights.
Returns:
An arg_scope.
"""
# Default network arg scope.
with slim.arg_scope([slim.conv2d],
activation_fn=tf.nn.relu,
weights_initializer=caffe_scope.conv_weights_init(),
biases_initializer=caffe_scope.conv_biases_init()):
with slim.arg_scope([slim.fully_connected],
activation_fn=tf.nn.relu):
with slim.arg_scope([custom_layers.l2_normalization],
scale_initializer=caffe_scope.l2_norm_scale_init()):
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
padding='SAME') as sc:
return sc
# =========================================================================== #
# SSD loss function.
# =========================================================================== #
def ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
device='/cpu:0',
scope=None):
with tf.name_scope(scope, 'ssd_losses'):
lshape = tfe.get_shape(logits[0], 5)
num_classes = lshape[-1]
batch_size = lshape[0]
# Flatten out all vectors!
flogits = []
fgclasses = []
fgscores = []
flocalisations = []
fglocalisations = []
for i in range(len(logits)):
flogits.append(tf.reshape(logits[i], [-1, num_classes]))
fgclasses.append(tf.reshape(gclasses[i], [-1]))
fgscores.append(tf.reshape(gscores[i], [-1]))
flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
# And concat the crap!
logits = tf.concat(flogits, axis=0)
gclasses = tf.concat(fgclasses, axis=0)
gscores = tf.concat(fgscores, axis=0)
localisations = tf.concat(flocalisations, axis=0)
glocalisations = tf.concat(fglocalisations, axis=0)
dtype = logits.dtype
# Compute positive matching mask...
pmask = gscores > match_threshold
fpmask = tf.cast(pmask, dtype)
n_positives = tf.reduce_sum(fpmask)
# Hard negative mining...
no_classes = tf.cast(pmask, tf.int32)
predictions = slim.softmax(logits)
nmask = tf.logical_and(tf.logical_not(pmask),
gscores > -0.5)
fnmask = tf.cast(nmask, dtype)
nvalues = tf.where(nmask,
predictions[:, 0],
1. - fnmask)
nvalues_flat = tf.reshape(nvalues, [-1])
# Number of negative entries to select.
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
n_neg = tf.minimum(n_neg, max_neg_entries)
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
max_hard_pred = -val[-1]
# Final negative mask.
nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
fnmask = tf.cast(nmask, dtype)
batch_float = tf.cast(batch_size, tf.float32)
# Add cross-entropy loss.
with tf.name_scope('cross_entropy_pos'):
cross_entropy_pos_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=gclasses)
cross_entropy_pos_loss = tf.divide(tf.reduce_sum(cross_entropy_pos_loss * fpmask), batch_float, name='value')
tf.losses.add_loss(cross_entropy_pos_loss)
with tf.name_scope('cross_entropy_neg'):
cross_entropy_neg_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=no_classes)
cross_entropy_neg_loss = tf.divide(tf.reduce_sum(cross_entropy_neg_loss * fnmask), batch_float, name='value')
tf.losses.add_loss(cross_entropy_neg_loss)
# Add localization loss: smooth L1, L2, ...
with tf.name_scope('localization'):
# Weights Tensor: positive mask + random negative.
weights = tf.expand_dims(alpha * fpmask, axis=-1)
localization_loss = custom_layers.abs_smooth(localisations - glocalisations)
localization_loss = tf.divide(tf.reduce_sum(localization_loss * weights), batch_float, name='value')
tf.losses.add_loss(localization_loss)
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
all_losses = [cross_entropy_neg_loss, cross_entropy_pos_loss, localization_loss] + (regularization_losses if regularization_losses else [])
return tf.add_n(all_losses)

View File

@@ -0,0 +1,24 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TF Extended: additional metrics.
"""
# pylint: disable=unused-import,line-too-long,g-importing-member,wildcard-import
from tfextended.metrics import *
from tfextended.tensors import *
from tfextended.bboxes import *
from tfextended.image import *
from tfextended.math import *

View File

@@ -0,0 +1,508 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TF Extended: additional bounding boxes methods.
"""
import numpy as np
import tensorflow as tf
from tfextended import tensors as tfe_tensors
from tfextended import math as tfe_math
# =========================================================================== #
# Standard boxes algorithms.
# =========================================================================== #
def bboxes_sort_all_classes(classes, scores, bboxes, top_k=400, scope=None):
"""Sort bounding boxes by decreasing order and keep only the top_k.
Assume the input Tensors mix-up objects with different classes.
Assume a batch-type input.
Args:
classes: Batch x N Tensor containing integer classes.
scores: Batch x N Tensor containing float scores.
bboxes: Batch x N x 4 Tensor containing boxes coordinates.
top_k: Top_k boxes to keep.
Return:
classes, scores, bboxes: Sorted tensors of shape Batch x Top_k.
"""
with tf.name_scope(scope, 'bboxes_sort', [classes, scores, bboxes]):
scores, idxes = tf.nn.top_k(scores, k=top_k, sorted=True)
# Trick to be able to use tf.gather: map for each element in the batch.
def fn_gather(classes, bboxes, idxes):
cl = tf.gather(classes, idxes)
bb = tf.gather(bboxes, idxes)
return [cl, bb]
r = tf.map_fn(lambda x: fn_gather(x[0], x[1], x[2]),
[classes, bboxes, idxes],
dtype=[classes.dtype, bboxes.dtype],
parallel_iterations=10,
back_prop=False,
swap_memory=False,
infer_shape=True)
classes = r[0]
bboxes = r[1]
return classes, scores, bboxes
def bboxes_sort(scores, bboxes, top_k=400, scope=None):
"""Sort bounding boxes by decreasing order and keep only the top_k.
If inputs are dictionnaries, assume every key is a different class.
Assume a batch-type input.
Args:
scores: Batch x N Tensor/Dictionary containing float scores.
bboxes: Batch x N x 4 Tensor/Dictionary containing boxes coordinates.
top_k: Top_k boxes to keep.
Return:
scores, bboxes: Sorted Tensors/Dictionaries of shape Batch x Top_k x 1|4.
"""
# Dictionaries as inputs.
if isinstance(scores, dict) or isinstance(bboxes, dict):
with tf.name_scope(scope, 'bboxes_sort_dict'):
d_scores = {}
d_bboxes = {}
for c in scores.keys():
s, b = bboxes_sort(scores[c], bboxes[c], top_k=top_k)
d_scores[c] = s
d_bboxes[c] = b
return d_scores, d_bboxes
# Tensors inputs.
with tf.name_scope(scope, 'bboxes_sort', [scores, bboxes]):
# Sort scores...
scores, idxes = tf.nn.top_k(scores, k=top_k, sorted=True)
# Trick to be able to use tf.gather: map for each element in the first dim.
def fn_gather(bboxes, idxes):
bb = tf.gather(bboxes, idxes)
return [bb]
r = tf.map_fn(lambda x: fn_gather(x[0], x[1]),
[bboxes, idxes],
dtype=[bboxes.dtype],
parallel_iterations=10,
back_prop=False,
swap_memory=False,
infer_shape=True)
bboxes = r[0]
return scores, bboxes
def bboxes_clip(bbox_ref, bboxes, scope=None):
"""Clip bounding boxes to a reference box.
Batch-compatible if the first dimension of `bbox_ref` and `bboxes`
can be broadcasted.
Args:
bbox_ref: Reference bounding box. Nx4 or 4 shaped-Tensor;
bboxes: Bounding boxes to clip. Nx4 or 4 shaped-Tensor or dictionary.
Return:
Clipped bboxes.
"""
# Bboxes is dictionary.
if isinstance(bboxes, dict):
with tf.name_scope(scope, 'bboxes_clip_dict'):
d_bboxes = {}
for c in bboxes.keys():
d_bboxes[c] = bboxes_clip(bbox_ref, bboxes[c])
return d_bboxes
# Tensors inputs.
with tf.name_scope(scope, 'bboxes_clip'):
# Easier with transposed bboxes. Especially for broadcasting.
bbox_ref = tf.transpose(bbox_ref)
bboxes = tf.transpose(bboxes)
# Intersection bboxes and reference bbox.
ymin = tf.maximum(bboxes[0], bbox_ref[0])
xmin = tf.maximum(bboxes[1], bbox_ref[1])
ymax = tf.minimum(bboxes[2], bbox_ref[2])
xmax = tf.minimum(bboxes[3], bbox_ref[3])
# Double check! Empty boxes when no-intersection.
ymin = tf.minimum(ymin, ymax)
xmin = tf.minimum(xmin, xmax)
bboxes = tf.transpose(tf.stack([ymin, xmin, ymax, xmax], axis=0))
return bboxes
def bboxes_resize(bbox_ref, bboxes, name=None):
"""Resize bounding boxes based on a reference bounding box,
assuming that the latter is [0, 0, 1, 1] after transform. Useful for
updating a collection of boxes after cropping an image.
"""
# Bboxes is dictionary.
if isinstance(bboxes, dict):
with tf.name_scope(name, 'bboxes_resize_dict'):
d_bboxes = {}
for c in bboxes.keys():
d_bboxes[c] = bboxes_resize(bbox_ref, bboxes[c])
return d_bboxes
# Tensors inputs.
with tf.name_scope(name, 'bboxes_resize'):
# Translate.
v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])
bboxes = bboxes - v
# Scale.
s = tf.stack([bbox_ref[2] - bbox_ref[0],
bbox_ref[3] - bbox_ref[1],
bbox_ref[2] - bbox_ref[0],
bbox_ref[3] - bbox_ref[1]])
bboxes = bboxes / s
return bboxes
def bboxes_nms(scores, bboxes, nms_threshold=0.5, keep_top_k=200, scope=None):
"""Apply non-maximum selection to bounding boxes. In comparison to TF
implementation, use classes information for matching.
Should only be used on single-entries. Use batch version otherwise.
Args:
scores: N Tensor containing float scores.
bboxes: N x 4 Tensor containing boxes coordinates.
nms_threshold: Matching threshold in NMS algorithm;
keep_top_k: Number of total object to keep after NMS.
Return:
classes, scores, bboxes Tensors, sorted by score.
Padded with zero if necessary.
"""
with tf.name_scope(scope, 'bboxes_nms_single', [scores, bboxes]):
# Apply NMS algorithm.
idxes = tf.image.non_max_suppression(bboxes, scores,
keep_top_k, nms_threshold)
scores = tf.gather(scores, idxes)
bboxes = tf.gather(bboxes, idxes)
# Pad results.
scores = tfe_tensors.pad_axis(scores, 0, keep_top_k, axis=0)
bboxes = tfe_tensors.pad_axis(bboxes, 0, keep_top_k, axis=0)
return scores, bboxes
def bboxes_nms_batch(scores, bboxes, nms_threshold=0.5, keep_top_k=200,
scope=None):
"""Apply non-maximum selection to bounding boxes. In comparison to TF
implementation, use classes information for matching.
Use only on batched-inputs. Use zero-padding in order to batch output
results.
Args:
scores: Batch x N Tensor/Dictionary containing float scores.
bboxes: Batch x N x 4 Tensor/Dictionary containing boxes coordinates.
nms_threshold: Matching threshold in NMS algorithm;
keep_top_k: Number of total object to keep after NMS.
Return:
scores, bboxes Tensors/Dictionaries, sorted by score.
Padded with zero if necessary.
"""
# Dictionaries as inputs.
if isinstance(scores, dict) or isinstance(bboxes, dict):
with tf.name_scope(scope, 'bboxes_nms_batch_dict'):
d_scores = {}
d_bboxes = {}
for c in scores.keys():
s, b = bboxes_nms_batch(scores[c], bboxes[c],
nms_threshold=nms_threshold,
keep_top_k=keep_top_k)
d_scores[c] = s
d_bboxes[c] = b
return d_scores, d_bboxes
# Tensors inputs.
with tf.name_scope(scope, 'bboxes_nms_batch'):
r = tf.map_fn(lambda x: bboxes_nms(x[0], x[1],
nms_threshold, keep_top_k),
(scores, bboxes),
dtype=(scores.dtype, bboxes.dtype),
parallel_iterations=10,
back_prop=False,
swap_memory=False,
infer_shape=True)
scores, bboxes = r
return scores, bboxes
# def bboxes_fast_nms(classes, scores, bboxes,
# nms_threshold=0.5, eta=3., num_classes=21,
# pad_output=True, scope=None):
# with tf.name_scope(scope, 'bboxes_fast_nms',
# [classes, scores, bboxes]):
# nms_classes = tf.zeros((0,), dtype=classes.dtype)
# nms_scores = tf.zeros((0,), dtype=scores.dtype)
# nms_bboxes = tf.zeros((0, 4), dtype=bboxes.dtype)
def bboxes_matching(label, scores, bboxes,
glabels, gbboxes, gdifficults,
matching_threshold=0.5, scope=None):
"""Matching a collection of detected boxes with groundtruth values.
Does not accept batched-inputs.
The algorithm goes as follows: for every detected box, check
if one grountruth box is matching. If none, then considered as False Positive.
If the grountruth box is already matched with another one, it also counts
as a False Positive. We refer the Pascal VOC documentation for the details.
Args:
rclasses, rscores, rbboxes: N(x4) Tensors. Detected objects, sorted by score;
glabels, gbboxes: Groundtruth bounding boxes. May be zero padded, hence
zero-class objects are ignored.
matching_threshold: Threshold for a positive match.
Return: Tuple of:
n_gbboxes: Scalar Tensor with number of groundtruth boxes (may difer from
size because of zero padding).
tp_match: (N,)-shaped boolean Tensor containing with True Positives.
fp_match: (N,)-shaped boolean Tensor containing with False Positives.
"""
with tf.name_scope(scope, 'bboxes_matching_single',
[scores, bboxes, glabels, gbboxes]):
rsize = tf.size(scores)
rshape = tf.shape(scores)
rlabel = tf.cast(label, glabels.dtype)
# Number of groundtruth boxes.
gdifficults = tf.cast(gdifficults, tf.bool)
n_gbboxes = tf.count_nonzero(tf.logical_and(tf.equal(glabels, label),
tf.logical_not(gdifficults)))
# Grountruth matching arrays.
gmatch = tf.zeros(tf.shape(glabels), dtype=tf.bool)
grange = tf.range(tf.size(glabels), dtype=tf.int32)
# True/False positive matching TensorArrays.
sdtype = tf.bool
ta_tp_bool = tf.TensorArray(sdtype, size=rsize, dynamic_size=False, infer_shape=True)
ta_fp_bool = tf.TensorArray(sdtype, size=rsize, dynamic_size=False, infer_shape=True)
# Loop over returned objects.
def m_condition(i, ta_tp, ta_fp, gmatch):
r = tf.less(i, rsize)
return r
def m_body(i, ta_tp, ta_fp, gmatch):
# Jaccard score with groundtruth bboxes.
rbbox = bboxes[i]
jaccard = bboxes_jaccard(rbbox, gbboxes)
jaccard = jaccard * tf.cast(tf.equal(glabels, rlabel), dtype=jaccard.dtype)
# Best fit, checking it's above threshold.
idxmax = tf.cast(tf.argmax(jaccard, axis=0), tf.int32)
jcdmax = jaccard[idxmax]
match = jcdmax > matching_threshold
existing_match = gmatch[idxmax]
not_difficult = tf.logical_not(gdifficults[idxmax])
# TP: match & no previous match and FP: previous match | no match.
# If difficult: no record, i.e FP=False and TP=False.
tp = tf.logical_and(not_difficult,
tf.logical_and(match, tf.logical_not(existing_match)))
ta_tp = ta_tp.write(i, tp)
fp = tf.logical_and(not_difficult,
tf.logical_or(existing_match, tf.logical_not(match)))
ta_fp = ta_fp.write(i, fp)
# Update grountruth match.
mask = tf.logical_and(tf.equal(grange, idxmax),
tf.logical_and(not_difficult, match))
gmatch = tf.logical_or(gmatch, mask)
return [i+1, ta_tp, ta_fp, gmatch]
# Main loop definition.
i = 0
[i, ta_tp_bool, ta_fp_bool, gmatch] = \
tf.while_loop(m_condition, m_body,
[i, ta_tp_bool, ta_fp_bool, gmatch],
parallel_iterations=1,
back_prop=False)
# TensorArrays to Tensors and reshape.
tp_match = tf.reshape(ta_tp_bool.stack(), rshape)
fp_match = tf.reshape(ta_fp_bool.stack(), rshape)
# Some debugging information...
# tp_match = tf.Print(tp_match,
# [n_gbboxes,
# tf.reduce_sum(tf.cast(tp_match, tf.int64)),
# tf.reduce_sum(tf.cast(fp_match, tf.int64)),
# tf.reduce_sum(tf.cast(gmatch, tf.int64))],
# 'Matching (NG, TP, FP, GM): ')
return n_gbboxes, tp_match, fp_match
def bboxes_matching_batch(labels, scores, bboxes,
glabels, gbboxes, gdifficults,
matching_threshold=0.5, scope=None):
"""Matching a collection of detected boxes with groundtruth values.
Batched-inputs version.
Args:
rclasses, rscores, rbboxes: BxN(x4) Tensors. Detected objects, sorted by score;
glabels, gbboxes: Groundtruth bounding boxes. May be zero padded, hence
zero-class objects are ignored.
matching_threshold: Threshold for a positive match.
Return: Tuple or Dictionaries with:
n_gbboxes: Scalar Tensor with number of groundtruth boxes (may difer from
size because of zero padding).
tp: (B, N)-shaped boolean Tensor containing with True Positives.
fp: (B, N)-shaped boolean Tensor containing with False Positives.
"""
# Dictionaries as inputs.
if isinstance(scores, dict) or isinstance(bboxes, dict):
with tf.name_scope(scope, 'bboxes_matching_batch_dict'):
d_n_gbboxes = {}
d_tp = {}
d_fp = {}
for c in labels:
n, tp, fp, _ = bboxes_matching_batch(c, scores[c], bboxes[c],
glabels, gbboxes, gdifficults,
matching_threshold)
d_n_gbboxes[c] = n
d_tp[c] = tp
d_fp[c] = fp
return d_n_gbboxes, d_tp, d_fp, scores
with tf.name_scope(scope, 'bboxes_matching_batch',
[scores, bboxes, glabels, gbboxes]):
r = tf.map_fn(lambda x: bboxes_matching(labels, x[0], x[1],
x[2], x[3], x[4],
matching_threshold),
(scores, bboxes, glabels, gbboxes, gdifficults),
dtype=(tf.int64, tf.bool, tf.bool),
parallel_iterations=10,
back_prop=False,
swap_memory=True,
infer_shape=True)
return r[0], r[1], r[2], scores
# =========================================================================== #
# Some filteting methods.
# =========================================================================== #
def bboxes_filter_center(labels, bboxes, margins=[0., 0., 0., 0.],
scope=None):
"""Filter out bounding boxes whose center are not in
the rectangle [0, 0, 1, 1] + margins. The margin Tensor
can be used to enforce or loosen this condition.
Return:
labels, bboxes: Filtered elements.
"""
with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
cy = (bboxes[:, 0] + bboxes[:, 2]) / 2.
cx = (bboxes[:, 1] + bboxes[:, 3]) / 2.
mask = tf.greater(cy, margins[0])
mask = tf.logical_and(mask, tf.greater(cx, margins[1]))
mask = tf.logical_and(mask, tf.less(cx, 1. + margins[2]))
mask = tf.logical_and(mask, tf.less(cx, 1. + margins[3]))
# Boolean masking...
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
return labels, bboxes
def bboxes_filter_overlap(labels, bboxes,
threshold=0.5, assign_negative=False,
scope=None):
"""Filter out bounding boxes based on (relative )overlap with reference
box [0, 0, 1, 1]. Remove completely bounding boxes, or assign negative
labels to the one outside (useful for latter processing...).
Return:
labels, bboxes: Filtered (or newly assigned) elements.
"""
with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),
bboxes)
mask = scores > threshold
if assign_negative:
labels = tf.where(mask, labels, -labels)
# bboxes = tf.where(mask, bboxes, bboxes)
else:
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
return labels, bboxes
def bboxes_filter_labels(labels, bboxes,
out_labels=[], num_classes=np.inf,
scope=None):
"""Filter out labels from a collection. Typically used to get
of DontCare elements. Also remove elements based on the number of classes.
Return:
labels, bboxes: Filtered elements.
"""
with tf.name_scope(scope, 'bboxes_filter_labels', [labels, bboxes]):
mask = tf.greater_equal(labels, num_classes)
for l in labels:
mask = tf.logical_and(mask, tf.not_equal(labels, l))
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
return labels, bboxes
# =========================================================================== #
# Standard boxes computation.
# =========================================================================== #
def bboxes_jaccard(bbox_ref, bboxes, name=None):
"""Compute jaccard score between a reference box and a collection
of bounding boxes.
Args:
bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).
bboxes: (N, 4) Tensor, collection of bounding boxes.
Return:
(N,) Tensor with Jaccard scores.
"""
with tf.name_scope(name, 'bboxes_jaccard'):
# Should be more efficient to first transpose.
bboxes = tf.transpose(bboxes)
bbox_ref = tf.transpose(bbox_ref)
# Intersection bbox and volume.
int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = -inter_vol \
+ (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) \
+ (bbox_ref[2] - bbox_ref[0]) * (bbox_ref[3] - bbox_ref[1])
jaccard = tfe_math.safe_divide(inter_vol, union_vol, 'jaccard')
return jaccard
def bboxes_intersection(bbox_ref, bboxes, name=None):
"""Compute relative intersection between a reference box and a
collection of bounding boxes. Namely, compute the quotient between
intersection area and box area.
Args:
bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es).
bboxes: (N, 4) Tensor, collection of bounding boxes.
Return:
(N,) Tensor with relative intersection.
"""
with tf.name_scope(name, 'bboxes_intersection'):
# Should be more efficient to first transpose.
bboxes = tf.transpose(bboxes)
bbox_ref = tf.transpose(bbox_ref)
# Intersection bbox and volume.
int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])
scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')
return scores

View File

@@ -0,0 +1,63 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TF Extended: additional math functions.
"""
import tensorflow as tf
from tensorflow.python.framework import ops
def safe_divide(numerator, denominator, name):
"""Divides two values, returning 0 if the denominator is <= 0.
Args:
numerator: A real `Tensor`.
denominator: A real `Tensor`, with dtype matching `numerator`.
name: Name for the returned op.
Returns:
0 if `denominator` <= 0, else `numerator` / `denominator`
"""
return tf.where(
tf.greater(denominator, 0),
tf.divide(numerator, denominator),
tf.zeros_like(numerator),
name=name)
def cummax(x, reverse=False, name=None):
"""Compute the cumulative maximum of the tensor `x` along `axis`. This
operation is similar to the more classic `cumsum`. Only support 1D Tensor
for now.
Args:
x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
`complex128`, `qint8`, `quint8`, `qint32`, `half`.
axis: A `Tensor` of type `int32` (default: 0).
reverse: A `bool` (default: False).
name: A name for the operation (optional).
Returns:
A `Tensor`. Has the same type as `x`.
"""
with ops.name_scope(name, "Cummax", [x]) as name:
x = ops.convert_to_tensor(x, name="x")
# Not very optimal: should directly integrate reverse into tf.scan.
if reverse:
x = tf.reverse(x, axis=[0])
# 'Accumlating' maximum: ensure it is always increasing.
cmax = tf.scan(tf.maximum, x,
initializer=None, parallel_iterations=1,
back_prop=False, swap_memory=False)
if reverse:
cmax = tf.reverse(cmax, axis=[0])
return cmax

View File

@@ -0,0 +1,397 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TF Extended: additional metrics.
"""
import tensorflow as tf
import numpy as np
from tensorflow.contrib.framework.python.ops import variables as contrib_variables
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables
from tfextended import math as tfe_math
# =========================================================================== #
# TensorFlow utils
# =========================================================================== #
def _create_local(name, shape, collections=None, validate_shape=False,
dtype=dtypes.float32):
"""Creates a new local variable.
Args:
name: The name of the new or existing variable.
shape: Shape of the new or existing variable.
collections: A list of collection names to which the Variable will be added.
validate_shape: Whether to validate the shape of the variable.
dtype: Data type of the variables.
Returns:
The created variable.
"""
# Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
collections = list(collections or [])
collections += [ops.GraphKeys.LOCAL_VARIABLES]
return tf.Variable(
initial_value=array_ops.zeros(shape, dtype=dtype),
name=name,
trainable=False,
collections=collections,
validate_shape=validate_shape)
def _safe_div(numerator, denominator, name):
"""Divides two values, returning 0 if the denominator is <= 0.
Args:
numerator: A real `Tensor`.
denominator: A real `Tensor`, with dtype matching `numerator`.
name: Name for the returned op.
Returns:
0 if `denominator` <= 0, else `numerator` / `denominator`
"""
return tf.where(
tf.math.greater(denominator, 0),
tf.math.divide(numerator, denominator),
tf.zeros_like(numerator),
name=name)
def _broadcast_weights(weights, values):
"""Broadcast `weights` to the same shape as `values`.
This returns a version of `weights` following the same broadcast rules as
`mul(weights, values)`. When computing a weighted average, use this function
to broadcast `weights` before summing them; e.g.,
`reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
Args:
weights: `Tensor` whose shape is broadcastable to `values`.
values: `Tensor` of any shape.
Returns:
`weights` broadcast to `values` shape.
"""
weights_shape = weights.get_shape()
values_shape = values.get_shape()
if(weights_shape.is_fully_defined() and
values_shape.is_fully_defined() and
weights_shape.is_compatible_with(values_shape)):
return weights
return tf.math.multiply(
weights, array_ops.ones_like(values), name='broadcast_weights')
# =========================================================================== #
# TF Extended metrics: TP and FP arrays.
# =========================================================================== #
def precision_recall(num_gbboxes, num_detections, tp, fp, scores,
dtype=tf.float64, scope=None):
"""Compute precision and recall from scores, true positives and false
positives booleans arrays
"""
# Input dictionaries: dict outputs as streaming metrics.
if isinstance(scores, dict):
d_precision = {}
d_recall = {}
for c in num_gbboxes.keys():
scope = 'precision_recall_%s' % c
p, r = precision_recall(num_gbboxes[c], num_detections[c],
tp[c], fp[c], scores[c],
dtype, scope)
d_precision[c] = p
d_recall[c] = r
return d_precision, d_recall
# Sort by score.
with tf.name_scope(scope, 'precision_recall',
[num_gbboxes, num_detections, tp, fp, scores]):
# Sort detections by score.
scores, idxes = tf.nn.top_k(scores, k=num_detections, sorted=True)
tp = tf.gather(tp, idxes)
fp = tf.gather(fp, idxes)
# Computer recall and precision.
tp = tf.cumsum(tf.cast(tp, dtype), axis=0)
fp = tf.cumsum(tf.cast(fp, dtype), axis=0)
recall = _safe_div(tp, tf.cast(num_gbboxes, dtype), 'recall')
precision = _safe_div(tp, tp + fp, 'precision')
return tf.tuple([precision, recall])
def streaming_tp_fp_arrays(num_gbboxes, tp, fp, scores,
remove_zero_scores=True,
metrics_collections=None,
updates_collections=None,
name=None):
"""Streaming computation of True and False Positive arrays. This metrics
also keeps track of scores and number of grountruth objects.
"""
# Input dictionaries: dict outputs as streaming metrics.
if isinstance(scores, dict) or isinstance(fp, dict):
d_values = {}
d_update_ops = {}
for c in num_gbboxes.keys():
scope = 'streaming_tp_fp_%s' % c
v, up = streaming_tp_fp_arrays(num_gbboxes[c], tp[c], fp[c], scores[c],
remove_zero_scores,
metrics_collections,
updates_collections,
name=scope)
d_values[c] = v
d_update_ops[c] = up
return d_values, d_update_ops
# Input Tensors...
with variable_scope.variable_scope(name, 'streaming_tp_fp',
[num_gbboxes, tp, fp, scores]):
num_gbboxes = tf.cast(num_gbboxes, dtype=tf.int64)
scores = tf.cast(scores, dtype=tf.float32)
stype = tf.bool
tp = tf.cast(tp, stype)
fp = tf.cast(fp, stype)
# Reshape TP and FP tensors and clean away 0 class values.
scores = tf.reshape(scores, [-1])
tp = tf.reshape(tp, [-1])
fp = tf.reshape(fp, [-1])
# Remove TP and FP both false.
mask = tf.logical_or(tp, fp)
if remove_zero_scores:
rm_threshold = 1e-4
mask = tf.logical_and(mask, tf.greater(scores, rm_threshold))
scores = tf.boolean_mask(scores, mask)
tp = tf.boolean_mask(tp, mask)
fp = tf.boolean_mask(fp, mask)
# Local variables accumlating information over batches.
v_nobjects = _create_local('v_num_gbboxes', shape=[], dtype=tf.int64)
v_ndetections = _create_local('v_num_detections', shape=[], dtype=tf.int32)
v_scores = _create_local('v_scores', shape=[0, ])
v_tp = _create_local('v_tp', shape=[0, ], dtype=stype)
v_fp = _create_local('v_fp', shape=[0, ], dtype=stype)
# Update operations.
nobjects_op = state_ops.assign_add(v_nobjects,
tf.reduce_sum(num_gbboxes))
ndetections_op = state_ops.assign_add(v_ndetections,
tf.size(scores, out_type=tf.int32))
scores_op = state_ops.assign(v_scores, tf.concat([v_scores, scores], axis=0),
validate_shape=False)
tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0),
validate_shape=False)
fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0),
validate_shape=False)
# Value and update ops.
val = (v_nobjects, v_ndetections, v_tp, v_fp, v_scores)
with ops.control_dependencies([nobjects_op, ndetections_op,
scores_op, tp_op, fp_op]):
update_op = (nobjects_op, ndetections_op, tp_op, fp_op, scores_op)
if metrics_collections:
ops.add_to_collections(metrics_collections, val)
if updates_collections:
ops.add_to_collections(updates_collections, update_op)
return val, update_op
# =========================================================================== #
# Average precision computations.
# =========================================================================== #
def average_precision_voc12(precision, recall, name=None):
"""Compute (interpolated) average precision from precision and recall Tensors.
The implementation follows Pascal 2012 and ILSVRC guidelines.
See also: https://sanchom.wordpress.com/tag/average-precision/
"""
with tf.name_scope(name, 'average_precision_voc12', [precision, recall]):
# Convert to float64 to decrease error on Riemann sums.
precision = tf.cast(precision, dtype=tf.float64)
recall = tf.cast(recall, dtype=tf.float64)
# Add bounds values to precision and recall.
precision = tf.concat([[0.], precision, [0.]], axis=0)
recall = tf.concat([[0.], recall, [1.]], axis=0)
# Ensures precision is increasing in reverse order.
precision = tfe_math.cummax(precision, reverse=True)
# Riemann sums for estimating the integral.
# mean_pre = (precision[1:] + precision[:-1]) / 2.
mean_pre = precision[1:]
diff_rec = recall[1:] - recall[:-1]
ap = tf.reduce_sum(mean_pre * diff_rec)
return ap
def average_precision_voc07(precision, recall, name=None):
"""Compute (interpolated) average precision from precision and recall Tensors.
The implementation follows Pascal 2007 guidelines.
See also: https://sanchom.wordpress.com/tag/average-precision/
"""
with tf.name_scope(name, 'average_precision_voc07', [precision, recall]):
# Convert to float64 to decrease error on cumulated sums.
precision = tf.cast(precision, dtype=tf.float64)
recall = tf.cast(recall, dtype=tf.float64)
# Add zero-limit value to avoid any boundary problem...
precision = tf.concat([precision, [0.]], axis=0)
recall = tf.concat([recall, [np.inf]], axis=0)
# Split the integral into 10 bins.
l_aps = []
for t in np.arange(0., 1.1, 0.1):
mask = tf.greater_equal(recall, t)
v = tf.reduce_max(tf.boolean_mask(precision, mask))
l_aps.append(v / 11.)
ap = tf.add_n(l_aps)
return ap
def precision_recall_values(xvals, precision, recall, name=None):
"""Compute values on the precision/recall curve.
Args:
x: Python list of floats;
precision: 1D Tensor decreasing.
recall: 1D Tensor increasing.
Return:
list of precision values.
"""
with ops.name_scope(name, "precision_recall_values",
[precision, recall]) as name:
# Add bounds values to precision and recall.
precision = tf.concat([[0.], precision, [0.]], axis=0)
recall = tf.concat([[0.], recall, [1.]], axis=0)
precision = tfe_math.cummax(precision, reverse=True)
prec_values = []
for x in xvals:
mask = tf.less_equal(recall, x)
val = tf.reduce_min(tf.boolean_mask(precision, mask))
prec_values.append(val)
return tf.tuple(prec_values)
# =========================================================================== #
# TF Extended metrics: old stuff!
# =========================================================================== #
def _precision_recall(n_gbboxes, n_detections, scores, tp, fp, scope=None):
"""Compute precision and recall from scores, true positives and false
positives booleans arrays
"""
# Sort by score.
with tf.name_scope(scope, 'prec_rec', [n_gbboxes, scores, tp, fp]):
# Sort detections by score.
scores, idxes = tf.nn.top_k(scores, k=n_detections, sorted=True)
tp = tf.gather(tp, idxes)
fp = tf.gather(fp, idxes)
# Computer recall and precision.
dtype = tf.float64
tp = tf.cumsum(tf.cast(tp, dtype), axis=0)
fp = tf.cumsum(tf.cast(fp, dtype), axis=0)
recall = _safe_div(tp, tf.cast(n_gbboxes, dtype), 'recall')
precision = _safe_div(tp, tp + fp, 'precision')
return tf.tuple([precision, recall])
def streaming_precision_recall_arrays(n_gbboxes, rclasses, rscores,
tp_tensor, fp_tensor,
remove_zero_labels=True,
metrics_collections=None,
updates_collections=None,
name=None):
"""Streaming computation of precision / recall arrays. This metrics
keeps tracks of boolean True positives and False positives arrays.
"""
with variable_scope.variable_scope(name, 'stream_precision_recall',
[n_gbboxes, rclasses, tp_tensor, fp_tensor]):
n_gbboxes = tf.cast(n_gbboxes, tf.int64)
rclasses = tf.cast(rclasses, tf.int64)
rscores = tf.cast(rscores, tf.float)
stype = tf.int32
tp_tensor = tf.cast(tp_tensor, stype)
fp_tensor = tf.cast(fp_tensor, stype)
# Reshape TP and FP tensors and clean away 0 class values.
rclasses = tf.reshape(rclasses, [-1])
rscores = tf.reshape(rscores, [-1])
tp_tensor = tf.reshape(tp_tensor, [-1])
fp_tensor = tf.reshape(fp_tensor, [-1])
if remove_zero_labels:
mask = tf.greater(rclasses, 0)
rclasses = tf.boolean_mask(rclasses, mask)
rscores = tf.boolean_mask(rscores, mask)
tp_tensor = tf.boolean_mask(tp_tensor, mask)
fp_tensor = tf.boolean_mask(fp_tensor, mask)
# Local variables accumlating information over batches.
v_nobjects = _create_local('v_nobjects', shape=[], dtype=tf.int64)
v_ndetections = _create_local('v_ndetections', shape=[], dtype=tf.int32)
v_scores = _create_local('v_scores', shape=[0, ])
v_tp = _create_local('v_tp', shape=[0, ], dtype=stype)
v_fp = _create_local('v_fp', shape=[0, ], dtype=stype)
# Update operations.
nobjects_op = state_ops.assign_add(v_nobjects,
tf.reduce_sum(n_gbboxes))
ndetections_op = state_ops.assign_add(v_ndetections,
tf.size(rscores, out_type=tf.int32))
scores_op = state_ops.assign(v_scores, tf.concat([v_scores, rscores], axis=0),
validate_shape=False)
tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp_tensor], axis=0),
validate_shape=False)
fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp_tensor], axis=0),
validate_shape=False)
# Precision and recall computations.
# r = _precision_recall(nobjects_op, scores_op, tp_op, fp_op, 'value')
r = _precision_recall(v_nobjects, v_ndetections, v_scores,
v_tp, v_fp, 'value')
with ops.control_dependencies([nobjects_op, ndetections_op,
scores_op, tp_op, fp_op]):
update_op = _precision_recall(nobjects_op, ndetections_op,
scores_op, tp_op, fp_op, 'update_op')
# update_op = tf.Print(update_op,
# [tf.reduce_sum(tf.cast(mask, tf.int64)),
# tf.reduce_sum(tf.cast(mask2, tf.int64)),
# tf.reduce_min(rscores),
# tf.reduce_sum(n_gbboxes)],
# 'Metric: ')
# Some debugging stuff!
# update_op = tf.Print(update_op,
# [tf.shape(tp_op),
# tf.reduce_sum(tf.cast(tp_op, tf.int64), axis=0)],
# 'TP and FP shape: ')
# update_op[0] = tf.Print(update_op,
# [nobjects_op],
# '# Groundtruth bboxes: ')
# update_op = tf.Print(update_op,
# [update_op[0][0],
# update_op[0][-1],
# tf.reduce_min(update_op[0]),
# tf.reduce_max(update_op[0]),
# tf.reduce_min(update_op[1]),
# tf.reduce_max(update_op[1])],
# 'Precision and recall :')
if metrics_collections:
ops.add_to_collections(metrics_collections, r)
if updates_collections:
ops.add_to_collections(updates_collections, update_op)
return r, update_op

View File

@@ -0,0 +1,95 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TF Extended: additional tensors operations.
"""
import tensorflow as tf
from tensorflow.contrib.framework.python.ops import variables as contrib_variables
from tensorflow.contrib.metrics.python.ops import set_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables
def get_shape(x, rank=None):
"""Returns the dimensions of a Tensor as list of integers or scale tensors.
Args:
x: N-d Tensor;
rank: Rank of the Tensor. If None, will try to guess it.
Returns:
A list of `[d1, d2, ..., dN]` corresponding to the dimensions of the
input tensor. Dimensions that are statically known are python integers,
otherwise they are integer scalar tensors.
"""
if x.get_shape().is_fully_defined():
return x.get_shape().as_list()
else:
static_shape = x.get_shape()
if rank is None:
static_shape = static_shape.as_list()
rank = len(static_shape)
else:
static_shape = x.get_shape().with_rank(rank).as_list()
dynamic_shape = tf.unstack(tf.shape(x), rank)
return [s if s is not None else d
for s, d in zip(static_shape, dynamic_shape)]
def pad_axis(x, offset, size, axis=0, name=None):
"""Pad a tensor on an axis, with a given offset and output size.
The tensor is padded with zero (i.e. CONSTANT mode). Note that the if the
`size` is smaller than existing size + `offset`, the output tensor
was the latter dimension.
Args:
x: Tensor to pad;
offset: Offset to add on the dimension chosen;
size: Final size of the dimension.
Return:
Padded tensor whose dimension on `axis` is `size`, or greater if
the input vector was larger.
"""
with tf.name_scope(name, 'pad_axis'):
shape = get_shape(x)
rank = len(shape)
# Padding description.
new_size = tf.maximum(size-offset-shape[axis], 0)
pad1 = tf.stack([0]*axis + [offset] + [0]*(rank-axis-1))
pad2 = tf.stack([0]*axis + [new_size] + [0]*(rank-axis-1))
paddings = tf.stack([pad1, pad2], axis=1)
x = tf.pad(x, paddings, mode='CONSTANT')
# Reshape, to get fully defined shape if possible.
# TODO: fix with tf.slice
shape[axis] = size
x = tf.reshape(x, tf.stack(shape))
return x
# def select_at_index(idx, val, t):
# """Return a tensor.
# """
# idx = tf.expand_dims(tf.expand_dims(idx, 0), 0)
# val = tf.expand_dims(val, 0)
# t = t + tf.scatter_nd(idx, val, tf.shape(t))
# return t

View File

@@ -0,0 +1,20 @@
'''
Endpoint names to look for in the graph
'''
from anchors import generate_anchors
feat_layers = generate_anchors.feat_layers
sub_feats = ['']
localizations_names = [f'ssd_300_vgg/{feature}_box/Reshape:0' for feature in feat_layers]
predictions_names = ['ssd_300_vgg/softmax/Reshape_1:0'] \
+ [f'ssd_300_vgg/softmax_{n}/Reshape_1:0' for n in range(1, len(feat_layers))]
logit_names = [f'ssd_300_vgg/{feature}_box/Reshape_1:0' for feature in feat_layers]
endpoint_names = ['ssd_300_vgg/conv1/conv1_2/Relu:0'] \
+ [f'ssd_300_vgg/conv{n}/conv{n}_3/Relu:0' for n in range(4, 6)] \
+ [f'ssd_300_vgg/conv{n}/conv{n}_{n}/Relu:0' for n in range(2, 4)] \
+ [f'ssd_300_vgg/conv{n}/Relu:0' for n in range(6, 8)] \
+ [f'ssd_300_vgg/{feature}/conv3x3/Relu:0' for feature in feat_layers if feature != 'block4' and feature != 'block7']

View File

@@ -0,0 +1,158 @@
# Copyright 2016 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Diverse TensorFlow utils, for training, evaluation and so on!
"""
import os
import tensorflow as tf
# =========================================================================== #
# General tools.
# =========================================================================== #
def reshape_list(l, shape=None):
"""Reshape list of (list): 1D to 2D or the other way around.
Args:
l: List or List of list.
shape: 1D or 2D shape.
Return
Reshaped list.
"""
r = []
if shape is None:
# Flatten everything.
for a in l:
if isinstance(a, (list, tuple)):
r = r + list(a)
else:
r.append(a)
else:
# Reshape to list of list.
i = 0
for s in shape:
if s == 1:
r.append(l[i])
else:
r.append(l[i:i+s])
i += s
return r
def configure_learning_rate(flags, num_samples_per_epoch, global_step):
"""Configures the learning rate.
Args:
num_samples_per_epoch: The number of samples in each epoch of training.
global_step: The global_step tensor.
Returns:
A `Tensor` representing the learning rate.
"""
decay_steps = int(num_samples_per_epoch / flags.batch_size *
flags.num_epochs_per_decay)
if flags.learning_rate_decay_type == 'exponential':
return tf.train.exponential_decay(flags.learning_rate,
global_step,
decay_steps,
flags.learning_rate_decay_factor,
staircase=True,
name='exponential_decay_learning_rate')
elif flags.learning_rate_decay_type == 'fixed':
return tf.constant(flags.learning_rate, name='fixed_learning_rate')
elif flags.learning_rate_decay_type == 'polynomial':
return tf.train.polynomial_decay(flags.learning_rate,
global_step,
decay_steps,
flags.end_learning_rate,
power=1.0,
cycle=False,
name='polynomial_decay_learning_rate')
else:
raise ValueError('learning_rate_decay_type [%s] was not recognized',
flags.learning_rate_decay_type)
def configure_optimizer(flags, learning_rate):
"""Configures the optimizer used for training.
Args:
learning_rate: A scalar or `Tensor` learning rate.
Returns:
An instance of an optimizer.
"""
if flags.optimizer == 'adadelta':
optimizer = tf.train.AdadeltaOptimizer(
learning_rate,
rho=flags.adadelta_rho,
epsilon=flags.opt_epsilon)
elif flags.optimizer == 'adagrad':
optimizer = tf.train.AdagradOptimizer(
learning_rate,
initial_accumulator_value=flags.adagrad_initial_accumulator_value)
elif flags.optimizer == 'adam':
optimizer = tf.train.AdamOptimizer(
learning_rate,
beta1=flags.adam_beta1,
beta2=flags.adam_beta2,
epsilon=flags.opt_epsilon)
elif flags.optimizer == 'ftrl':
optimizer = tf.train.FtrlOptimizer(
learning_rate,
learning_rate_power=flags.ftrl_learning_rate_power,
initial_accumulator_value=flags.ftrl_initial_accumulator_value,
l1_regularization_strength=flags.ftrl_l1,
l2_regularization_strength=flags.ftrl_l2)
elif flags.optimizer == 'momentum':
optimizer = tf.train.MomentumOptimizer(
learning_rate,
momentum=flags.momentum,
name='Momentum')
elif flags.optimizer == 'rmsprop':
optimizer = tf.train.RMSPropOptimizer(
learning_rate,
decay=flags.rmsprop_decay,
momentum=flags.rmsprop_momentum,
epsilon=flags.opt_epsilon)
elif flags.optimizer == 'sgd':
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
else:
raise ValueError('Optimizer [%s] was not recognized', flags.optimizer)
return optimizer
def update_model_scope(var, ckpt_scope, new_scope):
return var.op.name.replace(new_scope,'vgg_16')
def get_variables_to_train(flags):
"""Returns a list of variables to train.
Returns:
A list of variables to train by the optimizer.
"""
if flags.trainable_scopes is None:
return tf.trainable_variables()
else:
scopes = [scope.strip() for scope in flags.trainable_scopes.split(',')]
variables_to_train = []
for scope in scopes:
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
variables_to_train.extend(variables)
return variables_to_train
# =========================================================================== #
# Evaluation utils.
# =========================================================================== #

View File

@@ -0,0 +1,114 @@
# Copyright 2017 Paul Balanca. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import cv2
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.cm as mpcm
# =========================================================================== #
# Some colormaps.
# =========================================================================== #
def colors_subselect(colors, num_classes=21):
dt = len(colors) // num_classes
sub_colors = []
for i in range(num_classes):
color = colors[i*dt]
if isinstance(color[0], float):
sub_colors.append([int(c * 255) for c in color])
else:
sub_colors.append([c for c in color])
return sub_colors
colors_plasma = colors_subselect(mpcm.plasma.colors, num_classes=21)
colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# =========================================================================== #
# OpenCV drawing.
# =========================================================================== #
def draw_lines(img, lines, color=[255, 0, 0], thickness=2):
"""Draw a collection of lines on an image.
"""
for line in lines:
for x1, y1, x2, y2 in line:
cv2.line(img, (x1, y1), (x2, y2), color, thickness)
def draw_rectangle(img, p1, p2, color=[255, 0, 0], thickness=2):
cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)
def draw_bbox(img, bbox, shape, label, color=[255, 0, 0], thickness=2):
p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))
p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))
cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)
p1 = (p1[0]+15, p1[1])
cv2.putText(img, str(label), p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.5, color, 1)
def bboxes_draw_on_img(img, classes, scores, bboxes, colors, thickness=2):
shape = img.shape
for i in range(bboxes.shape[0]):
bbox = bboxes[i]
color = colors[classes[i]]
# Draw bounding box...
p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))
p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))
cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)
# Draw text...
s = '%s/%.3f' % (classes[i], scores[i])
p1 = (p1[0]-5, p1[1])
cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1)
# =========================================================================== #
# Matplotlib show...
# =========================================================================== #
def plt_bboxes(img, classes, scores, bboxes, figsize=(10,10), linewidth=1.5):
"""Visualize bounding boxes. Largely inspired by SSD-MXNET!
"""
fig = plt.figure(figsize=figsize)
plt.imshow(img)
height = img.shape[0]
width = img.shape[1]
colors = dict()
for i in range(classes.shape[0]):
cls_id = int(classes[i])
if cls_id >= 0:
score = scores[i]
if cls_id not in colors:
colors[cls_id] = (random.random(), random.random(), random.random())
ymin = int(bboxes[i, 0] * height)
xmin = int(bboxes[i, 1] * width)
ymax = int(bboxes[i, 2] * height)
xmax = int(bboxes[i, 3] * width)
rect = plt.Rectangle((xmin, ymin), xmax - xmin,
ymax - ymin, fill=False,
edgecolor=colors[cls_id],
linewidth=linewidth)
plt.gca().add_patch(rect)
class_name = str(cls_id)
plt.gca().text(xmin, ymin - 2,
'{:s} | {:.3f}'.format(class_name, score),
bbox=dict(facecolor=colors[cls_id], alpha=0.5),
fontsize=12, color='white')
plt.show()