Blog of random intresting stuff

object detection

February 02, 2019

%matplotlib inline
%reload_ext autoreload
%autoreload 2
from fastai.conv_learner import *
from fastai.dataset import *
from pathlib import Path
import json
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects

I will use the Pascal VOC dataset, of the 2007 cometition

PATH = Path("data/VOC2007")
list(PATH.iterdir())
[WindowsPath('data/VOC2007/Annotations'),
 WindowsPath('data/VOC2007/ImageSets'),
 WindowsPath('data/VOC2007/JPEGImages'),
 WindowsPath('data/VOC2007/pascal_test2007.json'),
 WindowsPath('data/VOC2007/pascal_train2007.json'),
 WindowsPath('data/VOC2007/pascal_train2012.json'),
 WindowsPath('data/VOC2007/pascal_val2007.json'),
 WindowsPath('data/VOC2007/pascal_val2012.json'),
 WindowsPath('data/VOC2007/PASCAL_VOC'),
 WindowsPath('data/VOC2007/SegmentationClass'),
 WindowsPath('data/VOC2007/SegmentationObject'),
 WindowsPath('data/VOC2007/tmp')]

The labels are json

train_bbox = json.load((PATH/"pascal_train2007.json").open())
train_bbox.keys()
dict_keys(['images', 'type', 'annotations', 'categories'])
IMAGES, ANNOTATIONS, CATEGORIES = ["images", "annotations", "categories"]
train_bbox[IMAGES][0:10]
[{'file_name': '000012.jpg', 'height': 333, 'width': 500, 'id': 12},
 {'file_name': '000017.jpg', 'height': 364, 'width': 480, 'id': 17},
 {'file_name': '000023.jpg', 'height': 500, 'width': 334, 'id': 23},
 {'file_name': '000026.jpg', 'height': 333, 'width': 500, 'id': 26},
 {'file_name': '000032.jpg', 'height': 281, 'width': 500, 'id': 32},
 {'file_name': '000033.jpg', 'height': 366, 'width': 500, 'id': 33},
 {'file_name': '000034.jpg', 'height': 500, 'width': 360, 'id': 34},
 {'file_name': '000035.jpg', 'height': 375, 'width': 500, 'id': 35},
 {'file_name': '000036.jpg', 'height': 500, 'width': 332, 'id': 36},
 {'file_name': '000042.jpg', 'height': 335, 'width': 500, 'id': 42}]
train_bbox[ANNOTATIONS][:1]
[{'segmentation': [[155, 96, 155, 270, 351, 270, 351, 96]],
  'area': 34104,
  'iscrowd': 0,
  'image_id': 12,
  'bbox': [155, 96, 196, 174],
  'category_id': 7,
  'id': 1,
  'ignore': 0}]
train_bbox[CATEGORIES][:5]
[{'supercategory': 'none', 'id': 1, 'name': 'aeroplane'},
 {'supercategory': 'none', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'none', 'id': 3, 'name': 'bird'},
 {'supercategory': 'none', 'id': 4, 'name': 'boat'},
 {'supercategory': 'none', 'id': 5, 'name': 'bottle'}]
FILE_NAME, ID, IMG_ID, CAT_ID, BBOX = 'file_name','id','image_id','category_id','bbox'

create dictionaries indexed by id and the content

All dictionaries are indexed by image id so you can stablish the relationships between - imageId - category - imageId - file name

categories = {c[ID]: c["name"] for c in train_bbox[CATEGORIES]}
train_images = { i[ID]: i[FILE_NAME] for i in train_bbox[IMAGES]}
train_ids = { i[ID] for i in train_bbox[IMAGES]}

Find the path to images

JPEG_FILES = PATH/"JPEGImages"

list(JPEG_FILES.iterdir())[:5]
[WindowsPath('data/VOC2007/JPEGImages/000005.jpg'),
 WindowsPath('data/VOC2007/JPEGImages/000007.jpg'),
 WindowsPath('data/VOC2007/JPEGImages/000009.jpg'),
 WindowsPath('data/VOC2007/JPEGImages/000012.jpg'),
 WindowsPath('data/VOC2007/JPEGImages/000016.jpg')]
annotations = train_bbox[ANNOTATIONS]
annotations[:1]
[{'segmentation': [[155, 96, 155, 270, 351, 270, 351, 96]],
  'area': 34104,
  'iscrowd': 0,
  'image_id': 12,
  'bbox': [155, 96, 196, 174],
  'category_id': 7,
  'id': 1,
  'ignore': 0}]
def hw_bb(bb):
    """
    converts the bounding box from [horizontal x, vertical y, height, width] to [(top left corner) x, y, (bottom_right corner) x, y]
    """
    return np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])
def bb_hw(hw):
    """
    converts the bounding box from  [top left corner, bottom right corner] to [top lef tcorner, height, width]
    """
    return np.array([hw[1], hw[0], hw[3] - hw[1] + 1, hw[2] - hw[0] +1])
train_annotations = collections.defaultdict(lambda: [])

for a in annotations:
    if not a["ignore"]:
        bb = hw_bb(a["bbox"])
        train_annotations[a[IMG_ID]].append((bb, a[CAT_ID]))
len(train_annotations)
2501

See the data

lests draw an immage with its annotations and noames

image = train_bbox[IMAGES][0]
image
{'file_name': '000012.jpg', 'height': 333, 'width': 500, 'id': 12}
img_annotations = train_annotations[image[ID]]
img_annotations[1]
---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-21-99f1a25a5578> in <module>
      1 img_annotations = train_annotations[image[ID]]
----> 2 img_annotations[1]


IndexError: list index out of range
im = open_image(JPEG_FILES/image["file_name"])
def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    return ax

def draw_rect(ax, b):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor='white', lw=2))
    draw_outline(patch, 4)
    
def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])
    
def draw_text(ax, xy, txt, sz=14):
    text = ax.text(*xy, txt,
        verticalalignment='top', color='white', fontsize=sz, weight='bold')
    draw_outline(text, 1)
ax = show_img(im)
b = bb_hw(img_annotations[0][0])
draw_rect(ax, b)
draw_text(ax, b[:2], categories[img_annotations[0][1]])

png

def draw_im(im, ann):
    ax = show_img(im, figsize=(16,8))
    for b,c in ann:
        b = bb_hw(b)
        draw_rect(ax, b)
        draw_text(ax, b[:2], categories[c], sz=16)
        
def draw_idx(i):
    im_a = train_annotations[i]
    im = open_image(JPEG_FILES/train_images[i])
    print(im.shape)
    draw_im(im, im_a)
draw_idx(554)
(375, 500, 3)

png

Largest item classifier

Lest do a classfifier that identifies the biggest item in an image

def get_lrg(b):
    if not b: raise Exception(b)
    b = sorted(b, key=lambda x: np.product(x[0][-2:]-x[0][:2]), reverse=True)
    return b[0]
trn_largest_anno = {a: get_lrg(b) for a,b in train_annotations.items()}
(PATH/"tmp").mkdir(exist_ok=True)
CSV  = PATH/"tmp/largest_item.csv"
dataFrame = pd.DataFrame(data = {
    "fn": [train_images[i] for i in train_ids],
    "cat": [categories[trn_largest_anno[i][1]] for i in train_ids],
}, columns= ["fn", "cat"])
dataFrame.to_csv(CSV, index=False)

Classification

steps

  1. define architecture
  2. define batch size
  3. create transformations
  4. create an image classifier
  5. find learning rate with ConvLearner
  6. fit model
  7. unfreeze layers and fit
arch = resnet34
sz = 224
bs = 64
tfms = tfms_from_model(f_model = arch, sz=sz, aug_tfms=transforms_side_on, crop_type=CropType.NO)
md = ImageClassifierData.from_csv("./", JPEG_FILES, CSV, tfms=tfms, bs = bs)
x,y=next(iter(md.val_dl))
show_img(md.val_ds.denorm(to_np(x))[0]);

png

learn = ConvLearner.pretrained(arch, md, metrics=[accuracy])
learn.opt_fn = optim.Adam
lrf = learn.lr_find(1e-5, 100)
HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…



 78%|███████████████████████████████████████████████████████▍               | 25/32 [00:11<00:03,  1.83it/s, loss=5.76]
learn.sched.plot(n_skip=5, n_skip_end=1)

png

lr = 2e-2
learn.fit(lr, 1, cycle_len=1)
HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…


epoch      trn_loss   val_loss   accuracy                                                                              
    0      1.223465   0.708285   0.794     





[array([0.70828]), 0.794]
lrs = np.array([lr/1000,lr/100,lr])
learn.freeze_to(-2)
lrf=learn.lr_find(lrs/1000)
learn.sched.plot(1)
HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…



 84%|███████████████████████████████████████████████████████████▉           | 27/32 [00:14<00:02,  2.08it/s, loss=2.36]

png

learn.fit(lrs/5, 1, cycle_len=2)
HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…


epoch      trn_loss   val_loss   accuracy                                                                              
    0      0.7591     0.620626   0.78      
    1      0.540552   0.608486   0.78                                                                                  





[array([0.60849]), 0.7800000019073486]
learn.save('clas_one')
learn.load('clas_one')
x,y = next(iter(md.val_dl))
probs = F.softmax(predict_batch(learn.model, x), -1)
x,preds = to_np(x),to_np(probs)
preds = np.argmax(preds, -1)
fig, axes = plt.subplots(3, 4, figsize=(12, 8))
for i,ax in enumerate(axes.flat):
    ima=md.val_ds.denorm(x)[i]
    b = md.classes[preds[i]]
    ax = show_img(ima, ax=ax)
    draw_text(ax, (0,0), b)
plt.tight_layout()
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

png

Bounding box

We’ll try to find the bounding box around an object.

To find the bounding box, we use a multiple regresion, we will train the model to predict 4 values, one for each coordinate

this means each output will need to be countinous with any number between 0 and 244 (imagesize)

So following a differential programming approach we need

  • Predict 4 contiuous values

So the neural network will have at the end 4 neurons, each without activation, to produce a continous value

To train the network we need to use a function that is lower when the four numbers have less error

  • we can use Mean square error
  • Also L1 function
BB_CSV = PATH/"tmp/bb.csv"

We will use the largest annotation per image, and discard the category information for now, since right now we are only building a bounding box predictor

bb = np.array([trn_largest_anno[i][0] for i in train_ids])

For fastai reasons we need to put the four values in the same column separated by spaces

bbs = [" ".join(str(v) for v in boundingbox) for boundingbox in bb ]
data = {"fn": [train_images[i] for i in train_ids],"bbox": bbs}
df = pd.DataFrame(data, columns=["fn", "bbox"])
df.to_csv(BB_CSV, index = False)

For the pretrained architecture we will use resnet34 again, with the same file size and batch size

f_model=resnet34
sz=224
bs=64

Because this is a regresion problem (meaning the output will be 4 numbers instead of a classification) we need to do some customization

  1. Set continous = True so fastai won’t

  2. one-hot encode the labels

  3. use Mean Square Error as the default loss function

  4. Tell the transforms that our labels are coordinates so that they are transformed with the image. (this is why we have that weird way of defining bboxes “topleftcorner, bottomrightcorner”

  5. create our own set of augmentations, to void to high rotations that make the bb unreal

  6. Set CropType.NO so that the images are squished to 224 rather than cropped

augmentations = [ RandomFlip(),
                  RandomRotate(30),
                  RandomLighting(0.1,0.1)]
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, aug_tfms=augmentations)
md = ImageClassifierData.from_csv("./", JPEG_FILES, BB_CSV, tfms=tfms, continuous=True)
idx=3
fig,axes = plt.subplots(3,3, figsize=(9,9))
for i,ax in enumerate(axes.flat):
    x,y=next(iter(md.aug_dl))
    ima=md.val_ds.denorm(to_np(x))[idx]
    b = bb_hw(to_np(y[idx]))
    print(b)
    show_img(ima, ax=ax)
    draw_rect(ax, b)
[  1.  89. 499. 192.]
[  1.  89. 499. 192.]
[  1.  89. 499. 192.]
[  1.  89. 499. 192.]
[  1.  89. 499. 192.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  89. 499. 192.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  89. 499. 192.]
[  1.  89. 499. 192.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  89. 499. 192.]

png

augs = [RandomFlip(tfm_y=TfmType.COORD),
        RandomRotate(30, tfm_y=TfmType.COORD),
        RandomLighting(0.1,0.1, tfm_y=TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=augs)
md = ImageClassifierData.from_csv("./", JPEG_FILES, BB_CSV, tfms=tfms, continuous=True, bs=4)
idx=3
fig,axes = plt.subplots(3,3, figsize=(9,9))
for i,ax in enumerate(axes.flat):
    x,y=next(iter(md.aug_dl))
    ima=md.val_ds.denorm(to_np(x))[idx]
    b = bb_hw(to_np(y[idx]))
    print(b)
    show_img(ima, ax=ax)
    draw_rect(ax, b)
[  0.   3. 224. 220.]
[  1.  60. 221. 125.]
[  0.  56. 223. 133.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  0.  25. 224. 194.]
[  0.   5. 224. 218.]
[  1.  60. 221. 125.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  0.   0. 224. 223.]
[  0.  44. 224. 157.]
[  1.  60. 221. 125.]

png

tfm_y = TfmType.COORD
augs = [RandomFlip(tfm_y=tfm_y),
        RandomRotate(3, p=0.5, tfm_y=tfm_y),
        RandomLighting(0.05,0.05, tfm_y=tfm_y)]

tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=tfm_y, aug_tfms=augs)
md = ImageClassifierData.from_csv("./", JPEG_FILES, BB_CSV, tfms=tfms, bs=bs, continuous=True)
idx=3
fig,axes = plt.subplots(3,3, figsize=(9,9))
for i,ax in enumerate(axes.flat):
    x,y=next(iter(md.aug_dl))
    ima=md.val_ds.denorm(to_np(x))[idx]
    b = bb_hw(to_np(y[idx]))
    print(b)
    show_img(ima, ax=ax)
    draw_rect(ax, b)
[  0.  55. 224. 135.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  0.  55. 223. 135.]
[  0.  55. 224. 135.]
[  1.  60. 221. 125.]
[  0.  52. 224. 141.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  0.  57. 223. 131.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  60. 221. 125.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  60. 221. 125.]


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


[  1.  60. 221. 125.]

png


Miguel Espinoza

Written by Miguel Espinoza a Telematics engineer...