Training and Deploying an Image Captioning System
Demo
Upload your own photo to be captioned:
For the rest of this post I show an end-to-end training of the captioning system in a reproducible jupyter notebook style. This notebook was run on Google Colab on a high-ram GPU-accelerated runtime. All code for training and deployment is also available here.
Download the data from the coco site.
!curl -o "annotations_trainval2014.zip" http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip "annotations_trainval2014.zip"
!curl -o "train2014.zip" http://images.cocodataset.org/zips/train2014.zip
!curl -o "val2014.zip" http://images.cocodataset.org/zips/val2014.zip
import some stuff for initial data processing
import matplotlib.pyplot as plt
from io import BytesIO
import cv2
import json
import numpy as np
import zipfile
import torchtext
import string
import torch
now im going to iterate through all the pics in the archives and resize them to 224x224 while preserving the aspect ratios (by padding with black pixels). Ill save each resized picture to a numpy array to use later when building the model.
base_path = "."
ds_to_fn = {'train':'train2014.zip','val':'val2014.zip'}
size = 224
def pad_image(img, height, width):
h, w = img.shape[:2]
t = 0
b = height - h
l = 0
r = width - w
return cv2.copyMakeBorder(img, t, b, l, r,
cv2.BORDER_CONSTANT, value=0)
def resize_and_pad(img, height, width, resample=cv2.INTER_AREA):
if len(img.shape)==2:
img = np.stack([img,img,img],axis=2)
target_aspect_ratio = height/width
im_h, im_w, _ = img.shape
im_aspect_aspect_ratio = im_h/im_w
if im_aspect_aspect_ratio>target_aspect_ratio:
target_height = height
target_width = int(im_w * target_height/im_h)
else:
target_width = width
target_height = int(im_h * target_width/im_w)
resized = cv2.resize(img, (target_width, target_height),
interpolation=resample)
return pad_image(resized, height, width)
pics, im_fn_to_index = {}, {}
for ds in ['train','val']:
fn = ds_to_fn[ds]
archive = zipfile.ZipFile(f"{base_path}/{fn}")
file_list = archive.filelist
pics[ds] = np.zeros((len(file_list)-1,size,size,3),dtype=np.uint8)
im_fn_to_index[ds] = {}
for count,file_obj in enumerate(file_list):
im_fn = file_obj.filename
if not im_fn.endswith('.jpg'):
continue
with archive.open(file_obj) as open_file:
res = BytesIO(open_file.read())
pic = plt.imread(res,'jpg')
ind = len(im_fn_to_index[ds])
pics[ds][ind] = resize_and_pad(pic, size, size)
im_fn_to_index[ds][im_fn] = ind
archive.close()
for word embeddings i am going to use the pretrained GLoVE embeddings that are downloadable from torchtext. i chose to take the 100k most common words since less frequent words will add more dimensions to the output space without much benefit since most words in the captions are simple and therefore common. i also exclude any words with punctuation other than a dash or apostrophe or any words with an uppercase letter since i will only use lowercase text.
vocab = torchtext.vocab.GloVe(name='840B', dim=300, max_vectors=100000)
punctuation = set(c for c in string.punctuation if c not in "-'")
digits = set(str(i) for i in range(10))
inds_to_use = []
seen_lower = set()
upper_added = {}
words = set()
for i,word in enumerate(vocab.itos):
if not any(c in punctuation or c in digits for c in word):
if not all(c in "-'" for c in word) and word.islower():
inds_to_use.append(i)
words.add(word)
vocab.itos = np.array(vocab.itos)[inds_to_use]
vocab.stoi = {s:i for i,s in enumerate(vocab.itos)}
vocab.vectors = vocab.vectors[inds_to_use]
## size of remaining vocab
len(vocab.stoi),len(vocab.itos),vocab.vectors.size()
install some libraries that i use for pretrained building blocks
!pip install efficientnet_pytorch
!pip install transformers
import everything i will need for defining and training the neural net
import torch.nn as nn
import torchvision
import efficientnet_pytorch
import transformers
import scipy.stats
base_path = "."
load all the data and labels and define all the hash maps that i will use later during training
with open(f'{base_path}/annotations/captions_train2014.json','r') as f:
annot_train = json.load(f)
with open(f'{base_path}/annotations/captions_val2014.json','r') as f:
annot_val = json.load(f)
LONGEST_CAPTION = max(len(d['caption'].split())
for d in annot_train['annotations'] +\
annot_val['annotations'])
LONGEST_CAPTION
train_pics = pics['train']
val_pics = pics['val']
train_immap = im_fn_to_index['train']
val_immap = im_fn_to_index['val']
train_fn_to_index = {key.split('/')[1]:val
for key,val in train_immap.items()}
train_index_to_fn = {val:key for key,val in train_fn_to_index.items()}
val_fn_to_index = {key.split('/')[1]:val for key,val in val_immap.items()}
val_index_to_fn = {val:key for key,val in val_fn_to_index.items()}
train_imfn_to_imid = {d['file_name']:d['id']
for d in annot_train['images']}
train_imid_to_caption = {d['image_id']:d['caption']
for d in annot_train['annotations']}
train_imfn_to_caption = {fn:train_imid_to_caption[id_]
for fn,id_ in train_imfn_to_imid.items()}
val_imfn_to_imid = {d['file_name']:d['id'] for d in annot_val['images']}
val_imid_to_caption = {d['image_id']:d['caption']
for d in annot_val['annotations']}
val_imfn_to_caption = {fn:val_imid_to_caption[id_]
for fn,id_ in val_imfn_to_imid.items()}
define a function to show some training images and their provided captions from the training annotations. view a random 10 images and their captions
def show_im_and_cap_train(indexes):
for index in indexes:
imfn = train_index_to_fn[index]
caption = train_imfn_to_caption[imfn]
fig = plt.figure(figsize=(7,7))
plt.imshow(train_pics[index])
plt.title(caption)
plt.show()
show_im_and_cap_train(np.random.randint(0,len(train_pics),10))