diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..19a22c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +*.pyc +vqa diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..faff2c9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "resnet"] + path = resnet + url = https://github.com/Cyanogenoid/pytorch-resnet diff --git a/README.md b/README.md new file mode 100644 index 0000000..e2575f4 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Strong baseline for visual question answering + +This is a re-implementation of Vahid Kazemi and Ali Elqursh's paper [Show, Ask, Attend, and Answer: A Strong Baseline For Visual Question Answering][0] in [PyTorch][1]. + +The paper shows that with a relatively simple model, using only common building blocks in Deep Learning, you can get better accuracies than the majority of previously published work on the popular [VQA v1][2] dataset. + +This repository is intended to provide a straightforward implementation of the paper for other researchers to build on. +The results closely match the reported results, as the majority of details should be exactly the same as the paper. (Thanks to the authors for answering my questions about some details!) +This implementation seems to consistently converge to about 0.1% better results, but I am not aware of what implementation difference is causing this. + +A fully trained model (convergence shown below) is [available for download][5]. + +![Graph of convergence of implementation versus paper results](http://i.imgur.com/moWYEm8.png) + + +## Running the model + +- Clone this repository with: +``` +git clone https://github.com/Cyanogenoid/pytorch-vqa --recursive +``` +- Set the paths to your downloaded [questions, answers, and MS COCO images][4] in `config.py`. + - `qa_path` should contain the files `OpenEnded_mscoco_train2014_questions.json`, `OpenEnded_mscoco_val2014_questions.json`, `mscoco_train2014_annotations.json`, `mscoco_val2014_annotations.json`. + - `train_path`, `val_path`, `test_path` should contain the train, validation, and test `.jpg` images respectively. +- Pre-process images (93 GiB of free disk space required for f16 accuracy) with [ResNet152 weights ported from Caffe][3] and vocabularies for questions and answers with: +``` +python preprocess-images.py +python preprocess-vocab.py +``` +- Train the model in `model.py` with: +``` +python train.py +``` +This will alternate between one epoch of training on the train split and one epoch of validation on the validation split while printing the current training progress to stdout and saving logs in the `logs` directory. +The logs contain the name of the model, training statistics, contents of `config.py`, model weights, evaluation information (per-question answer and accuracy), and question and answer vocabularies. +- During training (which takes a while), plot the training progress with: +``` +python view-log.py +``` + + +## Python 3 dependencies (tested on Python 3.6.2) + +- torch +- torchvision +- h5py +- tqdm + + + +[0]: https://arxiv.org/abs/1704.03162 +[1]: https://github.com/pytorch/pytorch +[2]: http://visualqa.org/ +[3]: https://github.com/ruotianluo/pytorch-resnet +[4]: http://visualqa.org/vqa_v1_download.html +[5]: https://github.com/Cyanogenoid/pytorch-vqa/releases diff --git a/config.py b/config.py new file mode 100644 index 0000000..24dc7b8 --- /dev/null +++ b/config.py @@ -0,0 +1,25 @@ +# paths +qa_path = 'vqa' # directory containing the question and annotation jsons +train_path = 'mscoco/train2014' # directory of training images +val_path = 'mscoco/val2014' # directory of validation images +test_path = 'mscoco/test2015' # directory of test images +preprocessed_path = '/ssd/resnet-14x14.h5' # path where preprocessed features are saved to and loaded from +vocabulary_path = 'vocab.json' # path where the used vocabularies for question and answers are saved to + +task = 'OpenEnded' +dataset = 'mscoco' + +# preprocess config +preprocess_batch_size = 64 +image_size = 448 # scale shorter end of image to this size and centre crop +output_size = image_size // 32 # size of the feature maps after processing through a network +output_features = 2048 # number of feature maps thereof +central_fraction = 0.875 # only take this much of the centre when scaling and centre cropping + +# training config +epochs = 50 +batch_size = 128 +initial_lr = 1e-3 # default Adam lr +lr_halflife = 50000 # in iterations +data_workers = 8 +max_answers = 3000 diff --git a/data.py b/data.py new file mode 100644 index 0000000..9e90e34 --- /dev/null +++ b/data.py @@ -0,0 +1,256 @@ +import json +import os +import os.path +import re + +from PIL import Image +import h5py +import torch +import torch.utils.data as data +import torchvision.transforms as transforms + +import config +import utils + + +def get_loader(train=False, val=False, test=False): + """ Returns a data loader for the desired split """ + assert train + val + test == 1, 'need to set exactly one of {train, val, test} to True' + split = VQA( + utils.path_for(train=train, val=val, test=test, question=True), + utils.path_for(train=train, val=val, test=test, answer=True), + config.preprocessed_path, + answerable_only=train, + ) + loader = torch.utils.data.DataLoader( + split, + batch_size=config.batch_size, + shuffle=train, # only shuffle the data in training + pin_memory=True, + num_workers=config.data_workers, + collate_fn=collate_fn, + ) + return loader + + +def collate_fn(batch): + # put question lengths in descending order so that we can use packed sequences later + batch.sort(key=lambda x: x[-1], reverse=True) + return data.dataloader.default_collate(batch) + + +class VQA(data.Dataset): + """ VQA dataset, open-ended """ + def __init__(self, questions_path, answers_path, image_features_path, answerable_only=False): + super(VQA, self).__init__() + with open(questions_path, 'r') as fd: + questions_json = json.load(fd) + with open(answers_path, 'r') as fd: + answers_json = json.load(fd) + with open(config.vocabulary_path, 'r') as fd: + vocab_json = json.load(fd) + self._check_integrity(questions_json, answers_json) + + # vocab + self.vocab = vocab_json + self.token_to_index = self.vocab['question'] + self.answer_to_index = self.vocab['answer'] + + # q and a + self.questions = list(prepare_questions(questions_json)) + self.answers = list(prepare_answers(answers_json)) + self.questions = [self._encode_question(q) for q in self.questions] + self.answers = [self._encode_answers(a) for a in self.answers] + + # v + self.image_features_path = image_features_path + self.coco_id_to_index = self._create_coco_id_to_index() + self.coco_ids = [q['image_id'] for q in questions_json['questions']] + + # only use questions that have at least one answer? + self.answerable_only = answerable_only + if self.answerable_only: + self.answerable = self._find_answerable() + + @property + def max_question_length(self): + if not hasattr(self, '_max_length'): + self._max_length = max(map(len, self.questions)) + return self._max_length + + @property + def num_tokens(self): + return len(self.token_to_index) + 1 # add 1 for token at index 0 + + def _create_coco_id_to_index(self): + """ Create a mapping from a COCO image id into the corresponding index into the h5 file """ + with h5py.File(self.image_features_path, 'r') as features_file: + coco_ids = features_file['ids'][()] + coco_id_to_index = {id: i for i, id in enumerate(coco_ids)} + return coco_id_to_index + + def _check_integrity(self, questions, answers): + """ Verify that we are using the correct data """ + qa_pairs = list(zip(questions['questions'], answers['annotations'])) + assert all(q['question_id'] == a['question_id'] for q, a in qa_pairs), 'Questions not aligned with answers' + assert all(q['image_id'] == a['image_id'] for q, a in qa_pairs), 'Image id of question and answer don\'t match' + assert questions['data_type'] == answers['data_type'], 'Mismatched data types' + assert questions['data_subtype'] == answers['data_subtype'], 'Mismatched data subtypes' + + def _find_answerable(self): + """ Create a list of indices into questions that will have at least one answer that is in the vocab """ + answerable = [] + for i, answers in enumerate(self.answers): + answer_has_index = len(answers.nonzero()) > 0 + # store the indices of anything that is answerable + if answer_has_index: + answerable.append(i) + return answerable + + def _encode_question(self, question): + """ Turn a question into a vector of indices and a question length """ + vec = torch.zeros(self.max_question_length).long() + for i, token in enumerate(question): + index = self.token_to_index.get(token, 0) + vec[i] = index + return vec, len(question) + + def _encode_answers(self, answers): + """ Turn an answer into a vector """ + # answer vec will be a vector of answer counts to determine which answers will contribute to the loss. + # this should be multiplied with 0.1 * negative log-likelihoods that a model produces and then summed up + # to get the loss that is weighted by how many humans gave that answer + answer_vec = torch.zeros(len(self.answer_to_index)) + for answer in answers: + index = self.answer_to_index.get(answer) + if index is not None: + answer_vec[index] += 1 + return answer_vec + + def _load_image(self, image_id): + """ Load an image """ + if not hasattr(self, 'features_file'): + # Loading the h5 file has to be done here and not in __init__ because when the DataLoader + # forks for multiple works, every child would use the same file object and fail + # Having multiple readers using different file objects is fine though, so we just init in here. + self.features_file = h5py.File(self.image_features_path, 'r') + index = self.coco_id_to_index[image_id] + dataset = self.features_file['features'] + img = dataset[index].astype('float32') + return torch.from_numpy(img) + + def __getitem__(self, item): + if self.answerable_only: + # change of indices to only address answerable questions + item = self.answerable[item] + + q, q_length = self.questions[item] + a = self.answers[item] + image_id = self.coco_ids[item] + v = self._load_image(image_id) + # since batches are re-ordered for PackedSequence's, the original question order is lost + # we return `item` so that the order of (v, q, a) triples can be restored if desired + # without shuffling in the dataloader, these will be in the order that they appear in the q and a json's. + return v, q, a, item, q_length + + def __len__(self): + if self.answerable_only: + return len(self.answerable) + else: + return len(self.questions) + + +# this is used for normalizing questions +_special_chars = re.compile('[^a-z0-9 ]*') + +# these try to emulate the original normalisation scheme for answers +_period_strip = re.compile(r'(?!<=\d)(\.)(?!\d)') +_comma_strip = re.compile(r'(\d)(,)(\d)') +_punctuation_chars = re.escape(r';/[]"{}()=+\_-><@`,?!') +_punctuation = re.compile(r'([{}])'.format(re.escape(_punctuation_chars))) +_punctuation_with_a_space = re.compile(r'(?<= )([{0}])|([{0}])(?= )'.format(_punctuation_chars)) + + +def prepare_questions(questions_json): + """ Tokenize and normalize questions from a given question json in the usual VQA format. """ + questions = [q['question'] for q in questions_json['questions']] + for question in questions: + question = question.lower()[:-1] + yield question.split(' ') + + +def prepare_answers(answers_json): + """ Normalize answers from a given answer json in the usual VQA format. """ + answers = [[a['answer'] for a in ans_dict['answers']] for ans_dict in answers_json['annotations']] + # The only normalisation that is applied to both machine generated answers as well as + # ground truth answers is replacing most punctuation with space (see [0] and [1]). + # Since potential machine generated answers are just taken from most common answers, applying the other + # normalisations is not needed, assuming that the human answers are already normalized. + # [0]: http://visualqa.org/evaluation.html + # [1]: https://github.com/VT-vision-lab/VQA/blob/3849b1eae04a0ffd83f56ad6f70ebd0767e09e0f/PythonEvaluationTools/vqaEvaluation/vqaEval.py#L96 + + def process_punctuation(s): + # the original is somewhat broken, so things that look odd here might just be to mimic that behaviour + # this version should be faster since we use re instead of repeated operations on str's + if _punctuation.search(s) is None: + return s + s = _punctuation_with_a_space.sub('', s) + if re.search(_comma_strip, s) is not None: + s = s.replace(',', '') + s = _punctuation.sub(' ', s) + s = _period_strip.sub('', s) + return s.strip() + + for answer_list in answers: + yield list(map(process_punctuation, answer_list)) + + +class CocoImages(data.Dataset): + """ Dataset for MSCOCO images located in a folder on the filesystem """ + def __init__(self, path, transform=None): + super(CocoImages, self).__init__() + self.path = path + self.id_to_filename = self._find_images() + self.sorted_ids = sorted(self.id_to_filename.keys()) # used for deterministic iteration order + print('found {} images in {}'.format(len(self), self.path)) + self.transform = transform + + def _find_images(self): + id_to_filename = {} + for filename in os.listdir(self.path): + if not filename.endswith('.jpg'): + continue + id_and_extension = filename.split('_')[-1] + id = int(id_and_extension.split('.')[0]) + id_to_filename[id] = filename + return id_to_filename + + def __getitem__(self, item): + id = self.sorted_ids[item] + path = os.path.join(self.path, self.id_to_filename[id]) + img = Image.open(path).convert('RGB') + + if self.transform is not None: + img = self.transform(img) + return id, img + + def __len__(self): + return len(self.sorted_ids) + + +class Composite(data.Dataset): + """ Dataset that is a composite of several Dataset objects. Useful for combining splits of a dataset. """ + def __init__(self, *datasets): + self.datasets = datasets + + def __getitem__(self, item): + current = self.datasets[0] + for d in self.datasets: + if item < len(d): + return d[item] + item -= len(d) + else: + raise IndexError('Index too large for composite dataset') + + def __len__(self): + return sum(map(len, self.datasets)) diff --git a/logs/.dummy b/logs/.dummy new file mode 100644 index 0000000..e69de29 diff --git a/model.py b/model.py new file mode 100644 index 0000000..63bac84 --- /dev/null +++ b/model.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from torch.nn.utils.rnn import pack_padded_sequence + +import config + + +class Net(nn.Module): + """ Re-implementation of ``Show, Ask, Attend, and Answer: A Strong Baseline For Visual Question Answering'' [0] + + [0]: https://arxiv.org/abs/1704.03162 + """ + + def __init__(self, embedding_tokens): + super(Net, self).__init__() + question_features = 1024 + vision_features = config.output_features + glimpses = 2 + + self.text = TextProcessor( + embedding_tokens=embedding_tokens, + embedding_features=300, + lstm_features=question_features, + drop=0.5, + ) + self.attention = Attention( + v_features=vision_features, + q_features=question_features, + mid_features=512, + glimpses=2, + drop=0.5, + ) + self.classifier = Classifier( + in_features=glimpses * vision_features + question_features, + mid_features=1024, + out_features=config.max_answers, + drop=0.5, + ) + + for m in self.modules(): + if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): + init.xavier_uniform(m.weight) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, v, q, q_len): + q = self.text(q, list(q_len.data)) + + v = v / (v.norm(p=2, dim=1).expand_as(v) + 1e-8) + a = self.attention(v, q) + v = apply_attention(v, a) + + combined = torch.cat([v, q], dim=1) + answer = self.classifier(combined) + return answer + + +class Classifier(nn.Sequential): + def __init__(self, in_features, mid_features, out_features, drop=0.0): + super(Classifier, self).__init__() + self.add_module('drop1', nn.Dropout(drop)) + self.add_module('lin1', nn.Linear(in_features, mid_features)) + self.add_module('relu', nn.ReLU()) + self.add_module('drop2', nn.Dropout(drop)) + self.add_module('lin2', nn.Linear(mid_features, out_features)) + + +class TextProcessor(nn.Module): + def __init__(self, embedding_tokens, embedding_features, lstm_features, drop=0.0): + super(TextProcessor, self).__init__() + self.embedding = nn.Embedding(embedding_tokens, embedding_features, padding_idx=0) + self.drop = nn.Dropout(drop) + self.tanh = nn.Tanh() + self.lstm = nn.LSTM(input_size=embedding_features, + hidden_size=lstm_features, + num_layers=1) + self.features = lstm_features + + self._init_lstm(self.lstm.weight_ih_l0) + self._init_lstm(self.lstm.weight_hh_l0) + self.lstm.bias_ih_l0.data.zero_() + self.lstm.bias_hh_l0.data.zero_() + + init.xavier_uniform(self.embedding.weight) + + def _init_lstm(self, weight): + for w in weight.chunk(4, 0): + init.xavier_uniform(w) + + def forward(self, q, q_len): + embedded = self.embedding(q) + tanhed = self.tanh(self.drop(embedded)) + packed = pack_padded_sequence(tanhed, q_len, batch_first=True) + _, (_, c) = self.lstm(packed) + return c.squeeze(0) + + +class Attention(nn.Module): + def __init__(self, v_features, q_features, mid_features, glimpses, drop=0.0): + super(Attention, self).__init__() + self.v_conv = nn.Conv2d(v_features, mid_features, 1, bias=False) # let self.lin take care of bias + self.q_lin = nn.Linear(q_features, mid_features) + self.x_conv = nn.Conv2d(mid_features, glimpses, 1) + + self.drop = nn.Dropout(drop) + self.relu = nn.ReLU(inplace=True) + + def forward(self, v, q): + v = self.v_conv(self.drop(v)) + q = self.q_lin(self.drop(q)) + q = tile_2d_over_nd(q, v) + x = self.relu(v + q) + x = self.x_conv(self.drop(x)) + return x + + +def apply_attention(input, attention): + """ Apply any number of attention maps over the input. + The attention map has to have the same size in all dimensions except dim=1. + """ + n, c = input.size()[:2] + glimpses = attention.size(1) + + # flatten the spatial dims into the third dim, since we don't need to care about how they are arranged + input = input.view(n, c, -1) + attention = attention.view(n, glimpses, -1) + s = input.size(2) + + # apply a softmax to each attention map separately + # since softmax only takes 2d inputs, we have to collapse the first two dimensions together + # so that each glimpse is normalised separately + attention = attention.view(n * glimpses, -1) + attention = F.softmax(attention) + + # apply the weighting by creating a new dim to tile both tensors over + target_size = [n, glimpses, c, s] + input = input.view(n, 1, c, s).expand(*target_size) + attention = attention.view(n, glimpses, 1, s).expand(*target_size) + weighted = input * attention + # sum over only the spatial dimension + weighted_mean = weighted.sum(dim=3) + # the shape at this point is (n, glimpses, c, 1) + return weighted_mean.view(n, -1) + + +def tile_2d_over_nd(feature_vector, feature_map): + """ Repeat the same feature vector over all spatial positions of a given feature map. + The feature vector should have the same batch size and number of features as the feature map. + """ + n, c = feature_vector.size() + spatial_size = feature_map.dim() - 2 + tiled = feature_vector.view(n, c, *([1] * spatial_size)).expand_as(feature_map) + return tiled diff --git a/preprocess-images.py b/preprocess-images.py new file mode 100644 index 0000000..c1ad976 --- /dev/null +++ b/preprocess-images.py @@ -0,0 +1,73 @@ +import h5py +from torch.autograd import Variable +import torch.nn as nn +import torch.backends.cudnn as cudnn +import torch.utils.data +import torchvision.models as models +from tqdm import tqdm + +import config +import data +import utils +from resnet import resnet as caffe_resnet + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.model = caffe_resnet.resnet152(pretrained=True) + + def save_output(module, input, output): + self.buffer = output + self.model.layer4.register_forward_hook(save_output) + + def forward(self, x): + self.model(x) + return self.buffer + + +def create_coco_loader(*paths): + transform = utils.get_transform(config.image_size, config.central_fraction) + datasets = [data.CocoImages(path, transform=transform) for path in paths] + dataset = data.Composite(*datasets) + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=config.preprocess_batch_size, + num_workers=config.data_workers, + shuffle=False, + pin_memory=True, + ) + return data_loader + + +def main(): + cudnn.benchmark = True + + net = Net().cuda() + net.eval() + + loader = create_coco_loader(config.train_path, config.val_path) + features_shape = ( + len(loader.dataset), + config.output_features, + config.output_size, + config.output_size + ) + + with h5py.File(config.preprocessed_path, libver='latest') as fd: + features = fd.create_dataset('features', shape=features_shape, dtype='float16') + coco_ids = fd.create_dataset('ids', shape=(len(loader.dataset),), dtype='int32') + + i = j = 0 + for ids, imgs in tqdm(loader): + imgs = Variable(imgs.cuda(async=True), volatile=True) + out = net(imgs) + + j = i + imgs.size(0) + features[i:j, :, :] = out.data.cpu().numpy().astype('float16') + coco_ids[i:j] = ids.numpy().astype('int32') + i = j + + +if __name__ == '__main__': + main() diff --git a/preprocess-vocab.py b/preprocess-vocab.py new file mode 100644 index 0000000..930b0e3 --- /dev/null +++ b/preprocess-vocab.py @@ -0,0 +1,51 @@ +import json +from collections import Counter +import itertools + +import config +import data +import utils + + +def extract_vocab(iterable, top_k=None, start=0): + """ Turns an iterable of list of tokens into a vocabulary. + These tokens could be single answers or word tokens in questions. + """ + all_tokens = itertools.chain.from_iterable(iterable) + counter = Counter(all_tokens) + if top_k: + most_common = counter.most_common(top_k) + most_common = (t for t, c in most_common) + else: + most_common = counter.keys() + # descending in count, then lexicographical order + tokens = sorted(most_common, key=lambda x: (counter[x], x), reverse=True) + vocab = {t: i for i, t in enumerate(tokens, start=start)} + return vocab + + +def main(): + questions = utils.path_for(train=True, question=True) + answers = utils.path_for(train=True, answer=True) + + with open(questions, 'r') as fd: + questions = json.load(fd) + with open(answers, 'r') as fd: + answers = json.load(fd) + + questions = data.prepare_questions(questions) + answers = data.prepare_answers(answers) + + question_vocab = extract_vocab(questions, start=1) + answer_vocab = extract_vocab(answers, top_k=config.max_answers) + + vocabs = { + 'question': question_vocab, + 'answer': answer_vocab, + } + with open(config.vocabulary_path, 'w') as fd: + json.dump(vocabs, fd) + + +if __name__ == '__main__': + main() diff --git a/resnet b/resnet new file mode 160000 index 0000000..9332392 --- /dev/null +++ b/resnet @@ -0,0 +1 @@ +Subproject commit 9332392b01317d57e92f81e00933c48f423ff503 diff --git a/train.py b/train.py new file mode 100644 index 0000000..357bb6d --- /dev/null +++ b/train.py @@ -0,0 +1,128 @@ +import sys +import os.path +import math +import json + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.autograd import Variable +import torch.backends.cudnn as cudnn +from tqdm import tqdm + +import config +import data +import model +import utils + + +def update_learning_rate(optimizer, iteration): + lr = config.initial_lr * 0.5**(float(iteration) / config.lr_halflife) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +total_iterations = 0 + + +def run(net, loader, optimizer, tracker, train=False, prefix='', epoch=0): + """ Run an epoch over the given loader """ + if train: + net.train() + tracker_class, tracker_params = tracker.MovingMeanMonitor, {'momentum': 0.99} + else: + net.eval() + tracker_class, tracker_params = tracker.MeanMonitor, {} + answ = [] + idxs = [] + accs = [] + + tq = tqdm(loader, desc='{} E{:03d}'.format(prefix, epoch), ncols=0) + loss_tracker = tracker.track('{}_loss'.format(prefix), tracker_class(**tracker_params)) + acc_tracker = tracker.track('{}_acc'.format(prefix), tracker_class(**tracker_params)) + + log_softmax = nn.LogSoftmax().cuda() + for v, q, a, idx, q_len in tq: + var_params = { + 'volatile': not train, + 'requires_grad': False, + } + v = Variable(v.cuda(async=True), **var_params) + q = Variable(q.cuda(async=True), **var_params) + a = Variable(a.cuda(async=True), **var_params) + q_len = Variable(q_len.cuda(async=True), **var_params) + + out = net(v, q, q_len) + nll = -log_softmax(out) + loss = (nll * a / 10).sum(dim=1).mean() + acc = utils.batch_accuracy(out.data, a.data).cpu() + + if train: + global total_iterations + update_learning_rate(optimizer, total_iterations) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + total_iterations += 1 + else: + # store information about evaluation of this minibatch + _, answer = out.data.cpu().max(dim=1) + answ.append(answer.view(-1)) + accs.append(acc.view(-1)) + idxs.append(idx.view(-1).clone()) + + loss_tracker.append(loss.data[0]) + acc_tracker.append(acc.mean()) + fmt = '{:.4f}'.format + tq.set_postfix(loss=fmt(loss_tracker.mean.value), acc=fmt(acc_tracker.mean.value)) + + if not train: + answ = list(torch.cat(answ, dim=0)) + accs = list(torch.cat(accs, dim=0)) + idxs = list(torch.cat(idxs, dim=0)) + return answ, accs, idxs + + +def main(): + if len(sys.argv) > 1: + name = ' '.join(sys.argv[1:]) + else: + from datetime import datetime + name = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + target_name = os.path.join('logs', '{}.pth'.format(name)) + print('will save to {}'.format(target_name)) + + cudnn.benchmark = True + + train_loader = data.get_loader(train=True) + val_loader = data.get_loader(val=True) + + net = nn.DataParallel(model.Net(train_loader.dataset.num_tokens)).cuda() + optimizer = optim.Adam([p for p in net.parameters() if p.requires_grad]) + + tracker = utils.Tracker() + config_as_dict = {k: v for k, v in vars(config).items() if not k.startswith('__')} + + for i in range(config.epochs): + _ = run(net, train_loader, optimizer, tracker, train=True, prefix='train', epoch=i) + r = run(net, val_loader, optimizer, tracker, train=False, prefix='val', epoch=i) + + results = { + 'name': name, + 'tracker': tracker.to_dict(), + 'config': config_as_dict, + 'weights': net.state_dict(), + 'eval': { + 'answers': r[0], + 'accuracies': r[1], + 'idx': r[2], + }, + 'vocab': train_loader.dataset.vocab, + } + torch.save(results, target_name) + + +if __name__ == '__main__': + main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..06319a4 --- /dev/null +++ b/utils.py @@ -0,0 +1,136 @@ +import os +import json + +import torch +import torch.nn as nn +import torchvision.transforms as transforms + +import config + + +def batch_accuracy(predicted, true): + """ Compute the accuracies for a batch of predictions and answers """ + _, predicted_index = predicted.max(dim=1) + agreeing = true.gather(dim=1, index=predicted_index) + ''' + Acc needs to be averaged over all 10 choose 9 subsets of human answers. + While we could just use a loop, surely this can be done more efficiently (and indeed, it can). + There are two cases for the 1 chosen answer to be discarded: + (1) the discarded answer is not the predicted answer => acc stays the same + (2) the discarded answer is the predicted answer => we have to subtract 1 from the number of agreeing answers + + There are (10 - num_agreeing_answers) of case 1 and num_agreeing_answers of case 2, thus + acc = ((10 - agreeing) * min( agreeing / 3, 1) + + agreeing * min((agreeing - 1) / 3, 1)) / 10 + + Let's do some more simplification: + if num_agreeing_answers == 0: + acc = 0 since the case 1 min term becomes 0 and case 2 weighting term is 0 + if num_agreeing_answers >= 4: + acc = 1 since the min term in both cases is always 1 + The only cases left are for 1, 2, and 3 agreeing answers. + In all of those cases, (agreeing - 1) / 3 < agreeing / 3 <= 1, so we can get rid of all the mins. + By moving num_agreeing_answers from both cases outside the sum we get: + acc = agreeing * ((10 - agreeing) + (agreeing - 1)) / 3 / 10 + which we can simplify to: + acc = agreeing * 0.3 + Finally, we can combine all cases together with: + min(agreeing * 0.3, 1) + ''' + return (agreeing * 0.3).clamp(max=1) + + +def path_for(train=False, val=False, test=False, question=False, answer=False): + assert train + val + test == 1 + assert question + answer == 1 + assert not (test and answer), 'loading answers from test split not supported' # if you want to eval on test, you need to implement loading of a VQA Dataset without given answers yourself + if train: + split = 'train2014' + elif val: + split = 'val2014' + else: + split = 'test2015' + if question: + fmt = '{0}_{1}_{2}_questions.json' + else: + fmt = '{1}_{2}_annotations.json' + s = fmt.format(config.task, config.dataset, split) + return os.path.join(config.qa_path, s) + + +class Tracker: + """ Keep track of results over time, while having access to monitors to display information about them. """ + def __init__(self): + self.data = {} + + def track(self, name, *monitors): + """ Track a set of results with given monitors under some name (e.g. 'val_acc'). + When appending to the returned list storage, use the monitors to retrieve useful information. + """ + l = Tracker.ListStorage(monitors) + self.data.setdefault(name, []).append(l) + return l + + def to_dict(self): + # turn list storages into regular lists + return {k: list(map(list, v)) for k, v in self.data.items()} + + + class ListStorage: + """ Storage of data points that updates the given monitors """ + def __init__(self, monitors=[]): + self.data = [] + self.monitors = monitors + for monitor in self.monitors: + setattr(self, monitor.name, monitor) + + def append(self, item): + for monitor in self.monitors: + monitor.update(item) + self.data.append(item) + + def __iter__(self): + return iter(self.data) + + class MeanMonitor: + """ Take the mean over the given values """ + name = 'mean' + + def __init__(self): + self.n = 0 + self.total = 0 + + def update(self, value): + self.total += value + self.n += 1 + + @property + def value(self): + return self.total / self.n + + class MovingMeanMonitor: + """ Take an exponentially moving mean over the given values """ + name = 'mean' + + def __init__(self, momentum=0.9): + self.momentum = momentum + self.first = True + self.value = None + + def update(self, value): + if self.first: + self.value = value + self.first = False + else: + m = self.momentum + self.value = m * self.value + (1 - m) * value + + +def get_transform(target_size, central_fraction=1.0): + return transforms.Compose([ + transforms.Scale(int(target_size / central_fraction)), + transforms.CenterCrop(target_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]) diff --git a/view-log.py b/view-log.py new file mode 100644 index 0000000..add6acf --- /dev/null +++ b/view-log.py @@ -0,0 +1,20 @@ +import sys +import torch +import matplotlib; matplotlib.use('agg') +import matplotlib.pyplot as plt + + +def main(): + path = sys.argv[1] + results = torch.load(path) + + val_acc = torch.FloatTensor(results['tracker']['val_acc']) + val_acc = val_acc.mean(dim=1).numpy() + + plt.figure() + plt.plot(val_acc) + plt.savefig('val_acc.png') + + +if __name__ == '__main__': + main()