train.py

# Sort out the imports!
import torch
from torch.utils.data import DataLoader
from torchvision.utils import save_image
from functools import partial
from copy import deepcopy
from model import *
from utils import *


class AdaINLoss(nn.Module):
    def __init__(self, lambda_):
        super().__init__()
        self.lambda_ = lambda_
        # lambda is a hyperparameter the dictates the relative importance of content vs style
        # the greater lambda is the more the model will try to preserve style
        # the smaller lambda is the more the model will try and preserve content
        # [See equation 11 of the Paper]

    def contentLoss(self, content_emb, output_emb):
        """ Takes 2 embedding tensors generated by vgg and finds the L2 norm
        (ie. euclidan distance) between them. [See equation 12 of the Paper]"""
        return torch.norm(content_emb-output_emb)

    def styleLoss(self, style_activations, output_activations):
        """ Takes 2 lists of activation tensors hooked from vgg layers during
        forward passes using our style image and our ouput image as inputs.
        Computes the L2 norm between each of their means and standard deviations
        and returns the sum. [See equation 13 of the Paper]"""
        mu_sum = 0
        sigma_sum = 0
        for style_act, output_act in zip(style_activations, output_activations):
            mu_sum = torch.norm(mu(style_act)-mu(output_act))
            sigma_sum = torch.norm(sigma(style_act)-sigma(output_act))
        return mu_sum + sigma_sum

    def totalLoss(self, content_emb, output_emb, style_activations, output_activations):
        """ Calculates the overall loss. [See equation 11 of the Paper]"""
        content_loss = self.contentLoss(content_emb, output_emb)
        style_loss = self.styleLoss(style_activations, output_activations)
        #print(content_loss.item(), style_loss.item())
        return content_loss+self.lambda_*style_loss

    def forward(self, content_emb, output_emb, style_activations, output_activations):
        """ For caculating single image loss please pass arguments with a batch size of 1. """
        return self.totalLoss(content_emb, output_emb, style_activations, output_activations)/content_emb.shape[0]


if __name__ == "__main__":
    # Set hyperparameters
    bs = 1
    epochs = 5
    lr=6e-4
    wd=0.001
    lambda_ = 7.5
    alpha = 0.1
    style_layers = ['1','6','10','20']
    debug_layers = [0,3,5,7]
    # Load content and style datasets
    content_images = DataLoader(ImageDataset('./train/content'), batch_size=bs, shuffle=True, num_workers=0)
    style_images = DataLoader(ImageDataset('./train/style'), batch_size=bs, shuffle=True, num_workers=0)
    # Set cost function
    criterion = AdaINLoss(lambda_)
    # Load Model
    model = AdaINStyle()
    for p in model.vgg.parameters():
        p.requires_grad = False
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    # declare variables to store hooks
    activations = [None]*4
    debug_activations = [None]*4
    debug_grads = [None]*4
    # declare hook function
    def styleHook(i, module, input, output):
        global activations
        activations[i] = output

    def debugHook(i, module, input, output):
        global activations
        debug_activations[i] = output

    # establish hooks in vgg
    for i, layer in enumerate(style_layers):
        model.vgg._modules[layer].register_forward_hook(partial(styleHook,i))
    for i, layer in enumerate(debug_layers):
        model.dec._modules[str(layer)].register_forward_hook(partial(debugHook,i))


    for epoch in range(epochs):
        i=0
        running_loss = 0
        for content_batch, style_batch in zip(content_images, style_images):
            i += 1
            optimizer.zero_grad()
            output = model(content_batch,style_batch)
            content_emb = model.t
            style_activations = deepcopy(activations)
            output_emb = model.vgg(output)
            output_activations = activations
            #print(output, content_emb)
            loss = criterion(content_emb, output_emb, style_activations, output_activations)
            if torch.isnan(loss).item():
                print("Got nan, here's the activations:")
                print(debug_activations)
                print(debug_grads_old)
                quit()
            loss.backward()
            debug_grads_old = deepcopy(debug_grads)
            debug_grads = [model.dec[layer].weight.grad for layer in debug_layers]
            optimizer.step()
            running_loss = alpha*loss.item() + (1-alpha)*running_loss
            print(running_loss)
            #for layer in model:
            #    print(layer.grad)
            save_image(output, f"./tmp/{epoch}_{i}_o.png",2)
            save_image(torch.sigmoid(content_batch), f"./tmp/{epoch}_{i}_c.png",2)
            save_image(torch.sigmoid(style_batch), f"./tmp/{epoch}_{i}_s.png",2)
        #save_image(torch.sigmoid(output), f"./tmp/o_{epoch}_{i}.png")
        print(f'Epoch: [{epoch}/{epochs}],  Loss: {loss}')
        torch.save(model, 'adain_model')