src/generativezoo/models/Flow/VanillaFlow.py

#########################################################################################################################################
### Code based on: https://github.com/phlippe/uvadlc_notebooks/blob/master/docs/tutorial_notebooks/tutorial11/NF_image_modeling.ipynb ###
#########################################################################################################################################

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm, trange
from matplotlib import pyplot as plt
import wandb
from torchvision.utils import make_grid
import os
from config import models_dir
from sklearn.metrics import roc_auc_score, roc_curve

# Convert images from 0-1 to 0-255 (integers)
def discretize(sample):
    return (sample * 255).to(torch.int32)

class Dequantization(nn.Module):

    def __init__(self, alpha=1e-5, quants=256):
        """
        Inputs:
            alpha - small constant that is used to scale the original input.
                    Prevents dealing with values very close to 0 and 1 when inverting the sigmoid
            quants - Number of possible discrete values (usually 256 for 8-bit image)
        """
        super().__init__()
        self.alpha = alpha
        self.quants = quants

    def forward(self, z, ldj, reverse=False):
        if not reverse:
            z, ldj = self.dequant(z, ldj)
            z, ldj = self.sigmoid(z, ldj, reverse=True)
        else:
            z, ldj = self.sigmoid(z, ldj, reverse=False)
            z = z * self.quants
            ldj += np.log(self.quants) * np.prod(z.shape[1:])
            z = torch.floor(z).clamp(min=0, max=self.quants-1).to(torch.int32)
        return z, ldj

    def sigmoid(self, z, ldj, reverse=False):
        # Applies an invertible sigmoid transformation
        if not reverse:
            ldj += (-z-2*F.softplus(-z)).sum(dim=[1,2,3])
            z = torch.sigmoid(z)
            # Reversing scaling for numerical stability
            ldj -= np.log(1 - self.alpha) * np.prod(z.shape[1:])
            z = (z - 0.5 * self.alpha) / (1 - self.alpha)
        else:
            z = z * (1 - self.alpha) + 0.5 * self.alpha  # Scale to prevent boundaries 0 and 1
            ldj += np.log(1 - self.alpha) * np.prod(z.shape[1:])
            ldj += (-torch.log(z) - torch.log(1-z)).sum(dim=[1,2,3])
            z = torch.log(z) - torch.log(1-z)
        return z, ldj

    def dequant(self, z, ldj):
        # Transform discrete values to continuous volumes
        z = z.to(torch.float32)
        z = z + torch.rand_like(z).detach()
        z = z / self.quants
        ldj -= np.log(self.quants) * np.prod(z.shape[1:])
        return z, ldj
    
class VariationalDequantization(Dequantization):

    def __init__(self, var_flows, alpha=1e-5):
        """
        Inputs:
            var_flows - A list of flow transformations to use for modeling q(u|x)
            alpha - Small constant, see Dequantization for details
        """
        super().__init__(alpha=alpha)
        self.flows = nn.ModuleList(var_flows)

    def dequant(self, z, ldj):
        z = z.to(torch.float32)
        img = (z / 255.0) * 2 - 1 # We condition the flows on x, i.e. the original image

        # Prior of u is a uniform distribution as before
        # As most flow transformations are defined on [-infinity,+infinity], we apply an inverse sigmoid first.
        deq_noise = torch.rand_like(z).detach()
        deq_noise, ldj = self.sigmoid(deq_noise, ldj, reverse=True)
        for flow in self.flows:
            deq_noise, ldj = flow(deq_noise, ldj, reverse=False, orig_img=img)
        deq_noise, ldj = self.sigmoid(deq_noise, ldj, reverse=False)

        # After the flows, apply u as in standard dequantization
        z = (z + deq_noise) / 256.0
        ldj -= np.log(256.0) * np.prod(z.shape[1:])
        return z, ldj

class CouplingLayer(nn.Module):

    def __init__(self, network, mask, c_in):
        """
        Coupling layer inside a normalizing flow.
        Inputs:
            network - A PyTorch nn.Module constituting the deep neural network for mu and sigma.
                      Output shape should be twice the channel size as the input.
            mask - Binary mask (0 or 1) where 0 denotes that the element should be transformed,
                   while 1 means the latent will be used as input to the NN.
            c_in - Number of input channels
        """
        super().__init__()
        self.network = network
        self.scaling_factor = nn.Parameter(torch.zeros(c_in))
        # Register mask as buffer as it is a tensor which is not a parameter,
        # but should be part of the modules state.
        self.register_buffer('mask', mask)

    def forward(self, z, ldj, reverse=False, orig_img=None):
        """
        Inputs:
            z - Latent input to the flow
            ldj - The current ldj of the previous flows.
                  The ldj of this layer will be added to this tensor.
            reverse - If True, we apply the inverse of the layer.
            orig_img (optional) - Only needed in VarDeq. Allows external
                                  input to condition the flow on (e.g. original image)
        """
        # Apply network to masked input
        z_in = z * self.mask
        if orig_img is None:
            nn_out = self.network(z_in)
        else:
            nn_out = self.network(torch.cat([z_in, orig_img], dim=1))
        s, t = nn_out.chunk(2, dim=1)

        # Stabilize scaling output
        s_fac = self.scaling_factor.exp().view(1, -1, 1, 1)
        s = torch.tanh(s / s_fac) * s_fac

        # Mask outputs (only transform the second part)
        s = s * (1 - self.mask)
        t = t * (1 - self.mask)

        # Affine transformation
        if not reverse:
            # Whether we first shift and then scale, or the other way round,
            # is a design choice, and usually does not have a big impact
            z = (z + t) * torch.exp(s)
            ldj += s.sum(dim=[1,2,3])
        else:
            z = (z * torch.exp(-s)) - t
            ldj -= s.sum(dim=[1,2,3])

        return z, ldj

def create_checkerboard_mask(h, w, invert=False):
    x, y = torch.arange(h, dtype=torch.int32), torch.arange(w, dtype=torch.int32)
    xx, yy = torch.meshgrid(x, y, indexing='ij')
    mask = torch.fmod(xx + yy, 2)
    mask = mask.to(torch.float32).view(1, 1, h, w)
    if invert:
        mask = 1 - mask
    return mask

def create_channel_mask(c_in, invert=False):
    mask = torch.cat([torch.ones(c_in//2, dtype=torch.float32),
                      torch.zeros(c_in-c_in//2, dtype=torch.float32)])
    mask = mask.view(1, c_in, 1, 1)
    if invert:
        mask = 1 - mask
    return mask

class ConcatELU(nn.Module):
    """
    Activation function that applies ELU in both direction (inverted and plain).
    Allows non-linearity while providing strong gradients for any input (important for final convolution)
    """

    def forward(self, x):
        return torch.cat([F.elu(x), F.elu(-x)], dim=1)


class LayerNormChannels(nn.Module):

    def __init__(self, c_in, eps=1e-5):
        """
        This module applies layer norm across channels in an image.
        Inputs:
            c_in - Number of channels of the input
            eps - Small constant to stabilize std
        """
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(1, c_in, 1, 1))
        self.beta = nn.Parameter(torch.zeros(1, c_in, 1, 1))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=1, keepdim=True)
        var = x.var(dim=1, unbiased=False, keepdim=True)
        y = (x - mean) / torch.sqrt(var + self.eps)
        y = y * self.gamma + self.beta
        return y


class GatedConv(nn.Module):

    def __init__(self, c_in, c_hidden):
        """
        This module applies a two-layer convolutional ResNet block with input gate
        Inputs:
            c_in - Number of channels of the input
            c_hidden - Number of hidden dimensions we want to model (usually similar to c_in)
        """
        super().__init__()
        self.net = nn.Sequential(
            ConcatELU(),
            nn.Conv2d(2*c_in, c_hidden, kernel_size=3, padding=1),
            ConcatELU(),
            nn.Conv2d(2*c_hidden, 2*c_in, kernel_size=1)
        )

    def forward(self, x):
        out = self.net(x)
        val, gate = out.chunk(2, dim=1)
        return x + val * torch.sigmoid(gate)


class GatedConvNet(nn.Module):

    def __init__(self, c_in, c_hidden=32, c_out=-1, num_layers=3):
        """
        Module that summarizes the previous blocks to a full convolutional neural network.
        Inputs:
            c_in - Number of input channels
            c_hidden - Number of hidden dimensions to use within the network
            c_out - Number of output channels. If -1, 2 times the input channels are used (affine coupling)
            num_layers - Number of gated ResNet blocks to apply
        """
        super().__init__()
        c_out = c_out if c_out > 0 else 2 * c_in
        layers = []
        layers += [nn.Conv2d(c_in, c_hidden, kernel_size=3, padding=1)]
        for layer_index in range(num_layers):
            layers += [GatedConv(c_hidden, c_hidden),
                       LayerNormChannels(c_hidden)]
        layers += [ConcatELU(),
                   nn.Conv2d(2*c_hidden, c_out, kernel_size=3, padding=1)]
        self.nn = nn.Sequential(*layers)

        self.nn[-1].weight.data.zero_()
        self.nn[-1].bias.data.zero_()

    def forward(self, x):
        return self.nn(x)
    
class SqueezeFlow(nn.Module):

    def forward(self, z, ldj, reverse=False):
        B, C, H, W = z.shape
        if not reverse:
            # Forward direction: H x W x C => H/2 x W/2 x 4C
            z = z.reshape(B, C, H//2, 2, W//2, 2)
            z = z.permute(0, 1, 3, 5, 2, 4)
            z = z.reshape(B, 4*C, H//2, W//2)
        else:
            # Reverse direction: H/2 x W/2 x 4C => H x W x C
            z = z.reshape(B, C//4, 2, 2, H, W)
            z = z.permute(0, 1, 4, 2, 5, 3)
            z = z.reshape(B, C//4, H*2, W*2)
        return z, ldj
    
class SplitFlow(nn.Module):

    def __init__(self):
        super().__init__()
        self.prior = torch.distributions.normal.Normal(loc=0.0, scale=1.0)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def forward(self, z, ldj, reverse=False):
        if not reverse:
            z, z_split = z.chunk(2, dim=1)
            ldj += self.prior.log_prob(z_split).sum(dim=[1,2,3])
        else:
            z_split = self.prior.sample(sample_shape=z.shape).to(self.device)
            z = torch.cat([z, z_split], dim=1)
            ldj -= self.prior.log_prob(z_split).sum(dim=[1,2,3])
        return z, ldj
    
def create_checkpoint_dir():
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    if not os.path.exists(os.path.join(models_dir, 'VanillaFlow')):
        os.makedirs(os.path.join(models_dir, 'VanillaFlow'))

class VanillaFlow(nn.Module):

    def __init__(self, img_size, channels, args):
        """
        Inputs:
            flows - A list of flows (each a nn.Module) that should be applied on the images.
            import_samples - Number of importance samples to use during testing (see explanation below). Can be changed at any time
        """
        super().__init__()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        flow_layers = []
        if args.multi_scale:
            vardeq_layers = [CouplingLayer(network=GatedConvNet(c_in=2*channels, c_out=2*channels, c_hidden=args.c_hidden),
                                   mask=create_checkerboard_mask(h=img_size, w=img_size, invert=(i%2==1)),
                                   c_in=channels) for i in range(4)]
            flow_layers += [VariationalDequantization(vardeq_layers)]

            flow_layers += [CouplingLayer(network=GatedConvNet(c_in=channels, c_hidden=args.c_hidden*2),
                                        mask=create_checkerboard_mask(h=img_size, w=img_size, invert=(i%2==1)),
                                        c_in=channels) for i in range(2)]
            flow_layers += [SqueezeFlow()]
            for i in range(args.n_layers//4):
                flow_layers += [CouplingLayer(network=GatedConvNet(c_in=4*channels, c_hidden=args.c_hidden*3),
                                            mask=create_channel_mask(c_in=4*channels, invert=(i%2==1)),
                                            c_in=4*channels)]
            flow_layers += [SplitFlow(),
                            SqueezeFlow()]
            for i in range(args.n_layers//2):
                flow_layers += [CouplingLayer(network=GatedConvNet(c_in=8*channels, c_hidden=args.c_hidden*4),
                                            mask=create_channel_mask(c_in=8*channels, invert=(i%2==1)),
                                            c_in=8*channels)]
            self.sample_shape = (16, channels*8, img_size//4, img_size//4)
        else:
            if args.vardeq:
                vardeq_layers = [CouplingLayer(network=GatedConvNet(c_in=2*channels, c_out=2*channels, c_hidden=args.c_hidden),
                                            mask=create_checkerboard_mask(h=img_size, w=img_size, invert=(i%2==1)),
                                            c_in=channels) for i in range(4)]
                flow_layers += [VariationalDequantization(var_flows=vardeq_layers)]
            else:
                flow_layers += [Dequantization()]

            for i in range(args.n_layers):
                flow_layers += [CouplingLayer(network=GatedConvNet(c_in=channels, c_hidden=args.c_hidden*2),
                                            mask=create_checkerboard_mask(h=img_size, w=img_size, invert=(i%2==1)),
                                            c_in=channels)]
            self.sample_shape = (16,channels, img_size, img_size)
        
        self.flows = nn.ModuleList(flow_layers).to(self.device)
        # Create prior distribution for final latent space
        self.prior = torch.distributions.normal.Normal(loc=0.0, scale=1.0)
        # Example input for visualizing the graph
        self.no_wandb = args.no_wandb

    def forward(self, imgs):
        # The forward function is only used for visualizing the graph
        return self._get_likelihood(imgs)

    def encode(self, imgs):
        # Given a batch of images, return the latent representation z and ldj of the transformations
        z, ldj = imgs, torch.zeros(imgs.shape[0], device=self.device)
        for flow in self.flows:
            z, ldj = flow(z, ldj, reverse=False)
        return z, ldj

    def _get_likelihood(self, imgs, return_ll=False):
        """
        Given a batch of images, return the likelihood of those.
        If return_ll is True, this function returns the log likelihood of the input.
        Otherwise, the ouptut metric is bits per dimension (scaled negative log likelihood)
        """
        z, ldj = self.encode(imgs)
        log_pz = self.prior.log_prob(z).sum(dim=[1,2,3])
        log_px = ldj + log_pz
        nll = -log_px
        # Calculating bits per dimension
        bpd = nll * np.log2(np.exp(1)) / np.prod(imgs.shape[1:])
        return bpd.mean() if not return_ll else log_px

    @torch.no_grad()
    def sample(self, z_init=None, train=True):
        """
        Sample a batch of images from the flow.
        """
        # Sample latent representation from prior
        if z_init is None:
            z = self.prior.sample(sample_shape=self.sample_shape).to(self.device)
        else:
            z = z_init.to(self.device)

        # Transform z to x by inverting the flows
        ldj = torch.zeros(self.sample_shape[0], device=self.device)
        for flow in reversed(self.flows):
            z, ldj = flow(z, ldj, reverse=True)
        
        z = torch.clamp(z, 0, 1)
        figure = plt.figure(figsize=(10, 10))
        grid = make_grid(discretize(z.cpu().detach()), nrow=int(z.shape[0]**0.5), normalize=False)
        plt.imshow(grid.permute(1, 2, 0))
        plt.axis('off')
        if train:
            if not self.no_wandb:
                wandb.log({"Samples": figure})
        else:
            plt.show()
        plt.close(figure)

    def configure_optimizers(self, args):
        optimizer = optim.Adam(self.parameters(), lr=args.lr)
        # An scheduler is optional, but can help in flows to get the last bpd improvement
        scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99)
        return optimizer, scheduler

    def training_step(self, batch):
        # Normalizing flows are trained by maximum likelihood => return bpd
        loss = self._get_likelihood(batch)
        return loss

    def train_model(self, train_loader, args, verbose=True):

        create_checkpoint_dir()

        optimizer, scheduler = self.configure_optimizers(args)

        epoch_bar = trange(args.n_epochs, desc='Epochs')
        best_loss = np.inf
        # Training loop
        for epoch in epoch_bar:
            self.train()
            epoch_loss = 0.0

            for batch, _ in tqdm(train_loader, desc='Batches', leave=False, display = not verbose):
                batch = batch.to(self.device)
                optimizer.zero_grad()
                loss = self.training_step(batch)
                epoch_loss += loss.item()*batch.shape[0]
                loss.backward()
                optimizer.step()

            scheduler.step()
            epoch_loss /= len(train_loader.dataset)
            epoch_bar.set_postfix(loss=epoch_loss)
            if not self.no_wandb:
                wandb.log({"Loss": epoch_loss})

            if (epoch+1) % args.sample_and_save_freq == 0 or epoch == 0:
                self.sample()
            
            if epoch_loss < best_loss:
                best_loss = epoch_loss
                torch.save(self.flows.state_dict(), os.path.join(models_dir, 'VanillaFlow', f"VanFlow_{args.dataset}.pt"))

    @torch.no_grad()    
    def outlier_detection(self, in_loader, out_loader):
        
        in_scores = []
        out_scores = []

        self.eval()

        for batch, _ in tqdm(in_loader, desc='Inlier Batches', leave=False):
            batch = batch.to(self.device)
            in_scores.append(-self._get_likelihood(batch, return_ll=True).cpu().detach().numpy())
        
        for batch, _ in tqdm(out_loader, desc='Outlier Batches', leave=False):
            batch = batch.to(self.device)
            out_scores.append(-self._get_likelihood(batch, return_ll=True).cpu().detach().numpy())

        in_scores = np.concatenate(in_scores)
        out_scores = np.concatenate(out_scores)

        # Calculate ROC AUC
        scores = np.concatenate([in_scores, out_scores])
        labels = np.concatenate([np.zeros_like(in_scores), np.ones_like(out_scores)])
        auc = roc_auc_score(labels, scores)
        fpr, tpr, _ = roc_curve(labels, scores)
        fpr95 = fpr[np.argmax(tpr >= 0.95)]

        print(f"ROC AUC: {auc:.4f}, FPR at 95% TPR: {fpr95:.4f}")

        plt.figure(figsize=(10, 5))
        plt.hist(in_scores, bins=50, alpha=0.5, label='Inliers', color='blue')
        plt.hist(out_scores, bins=50, alpha=0.5, label='Outliers', color='red')
        plt.legend()
        plt.xlabel('Negative Log Likelihood')
        plt.ylabel('Number of samples')
        plt.title('Outlier Detection')
        plt.show()