VAE_pems.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from linear_attention_transformer import LinearAttentionTransformer
import numpy as np

def get_torch_trans(heads=8, layers=1, channels=64):
    encoder_layer = nn.TransformerEncoderLayer(
        d_model=channels, nhead=heads, dim_feedforward=64, activation="gelu"
    )
    return nn.TransformerEncoder(encoder_layer, num_layers=layers)

def get_linear_trans(heads=8,layers=1,channels=64,localheads=0,localwindow=0):

  return LinearAttentionTransformer(
        dim = channels,
        depth = layers,
        heads = heads,
        max_seq_len = 256,
        n_local_attn_heads = 0,
        local_attn_window_size = 0,
    )

def Conv1d_with_init(in_channels, out_channels, kernel_size):
    layer = nn.Conv1d(in_channels, out_channels, kernel_size)
    nn.init.kaiming_normal_(layer.weight)
    return layer


class VAE(nn.Module):
    def __init__(self, config, inputdim=1):
        super().__init__()
        self.channels = config["channels"]


        self.input_projection = Conv1d_with_init(inputdim, self.channels, 1)

        self.input_projection_u = Conv1d_with_init(24*325, 30, 1)
        self.input_projection_sigma = Conv1d_with_init(24*325, 30, 1)
        self.trans = nn.Linear(30,24*325)
##################################################################################


        ###############################################################################

        self.output_projection1 = Conv1d_with_init(self.channels, self.channels, 1)
        self.output_projection2 = Conv1d_with_init(self.channels, 1, 1)
        nn.init.zeros_(self.output_projection2.weight)

        self.residual_layers = nn.ModuleList(
            [
                ResidualBlock(
                    side_dim=config["side_dim"],
                    channels=self.channels,
                    diffusion_embedding_dim=config["diffusion_embedding_dim"],
                    nheads=config["nheads"],
                    is_linear=config["is_linear"],
                )
                for _ in range(config["layers"])
            ]
        )

    def forward(self, x, cond_info):
        B, inputdim, K, L = x.shape
        ############################

        # for air36
        adj = np.load('pems_bay_adj.npz')['adj']
        xx = x.reshape(B, inputdim, K * L)


        degree_matrix = np.diag(np.sum(adj, axis=1))
        # Compute the Laplacian matrix
        laplacian = degree_matrix - adj
        adjadj = torch.tensor(laplacian).cuda().float()
        x = torch.matmul(adjadj, x)
        x = F.relu(x)
        ####################################
        x = x.reshape(B, inputdim, K * L)
        x = self.input_projection(x)

##################################
        x = x.reshape(B, -1, K , L)
        x = torch.matmul(adjadj, x)
        x = x.reshape(B, -1, K * L)
####################################
        z = x.permute(0,2,1)
        u = self.input_projection_u(z)
        u = F.sigmoid(u)

        sigma = self.input_projection_sigma(z)
        sigma =F.sigmoid(sigma)

        u = u.permute(0,2,1)
        sigma = sigma.permute(0,2,1)

        device = torch.device('cuda:0')
        guassian_noise = u + sigma * torch.randn(u.shape[0],u.shape[1],u.shape[2]).to(device)

        x = self.trans(guassian_noise)

        x = x + xx
        x = x.reshape(B, self.channels, K, L)

        skip = []
        for layer in self.residual_layers:
            x, skip_connection = layer(x, cond_info)
            skip.append(skip_connection)

        x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
        x = x.reshape(B, self.channels, K * L)
        x = self.output_projection1(x)  # (B,channel,K*L)
        x = F.relu(x)
        x = self.output_projection2(x)  # (B,1,K*L)
        x = x.reshape(B, K, L)
        ####################################################
        # KL = -2 * sigma - 0.5 * guassian_noise ** 2 + torch.exp(sigma) ** 2 + u ** 2
        #
        # KL = KL.mean()
        # KL = 0.5 / (x.size(0) * x.size(1)* x.size(2)) * KL
        eps = 1e-4
        k = u.size(-1)  # Dimensionality of the Gaussian distribution

        # Ensure sigma is positive and avoid log(0) or division by zero
        sigma = torch.clamp(sigma, min=eps)

        # Trace term (sum of the diagonal elements of sigma, which are just sigma itself)
        trace_term = torch.sum(sigma, dim=-1)

        # Quadratic term (sum of squares of the mean vector u)
        mu_term = torch.sum(u ** 2, dim=-1)

        # Log determinant term (sum of the log of diagonal elements of sigma)
        log_det_sigma = torch.sum(torch.log(sigma), dim=-1)

        # KL divergence formula (batch-wise, per distribution)
        kl_divergence = 0.5 * (trace_term + mu_term - k - log_det_sigma)

        # Optionally: Averaging the KL divergence over the batch
        kl_divergence_mean = kl_divergence.mean()
       #######################################################################
        return x, kl_divergence_mean


class ResidualBlock(nn.Module):
    def __init__(self, side_dim, channels, diffusion_embedding_dim, nheads, is_linear=False):
        super().__init__()
        self.diffusion_projection = nn.Linear(diffusion_embedding_dim, channels)
        self.cond_projection = Conv1d_with_init(side_dim, 2 * channels, 1)
        self.mid_projection = Conv1d_with_init(channels, 2 * channels, 1)
        self.output_projection = Conv1d_with_init(channels, 2 * channels, 1)

        self.is_linear = is_linear
        if is_linear:
            self.time_layer = get_linear_trans(heads=nheads,layers=1,channels=channels)
            self.feature_layer = get_linear_trans(heads=nheads,layers=1,channels=channels)
        else:
            self.time_layer = get_torch_trans(heads=nheads, layers=1, channels=channels)
            self.feature_layer = get_torch_trans(heads=nheads, layers=1, channels=channels)


    def forward_time(self, y, base_shape):
        B, channel, K, L = base_shape
        if L == 1:
            return y
        y = y.reshape(B, channel, K, L).permute(0, 2, 1, 3).reshape(B * K, channel, L)

        if self.is_linear:
            y = self.time_layer(y.permute(0, 2, 1)).permute(0, 2, 1)
        else:
            y = self.time_layer(y.permute(2, 0, 1)).permute(1, 2, 0)
        y = y.reshape(B, K, channel, L).permute(0, 2, 1, 3).reshape(B, channel, K * L)
        return y


    def forward_feature(self, y, base_shape):
        B, channel, K, L = base_shape
        if K == 1:
            return y
        y = y.reshape(B, channel, K, L).permute(0, 3, 1, 2).reshape(B * L, channel, K)
        if self.is_linear:
            y = self.feature_layer(y.permute(0, 2, 1)).permute(0, 2, 1)
        else:
            y = self.feature_layer(y.permute(2, 0, 1)).permute(1, 2, 0)
        y = y.reshape(B, L, channel, K).permute(0, 2, 3, 1).reshape(B, channel, K * L)
        return y

    def forward(self, x, cond_info):
        B, channel, K, L = x.shape
        base_shape = x.shape
        x = x.reshape(B, channel, K * L)

        y = x

        y = self.forward_time(y, base_shape)
        y = self.forward_feature(y, base_shape)  # (B,channel,K*L)
        y = self.mid_projection(y)  # (B,2*channel,K*L)

        _, cond_dim, _, _ = cond_info.shape
        cond_info = cond_info.reshape(B, cond_dim, K * L)
        cond_info = self.cond_projection(cond_info)  # (B,2*channel,K*L)
        y = y + cond_info

        gate, filter = torch.chunk(y, 2, dim=1)
        y = torch.sigmoid(gate) * torch.tanh(filter)  # (B,channel,K*L)
        y = self.output_projection(y)

        residual, skip = torch.chunk(y, 2, dim=1)
        x = x.reshape(base_shape)
        residual = residual.reshape(base_shape)
        skip = skip.reshape(base_shape)
        return (x + residual) / math.sqrt(2.0), skip