class_DeepIMV_AISTATS.py

import tensorflow as tf
import numpy as np

from tensorflow.contrib.layers import fully_connected as FC_Net


_EPSILON = 1e-8

def div(x_, y_):
    return tf.div(x_, y_ + _EPSILON)

def log(x_):
    return tf.log(x_ + _EPSILON)

def xavier_initialization(size):
    dim_ = size[0]
    xavier_stddev = 1. / tf.sqrt(dim_ / 2.)
    return tf.random_normal(shape=size, stddev=xavier_stddev)


### DEFINE PREDICTOR
def predictor(x_, o_dim_, o_type_, num_layers_=1, h_dim_=100, activation_fn=tf.nn.relu, keep_prob_=1.0, w_reg_=None):
    '''
        INPUT
            x_            : (2D-tensor) input
            o_dim_        : (int) output dimension
            o_type_       : (string) output type one of {'continuous', 'categorical', 'binary'}
            num_layers_   : (int) # of hidden layers
            activation_fn_: tf activation functions
        
        OUTPUT
            o_type_ tensor
    '''
    if o_type_ == 'continuous':
        out_fn = None
    elif o_type_ == 'categorical':
        out_fn = tf.nn.softmax #for classification task
    elif o_type_ == 'binary':
        out_fn = tf.nn.sigmoid
    else:
        raise ValueError('Wrong output type. The value {}!!'.format(o_type_))

    if num_layers_ == 1:
        out =  FC_Net(inputs=x_, num_outputs=o_dim_, activation_fn=out_fn, weights_regularizer=w_reg_, scope='out')
    else: #num_layers > 1
        for tmp_layer in range(num_layers_-1):
            if tmp_layer == 0:
                net = x_
            net = FC_Net(inputs=net, num_outputs=h_dim_, activation_fn=activation_fn, weights_regularizer=w_reg_, scope='layer_'+str(tmp_layer))
            net = tf.nn.dropout(net, keep_prob=keep_prob_)
        out =  FC_Net(inputs=net, num_outputs=o_dim_, activation_fn=out_fn, weights_regularizer=w_reg_, scope='out')  
    return out


### DEFINE STOCHASTIC ENCODER
def stochastic_encoder(x_, o_dim_, num_layers_=1, h_dim_=100, activation_fn=tf.nn.relu, keep_prob_=1.0, w_reg_=None):
    '''
        INPUT
            x_            : (2D-tensor) input
            o_dim_        : (int) output dimension
            num_layers_   : (int) # of hidden layers
            activation_fn_: tf activation functions
        
        OUTPUT
            [mu,sigma] tensor
    '''
    if num_layers_ == 1:
        out =  FC_Net(inputs=x_, num_outputs=o_dim_, activation_fn=None, weights_regularizer=w_reg_, scope='out')
    else: #num_layers > 1
        for tmp_layer in range(num_layers_-1):
            if tmp_layer == 0:
                net = x_
            net = FC_Net(inputs=net, num_outputs=h_dim_, activation_fn=activation_fn, weights_regularizer=w_reg_, scope='layer_'+str(tmp_layer))
            net = tf.nn.dropout(net, keep_prob=keep_prob_)
        out =  FC_Net(inputs=net, num_outputs=o_dim_, activation_fn=None, weights_regularizer=w_reg_, scope='out')  
    return out


### DEFINE SUPERVISED LOSS FUNCTION
def loss_y(y_true_, y_pred_, y_type_):                
    if y_type_ == 'continuous':
        tmp_loss = tf.reduce_sum((y_true_ - y_pred_)**2, axis=-1)
    elif y_type_ == 'categorical':
        tmp_loss = - tf.reduce_sum(y_true_ * log(y_pred_), axis=-1)
    elif y_type_ == 'binary':
        tmp_loss = - tf.reduce_sum(y_true_ * log(y_pred_) + (1.-y_true_) * log(1.-y_pred_), axis=-1)
    else:
        raise ValueError('Wrong output type. The value {}!!'.format(y_type_))                    
    return tmp_loss


### DEFINE NETWORK-RELATED FUNCTIONS
def product_of_experts(mask_, mu_set_, logvar_set_):
    tmp = 1.
    for m in range(len(mu_set_)):
        tmp += tf.reshape(mask_[:, m], [-1,1])*div(1., tf.exp(logvar_set_[m]))
    poe_var = div(1., tmp)
    poe_logvar = log(poe_var)
    
    tmp = 0.
    for m in range(len(mu_set_)):
        tmp += tf.reshape(mask_[:, m], [-1,1])*div(1., tf.exp(logvar_set_[m]))*mu_set_[m]
    poe_mu = poe_var * tmp
    
    return poe_mu, poe_logvar

    
###########################################################################
#### DEFINE PROPOSED-NETWORK
class DeepIMV_AISTATS:
    '''
        - Add mixture mode
        - Remove common/shared parts -- go back to the previous version
        - Leave the consistency loss; but make sure to set gamma = 0
    '''
    
    def __init__(self, sess, name, input_dims, network_settings):
        self.sess             = sess
        self.name             = name
       
        # INPUT/OUTPUT DIMENSIONS
        self.M                = len(input_dims['x_dim_set'])
        
        self.x_dim_set = {}
        for m in range(self.M):
            self.x_dim_set[m] = input_dims['x_dim_set'][m]
            
        self.y_dim            = input_dims['y_dim']
        self.y_type           = input_dims['y_type']
       
        self.z_dim            = input_dims['z_dim']  # z_dim is equivalent to W and Z             
        self.steps_per_batch  = input_dims['steps_per_batch']
        
        # PREDICTOR INFO (VIEW-SPECIFC)
        self.h_dim_p1         = network_settings['h_dim_p1']      #predictor hidden nodes
        self.num_layers_p1    = network_settings['num_layers_p1'] #predictor layers
        
        # PREDICTOR INFO (MULTI_VIEW)
        self.h_dim_p2         = network_settings['h_dim_p2']      #predictor hidden nodes
        self.num_layers_p2    = network_settings['num_layers_p2'] #predictor layers
        
        # ENCODER INFO
        self.h_dim_e          = network_settings['h_dim_e']      #encoder hidden nodes
        self.num_layers_e     = network_settings['num_layers_e'] #encoder layers
       
        self.fc_activate_fn   = network_settings['fc_activate_fn'] 
        self.reg_scale        = network_settings['reg_scale']   #regularization
                
        self._build_net()
        
       
    def _build_net(self):
        ds     = tf.contrib.distributions
        
#         with tf.name_scope(self.name):
        with tf.variable_scope(self.name):
            self.mb_size        = tf.placeholder(tf.int32, [], name='batch_size')
            self.lr_rate        = tf.placeholder(tf.float32, name='learning_rate')           
            self.k_prob         = tf.placeholder(tf.float32, name='keep_probability')
                       
            ### INPUT/OUTPUT                   
            self.x_set          = {}
            for m in range(self.M):
                self.x_set[m]   = tf.placeholder(tf.float32, [None, self.x_dim_set[m]], 'input_{}'.format(m))
            
            self.mask           = tf.placeholder(tf.float32, [None, self.M], name='mask')            
            self.y              = tf.placeholder(tf.float32, [None, self.y_dim],  name='output')
                                   
            ### BALANCING COEFFICIENTS
            self.alpha          = tf.placeholder(tf.float32, name='coef_alpha') #Consitency Loss
            self.beta           = tf.placeholder(tf.float32, name='coef_beta')  #Information Bottleneck
            
            if self.reg_scale == 0:
                w_reg           = None
            else:
                w_reg           = tf.contrib.layers.l1_regularizer(scale=self.reg_scale)
                        
            ### PRIOR
            prior_z  = ds.Normal(0.0, 1.0) #PoE Prior - q(z)
            prior_z_set = {}
            for m in range(self.M):
                prior_z_set[m] = ds.Normal(0.0, 1.0) #View-Specific Prior - q(z_{m})
                        
            ### STOCHASTIC ENCODER
            self.h_set      = {}
            
            self.mu_z_set     = {}
            self.logvar_z_set = {}
            
            for m in range(self.M):
                with tf.variable_scope('encoder{}'.format(m+1)):
                    self.h_set[m]      = stochastic_encoder(
                        x_=self.x_set[m], o_dim_=2*self.z_dim, 
                        num_layers_=self.num_layers_e, h_dim_=self.h_dim_e, 
                        activation_fn=self.fc_activate_fn, keep_prob_=self.k_prob, w_reg_=w_reg
                    )
                    self.mu_z_set[m]     = self.h_set[m][:, :self.z_dim] 
                    self.logvar_z_set[m] = self.h_set[m][:, self.z_dim:]
            
            self.mu_z, self.logvar_z = product_of_experts(self.mask, self.mu_z_set, self.logvar_z_set)                
            
            qz         = ds.Normal(self.mu_z, tf.sqrt(tf.exp(self.logvar_z)))
            self.z     = qz.sample()
            self.zs    = qz.sample(10)

            qz_set     = {}
            self.z_set = {}
            for m in range(self.M):
                qz_set[m]      = ds.Normal(self.mu_z_set[m], tf.sqrt(tf.exp(self.logvar_z_set[m])))
                self.z_set[m]  = qz_set[m].sample()

    
            ### PREDICTOR (JOINT)
            with tf.variable_scope('predictor'):
                self.y_hat = predictor(
                    x_=self.z, o_dim_=self.y_dim, o_type_=self.y_type, 
                    num_layers_=self.num_layers_p2, h_dim_=self.h_dim_p2, 
                    activation_fn=self.fc_activate_fn, keep_prob_=self.k_prob, w_reg_=w_reg
                )

            # this will generate multiple samples of y (based on multiple samples drawn from the variational encoder.
            with tf.variable_scope('predictor', reuse=True):               
                self.y_hats = predictor(
                    x_=self.zs, o_dim_=self.y_dim, o_type_=self.y_type, 
                    num_layers_=self.num_layers_p2, h_dim_=self.h_dim_p2, 
                    activation_fn=self.fc_activate_fn, keep_prob_=self.k_prob, w_reg_=w_reg
                )
                                    
            ### PREDICTOR 
            self.y_hat_set = {}
            for m in range(self.M):
                with tf.variable_scope('predictor_set{}'.format(m)):
                    self.y_hat_set[m] = predictor(
                        x_=self.z_set[m], o_dim_=self.y_dim, o_type_=self.y_type, 
                        num_layers_=self.num_layers_p1, h_dim_=self.h_dim_p1, 
                        activation_fn=self.fc_activate_fn, keep_prob_=self.k_prob, w_reg_=w_reg
                    )

            
            ### OPTIMIZER
            global_vars      = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            enc_vars         = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name + '/encoder')
            pred_vars        = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name + '/predictor')
            
        
            ### CONSITENCY LOSS
            self.LOSS_CONSISTENCY = 0.
            for m in range(self.M):
                self.LOSS_CONSISTENCY += 1./self.M * div(
                    tf.reduce_sum(self.mask[:, m] * tf.reduce_sum(ds.kl_divergence(qz, qz_set[m]), axis=-1)),
                    tf.reduce_sum(self.mask[:, m])
                )
                                                                        

            self.LOSS_KL       = tf.reduce_mean(
                tf.reduce_sum(ds.kl_divergence(qz, prior_z), axis=-1)
            )            
            self.LOSS_P        = tf.reduce_mean(loss_y(self.y, self.y_hat, self.y_type))
            
            self.LOSS_IB_JOINT = self.LOSS_P + self.beta*self.LOSS_KL
                                    
            self.LOSS_Ps_all  = []
            self.LOSS_KLs_all = []
            for m in range(self.M):
                tmp_p              = loss_y(self.y, self.y_hat_set[m], self.y_type)
                tmp_kl             = tf.reduce_sum(ds.kl_divergence(qz_set[m], prior_z_set[m]), axis=-1)
        
                self.LOSS_Ps_all  += [div(tf.reduce_sum(self.mask[:,m]*tmp_p), tf.reduce_sum(self.mask[:,m]))]        
                self.LOSS_KLs_all += [div(tf.reduce_sum(self.mask[:,m]*tmp_kl), tf.reduce_sum(self.mask[:,m]))]
        
                
            self.LOSS_Ps_all  = tf.stack(self.LOSS_Ps_all, axis=0)
            self.LOSS_KLs_all = tf.stack(self.LOSS_KLs_all, axis=0)
            
            self.LOSS_Ps      = tf.reduce_sum(self.LOSS_Ps_all)
            self.LOSS_KLs     = tf.reduce_sum(self.LOSS_KLs_all)
                        
            self.LOSS_IB_MARGINAL = self.LOSS_Ps + self.beta*self.LOSS_KLs         
            

            self.LOSS_TOTAL       = self.LOSS_IB_JOINT\
                                    + self.alpha*(self.LOSS_IB_MARGINAL)\
                                    + tf.losses.get_regularization_loss()
    
            
            self.global_step      = tf.contrib.framework.get_or_create_global_step()
            self.lr_rate_decayed  = tf.train.exponential_decay(self.lr_rate, self.global_step,
                                                       decay_steps=2*self.steps_per_batch,
                                                       decay_rate=0.97, staircase=True)

            opt                = tf.train.AdamOptimizer(self.lr_rate_decayed, 0.5)
                
                        
            ma = tf.train.ExponentialMovingAverage(0.999, zero_debias=True)
            ma_update = ma.apply(tf.model_variables())
            

            self.solver = tf.contrib.training.create_train_op(self.LOSS_TOTAL, opt,
                                                               self.global_step,
                                                               update_ops=[ma_update])
                

    def train(self, x_set_, y_, m_, alpha_, beta_, lr_train, k_prob=1.0):
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.y: y_, self.mask: m_, 
                           self.alpha: alpha_, self.beta: beta_, 
                           self.mb_size: np.shape(x_set_[0])[0],
                           self.lr_rate: lr_train, self.k_prob: k_prob})        
        return self.sess.run([self.solver, self.LOSS_TOTAL, self.LOSS_P, self.LOSS_KL, self.LOSS_Ps, 
                              self.LOSS_KLs, self.LOSS_CONSISTENCY],
                             feed_dict=feed_dict_)
   
    def get_loss(self, x_set_, y_, m_, alpha_, beta_):
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.y: y_, self.mask: m_, 
                           self.alpha: alpha_, self.beta: beta_,
                           self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})        
        return self.sess.run([self.LOSS_TOTAL, self.LOSS_P, self.LOSS_KL, self.LOSS_Ps, 
                              self.LOSS_KLs, self.LOSS_CONSISTENCY, self.LOSS_Ps_all, self.LOSS_KLs_all],
                             feed_dict=feed_dict_)
        

    def predict_y(self, x_set_, m_):
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run(self.y_hat, feed_dict=feed_dict_)
    
    def predict_ys(self, x_set_, m_):
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run([self.y_hat, self.y_hats], feed_dict=feed_dict_)

    def predict_yhat_set(self, x_set_, m_):
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run(self.y_hat_set, feed_dict=feed_dict_)
    
    def predict_mu_z_and_mu_z_set(self, x_set_, m_): #this outputs mu and mu_set
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run([self.mu_z, self.mu_z_set], feed_dict=feed_dict_)
    
    def predict_logvar_z_and_logvar_z_set(self, x_set_, m_): #this outputs sigma and sigma_set
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run([self.logvar_z, self.logvar_z_set], feed_dict=feed_dict_)

    def predict_z_n_z_set(self, x_set_, m_):  #this outputs z and z_set
        feed_dict_ = self.make_feed_dict(x_set_)
        feed_dict_.update({self.mask: m_, self.mb_size: np.shape(x_set_[0])[0], self.k_prob: 1.0})
        return self.sess.run([self.z, self.z_set], feed_dict=feed_dict_)
  
    def make_feed_dict(self, x_set_):
        feed_dict_ = {}
        for m in range(len(self.x_set)):
            feed_dict_[self.x_set[m]] = x_set_[m]           
        return feed_dict_