Merge pull request #182 from monarch-initiative/develop

Develop
monarch-initiative · May 26, 2020 · 93a5b0a · 93a5b0a
2 parents c504567 + 3e0ea17
commit 93a5b0a
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 140 deletions.
diff --git a/embiggen/glove.py b/embiggen/glove.py
@@ -2,7 +2,7 @@
 import tensorflow as tf  # type: ignore
 from random import shuffle
 
-assert tf.__version__ >= "2.0"
+assert tf.__version__ >= "2.0"   # pylint: disable=no-member
 
 
 class NotTrainedError(Exception):

diff --git a/embiggen/link_prediction.py b/embiggen/link_prediction.py
@@ -9,6 +9,7 @@
 from embiggen.utils import load_embeddings
 import numpy as np  # type: ignore
 from .neural_networks import MLP, FFNN, MultiModalFFNN
+import csv
 
 import logging
 #import os
@@ -339,50 +340,43 @@ def output_classifier_results(self):
                 2.0 * train_conf_matrix[1, 1] + train_conf_matrix[0, 1] + train_conf_matrix[1, 0])
         # f1-score =2 * TP / (2 * TP + FP + FN)
 
-        with open(self.output, 'w') as f:
-            f.write("confusion matrix.training: {}\n".format(str(train_conf_matrix)))
-            f.write('Accuracy.training: {}\n'.format(train_accuracy))
-            f.write('Specificity.training: {}\n'.format(train_specificity))
-            f.write('Sensitivity.training: {}\n'.format(train_sensitivity))
-            f.write('F1-score.training: {}\n'.format(train_f1_score))
-            f.write("ROC score.training: {}\n ".format(str(self.train_roc)))
-            f.write("AP score.training: {}\n".format(str(self.train_average_precision)))
+        logging.info("confusion matrix.training: {}\n".format(str(train_conf_matrix)))
 
-            if not self.skip_validation:
-                valid_conf_matrix = self.validation_confusion_matrix
-                total = sum(sum(valid_conf_matrix))
-                valid_accuracy = (valid_conf_matrix[0, 0] + valid_conf_matrix[1, 1]) / total
-                valid_specificity = valid_conf_matrix[0, 0] / (valid_conf_matrix[0, 0] + valid_conf_matrix[0, 1])
-                valid_sensitivity = valid_conf_matrix[1, 1] / (valid_conf_matrix[1, 0] + valid_conf_matrix[1, 1])
-                valid_f1_score = (2.0 * valid_conf_matrix[1, 1]) / (
+        if not self.skip_validation:
+            valid_conf_matrix = self.validation_confusion_matrix
+            total = sum(sum(valid_conf_matrix))
+            valid_accuracy = (valid_conf_matrix[0, 0] + valid_conf_matrix[1, 1]) / total
+            valid_specificity = valid_conf_matrix[0, 0] / (valid_conf_matrix[0, 0] + valid_conf_matrix[0, 1])
+            valid_sensitivity = valid_conf_matrix[1, 1] / (valid_conf_matrix[1, 0] + valid_conf_matrix[1, 1])
+            valid_f1_score = (2.0 * valid_conf_matrix[1, 1]) / (
                             2.0 * valid_conf_matrix[1, 1] + valid_conf_matrix[0, 1] + valid_conf_matrix[1, 0])
-                # f1-score =2 * TP / (2 * TP + FP + FN)
-
-                f.write("confusion matrix.validation: {}\n".format(str(valid_conf_matrix)))
-                f.write('Accuracy.validation : {}\n'.format(valid_accuracy))
-                f.write('Specificity.validation: {}\n'.format(valid_specificity))
-                f.write('Sensitivity.validation: {}\n'.format(valid_sensitivity))
-                f.write('F1-score.validation: {}\n'.format(valid_f1_score))
-                f.write("ROC score.validation: {}\n ".format(str(self.valid_roc)))
-                f.write("AP score.validation: {}\n".format(str(self.valid_average_precision)))
-
-            test_confusion_matrix = self.test_confusion_matrix
-            total = sum(sum(test_confusion_matrix))
-            test_accuracy = (test_confusion_matrix[0, 0] + test_confusion_matrix[1, 1]) * 1.0 / total
-            test_specificity = test_confusion_matrix[0, 0] * 1.0 / (test_confusion_matrix[0, 0] + test_confusion_matrix[0, 1]) * 1.0
-            test_sensitivity = test_confusion_matrix[1, 1] * 1.0 / (test_confusion_matrix[1, 0] + test_confusion_matrix[1, 1]) * 1.0
-            test_f1_score = (2.0 * test_confusion_matrix[1, 1]) / (
-                        2.0 * test_confusion_matrix[1, 1] + test_confusion_matrix[0, 1] + test_confusion_matrix[1, 0])
             # f1-score =2 * TP / (2 * TP + FP + FN)
 
-            f.write("confusion matrix.test: {}\n".format(str(test_confusion_matrix)))
-            f.write('Accuracy.test: {}\n'.format(test_accuracy))
-            f.write('Specificity.test: {}\n'.format(test_specificity))
-            f.write('Sensitivity.test: {}\n'.format(test_sensitivity))
-            f.write("F1-score.test: {}\n".format(test_f1_score))
-            f.write("ROC score.test: {}\n ".format(str(self.test_roc)))
-            f.write("AP score.test: {} \n".format(str(self.test_average_precision)))
-        f.close()
+        test_confusion_matrix = self.test_confusion_matrix
+        total = sum(sum(test_confusion_matrix))
+        test_accuracy = (test_confusion_matrix[0, 0] + test_confusion_matrix[1, 1]) * 1.0 / total
+        test_specificity = test_confusion_matrix[0, 0] * 1.0 / (test_confusion_matrix[0, 0] + test_confusion_matrix[0, 1]) * 1.0
+        test_sensitivity = test_confusion_matrix[1, 1] * 1.0 / (test_confusion_matrix[1, 0] + test_confusion_matrix[1, 1]) * 1.0
+        test_f1_score = (2.0 * test_confusion_matrix[1, 1]) / (
+                        2.0 * test_confusion_matrix[1, 1] + test_confusion_matrix[0, 1] + test_confusion_matrix[1, 0])
+        # f1-score =2 * TP / (2 * TP + FP + FN)
+
+        logging.info("confusion matrix.training: {}\n".format(str(train_conf_matrix)))
+        logging.info("confusion matrix.test: {}\n".format(str(test_confusion_matrix)))
+        if not self.skip_validation:
+            logging.info("confusion matrix.validation: {}\n".format(str(valid_conf_matrix)))
+
+        with open(self.output, mode='w') as csv_file:
+            fieldnames = ['set', 'AUC-score', 'Sensitivity', 'Specificity', 'Accuracy', 'F1-score', 'Average-precision']
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=' ')
+
+            writer.writeheader()
+            writer.writerow({'set': "training",'AUC-score': self.train_roc, 'Sensitivity':train_sensitivity, 'Specificity': train_specificity, 'Accuracy': train_accuracy, "F1-score": train_f1_score, "Average-precision": self.train_average_precision})
+
+            writer.writerow({'set': "test", 'AUC-score': self.test_roc, 'Sensitivity': test_sensitivity,
+                             'Specificity': test_specificity, 'Accuracy': test_accuracy, "F1-score": test_f1_score,"Average-precision": self.test_average_precision})
+            if not self.skip_validation:
+                writer.writerow({'set': "validation",'AUC-score': self.valid_roc, 'Sensitivity':valid_sensitivity,'Specificity': valid_specificity, 'Accuracy': valid_accuracy, "F1-score": valid_f1_score, "Average-precision": self.valid_average_precision})
 
     def create_edge_embeddings(self, edge_list, node2vector_map) -> \
             Tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -406,23 +400,19 @@ def create_edge_embeddings(self, edge_list, node2vector_map) -> \
             emb2 = node2vector_map[node2]
             if edge_embedding_method == "hadamard":
                 # Perform a Hadamard transform on the node embeddings.
-                # This is a dot product of the node embedding for the two nodes that
-                # belong to each edge
+                # This is a hadamard product of the node embedding for the two nodes
                 edge_emb = np.multiply(emb1, emb2)
             elif edge_embedding_method == "average":
                 # Perform a Average transform on the node embeddings.
-                # This is a elementwise average of the node embedding for the two nodes that
-                # belong to each edge
+                # This is a elementwise average of the node embedding for the two nodes
                 edge_emb = np.add(emb1, emb2) / 2
             elif edge_embedding_method == "weightedL1":
                 # Perform weightedL1 transform on the node embeddings.
-                # WeightedL1 calculates the absolute value of difference of each element of the two nodes that
-                # belong to each edge
+                # WeightedL1 calculates the absolute value of difference of each element of the two nodes
                 edge_emb = abs(emb1 - emb2)
             elif edge_embedding_method == "weightedL2":
                 # Perform weightedL2 transform on the node embeddings.
-                # WeightedL2 calculates the square of difference of each element of the two nodes that
-                # belong to each edge
+                # WeightedL2 calculates the square of difference of each element of the two nodes
                 edge_emb = np.power((emb1 - emb2), 2)
             else:
                 logging.error("Enter hadamard, average, weightedL1, weightedL2")

diff --git a/embiggen/neural_networks.py b/embiggen/neural_networks.py
@@ -1,7 +1,7 @@
-from tensorflow.keras.models import Sequential, Model  # type: ignore
-from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Concatenate, Layer  # type: ignore
-from tensorflow.keras.callbacks import EarlyStopping  # type: ignore
-from tensorflow.keras.metrics import AUC  # type: ignore
+from tensorflow.keras.models import Sequential, Model  # type: ignore  # pylint: disable=import-error
+from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Concatenate, Layer  # type: ignore  # pylint: disable=import-error
+from tensorflow.keras.callbacks import EarlyStopping  # type: ignore  # pylint: disable=import-error
+from tensorflow.keras.metrics import AUC  # type: ignore  # pylint: disable=import-error
 import pandas as pd  # type: ignore
 import numpy as np  # type: ignore
 from typing import Tuple, Dict, List, Union, Optional

diff --git a/embiggen/text_encoder.py b/embiggen/text_encoder.py
@@ -1,12 +1,13 @@
-import nltk  # type: ignore
+import nltk  # type: ignore  # pylint: disable=import-error
 import os
-import tensorflow as tf  # type: ignore
+import pandas as pd  # type: ignore
 import re
+import tensorflow as tf  # type: ignore
 
 from collections import Counter
 from more_itertools import unique_everseen  # type: ignore
 from pandas.core.common import flatten  # type: ignore
-from tensorflow.keras.preprocessing.text import Tokenizer  # type: ignore
+from tensorflow.keras.preprocessing.text import Tokenizer  # type: ignore  # pylint: disable=import-error
 from typing import Dict, List, Optional, Tuple, Union
 
 
@@ -21,34 +22,60 @@ class TextEncoder:
 
     Attributes:
         filename: A filepath and name to text which needs encoding.
+        payload_index: An integer that if specified is used to process a specific column from an input csv file.
+        header: An integer, that if specified contains the row index of the input data containing file header info.
+        delimiter: A string containing the file delimiter type.
         data_type: A string which is used to indicate whether or not the data should be read in as a single text
             object or as a list of sentences. Passed values can be "words" or "sentences" (default="sentences").
         stopwords: A set of stopwords. If nothing is passed by user a default list of stopwords is utilized.
 
     Raises:
-        ValueError: If the filename is None.
-        TypeError: If the filename attribute is not a string.
+        ValueError: If filename is None.
+        TypeError: If filename and delimiter (when specified) are not strings.
+        TypeError: If payload_index and header (when specified) are not are not integers.
         IOError: If the file referenced by filename could not be found.
+        TypeError: If payload_index is not an integer.
+        ValueError: If data_type is not "words" or "sentences".
     """
 
-    def __init__(self, filename: str, data_type: Optional[str] = None, stopwords: set = None):
+    def __init__(self, filename: str, payload_index: Optional[int] = None, header: Optional[int] = None,
+                 delimiter: Optional[str] = None, data_type: Optional[str] = None, stopwords: set = None) -> None:
 
-        if filename is None:
+        # verify filename structure
+        if not filename:
             raise ValueError('filename cannot be None')
         elif not isinstance(filename, str):
             raise TypeError('filename must be a string')
         elif not os.path.exists(filename):
-            raise IOError('Could not find file referenced by filename: {}'.format(filename))
+            raise IOError('could not find file referenced by filename: {}'.format(filename))
         else:
             self.filename = filename
 
-        self.data_type = data_type if data_type else 'sentences'
+        if payload_index and not isinstance(payload_index, int):
+            raise TypeError('payload_index must be an integer')
+        else:
+            self.payload_index = payload_index if payload_index else None
+
+            if header and not isinstance(header, int):
+                raise TypeError('header must be an integer')
+            else:
+                self.header = header if header else None
+
+            if delimiter and not isinstance(delimiter, str):
+                raise TypeError('delimiter must be a string')
+            else:
+                self.delimiter = delimiter if delimiter else '\t'
+
+        if data_type and data_type.lower() not in ['sentences', 'words']:
+            raise ValueError('data_type must be "words" or "sentences"')
+        else:
+            self.data_type = data_type.lower() if data_type else 'sentences'
 
         try:
-            self.stopwords = nltk.corpus.stopwords.words('english') if stopwords is None else stopwords
+            self.stopwords = nltk.corpus.stopwords.words('english') if not stopwords else stopwords
         except LookupError:
             nltk.download('stopwords')
-            self.stopwords = nltk.corpus.stopwords.words('english') if stopwords is None else stopwords
+            self.stopwords = nltk.corpus.stopwords.words('english') if not stopwords else stopwords
 
     def clean_text(self, text: str) -> str:
         """Takes a text string and performs several tasks that are intended to clean the text including making the
@@ -101,13 +128,21 @@ def process_input_text(self) -> List[str]:
             text: A string or list of stings of text from the read in file.
         """
 
-        print('Reading data from {filename} and processing it as {data_type}'.format(filename=self.filename,
-                                                                                     data_type=self.data_type))
+        print('Reading {file} and processing it as {data_type}'.format(file=self.filename, data_type=self.data_type))
+
         if self.data_type == 'words':
-            word_data = open(self.filename).read()
+            with open(self.filename, 'r') as input_file:
+                word_data = input_file.read()
+            input_file.close()
             return self.clean_text(word_data).split()
         else:
-            sentence_data = open(self.filename).readlines()
+            if self.payload_index:
+                data = pd.read_csv(self.filename, sep=self.delimiter, header=self.header)
+                sentence_data = list(data[list(data).index(self.payload_index)])
+            else:
+                with open(self.filename, 'r') as input_file:
+                    sentence_data = input_file.readlines()
+                input_file.close()
             return [self.clean_text(sent) for sent in sentence_data]
 
     def build_dataset(self, max_vocab=50000) -> Tuple[Union[tf.Tensor, tf.RaggedTensor], List, Dict, Dict]:
@@ -160,24 +195,21 @@ def build_dataset(self, max_vocab=50000) -> Tuple[Union[tf.Tensor, tf.RaggedTens
                     filtered_count['UNK'] = 0
                     dictionary['UNK'] = 1
                 else:
-                    filtered_count[k] = count[v-1]
+                    filtered_count[k] = count[v - 1]
                     dictionary[k] = v
             else:
                 filtered_count['UNK'] += 1
 
-        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))  # for downstream compatibility
+        # update for downstream compatibility
+        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
         count_list = [list(x) for x in list(zip(list(filtered_count.keys()), list(filtered_count.values())))]
 
         if max_vocab != len(count_list):
             raise ValueError('The length of count_as_tuples does not match max_vocab_size.')
         else:
-            #try:
-            #    tensor_data = tf.data.Dataset.from_tensor_slices(sequences)
-            #except ValueError:
-            #    tensor_data = tf.ragged.constant(sequences)  # for nested lists of differing lengths
             if isinstance(sequences, list):
                 tensor_data = tf.ragged.constant(sequences)
             else:
-                tensor_data = tf.convert_to_tensor(sequences) # should now be a 1D tensor
+                tensor_data = tf.convert_to_tensor(sequences)  # should now be a 1D tensor
 
         return tensor_data, count_list, dictionary, reverse_dictionary