Skip to content

Commit

Permalink
Merge pull request #182 from monarch-initiative/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
callahantiff authored May 26, 2020
2 parents c504567 + 3e0ea17 commit 93a5b0a
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 140 deletions.
2 changes: 1 addition & 1 deletion embiggen/glove.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import tensorflow as tf # type: ignore
from random import shuffle

assert tf.__version__ >= "2.0"
assert tf.__version__ >= "2.0" # pylint: disable=no-member


class NotTrainedError(Exception):
Expand Down
86 changes: 38 additions & 48 deletions embiggen/link_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from embiggen.utils import load_embeddings
import numpy as np # type: ignore
from .neural_networks import MLP, FFNN, MultiModalFFNN
import csv

import logging
#import os
Expand Down Expand Up @@ -339,50 +340,43 @@ def output_classifier_results(self):
2.0 * train_conf_matrix[1, 1] + train_conf_matrix[0, 1] + train_conf_matrix[1, 0])
# f1-score =2 * TP / (2 * TP + FP + FN)

with open(self.output, 'w') as f:
f.write("confusion matrix.training: {}\n".format(str(train_conf_matrix)))
f.write('Accuracy.training: {}\n'.format(train_accuracy))
f.write('Specificity.training: {}\n'.format(train_specificity))
f.write('Sensitivity.training: {}\n'.format(train_sensitivity))
f.write('F1-score.training: {}\n'.format(train_f1_score))
f.write("ROC score.training: {}\n ".format(str(self.train_roc)))
f.write("AP score.training: {}\n".format(str(self.train_average_precision)))
logging.info("confusion matrix.training: {}\n".format(str(train_conf_matrix)))

if not self.skip_validation:
valid_conf_matrix = self.validation_confusion_matrix
total = sum(sum(valid_conf_matrix))
valid_accuracy = (valid_conf_matrix[0, 0] + valid_conf_matrix[1, 1]) / total
valid_specificity = valid_conf_matrix[0, 0] / (valid_conf_matrix[0, 0] + valid_conf_matrix[0, 1])
valid_sensitivity = valid_conf_matrix[1, 1] / (valid_conf_matrix[1, 0] + valid_conf_matrix[1, 1])
valid_f1_score = (2.0 * valid_conf_matrix[1, 1]) / (
if not self.skip_validation:
valid_conf_matrix = self.validation_confusion_matrix
total = sum(sum(valid_conf_matrix))
valid_accuracy = (valid_conf_matrix[0, 0] + valid_conf_matrix[1, 1]) / total
valid_specificity = valid_conf_matrix[0, 0] / (valid_conf_matrix[0, 0] + valid_conf_matrix[0, 1])
valid_sensitivity = valid_conf_matrix[1, 1] / (valid_conf_matrix[1, 0] + valid_conf_matrix[1, 1])
valid_f1_score = (2.0 * valid_conf_matrix[1, 1]) / (
2.0 * valid_conf_matrix[1, 1] + valid_conf_matrix[0, 1] + valid_conf_matrix[1, 0])
# f1-score =2 * TP / (2 * TP + FP + FN)

f.write("confusion matrix.validation: {}\n".format(str(valid_conf_matrix)))
f.write('Accuracy.validation : {}\n'.format(valid_accuracy))
f.write('Specificity.validation: {}\n'.format(valid_specificity))
f.write('Sensitivity.validation: {}\n'.format(valid_sensitivity))
f.write('F1-score.validation: {}\n'.format(valid_f1_score))
f.write("ROC score.validation: {}\n ".format(str(self.valid_roc)))
f.write("AP score.validation: {}\n".format(str(self.valid_average_precision)))

test_confusion_matrix = self.test_confusion_matrix
total = sum(sum(test_confusion_matrix))
test_accuracy = (test_confusion_matrix[0, 0] + test_confusion_matrix[1, 1]) * 1.0 / total
test_specificity = test_confusion_matrix[0, 0] * 1.0 / (test_confusion_matrix[0, 0] + test_confusion_matrix[0, 1]) * 1.0
test_sensitivity = test_confusion_matrix[1, 1] * 1.0 / (test_confusion_matrix[1, 0] + test_confusion_matrix[1, 1]) * 1.0
test_f1_score = (2.0 * test_confusion_matrix[1, 1]) / (
2.0 * test_confusion_matrix[1, 1] + test_confusion_matrix[0, 1] + test_confusion_matrix[1, 0])
# f1-score =2 * TP / (2 * TP + FP + FN)

f.write("confusion matrix.test: {}\n".format(str(test_confusion_matrix)))
f.write('Accuracy.test: {}\n'.format(test_accuracy))
f.write('Specificity.test: {}\n'.format(test_specificity))
f.write('Sensitivity.test: {}\n'.format(test_sensitivity))
f.write("F1-score.test: {}\n".format(test_f1_score))
f.write("ROC score.test: {}\n ".format(str(self.test_roc)))
f.write("AP score.test: {} \n".format(str(self.test_average_precision)))
f.close()
test_confusion_matrix = self.test_confusion_matrix
total = sum(sum(test_confusion_matrix))
test_accuracy = (test_confusion_matrix[0, 0] + test_confusion_matrix[1, 1]) * 1.0 / total
test_specificity = test_confusion_matrix[0, 0] * 1.0 / (test_confusion_matrix[0, 0] + test_confusion_matrix[0, 1]) * 1.0
test_sensitivity = test_confusion_matrix[1, 1] * 1.0 / (test_confusion_matrix[1, 0] + test_confusion_matrix[1, 1]) * 1.0
test_f1_score = (2.0 * test_confusion_matrix[1, 1]) / (
2.0 * test_confusion_matrix[1, 1] + test_confusion_matrix[0, 1] + test_confusion_matrix[1, 0])
# f1-score =2 * TP / (2 * TP + FP + FN)

logging.info("confusion matrix.training: {}\n".format(str(train_conf_matrix)))
logging.info("confusion matrix.test: {}\n".format(str(test_confusion_matrix)))
if not self.skip_validation:
logging.info("confusion matrix.validation: {}\n".format(str(valid_conf_matrix)))

with open(self.output, mode='w') as csv_file:
fieldnames = ['set', 'AUC-score', 'Sensitivity', 'Specificity', 'Accuracy', 'F1-score', 'Average-precision']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=' ')

writer.writeheader()
writer.writerow({'set': "training",'AUC-score': self.train_roc, 'Sensitivity':train_sensitivity, 'Specificity': train_specificity, 'Accuracy': train_accuracy, "F1-score": train_f1_score, "Average-precision": self.train_average_precision})

writer.writerow({'set': "test", 'AUC-score': self.test_roc, 'Sensitivity': test_sensitivity,
'Specificity': test_specificity, 'Accuracy': test_accuracy, "F1-score": test_f1_score,"Average-precision": self.test_average_precision})
if not self.skip_validation:
writer.writerow({'set': "validation",'AUC-score': self.valid_roc, 'Sensitivity':valid_sensitivity,'Specificity': valid_specificity, 'Accuracy': valid_accuracy, "F1-score": valid_f1_score, "Average-precision": self.valid_average_precision})

def create_edge_embeddings(self, edge_list, node2vector_map) -> \
Tuple[np.ndarray, np.ndarray, np.ndarray]:
Expand All @@ -406,23 +400,19 @@ def create_edge_embeddings(self, edge_list, node2vector_map) -> \
emb2 = node2vector_map[node2]
if edge_embedding_method == "hadamard":
# Perform a Hadamard transform on the node embeddings.
# This is a dot product of the node embedding for the two nodes that
# belong to each edge
# This is a hadamard product of the node embedding for the two nodes
edge_emb = np.multiply(emb1, emb2)
elif edge_embedding_method == "average":
# Perform a Average transform on the node embeddings.
# This is a elementwise average of the node embedding for the two nodes that
# belong to each edge
# This is a elementwise average of the node embedding for the two nodes
edge_emb = np.add(emb1, emb2) / 2
elif edge_embedding_method == "weightedL1":
# Perform weightedL1 transform on the node embeddings.
# WeightedL1 calculates the absolute value of difference of each element of the two nodes that
# belong to each edge
# WeightedL1 calculates the absolute value of difference of each element of the two nodes
edge_emb = abs(emb1 - emb2)
elif edge_embedding_method == "weightedL2":
# Perform weightedL2 transform on the node embeddings.
# WeightedL2 calculates the square of difference of each element of the two nodes that
# belong to each edge
# WeightedL2 calculates the square of difference of each element of the two nodes
edge_emb = np.power((emb1 - emb2), 2)
else:
logging.error("Enter hadamard, average, weightedL1, weightedL2")
Expand Down
8 changes: 4 additions & 4 deletions embiggen/neural_networks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from tensorflow.keras.models import Sequential, Model # type: ignore
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Concatenate, Layer # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
from tensorflow.keras.metrics import AUC # type: ignore
from tensorflow.keras.models import Sequential, Model # type: ignore # pylint: disable=import-error
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, Concatenate, Layer # type: ignore # pylint: disable=import-error
from tensorflow.keras.callbacks import EarlyStopping # type: ignore # pylint: disable=import-error
from tensorflow.keras.metrics import AUC # type: ignore # pylint: disable=import-error
import pandas as pd # type: ignore
import numpy as np # type: ignore
from typing import Tuple, Dict, List, Union, Optional
Expand Down
76 changes: 54 additions & 22 deletions embiggen/text_encoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import nltk # type: ignore
import nltk # type: ignore # pylint: disable=import-error
import os
import tensorflow as tf # type: ignore
import pandas as pd # type: ignore
import re
import tensorflow as tf # type: ignore

from collections import Counter
from more_itertools import unique_everseen # type: ignore
from pandas.core.common import flatten # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore # pylint: disable=import-error
from typing import Dict, List, Optional, Tuple, Union


Expand All @@ -21,34 +22,60 @@ class TextEncoder:
Attributes:
filename: A filepath and name to text which needs encoding.
payload_index: An integer that if specified is used to process a specific column from an input csv file.
header: An integer, that if specified contains the row index of the input data containing file header info.
delimiter: A string containing the file delimiter type.
data_type: A string which is used to indicate whether or not the data should be read in as a single text
object or as a list of sentences. Passed values can be "words" or "sentences" (default="sentences").
stopwords: A set of stopwords. If nothing is passed by user a default list of stopwords is utilized.
Raises:
ValueError: If the filename is None.
TypeError: If the filename attribute is not a string.
ValueError: If filename is None.
TypeError: If filename and delimiter (when specified) are not strings.
TypeError: If payload_index and header (when specified) are not are not integers.
IOError: If the file referenced by filename could not be found.
TypeError: If payload_index is not an integer.
ValueError: If data_type is not "words" or "sentences".
"""

def __init__(self, filename: str, data_type: Optional[str] = None, stopwords: set = None):
def __init__(self, filename: str, payload_index: Optional[int] = None, header: Optional[int] = None,
delimiter: Optional[str] = None, data_type: Optional[str] = None, stopwords: set = None) -> None:

if filename is None:
# verify filename structure
if not filename:
raise ValueError('filename cannot be None')
elif not isinstance(filename, str):
raise TypeError('filename must be a string')
elif not os.path.exists(filename):
raise IOError('Could not find file referenced by filename: {}'.format(filename))
raise IOError('could not find file referenced by filename: {}'.format(filename))
else:
self.filename = filename

self.data_type = data_type if data_type else 'sentences'
if payload_index and not isinstance(payload_index, int):
raise TypeError('payload_index must be an integer')
else:
self.payload_index = payload_index if payload_index else None

if header and not isinstance(header, int):
raise TypeError('header must be an integer')
else:
self.header = header if header else None

if delimiter and not isinstance(delimiter, str):
raise TypeError('delimiter must be a string')
else:
self.delimiter = delimiter if delimiter else '\t'

if data_type and data_type.lower() not in ['sentences', 'words']:
raise ValueError('data_type must be "words" or "sentences"')
else:
self.data_type = data_type.lower() if data_type else 'sentences'

try:
self.stopwords = nltk.corpus.stopwords.words('english') if stopwords is None else stopwords
self.stopwords = nltk.corpus.stopwords.words('english') if not stopwords else stopwords
except LookupError:
nltk.download('stopwords')
self.stopwords = nltk.corpus.stopwords.words('english') if stopwords is None else stopwords
self.stopwords = nltk.corpus.stopwords.words('english') if not stopwords else stopwords

def clean_text(self, text: str) -> str:
"""Takes a text string and performs several tasks that are intended to clean the text including making the
Expand Down Expand Up @@ -101,13 +128,21 @@ def process_input_text(self) -> List[str]:
text: A string or list of stings of text from the read in file.
"""

print('Reading data from {filename} and processing it as {data_type}'.format(filename=self.filename,
data_type=self.data_type))
print('Reading {file} and processing it as {data_type}'.format(file=self.filename, data_type=self.data_type))

if self.data_type == 'words':
word_data = open(self.filename).read()
with open(self.filename, 'r') as input_file:
word_data = input_file.read()
input_file.close()
return self.clean_text(word_data).split()
else:
sentence_data = open(self.filename).readlines()
if self.payload_index:
data = pd.read_csv(self.filename, sep=self.delimiter, header=self.header)
sentence_data = list(data[list(data).index(self.payload_index)])
else:
with open(self.filename, 'r') as input_file:
sentence_data = input_file.readlines()
input_file.close()
return [self.clean_text(sent) for sent in sentence_data]

def build_dataset(self, max_vocab=50000) -> Tuple[Union[tf.Tensor, tf.RaggedTensor], List, Dict, Dict]:
Expand Down Expand Up @@ -160,24 +195,21 @@ def build_dataset(self, max_vocab=50000) -> Tuple[Union[tf.Tensor, tf.RaggedTens
filtered_count['UNK'] = 0
dictionary['UNK'] = 1
else:
filtered_count[k] = count[v-1]
filtered_count[k] = count[v - 1]
dictionary[k] = v
else:
filtered_count['UNK'] += 1

reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) # for downstream compatibility
# update for downstream compatibility
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
count_list = [list(x) for x in list(zip(list(filtered_count.keys()), list(filtered_count.values())))]

if max_vocab != len(count_list):
raise ValueError('The length of count_as_tuples does not match max_vocab_size.')
else:
#try:
# tensor_data = tf.data.Dataset.from_tensor_slices(sequences)
#except ValueError:
# tensor_data = tf.ragged.constant(sequences) # for nested lists of differing lengths
if isinstance(sequences, list):
tensor_data = tf.ragged.constant(sequences)
else:
tensor_data = tf.convert_to_tensor(sequences) # should now be a 1D tensor
tensor_data = tf.convert_to_tensor(sequences) # should now be a 1D tensor

return tensor_data, count_list, dictionary, reverse_dictionary
Loading

0 comments on commit 93a5b0a

Please sign in to comment.