Skip to content

Commit

Permalink
Word embeddings loaded directly (#6)
Browse files Browse the repository at this point in the history
* Model Evaluation (regression on v1.0.2)
* Improved peak memory usage
  • Loading branch information
kylase authored Aug 20, 2018
1 parent d77b92f commit f4ad0ba
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 83 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ python:
install:
- pip install -r requirements/test.txt
script:
- pytest
- pytest -rs
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM python:2

ENV ENVIRONMENT prod

WORKDIR /usr/src

RUN apt-get update \
Expand Down
18 changes: 12 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,23 @@ source .venv/bin/activate
pip install -r requirements.txt
```

### Word Embeddings

The word embeddings does not come with this repository. You can obtain the [word embeddings](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) from WING website.

You will need to extract the content of the word embedding archive (`vectors.tar.gz`) to the root directory for this repository by running `tar xfz vectors.tar.gz`.

### Using Docker

1. Build the image: `docker build -t theano-gensim - < Dockerfile`
1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash`

## Word Embeddings

The word embeddings do not come with this repository. You can obtain the [word embeddings without `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) (not recommended for v1.0.3) or [word embeddings with `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) (deprecated in v1.0.3 as the entire word vectors can be loaded with less memory) from WING website. Please read the next section on availability of `<UNK>` in word embeddings.

You will need to extract the content of the word embedding archive (`vectors_with_unk.tar.gz`) to the root directory for this repository by running `tar xfz vectors_with_unk.tar.gz`.

### Embeddings Without `<UNK>`

If the word embeddings provided do not have `<UNK>`, your instance will not benefit from the lazy loading of the word vectors and hence the reduction of memory requirements.

Without `<UNK>`, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with `<UNK>`, which is much lower as it only requires at most 4.5 GB.

## Parse citation strings

The fastest way to use the parser is to run state-of-the-art pre-trained model as follows:
Expand Down
25 changes: 14 additions & 11 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
from optimization import Optimization

logging.basicConfig(format="%(asctime)-15s %(message)s", level=logging.INFO)
logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
level=logging.INFO)
logger = logging.getLogger

class Model(object):
Expand Down Expand Up @@ -169,18 +170,14 @@ def build(self,
# Word inputs
if word_dim:
input_dim += word_dim
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer', train=training)
word_input = word_layer.link(word_ids)
inputs.append(word_input)
# Initialize with pretrained embeddings
if pre_emb and training:
pretrained = self.load_word_embeddings(pre_emb)
if training:
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
new_weights = word_layer.embeddings.get_value()
logging.info("Loading pretrained embeddings from %s...", pre_emb)
emb_invalid = 0

#use gensim models as pretrained embeddings
pretrained = KeyedVectors.load(pre_emb, mmap='r')

# for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
# line = line.rstrip().split()
# if len(line) == word_dim + 1:
Expand Down Expand Up @@ -216,8 +213,13 @@ def build(self,
n_words, 100. * (c_found + c_lower + c_zeros) / n_words)
logging.info('%i found directly, %i after lowercasing, '
'%i after lowercasing + zero.', c_found, c_lower, c_zeros)
else:
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer',
pretrained=pretrained)
self.id_to_word.update({i: w for i, w in enumerate(pretrained.index2entity)})

#
word_input = word_layer.link(word_ids)
inputs.append(word_input)
# Chars inputs
#
if char_dim:
Expand Down Expand Up @@ -414,7 +416,8 @@ def load_word_embeddings(embeddings, mode='r'):
if isinstance(embeddings, KeyedVectors):
return embeddings
else:
if os.path.isfile(embeddings) and os.path.isfile(embeddings + 'vectors.npy'):
return KeyedVectors.load(embeddings, mmap=mode)
if os.path.isfile(embeddings) and os.path.isfile(embeddings + '.vectors.npy'):
v = KeyedVectors.load(embeddings, mmap=mode)
return v
else:
raise IOError("{embeddings} cannot be found.".format(embeddings=embeddings))
28 changes: 21 additions & 7 deletions nn.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import theano
import numpy as np
import theano.tensor as T
from utils import shared

Expand Down Expand Up @@ -59,20 +61,32 @@ class EmbeddingLayer(object):
Output: tensor of dimension (dim*, output_dim)
"""

def __init__(self, input_dim, output_dim, name='embedding_layer', train=True):
def __init__(self, input_dim, output_dim, name='embedding_layer', pretrained=None):
"""
Typically, input_dim is the vocabulary size,
and output_dim the embedding dimension.
"""
self.input_dim = input_dim
self.output_dim = output_dim
self.name = name
self.train = train

# Randomly generate weights
self.embeddings = shared((input_dim, output_dim),
self.name + '__embeddings',
train=self.train)
if pretrained:
if u'<UNK>' not in pretrained:
logging.warn('<UNK> is not found in the pretrained and will be added.'
'This will consume more memory than usual.')
pretrained.add([u'<UNK>'],
[np.zeros((pretrained.vectors.shape[1], ),
dtype=theano.config.floatX)])

if pretrained.vectors.dtype == theano.config.floatX:
self.embeddings = theano.shared(value=pretrained.vectors,
name=self.name + '__embeddings')
else:
self.embeddings = theano.shared(value=pretrained.vectors.astype(theano.config.floatX),
name=self.name + '__embeddings')
else:
# Randomly generate weights
self.embeddings = shared((input_dim, output_dim),
self.name + '__embeddings')

# Define parameters
self.params = [self.embeddings]
Expand Down
6 changes: 4 additions & 2 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
-r prod.txt
pylint==1.9.2
pytest==3.5.1
-r test.txt
ipython==5.7.0
git+https://github.com/pytorch/text.git@master
torch==0.4.1
sklearn==0.19.2
65 changes: 18 additions & 47 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
import numpy as np
import theano
from gensim.models import KeyedVectors
from contextlib import closing
from utils import evaluate, create_input
from model import Model
from loader import augment_with_pretrained, load_sentences, prepare_dataset
Expand Down Expand Up @@ -35,42 +35,10 @@
opts = optparser.parse_args()[0]

model = Model(model_path=opts.model_path)
model.parameters['pre_emb'] = os.path.join(os.getcwd(), opts.pre_emb)
f = model.build(training=False, **model.parameters)
model.reload()

model.parameters['pre_emb'] = opts.pre_emb
pretrained = KeyedVectors.load(model.parameters['pre_emb'], mmap='r')
n_words = len(model.id_to_word)

#only include pretrained embeddings for 640780 most frequent words
words = [item[0] for item in json.load(open('freq', 'r'))]

#Create new mapping because model.id_to_word only is an Ordered dict of only training and testing data
model.id_to_word = {}

discarded = 640780
new_weights = np.empty((n_words - n_words/2 + 1, 500), dtype=theano.config.floatX)
for i in range((n_words/2), n_words):
word = words[i]
lower = word.lower()
digits = re.sub(r'\d', '0', lower)
idx = i - discarded
if word in pretrained:
model.id_to_word[idx] = word
new_weights[idx] = pretrained[word]
elif lower in pretrained:
model.id_to_word[idx] = lower
new_weights[idx] = pretrained[lower]
elif digits in pretrained:
model.id_to_word[idx] = digits
new_weights[idx] = pretrained[digits]

model.id_to_word[0] = '<UNK>'
#Reset the values of word layer
model.components['word_layer'].embeddings.set_value(new_weights)
#release memory occupied by word embeddings
del pretrained
del new_weights
model.reload()

lower = model.parameters['lower']
zeros = model.parameters['zeros']
Expand All @@ -82,36 +50,39 @@
if opts.run == 'file':
assert opts.input_file
assert opts.output_file
input_file = opts.input_file

output_file = opts.output_file
data = open(input_file, 'r').read()

with closing(open(opts.input_file, 'r')) as fh:
data = fh.read()
strings = data.split('\n')
else:
string = raw_input("Enter the citation string: ")
strings = [string]

test_file = "test_file"
if os.path.exists(test_file):
os.remove(test_file)
file = open(test_file, 'a')
for string in strings:
file.write('\n'.join(string.split())+'\n')
file.write('\n'.join(string.split()) + '\n')
file.close()
test_sentences = load_sentences(test_file, lower, zeros)
data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower, True)

for citation in data:
inputs = create_input(citation, model.parameters, False)
y_pred = np.array(f[1](*inputs))[1:-1]
tags = []
for i in range(len(y_pred)):
tags.append(model.id_to_tag[y_pred[i]])
output = []
for num, word in enumerate(citation['str_words']):
output.append(word+'\t'+tags[num])

tags = [model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]

output = [w + '\t' + tags[i] for i, w in enumerate(citation['str_words'])]

if opts.run == 'file':
file = open(output_file, 'w')
file.write('\n'.join(output))
file.close()
with closing(open(output_file, 'w')) as fh:
fh.write('\n'.join(output))
else:
print('\n'.join(output))

if opts.run == 'file':
break
83 changes: 83 additions & 0 deletions tests/models/test_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import tempfile
import pytest
import requests
import numpy as np

from model import Model
from loader import load_sentences, prepare_dataset
from utils import create_input

CORA_URL = "https://raw.githubusercontent.com/knmnyn/ParsCit/master/crfpp/traindata/cora.train"

# Skip this test when running in CI as the amount of memory is not sufficient
# to build the model
@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI")
def test_inference_performance():
from sklearn.metrics import f1_score
from torchtext.datasets import SequenceTaggingDataset
from torchtext.data import Field, NestedField

WORD = Field(init_token='<bos>', eos_token='<eos>')
CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
ENTITY = Field(init_token='<bos>', eos_token='<eos>')

data_file = tempfile.NamedTemporaryFile(delete=True)

# TODO Need to be decoded in Python 3
data_file.write(requests.get(CORA_URL).content)

fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]

dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")

model = Model(model_path='models/neuralParsCit')
model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv')
f = model.build(training=False, **model.parameters)

model.reload()

word_to_id = {v:i for i, v in model.id_to_word.items()}
char_to_id = {v:i for i, v in model.id_to_char.items()}
tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}

tf = tempfile.NamedTemporaryFile(delete=False)
tf.write("\n\n".join(["\n".join(example.text) for example in dataset.examples]))
tf.close()

train_sentences = load_sentences(tf.name,
model.parameters['lower'],
model.parameters['zeros'])

train_inputs = prepare_dataset(train_sentences,
word_to_id,
char_to_id,
model.parameters['lower'], True)

preds = []

for citation in train_inputs:
inputs = create_input(citation, model.parameters, False)
y_pred = np.array(f[1](*inputs))[1:-1]

preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])])

assert len(preds) == len(dataset.examples)

results = []

for P, T in zip(preds, dataset.examples):
for p, t in zip(P, zip(T.text, T.entity)):
results.append((p[1], tag_to_id[t[1]]))

pred, true = zip(*results)

eval_metrics = {
'micro_f1': f1_score(true, pred, average='micro'),
'macro_f1': f1_score(true, pred, average='macro')
}

data_file.close()

assert eval_metrics == pytest.approx({'macro_f1': 0.98, 'micro_f1': 0.99}, abs=0.01)
15 changes: 6 additions & 9 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,17 @@ def set_values(name, param, pretrained):
).astype(np.float32))


def shared(shape, name, train=True):
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if train:
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
return theano.shared(value=np.zeros(shape, dtype=theano.config.floatX), name=name)
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)

return theano.shared(value=value.astype(theano.config.floatX), name=name)

def create_dico(item_list):
"""
Expand Down

0 comments on commit f4ad0ba

Please sign in to comment.