Skip to content

Commit

Permalink
Training Fix (#11)
Browse files Browse the repository at this point in the history
* #9 Deprecate load_word2vec_format

* Fixed Travis testing issue

* Python 2/3 compatibility; weak support for Python 3

* Use scikit-learn for model evaluation

* Restore the code for validation and test sets
  • Loading branch information
kylase authored Dec 20, 2018
1 parent f2de7dd commit 188721d
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 125 deletions.
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ language: python
cache: pip
python:
- "2.7"
# workaround to make boto work on travis
# from https://github.com/travis-ci/travis-ci/issues/7940
before_install:
- sudo rm -f /etc/boto.cfg
install:
- pip install -r requirements/test.txt
script:
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Neural ParsCit is a citation string parser which parses reference strings into i

## Initial setup

To use the tagger, you need Python 2.7, with Numpy, Theano and Gensim installed.
To use the tagger, you need Python 2.7 (works in Python 3 but not fully supported), with Numpy, Theano and Gensim installed. scikit-learn is needed for model evaluation if you are training a new model.

You can use environmental variables to set the following:
- `MODEL_PATH`: Path to the model's parameters
Expand Down Expand Up @@ -61,6 +61,8 @@ The state-of-the-art trained model is provided in the models folder and is named

### Using a Web Server

Note: This service is not Python 3 compatible due to `unicode`.

The web server (a Flask app) provides REST API.

Running the web server,
Expand Down Expand Up @@ -94,10 +96,10 @@ Details about the training data, experiments can be found in the following artic
title={Neural ParsCit: A Deep Learning Based Reference String Parser},
author={Prasad, Animesh and Kaur, Manpreet and Kan, Min-Yen},
journal={International Journal on Digital Libraries},
volume={},
pages={},
volume={19},
pages={323-337},
year={2018},
publisher={Springer},
url={}
url={https://link.springer.com/article/10.1007/s00799-018-0242-1}
}
```
27 changes: 14 additions & 13 deletions loader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import print_function
import os
import re
import codecs
from utils import create_dico, create_mapping, zero_digits
from utils import iob2, iob_iobes
import gensim, re
import gensim

def load_sentences(path, lower, zeros):
"""
Expand Down Expand Up @@ -61,9 +62,9 @@ def word_mapping(sentences, lower):
dico = create_dico(words)
dico['<UNK>'] = 10000000
word_to_id, id_to_word = create_mapping(dico)
print "Found %i unique words (%i in total)" % (
print("Found %i unique words (%i in total)" % (
len(dico), sum(len(x) for x in words)
)
))
return dico, word_to_id, id_to_word


Expand All @@ -74,7 +75,7 @@ def char_mapping(sentences):
chars = ["".join([w[0] for w in s]) for s in sentences]
dico = create_dico(chars)
char_to_id, id_to_char = create_mapping(dico)
print "Found %i unique characters" % len(dico)
print("Found %i unique characters" % len(dico))
return dico, char_to_id, id_to_char


Expand All @@ -85,7 +86,7 @@ def tag_mapping(sentences):
tags = [[word[-1] for word in s] for s in sentences]
dico = create_dico(tags)
tag_to_id, id_to_tag = create_mapping(dico)
print "Found %i unique named entity tags" % len(dico)
print("Found %i unique named entity tags" % len(dico))
return dico, tag_to_id, id_to_tag


Expand Down Expand Up @@ -124,7 +125,7 @@ def f(x): return x.lower() if lower else x
'caps': caps
}

def prepare_dataset(sentences, word_to_id, char_to_id, lower=False, zeros=False):
def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False, zeros=False):
"""
Prepare the dataset. Return a list of lists of dictionaries containing:
- word indexes
Expand All @@ -147,11 +148,13 @@ def f(x):
chars = [[char_to_id[c] for c in w if c in char_to_id]
for w in str_words]
caps = [cap_feature(w) for w in str_words]
tags = [tag_to_id[w[-1]] for w in s]
data.append({
'str_words': str_words,
'words': words,
'chars': chars,
'caps': caps,
'tags': tags,
})
return data

Expand All @@ -162,17 +165,19 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
to the dictionary, otherwise, we only add the words that are given by
`words` (typically the words in the development and test sets.)
"""
print 'Loading pretrained embeddings from %s...' % ext_emb_path
print('Loading pretrained embeddings from %s...' % ext_emb_path)
assert os.path.isfile(ext_emb_path)

is_digit = re.compile(r'\d')

# Load pretrained embeddings from file
#pretrained = set([
# line.rstrip().split()[0].strip()
# for line in codecs.open(ext_emb_path, 'r', 'cp850')
# if len(ext_emb_path) > 0
#])

pretrained = gensim.models.KeyedVectors.load_word2vec_format(ext_emb_path, binary=True)
pretrained = gensim.models.KeyedVectors.load(ext_emb_path, mmap='r')

# We either add every word in the pretrained file,
# or only words given in the `words` list to which
Expand All @@ -183,11 +188,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
dictionary[word] = 0
else:
for word in words:
if any(x in pretrained for x in [
word,
word.lower(),
re.sub('\d', '0', word.lower())
]) and word not in dictionary:
if any(x in pretrained for x in [word, word.lower(), is_digit.sub('0', word.lower())]) and word not in dictionary:
dictionary[word] = 0

word_to_id, id_to_word = create_mapping(dictionary)
Expand Down
16 changes: 9 additions & 7 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import print_function
import logging
import cPickle
try:
import cPickle
except ImportError:
import pickle as cPickle
import os
import re
import numpy as np
Expand Down Expand Up @@ -176,7 +179,6 @@ def build(self,
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
new_weights = word_layer.embeddings.get_value()
logging.info("Loading pretrained embeddings from %s...", pre_emb)
emb_invalid = 0

# for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
# line = line.rstrip().split()
Expand All @@ -192,19 +194,19 @@ def build(self,
c_found = 0
c_lower = 0
c_zeros = 0

is_digit = re.compile(r'\d')
# Lookup table initialization
for i in range(n_words):
for i in xrange(n_words):
word = self.id_to_word[i]
if word in pretrained:
new_weights[i] = pretrained[word]
c_found += 1
elif word.lower() in pretrained:
new_weights[i] = pretrained[word.lower()]
c_lower += 1
elif re.sub(r'\d', '0', word.lower()) in pretrained:
new_weights[i] = pretrained[
re.sub(r'\d', '0', word.lower())
]
elif is_digit.sub('0', word.lower()) in pretrained:
new_weights[i] = pretrained[is_digit.sub('0', word.lower())]
c_zeros += 1
word_layer.embeddings.set_value(new_weights)
# print 'Loaded %i pretrained embeddings.' % len(pretrained)
Expand Down
2 changes: 1 addition & 1 deletion requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
ipython==5.7.0
git+https://github.com/pytorch/text.git@master
torch==0.4.1
scikit-learn==0.19.2
scikit-learn==0.20.1
4 changes: 2 additions & 2 deletions requirements/prod.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
gensim==3.5.0
theano==1.0.2
numpy==1.14.5
theano==1.0.3
numpy==1.15.4
Flask==1.0.2
flask_restful==0.3.6
flask-restful-swagger-2==0.35
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
pylint==1.9.2
pytest==3.5.1
pytest-flask==0.10.0
scikit-learn==0.20.1
86 changes: 46 additions & 40 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python

from __future__ import print_function
import os
import numpy as np
import optparse
import itertools
import logging
from collections import OrderedDict
from utils import create_input
import loader
Expand Down Expand Up @@ -114,8 +115,8 @@

# Check parameters validity
assert os.path.isfile(opts.train)
#assert os.path.isfile(opts.dev)
#assert os.path.isfile(opts.test)
assert os.path.isfile(opts.dev)
assert os.path.isfile(opts.test)
assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
assert 0. <= parameters['dropout'] < 1.0
#assert parameters['tag_scheme'] in ['iob', 'iobes']
Expand All @@ -131,9 +132,13 @@
if not os.path.exists(models_path):
os.makedirs(models_path)

logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
level=logging.INFO)
logger = logging.getLogger

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
print "Model location: %s" % model.model_path
logging.info("Model location: %s" % model.model_path)

# Data parameters
lower = parameters['lower']
Expand All @@ -142,8 +147,8 @@

# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
#dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
#test_sentences = loader.load_sentences(opts.test, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
##update_tag_scheme(train_sentences, tag_scheme)
Expand Down Expand Up @@ -171,64 +176,65 @@
train_data = prepare_dataset(
train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
#dev_data = prepare_dataset(
# dev_sentences, word_to_id, char_to_id, tag_to_id, lower
#)
#test_data = prepare_dataset(
# test_sentences, word_to_id, char_to_id, tag_to_id, lower
#)

#print "%i / %i / %i sentences in train / dev / test." % (
# len(train_data), len(dev_data), len(test_data))
dev_data = prepare_dataset(
dev_sentences, word_to_id, char_to_id, tag_to_id, lower
)

test_data = prepare_dataset(
test_sentences, word_to_id, char_to_id, tag_to_id, lower
)

logging.info("%i / %i / %i sentences in train / dev / test.",
len(train_data),
len(dev_data),
len(test_data))

# Save the mappings to disk
print 'Saving the mappings to disk...'
logging.info('Saving the mappings to disk...')
model.save_mappings(id_to_word, id_to_char, id_to_tag)

# Build the model
f_train, f_eval = model.build(**parameters)

# Reload previous model values
if opts.reload:
print 'Reloading previous model...'
print('Reloading previous model...')
model.reload()

#
# Train network
#
singletons = set([word_to_id[k] for k, v
in dico_words_train.items() if v == 1])
singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1])
n_epochs = 10 # number of epochs over the training set
freq_eval = 1000 # evaluate on dev every freq_eval steps
best_dev = -np.inf
best_test = -np.inf
count = 0
for epoch in xrange(n_epochs):
epoch_costs = []
print "Starting epoch %i..." % epoch
logging.info("Starting epoch %i..." % epoch)
for i, index in enumerate(np.random.permutation(len(train_data))):
count += 1
input = create_input(train_data[index], parameters, True, singletons)
new_cost = f_train(*input)
inputs = create_input(train_data[index], parameters, True, singletons)
new_cost = f_train(*inputs)
epoch_costs.append(new_cost)
if i % 50 == 0 and i > 0 == 0:
print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
# if count % freq_eval == 0:
# dev_score = evaluate(parameters, f_eval, dev_sentences,
# dev_data, id_to_tag, dico_tags)
# test_score = evaluate(parameters, f_eval, test_sentences,
# test_data, id_to_tag, dico_tags)
# print "Score on dev: %.5f" % dev_score
# print "Score on test: %.5f" % test_score
# if dev_score > best_dev:
# best_dev = dev_score
# print "New best score on dev."
# print "Saving model to disk..."
# model.save()
# if test_score > best_test:
# best_test = test_score
# print "New best score on test."
print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))

#test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags)
logging.info("%i, cost average: %f", i, np.mean(epoch_costs[-50:]))
if count % freq_eval == 0:
dev_score = evaluate(parameters, f_eval, dev_sentences,
dev_data, id_to_tag, dico_tags)
test_score = evaluate(parameters, f_eval, test_sentences,
test_data, id_to_tag, dico_tags)
logging.info("Score on dev: %.5f", dev_score)
logging.info("Score on test: %.5f", test_score)
if dev_score > best_dev:
best_dev = dev_score
logging.info("New best score on dev.")
logging.info("Saving model to disk...")
model.save()
if test_score > best_test:
best_test = test_score
logging.info("New best score on test.")
logging.info("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
model.save()
Loading

0 comments on commit 188721d

Please sign in to comment.