Training Fix (#11)

* #9 Deprecate load_word2vec_format * Fixed Travis testing issue * Python 2/3 compatibility; weak support for Python 3 * Use scikit-learn for model evaluation * Restore the code for validation and test sets
WING-NUS · Dec 20, 2018 · 188721d · 188721d
1 parent f2de7dd
commit 188721d
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 125 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,10 @@ language: python
 cache: pip
 python:
   - "2.7"
+# workaround to make boto work on travis
+# from https://github.com/travis-ci/travis-ci/issues/7940
+before_install:
+  - sudo rm -f /etc/boto.cfg
 install:
   - pip install -r requirements/test.txt
 script:

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Neural ParsCit is a citation string parser which parses reference strings into i
 
 ## Initial setup
 
-To use the tagger, you need Python 2.7, with Numpy, Theano and Gensim installed.
+To use the tagger, you need Python 2.7 (works in Python 3 but not fully supported), with Numpy, Theano and Gensim installed. scikit-learn is needed for model evaluation if you are training a new model.
 
 You can use environmental variables to set the following:
 - `MODEL_PATH`: Path to the model's parameters
@@ -61,6 +61,8 @@ The state-of-the-art trained model is provided in the models folder and is named
 
 ### Using a Web Server
 
+Note: This service is not Python 3 compatible due to `unicode`.
+
 The web server (a Flask app) provides REST API.
 
 Running the web server,
@@ -94,10 +96,10 @@ Details about the training data, experiments can be found in the following artic
   title={Neural ParsCit: A Deep Learning Based Reference String Parser},
   author={Prasad, Animesh and Kaur, Manpreet and Kan, Min-Yen},
   journal={International Journal on Digital Libraries},
-  volume={},
-  pages={},
+  volume={19},
+  pages={323-337},
   year={2018},
   publisher={Springer},
-  url={}
+  url={https://link.springer.com/article/10.1007/s00799-018-0242-1}
 }
 ```
diff --git a/loader.py b/loader.py
@@ -1,9 +1,10 @@
+from __future__ import print_function
 import os
 import re
 import codecs
 from utils import create_dico, create_mapping, zero_digits
 from utils import iob2, iob_iobes
-import gensim, re
+import gensim
 
 def load_sentences(path, lower, zeros):
     """
@@ -61,9 +62,9 @@ def word_mapping(sentences, lower):
     dico = create_dico(words)
     dico['<UNK>'] = 10000000
     word_to_id, id_to_word = create_mapping(dico)
-    print "Found %i unique words (%i in total)" % (
+    print("Found %i unique words (%i in total)" % (
         len(dico), sum(len(x) for x in words)
-    )
+    ))
     return dico, word_to_id, id_to_word
 
 
@@ -74,7 +75,7 @@ def char_mapping(sentences):
     chars = ["".join([w[0] for w in s]) for s in sentences]
     dico = create_dico(chars)
     char_to_id, id_to_char = create_mapping(dico)
-    print "Found %i unique characters" % len(dico)
+    print("Found %i unique characters" % len(dico))
     return dico, char_to_id, id_to_char
 
 
@@ -85,7 +86,7 @@ def tag_mapping(sentences):
     tags = [[word[-1] for word in s] for s in sentences]
     dico = create_dico(tags)
     tag_to_id, id_to_tag = create_mapping(dico)
-    print "Found %i unique named entity tags" % len(dico)
+    print("Found %i unique named entity tags" % len(dico))
     return dico, tag_to_id, id_to_tag
 
 
@@ -124,7 +125,7 @@ def f(x): return x.lower() if lower else x
         'caps': caps
     }
 
-def prepare_dataset(sentences, word_to_id, char_to_id, lower=False, zeros=False):
+def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False, zeros=False):
     """
     Prepare the dataset. Return a list of lists of dictionaries containing:
         - word indexes
@@ -147,11 +148,13 @@ def f(x):
         chars = [[char_to_id[c] for c in w if c in char_to_id]
                  for w in str_words]
         caps = [cap_feature(w) for w in str_words]
+        tags = [tag_to_id[w[-1]] for w in s]
         data.append({
             'str_words': str_words,
             'words': words,
             'chars': chars,
             'caps': caps,
+            'tags': tags,
         })
     return data
 
@@ -162,17 +165,19 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
     to the dictionary, otherwise, we only add the words that are given by
     `words` (typically the words in the development and test sets.)
     """
-    print 'Loading pretrained embeddings from %s...' % ext_emb_path
+    print('Loading pretrained embeddings from %s...' % ext_emb_path)
     assert os.path.isfile(ext_emb_path)
 
+    is_digit = re.compile(r'\d')
+
     # Load pretrained embeddings from file
     #pretrained = set([
     #    line.rstrip().split()[0].strip()
     #    for line in codecs.open(ext_emb_path, 'r', 'cp850')
     #    if len(ext_emb_path) > 0
     #])
 
-    pretrained = gensim.models.KeyedVectors.load_word2vec_format(ext_emb_path, binary=True)
+    pretrained = gensim.models.KeyedVectors.load(ext_emb_path, mmap='r')
 
     # We either add every word in the pretrained file,
     # or only words given in the `words` list to which
@@ -183,11 +188,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
                 dictionary[word] = 0
     else:
         for word in words:
-            if any(x in pretrained for x in [
-                word,
-                word.lower(),
-                re.sub('\d', '0', word.lower())
-            ]) and word not in dictionary:
+            if any(x in pretrained for x in [word, word.lower(), is_digit.sub('0', word.lower())]) and word not in dictionary:
                 dictionary[word] = 0
 
     word_to_id, id_to_word = create_mapping(dictionary)

diff --git a/model.py b/model.py
@@ -1,6 +1,9 @@
 from __future__ import print_function
 import logging
-import cPickle
+try:
+    import cPickle
+except ImportError:
+    import pickle as cPickle
 import os
 import re
 import numpy as np
@@ -176,7 +179,6 @@ def build(self,
                 word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
                 new_weights = word_layer.embeddings.get_value()
                 logging.info("Loading pretrained embeddings from %s...", pre_emb)
-                emb_invalid = 0
 
 #                for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
 #                    line = line.rstrip().split()
@@ -192,19 +194,19 @@ def build(self,
                 c_found = 0
                 c_lower = 0
                 c_zeros = 0
+
+                is_digit = re.compile(r'\d')
                 # Lookup table initialization
-                for i in range(n_words):
+                for i in xrange(n_words):
                     word = self.id_to_word[i]
                     if word in pretrained:
                         new_weights[i] = pretrained[word]
                         c_found += 1
                     elif word.lower() in pretrained:
                         new_weights[i] = pretrained[word.lower()]
                         c_lower += 1
-                    elif re.sub(r'\d', '0', word.lower()) in pretrained:
-                        new_weights[i] = pretrained[
-                            re.sub(r'\d', '0', word.lower())
-                        ]
+                    elif is_digit.sub('0', word.lower()) in pretrained:
+                        new_weights[i] = pretrained[is_digit.sub('0', word.lower())]
                         c_zeros += 1
                 word_layer.embeddings.set_value(new_weights)
 #                print 'Loaded %i pretrained embeddings.' % len(pretrained)

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -3,4 +3,4 @@
 ipython==5.7.0
 git+https://github.com/pytorch/text.git@master
 torch==0.4.1
-scikit-learn==0.19.2
+scikit-learn==0.20.1
diff --git a/requirements/prod.txt b/requirements/prod.txt
@@ -1,6 +1,6 @@
 gensim==3.5.0
-theano==1.0.2
-numpy==1.14.5
+theano==1.0.3
+numpy==1.15.4
 Flask==1.0.2
 flask_restful==0.3.6
 flask-restful-swagger-2==0.35

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -2,3 +2,4 @@
 pylint==1.9.2
 pytest==3.5.1
 pytest-flask==0.10.0
+scikit-learn==0.20.1
diff --git a/train.py b/train.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
-
+from __future__ import print_function
 import os
 import numpy as np
 import optparse
 import itertools
+import logging
 from collections import OrderedDict
 from utils import create_input
 import loader
@@ -114,8 +115,8 @@
 
 # Check parameters validity
 assert os.path.isfile(opts.train)
-#assert os.path.isfile(opts.dev)
-#assert os.path.isfile(opts.test)
+assert os.path.isfile(opts.dev)
+assert os.path.isfile(opts.test)
 assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
 assert 0. <= parameters['dropout'] < 1.0
 #assert parameters['tag_scheme'] in ['iob', 'iobes']
@@ -131,9 +132,13 @@
 if not os.path.exists(models_path):
     os.makedirs(models_path)
 
+logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
+                    level=logging.INFO)
+logger = logging.getLogger
+
 # Initialize model
 model = Model(parameters=parameters, models_path=models_path)
-print "Model location: %s" % model.model_path
+logging.info("Model location: %s" % model.model_path)
 
 # Data parameters
 lower = parameters['lower']
@@ -142,8 +147,8 @@
 
 # Load sentences
 train_sentences = loader.load_sentences(opts.train, lower, zeros)
-#dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
-#test_sentences = loader.load_sentences(opts.test, lower, zeros)
+dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
+test_sentences = loader.load_sentences(opts.test, lower, zeros)
 
 # Use selected tagging scheme (IOB / IOBES)
 ##update_tag_scheme(train_sentences, tag_scheme)
@@ -171,64 +176,65 @@
 train_data = prepare_dataset(
     train_sentences, word_to_id, char_to_id, tag_to_id, lower
 )
-#dev_data = prepare_dataset(
-#    dev_sentences, word_to_id, char_to_id, tag_to_id, lower
-#)
-#test_data = prepare_dataset(
-#    test_sentences, word_to_id, char_to_id, tag_to_id, lower
-#)
 
-#print "%i / %i / %i sentences in train / dev / test." % (
-#    len(train_data), len(dev_data), len(test_data))
+dev_data = prepare_dataset(
+    dev_sentences, word_to_id, char_to_id, tag_to_id, lower
+)
+
+test_data = prepare_dataset(
+    test_sentences, word_to_id, char_to_id, tag_to_id, lower
+)
+
+logging.info("%i / %i / %i sentences in train / dev / test.",
+             len(train_data),
+             len(dev_data),
+             len(test_data))
 
 # Save the mappings to disk
-print 'Saving the mappings to disk...'
+logging.info('Saving the mappings to disk...')
 model.save_mappings(id_to_word, id_to_char, id_to_tag)
 
 # Build the model
 f_train, f_eval = model.build(**parameters)
 
 # Reload previous model values
 if opts.reload:
-    print 'Reloading previous model...'
+    print('Reloading previous model...')
     model.reload()
 
 #
 # Train network
 #
-singletons = set([word_to_id[k] for k, v
-                  in dico_words_train.items() if v == 1])
+singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1])
 n_epochs = 10  # number of epochs over the training set
 freq_eval = 1000  # evaluate on dev every freq_eval steps
 best_dev = -np.inf
 best_test = -np.inf
 count = 0
 for epoch in xrange(n_epochs):
     epoch_costs = []
-    print "Starting epoch %i..." % epoch
+    logging.info("Starting epoch %i..." % epoch)
     for i, index in enumerate(np.random.permutation(len(train_data))):
         count += 1
-        input = create_input(train_data[index], parameters, True, singletons)
-        new_cost = f_train(*input)
+        inputs = create_input(train_data[index], parameters, True, singletons)
+        new_cost = f_train(*inputs)
         epoch_costs.append(new_cost)
         if i % 50 == 0 and i > 0 == 0:
-            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
-#        if count % freq_eval == 0:
-#            dev_score = evaluate(parameters, f_eval, dev_sentences,
-#                                 dev_data, id_to_tag, dico_tags)
-#            test_score = evaluate(parameters, f_eval, test_sentences,
-#                                  test_data, id_to_tag, dico_tags)
-#            print "Score on dev: %.5f" % dev_score
-#            print "Score on test: %.5f" % test_score
-#            if dev_score > best_dev:
-#                best_dev = dev_score
-#                print "New best score on dev."
-#                print "Saving model to disk..."
-#                model.save()
-#            if test_score > best_test:
-#                best_test = test_score
-#                print "New best score on test."
-    print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
-
-#test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags)
+            logging.info("%i, cost average: %f", i, np.mean(epoch_costs[-50:]))
+        if count % freq_eval == 0:
+            dev_score = evaluate(parameters, f_eval, dev_sentences,
+                                 dev_data, id_to_tag, dico_tags)
+            test_score = evaluate(parameters, f_eval, test_sentences,
+                                  test_data, id_to_tag, dico_tags)
+            logging.info("Score on dev: %.5f", dev_score)
+            logging.info("Score on test: %.5f", test_score)
+            if dev_score > best_dev:
+                best_dev = dev_score
+                logging.info("New best score on dev.")
+                logging.info("Saving model to disk...")
+                model.save()
+            if test_score > best_test:
+                best_test = test_score
+                logging.info("New best score on test.")
+    logging.info("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
 model.save()