diff --git a/cli/bert/parse.py b/cli/bert/parse.py index fafbc5e..a43c56f 100644 --- a/cli/bert/parse.py +++ b/cli/bert/parse.py @@ -2,12 +2,11 @@ import click from tqdm import tqdm -from transformers import AutoTokenizer, TFAutoModel -from discopy.components.nn.bert import get_sentence_embeddings from discopy.parsers.pipeline import ParserPipeline from discopy.utils import init_logger from discopy_data.data.doc import Document +from discopy_data.nn.bert import get_sentence_embedder @click.command() @@ -19,10 +18,9 @@ def main(bert_model, model_path, src, tgt, limit): logger = init_logger() logger.info('Init Parser...') + get_sentence_embeddings = get_sentence_embedder(bert_model) parser = ParserPipeline.from_config(model_path) parser.load(model_path) - tokenizer = AutoTokenizer.from_pretrained(bert_model) - model = TFAutoModel.from_pretrained(bert_model) logger.info('Load pre-trained Parser...') for line_i, line in tqdm(enumerate(src)): if limit and line_i >= limit: @@ -32,7 +30,7 @@ def main(bert_model, model_path, src, tgt, limit): continue for sent_i, sent in enumerate(doc.sentences): sent_words = sent.tokens - embeddings = get_sentence_embeddings(sent_words, tokenizer, model) + embeddings = get_sentence_embeddings(sent_words) doc.sentences[sent_i].embeddings = embeddings doc = parser(doc) tgt.write(json.dumps(doc.to_json()) + '\n') diff --git a/discopy/components/nn/bert.py b/discopy/components/nn/bert.py deleted file mode 100644 index ae07dd3..0000000 --- a/discopy/components/nn/bert.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List - -import numpy as np - -from discopy_data.data.token import Token - -simple_map = { - "''": '"', - "``": '"', - "-LRB-": "(", - "-RRB-": ")", - "-LCB-": "{", - "-RCB-": "}", - "n't": "not" -} - - -def get_sentence_embeddings(tokens: List[Token], tokenizer, model): - subtokens = [tokenizer.tokenize(simple_map.get(t.surface, t.surface)) for t in tokens] - lengths = [len(s) for s in subtokens] - tokens_ids = tokenizer.convert_tokens_to_ids([ts for t in subtokens for ts in t]) - tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids) - outputs = model(np.array([tokens_ids]), output_hidden_states=True) - hidden_state = outputs.hidden_states[-2][0].numpy() - embeddings = np.zeros((len(lengths), hidden_state.shape[-1]), np.float32) - len_left = 1 - for i, length in enumerate(lengths): - embeddings[i] = hidden_state[len_left] - len_left += length - return embeddings - - -def get_sentence_vector_embeddings(tokens: List[Token], embedding_index, mean, std): - embedding_dim = len(next(iter(embedding_index.values()))) - embeddings = np.random.normal(mean, std, (len(tokens), embedding_dim)) - for i, tok in enumerate(tokens): - tok = simple_map.get(tok.surface, tok.surface) - if tok in embedding_index: - embeddings[i] = embedding_index[tok] - return embeddings - -# def get_doc_sentence_embeddings(sent_tokens: List[List[Token]], tokenizer, model): -# lengths = [] -# inputs = [] -# for tokens in sent_tokens: -# subtokens = [tokenizer.tokenize(simple_map.get(t.surface, t.surface)) for t in tokens] -# lengths.append([len(s) for s in subtokens]) -# tokens_ids = tokenizer.convert_tokens_to_ids([ts for t in subtokens for ts in t]) -# inputs.append(tokenizer.build_inputs_with_special_tokens(tokens_ids)) -# outputs = model(np.array(inputs)) -# last_hidden_states = outputs.last_hidden_state.numpy() -# embeddings = np.zeros((len(lengths), last_hidden_states[0].shape[-1]), np.float32) -# e_i = 0 -# for o_i, last_hidden_state in enumerate(last_hidden_states): -# len_left = 1 -# for i, length in enumerate(lengths[o_i]): -# embeddings[i] = np.concatenate([last_hidden_state[:1], -# last_hidden_state[len_left:len_left + length], -# last_hidden_state[-1:]]).mean(axis=0) -# if len_left + length >= len(last_hidden_state): -# print("ALERT", last_hidden_state.shape, len_left, lengths) -# len_left += length -# return embeddings diff --git a/requirements.txt b/requirements.txt index b59fd3a..798a5b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,5 @@ sklearn sklearn-crfsuite tensorflow>=2.1.0 transformers==4.2.1 -fastapi==0.61.2 -uvicorn==0.11.3 +fastapi==0.67.0 git+git://github.com:rknaebel/discopy-data \ No newline at end of file diff --git a/setup.py b/setup.py index 949f521..e91f148 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup(name='discopy-rknaebel', - version='1.0.0', + version='1.0.1', description='Shallow Discourse Parser', long_description=long_description, long_description_content_type="text/markdown", @@ -21,8 +21,7 @@ 'sklearn-crfsuite', 'tensorflow>=2.1.0', 'transformers>=3.5.0', - 'fastapi==0.61.2', - 'uvicorn==0.11.3', + 'fastapi==0.67.0', 'discopy-data-rknaebel', ], zip_safe=False, @@ -31,6 +30,9 @@ 'discopy-train=cli.train:main', 'discopy-eval=cli.eval:main', 'discopy-parse=cli.parse:main', + 'discopy-nn-train=cli.bert.train:main', + 'discopy-nn-parse=cli.bert.parse:main', + 'discopy-nn-predict=cli.bert.predict:main', ], }, python_requires='>=3.7',