-
Notifications
You must be signed in to change notification settings - Fork 12
/
ngram_models.py
156 lines (121 loc) · 5.53 KB
/
ngram_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains the logic for the form-only part of BERTRAM."""
import itertools
import random
from typing import List
from collections import Counter
import torch
import numpy as np
from torch import nn
import log
from utils import length_to_mask
logger = log.get_logger("root")
START_SYMBOL = '<S>'
END_SYMBOL = '</S>'
UNK_TOKEN = 'UNK'
PAD_TOKEN = 'PAD'
UNK_ID = 0
PAD_ID = 1
class NGramFeatures:
def __init__(self, ngrams: List[str], ngram_ids: List[int]):
self.ngrams = ngrams
self.ngram_ids = ngram_ids
self.ngram_lengths = len(ngram_ids)
def __repr__(self):
return '{}, {}, {}'.format(self.ngrams, self.ngram_ids, self.ngram_lengths)
class BatchedNGramFeatures:
def __init__(self, ngram_ids: torch.Tensor, ngram_lengths: torch.Tensor):
self.ngram_ids = ngram_ids
self.ngram_lengths = ngram_lengths
def __repr__(self):
return '{}, {}'.format(self.ngram_ids, self.ngram_lengths)
class NGramBuilder:
def __init__(self, vocab_file: str, ngram_threshold: int = 4, nmin: int = 3, nmax: int = 5, seed: int = None):
self.nmin = nmin
self.nmax = nmax
self.ngram2id = {UNK_TOKEN: UNK_ID, PAD_TOKEN: PAD_ID}
self.id2ngram = [UNK_TOKEN, PAD_TOKEN]
ngram_counts = Counter()
if seed is not None:
random.seed(seed)
with open(vocab_file, 'r', encoding='utf8') as file:
for line in file:
word = line.split()[0]
ngram_counts.update(self.to_n_gram(word, self.nmin, self.nmax))
most_common = ngram_counts.most_common()
if self.nmin == self.nmax == 1:
most_common = list(most_common)
most_common.sort(key=lambda x: (-x[1], x[0]))
for (ngram, count) in most_common:
if count >= ngram_threshold:
if ngram in self.ngram2id:
continue
id_ = len(self.id2ngram)
self.ngram2id[ngram] = id_
self.id2ngram.append(ngram)
logger.info('Found {} ngrams with min count {} and (nmin,nmax)=({},{}), first 10: {}, last 10: {}'.format(
len(self.id2ngram), ngram_threshold, nmin, nmax, self.id2ngram[:10], self.id2ngram[-10:]
))
def get_ngram_features(self, word: str, dropout_probability: float = 0) -> NGramFeatures:
ngrams = self.to_n_gram(word, self.nmin, self.nmax, dropout_probability)
ngram_ids = [self.ngram2id[ngram] if ngram in self.ngram2id else UNK_ID for ngram in ngrams]
return NGramFeatures(ngrams, ngram_ids)
@staticmethod
def batchify(features: List[NGramFeatures]) -> BatchedNGramFeatures:
ngram_ids = torch.tensor(np.array(
list(itertools.zip_longest(*[x.ngram_ids for x in features], fillvalue=PAD_ID)),
dtype=np.int32).T, dtype=torch.long)
ngram_lengths = torch.tensor(np.array([x.ngram_lengths for x in features], dtype=np.int32), dtype=torch.long)
return BatchedNGramFeatures(ngram_ids, ngram_lengths)
@staticmethod
def to_n_gram(word: str, nmin: int, nmax: int, dropout_probability: float = 0) -> List[str]:
"""
Turns a word into a list of n-grams.
:param word: the word
:param nmin: the minimum number of characters per n-gram
:param nmax: the maximum number of characters per n-gram
:param dropout_probability: the probability of randomly removing an n-gram
:return: the list of n-grams
"""
ngrams = []
if nmin == nmax:
letters = [START_SYMBOL] + list(word) + [END_SYMBOL] + ([PAD_TOKEN] * max(10, (50 - len(list(word)))))
else:
letters = [START_SYMBOL] + list(word) + [START_SYMBOL]
for i in range(len(letters)):
for j in range(i + nmin, min(len(letters) + 1, i + nmax + 1)):
ngram = ''.join(letters[i:j])
ngrams.append(ngram)
if dropout_probability > 0:
ngrams = [ngram for ngram in ngrams if random.random() < (1 - dropout_probability)]
if not ngrams:
ngrams = [UNK_TOKEN]
return ngrams
def get_number_of_ngrams(self) -> int:
return len(self.id2ngram)
class BagOfNgrams(nn.Module):
def __init__(self, vocab_size, embedding_size):
super(BagOfNgrams, self).__init__()
self.ngram_embeddings = nn.Embedding(vocab_size, embedding_size)
def forward(self, ngram_ids, ngram_lengths):
"""
:param ngram_ids: shape is [batch_size x max_seq_length]
:param ngram_lengths: shape is [batch_size]
"""
# shape is [batch_size x max_seq_length x embedding_size]
ngrams_embedded = self.ngram_embeddings(ngram_ids)
# shape is [batch_size x max_seq_length]
mask = length_to_mask(ngram_lengths, max_len=ngram_ids.shape[1], dtype=torch.float)
ngrams_embedded = ngrams_embedded * mask.unsqueeze(-1)
bag_of_ngrams = torch.sum(ngrams_embedded, dim=1) / ngram_lengths.float().unsqueeze(-1)
return bag_of_ngrams