Skip to content

Commit

Permalink
Merge pull request #13 from amir-zeldes/dev
Browse files Browse the repository at this point in the history
V2.2.0
  • Loading branch information
amir-zeldes authored Feb 1, 2024
2 parents 8f02cb0 + d0d7a52 commit d17d79f
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 36 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,16 @@ Arabic data is derived from the Prague Arabic Dependency Treebank (UD_Arabic-PAD

## Performance

Current scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features:
Realistic scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features, trained jointly on SPMRL and other UD Hebrew IAHLT data:

```
Perfect word forms: 0.9933281004709577
Precision: 0.9923298178331735
Recall: 0.9871244635193133
F-Score: 0.9897202964379631
```

Clean experimental scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features and training only on SPMRL:

```
Perfect word forms: 0.9918367346938776
Expand All @@ -62,7 +71,7 @@ Recall: 0.9864091559370529
F-Score: 0.9874686716791979
```

Or without BERT:
Or the latter without BERT:

```
Perfect word forms: 0.9821036106750393
Expand All @@ -71,7 +80,7 @@ Recall: 0.967103694874851
F-Score: 0.9716201652496708
```

Scores on Hebrew Wiki5K (out-of-domain, with BERT):
Scores on Hebrew Wiki5K (out-of-domain, with BERT, train on SPMRL):

```
Perfect word forms: 0.9907224634820371
Expand Down
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
scikit-learn
joblib
scikit-learn==1.3.2
joblib==1.3.2
numpy
pandas
xgboost==0.81
pandas==2.1.2
xgboost==2.0.3
hyperopt
flair==0.6.1
flair==0.13.0
65 changes: 65 additions & 0 deletions rftokenizer/conllu2segs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Simple utility to convert conllu files to tab-separated files with segments
"""

import io, os, sys, re
from glob import glob

def get_segs(conllu):
super_length = 0
limit = 10 # Maximum bound group length in units, discard sentences with longer groups
sents = []
words = []
labels = []
word = []
max_len = 0
lines = conllu.split("\n")
for l, line in enumerate(lines):
if "\t" in line:
fields = line.split("\t")
if "-" in fields[0]:
start, end = fields[0].split("-")
super_length = int(end) - int(start) + 1
else:
if super_length > 0:
word.append(fields[1])
super_length -= 1
if super_length == 0:
words.append("".join(word))
labels.append("|".join(word))
if len(word) > max_len:
max_len = len(word)
word = []
else:
if "SpaceAfter=No" in line and ("ADP\t" in line or "DET\t" in line):
done = False
word.append(fields[1])
counter = 1
while not done:
if "SpaceAfter" in lines[l+counter] and not ("\t,\t" in lines[l+counter+1] or "\t.\t" in lines[l+counter+1]):
super_length += 1
counter += 1
else:
super_length += 1
done = True
if super_length > 10:
print(l)
quit()
else:
words.append(fields[1])
labels.append(fields[1])
elif len(line) == 0 and len(words) > 0:
if max_len > limit or " " in "".join(words): # Reject sentence
max_len = 0
else:
sents.append("\n".join([w + "\t" + l for w, l, in zip(words, labels)]))
words = []
labels = []
return "\n".join(sents)

files = glob("*.conllu")

for file_ in files:
seg_data = get_segs(io.open(file_).read())
with io.open(os.path.basename(file_) + ".tab",'w',encoding="utf8",newline="\n") as f:
f.write(seg_data)
10 changes: 7 additions & 3 deletions rftokenizer/data/heb.conf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ allowed=
# Tokenization patterns based on regular expressions (machine learning tokenizer is not consulted for these)
regex_tok=
^([0-9\.,A-Za-z]+)$ \1
^(ב|ו|ל|מ|כ|ה)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3
^(ו)(ב|ל|מ|כ|ה)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3|\4
^(ש|ב|ו|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3
^(ש|ב|ו|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)([$€₪])$ \1|\2|\3|\4
^(ו)(ש|ב|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3|\4
^(ו)(ש|ב|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)([$€₪])$ \1|\2|\3|\4|\5
^(ש|ב|ל|מ)(כ)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3|\4
^(ש|ב|ל|מ)(כ)([-־])([0-9\./,A-Za-z']+)([$€₪])$ \1|\2|\3|\4|\5
^(ל|מ)(כ|ה)([-־])([0-9\./,A-Za-z']+)$ \1|\2|\3|\4
^(ב|ל|מ|כ|ה)([0-9\./,A-Za-z']+)$ \1|\2
^(ב|ל|מ|כש?|ה|ש)([0-9\./,A-Za-z']+)$ \1|\2
41 changes: 30 additions & 11 deletions rftokenizer/flair_pos_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


from argparse import ArgumentParser
import flair
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import OneHotEmbeddings, TransformerWordEmbeddings, StackedEmbeddings
Expand All @@ -22,9 +23,12 @@

seed(42)

flair_version = int(flair.__version__.split(".")[1])

script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep
model_dir = script_dir + ".." + os.sep + "models" + os.sep
model_dir = script_dir + ".." + os.sep + "models_311" + os.sep
CONLLU_ROOT = "conllu" + os.sep # Path to UD .conllu corpus repo directory
CONLLU_ROOT = "C:\\Uni\\Corpora\\Hebrew\\UD_Hebrew-joint" + os.sep # Path to UD .conllu corpus repo directory
TARGET_FEATS = {} # If using this tagger for specific features, specify them here
lang_prefix = "heb" # Prefix for the language name in the model, e.g. heb for Hebrew

Expand All @@ -39,7 +43,7 @@ def __init__(self, train=False, morph=False, seg=False):
if not os.path.exists(model_dir + lang_prefix + ".seg"):
sys.stderr.write("! Model file " + model_dir + lang_prefix + ".seg not found\n")
sys.stderr.write("! Attempting to download it... (this could take a while)\n")
url = "https://corpling.uis.georgetown.edu/amir/download/heb_models_v2/" + lang_prefix + ".seg"
url = "https://gucorpling.org/amir/download/heb_models_v4/" + lang_prefix + ".seg"
urlretrieve(url, model_dir + lang_prefix + ".seg")
sys.stderr.write("! Done!\n")
self.model = SequenceTagger.load(model_dir + lang_prefix + ".seg")
Expand Down Expand Up @@ -94,7 +98,7 @@ def segs2tag(segs):
tag = "WBY"
elif "XS" in tag:
tag = "X"
return tag
return tag.replace("SS","S")

def conllu2segs(conllu, target="affixes"):
super_length = 0
Expand Down Expand Up @@ -264,7 +268,10 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
tag_type = "seg"

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
if flair_version > 8:
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
else:
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
Expand All @@ -291,8 +298,8 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
embeddings=stacked,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
use_crf=True,
use_rnn=True)
use_crf=False,
use_rnn=False)

# 6. initialize trainer
from flair.trainers import ModelTrainer
Expand All @@ -302,7 +309,7 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
# 7. start training
trainer.train(script_dir + "pos-dependencies" + os.sep + 'flair_tagger',
learning_rate=0.1,
mini_batch_size=15,
mini_batch_size=24,
max_epochs=150)

def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=False, tags=False, seg=False):
Expand All @@ -325,7 +332,11 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
for line in data.split("\n"):
if len(line.strip())==0:
if len(words) > 0:
sents.append(Sentence(" ".join(words),use_tokenizer=lambda x:x.split(" ")))
if flair_version > 8:
tokenizer = False
else:
tokenizer = lambda x:x.split(" ")
sents.append(Sentence(" ".join(words),use_tokenizer=tokenizer))
for i, word in enumerate(sents[-1]):
if not seg:
word.add_label("super",positions[i])
Expand Down Expand Up @@ -371,7 +382,10 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
true_pos.append(line.split("\t")[4])

# predict tags and print
model.predict(sents)#, all_tag_prob=True)
if flair_version > 8:
model.predict(sents, force_token_predictions=True, return_probabilities_for_all_classes=True)
else:
model.predict(sents) # , all_tag_prob=True)

preds = []
scores = []
Expand All @@ -382,8 +396,13 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
pred = tok.labels[2].value
score = str(tok.labels[2].score)
elif seg:
pred = tok.labels[0].value
score = str(tok.labels[0].score)
if flair_version > 8:
pred = tok.labels[0].value if len(tok.labels)>0 else "O"
score = tok.labels[0].score if len(tok.labels) > 0 else "1.0"
else:
label = tok.labels[0]
pred = label.value
score = str(label.score)
else:
pred = tok.labels[1].value
score = str(tok.labels[1].score)
Expand Down
Binary file modified rftokenizer/models/heb.sm3
Binary file not shown.
24 changes: 13 additions & 11 deletions rftokenizer/tokenize_rf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
for Morphologically Rich Languages (MRLs)
"""

__version__ = "2.0.1"
__version__ = "2.2.0"
__author__ = "Amir Zeldes"
__copyright__ = "Copyright 2018-2019, Amir Zeldes"
__copyright__ = "Copyright 2018-2024, Amir Zeldes"
__license__ = "Apache 2.0"


Expand Down Expand Up @@ -762,8 +762,7 @@ def train(self, train_file, lexicon_file=None, freq_file=None, test_prop=0.1, ou
if found:
sys.stderr.write("\t"+feat+"\n")
else:
sys.stderr.write("\tERR: can't find ablation feature " + feat + "\n")
sys.exit()
sys.stderr.write("\tWARN: can't find ablation feature " + feat + "\n")

sys.stderr.write("o Creating dataframe\n")
data_x = pd.DataFrame(all_encoded_groups, columns=headers)
Expand Down Expand Up @@ -965,15 +964,17 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):
prev_group = data[i-1] if i > 0 else "_"
next_group = data[i+1] if i < len(data)-1 else "_"

# Protect again zero length input
# Protect against zero length input
if len(prev_group) == 0:
prev_group = "_"
if len(next_group) == 0:
next_group = "_"
if len(word) == 0:
word = "_"

if self.regex_tok is not None:
if len(word) == 1:
do_not_tok_indices.add(j)
elif self.regex_tok is not None:
for f, r in self.regex_tok:
if f.match(word) is not None:
do_not_tok_indices.add(j)
Expand Down Expand Up @@ -1014,15 +1015,16 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):

for word_idx, segmentation in enumerate(p_words):
tokenized = ""
if word_idx == 90:
a=5
if data[word_idx] == "":
tokenized = ""
else:
if word_idx in do_not_tok_indices:
word = data[word_idx]
for f, r in self.regex_tok:
word = f.sub(r, word)
if len(word) == 1:
pass
else:
for f, r in self.regex_tok:
word = f.sub(r, word)
tokenized += word
if proba:
out_proba = 1.0
Expand Down Expand Up @@ -1105,7 +1107,7 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):
if options.retrain_all:
print("\no Retraining on complete data set (no test partition)...")
rf_tok.train(train_file=options.file, lexicon_file=options.lexicon, dump_model=True, output_importances=False,
freq_file=options.freqs, test_prop=0.0, ablations=options.ablations, conf=options.conf)
freq_file=options.freqs, test_prop=0.0, ablations=options.ablations, conf=options.conf, bert=options.bert)
sys.exit()
elif options.bert:
sys.stderr.write("WARN: option --bert was used in predict mode; this has no effect, since saved models determine whether --bert is used\n")
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
setup(
name = 'rftokenizer',
packages = find_packages(),
version = '2.0.1',
version = '2.2.0',
description = 'A character-wise tokenizer for morphologically rich languages',
author = 'Amir Zeldes',
author_email = '[email protected]',
package_data = {'':['README.md','LICENSE.md','requirements.txt'],'rftokenizer':['data/*','pred/*','models/*']},
url = 'https://github.com/amir-zeldes/RFTokenizer',
install_requires=["scikit-learn","numpy","pandas","xgboost==0.81","hyperopt","joblib"],
install_requires=["scikit-learn","numpy","pandas","xgboost==2.0.3","hyperopt","joblib"],
license='Apache License, Version 2.0',
download_url = 'https://github.com/amir-zeldes/RFTokenizer/releases/tag/v2.0.1',
download_url = 'https://github.com/amir-zeldes/RFTokenizer/releases/tag/v2.2.0',
keywords = ['NLP', 'tokenization', 'segmentation', 'morphology', 'morphological', 'Hebrew', 'Arabic', 'Coptic', 'word', 'splitting'],
classifiers = ['Programming Language :: Python',
'Programming Language :: Python :: 2',
Expand Down

0 comments on commit d17d79f

Please sign in to comment.