Merge pull request #13 from amir-zeldes/dev

V2.2.0
amir-zeldes · Feb 1, 2024 · d17d79f · d17d79f
2 parents 8f02cb0 + d0d7a52
commit d17d79f
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,16 @@ Arabic data is derived from the Prague Arabic Dependency Treebank (UD_Arabic-PAD
 
 ## Performance
 
-Current scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features:
+Realistic scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features, trained jointly on SPMRL and other UD Hebrew IAHLT data:
+
+```
+Perfect word forms: 0.9933281004709577
+Precision: 0.9923298178331735
+Recall: 0.9871244635193133
+F-Score: 0.9897202964379631
+```
+
+Clean experimental scores on the SPMRL Hebrew dataset (UD_Hebrew, V1 splits), using BERT-based predictions and lexicon data as features and training only on SPMRL:
 
 ```
 Perfect word forms: 0.9918367346938776
@@ -62,7 +71,7 @@ Recall: 0.9864091559370529
 F-Score: 0.9874686716791979
 ```
 
-Or without BERT:
+Or the latter without BERT:
 
 ```
 Perfect word forms: 0.9821036106750393
@@ -71,7 +80,7 @@ Recall: 0.967103694874851
 F-Score: 0.9716201652496708
 ```
 
-Scores on Hebrew Wiki5K (out-of-domain, with BERT):
+Scores on Hebrew Wiki5K (out-of-domain, with BERT, train on SPMRL):
 
 ```
 Perfect word forms: 0.9907224634820371

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-scikit-learn
-joblib
+scikit-learn==1.3.2
+joblib==1.3.2
 numpy
-pandas
-xgboost==0.81
+pandas==2.1.2
+xgboost==2.0.3
 hyperopt
-flair==0.6.1
+flair==0.13.0
diff --git a/rftokenizer/conllu2segs.py b/rftokenizer/conllu2segs.py
@@ -0,0 +1,65 @@
+"""
+Simple utility to convert conllu files to tab-separated files with segments
+"""
+
+import io, os, sys, re
+from glob import glob
+
+def get_segs(conllu):
+    super_length = 0
+    limit = 10  # Maximum bound group length in units, discard sentences with longer groups
+    sents = []
+    words = []
+    labels = []
+    word = []
+    max_len = 0
+    lines = conllu.split("\n")
+    for l, line in enumerate(lines):
+        if "\t" in line:
+            fields = line.split("\t")
+            if "-" in fields[0]:
+                start, end = fields[0].split("-")
+                super_length = int(end) - int(start) + 1
+            else:
+                if super_length > 0:
+                    word.append(fields[1])
+                    super_length -= 1
+                    if super_length == 0:
+                        words.append("".join(word))
+                        labels.append("|".join(word))
+                        if len(word) > max_len:
+                            max_len = len(word)
+                        word = []
+                else:
+                    if "SpaceAfter=No" in line and ("ADP\t" in line or "DET\t" in line):
+                        done = False
+                        word.append(fields[1])
+                        counter = 1
+                        while not done:
+                            if "SpaceAfter" in lines[l+counter] and not ("\t,\t" in lines[l+counter+1] or "\t.\t" in lines[l+counter+1]):
+                                super_length += 1
+                                counter += 1
+                            else:
+                                super_length += 1
+                                done = True
+                                if super_length > 10:
+                                    print(l)
+                                    quit()
+                    else:
+                        words.append(fields[1])
+                        labels.append(fields[1])
+        elif len(line) == 0 and len(words) > 0:
+            if max_len > limit or " " in "".join(words):  # Reject sentence
+                max_len = 0
+            else:
+                sents.append("\n".join([w + "\t" + l for w, l, in zip(words, labels)]))
+            words = []
+            labels = []
+    return "\n".join(sents)
+
+files = glob("*.conllu")
+
+for file_ in files:
+    seg_data = get_segs(io.open(file_).read())
+    with io.open(os.path.basename(file_) + ".tab",'w',encoding="utf8",newline="\n") as f:
+        f.write(seg_data)
diff --git a/rftokenizer/data/heb.conf b/rftokenizer/data/heb.conf
@@ -24,7 +24,11 @@ allowed=
 # Tokenization patterns based on regular expressions (machine learning tokenizer is not consulted for these)
 regex_tok=
 	^([0-9\.,A-Za-z]+)$	\1
-	^(ב|ו|ל|מ|כ|ה)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3
-	^(ו)(ב|ל|מ|כ|ה)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3|\4
+	^(ש|ב|ו|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3
+	^(ש|ב|ו|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)([$€₪])$	\1|\2|\3|\4
+	^(ו)(ש|ב|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3|\4
+	^(ו)(ש|ב|ל|מ|כש?|ה)([-־])([0-9\./,A-Za-z']+)([$€₪])$	\1|\2|\3|\4|\5
+	^(ש|ב|ל|מ)(כ)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3|\4
+	^(ש|ב|ל|מ)(כ)([-־])([0-9\./,A-Za-z']+)([$€₪])$	\1|\2|\3|\4|\5
 	^(ל|מ)(כ|ה)([-־])([0-9\./,A-Za-z']+)$	\1|\2|\3|\4
-	^(ב|ל|מ|כ|ה)([0-9\./,A-Za-z']+)$	\1|\2
+	^(ב|ל|מ|כש?|ה|ש)([0-9\./,A-Za-z']+)$	\1|\2
diff --git a/rftokenizer/flair_pos_tagger.py b/rftokenizer/flair_pos_tagger.py
@@ -6,6 +6,7 @@
 
 
 from argparse import ArgumentParser
+import flair
 from flair.data import Corpus, Sentence
 from flair.datasets import ColumnCorpus
 from flair.embeddings import OneHotEmbeddings, TransformerWordEmbeddings, StackedEmbeddings
@@ -22,9 +23,12 @@
 
 seed(42)
 
+flair_version = int(flair.__version__.split(".")[1])
+
 script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep
-model_dir = script_dir + ".." + os.sep + "models" + os.sep
+model_dir = script_dir + ".." + os.sep + "models_311" + os.sep
 CONLLU_ROOT = "conllu" + os.sep  # Path to UD .conllu corpus repo directory
+CONLLU_ROOT = "C:\\Uni\\Corpora\\Hebrew\\UD_Hebrew-joint" + os.sep  # Path to UD .conllu corpus repo directory
 TARGET_FEATS = {}  # If using this tagger for specific features, specify them here
 lang_prefix = "heb"  # Prefix for the language name in the model, e.g. heb for Hebrew
 
@@ -39,7 +43,7 @@ def __init__(self, train=False, morph=False, seg=False):
                 if not os.path.exists(model_dir + lang_prefix + ".seg"):
                     sys.stderr.write("! Model file " + model_dir + lang_prefix + ".seg not found\n")
                     sys.stderr.write("! Attempting to download it... (this could take a while)\n")
-                    url = "https://corpling.uis.georgetown.edu/amir/download/heb_models_v2/" + lang_prefix + ".seg"
+                    url = "https://gucorpling.org/amir/download/heb_models_v4/" + lang_prefix + ".seg"
                     urlretrieve(url, model_dir + lang_prefix + ".seg")
                     sys.stderr.write("! Done!\n")
                 self.model = SequenceTagger.load(model_dir + lang_prefix + ".seg")
@@ -94,7 +98,7 @@ def segs2tag(segs):
                 tag = "WBY"
             elif "XS" in tag:
                 tag = "X"
-            return tag
+            return tag.replace("SS","S")
 
         def conllu2segs(conllu, target="affixes"):
             super_length = 0
@@ -264,7 +268,10 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
             tag_type = "seg"
 
         # 3. make the tag dictionary from the corpus
-        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
+        if flair_version > 8:
+            tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
+        else:
+            tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
         print(tag_dictionary)
 
         # 4. initialize embeddings
@@ -291,8 +298,8 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
                                                 embeddings=stacked,
                                                 tag_dictionary=tag_dictionary,
                                                 tag_type=tag_type,
-                                                use_crf=True,
-                                                use_rnn=True)
+                                                use_crf=False,
+                                                use_rnn=False)
 
         # 6. initialize trainer
         from flair.trainers import ModelTrainer
@@ -302,7 +309,7 @@ def train(self, cuda_safe=True, positional=True, tags=False, seg=False):
         # 7. start training
         trainer.train(script_dir + "pos-dependencies" + os.sep + 'flair_tagger',
                       learning_rate=0.1,
-                      mini_batch_size=15,
+                      mini_batch_size=24,
                       max_epochs=150)
 
     def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=False, tags=False, seg=False):
@@ -325,7 +332,11 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
         for line in data.split("\n"):
             if len(line.strip())==0:
                 if len(words) > 0:
-                    sents.append(Sentence(" ".join(words),use_tokenizer=lambda x:x.split(" ")))
+                    if flair_version > 8:
+                        tokenizer = False
+                    else:
+                        tokenizer = lambda x:x.split(" ")
+                    sents.append(Sentence(" ".join(words),use_tokenizer=tokenizer))
                     for i, word in enumerate(sents[-1]):
                         if not seg:
                             word.add_label("super",positions[i])
@@ -371,7 +382,10 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
                         true_pos.append(line.split("\t")[4])
 
         # predict tags and print
-        model.predict(sents)#, all_tag_prob=True)
+        if flair_version > 8:
+            model.predict(sents, force_token_predictions=True, return_probabilities_for_all_classes=True)
+        else:
+            model.predict(sents)  # , all_tag_prob=True)
 
         preds = []
         scores = []
@@ -382,8 +396,13 @@ def predict(self, in_path=None, in_format="flair", out_format="conllu", as_text=
                     pred = tok.labels[2].value
                     score = str(tok.labels[2].score)
                 elif seg:
-                    pred = tok.labels[0].value
-                    score = str(tok.labels[0].score)
+                    if flair_version > 8:
+                        pred = tok.labels[0].value if len(tok.labels)>0 else "O"
+                        score = tok.labels[0].score if len(tok.labels) > 0 else "1.0"
+                    else:
+                        label = tok.labels[0]
+                        pred = label.value
+                        score = str(label.score)
                 else:
                     pred = tok.labels[1].value
                     score = str(tok.labels[1].score)

diff --git a/rftokenizer/models/heb.sm3 b/rftokenizer/models/heb.sm3
diff --git a/rftokenizer/tokenize_rf.py b/rftokenizer/tokenize_rf.py
@@ -7,9 +7,9 @@
 for Morphologically Rich Languages (MRLs)
 """
 
-__version__ = "2.0.1"
+__version__ = "2.2.0"
 __author__ = "Amir Zeldes"
-__copyright__ = "Copyright 2018-2019, Amir Zeldes"
+__copyright__ = "Copyright 2018-2024, Amir Zeldes"
 __license__ = "Apache 2.0"
 
 
@@ -762,8 +762,7 @@ def train(self, train_file, lexicon_file=None, freq_file=None, test_prop=0.1, ou
 					if found:
 						sys.stderr.write("\t"+feat+"\n")
 					else:
-						sys.stderr.write("\tERR: can't find ablation feature " + feat + "\n")
-						sys.exit()
+						sys.stderr.write("\tWARN: can't find ablation feature " + feat + "\n")
 
 		sys.stderr.write("o Creating dataframe\n")
 		data_x = pd.DataFrame(all_encoded_groups, columns=headers)
@@ -965,15 +964,17 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):
 			prev_group = data[i-1] if i > 0 else "_"
 			next_group = data[i+1] if i < len(data)-1 else "_"
 
-			# Protect again zero length input
+			# Protect against zero length input
 			if len(prev_group) == 0:
 				prev_group = "_"
 			if len(next_group) == 0:
 				next_group = "_"
 			if len(word) == 0:
 				word = "_"
 
-			if self.regex_tok is not None:
+			if len(word) == 1:
+				do_not_tok_indices.add(j)
+			elif self.regex_tok is not None:
 				for f, r in self.regex_tok:
 					if f.match(word) is not None:
 						do_not_tok_indices.add(j)
@@ -1014,15 +1015,16 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):
 
 		for word_idx, segmentation in enumerate(p_words):
 			tokenized = ""
-			if word_idx == 90:
-				a=5
 			if data[word_idx] == "":
 				tokenized = ""
 			else:
 				if word_idx in do_not_tok_indices:
 					word = data[word_idx]
-					for f, r in self.regex_tok:
-						word = f.sub(r, word)
+					if len(word) == 1:
+						pass
+					else:
+						for f, r in self.regex_tok:
+							word = f.sub(r, word)
 					tokenized += word
 					if proba:
 						out_proba = 1.0
@@ -1105,7 +1107,7 @@ def rf_tokenize(self, data, sep="|", indices=None, proba=False):
 		if options.retrain_all:
 			print("\no Retraining on complete data set (no test partition)...")
 			rf_tok.train(train_file=options.file, lexicon_file=options.lexicon, dump_model=True, output_importances=False,
-						 freq_file=options.freqs, test_prop=0.0, ablations=options.ablations, conf=options.conf)
+						 freq_file=options.freqs, test_prop=0.0, ablations=options.ablations, conf=options.conf, bert=options.bert)
 		sys.exit()
 	elif options.bert:
 		sys.stderr.write("WARN: option --bert was used in predict mode; this has no effect, since saved models determine whether --bert is used\n")

diff --git a/setup.py b/setup.py
@@ -3,15 +3,15 @@
 setup(
   name = 'rftokenizer',
   packages = find_packages(),
-  version = '2.0.1',
+  version = '2.2.0',
   description = 'A character-wise tokenizer for morphologically rich languages',
   author = 'Amir Zeldes',
   author_email = '[email protected]',
   package_data = {'':['README.md','LICENSE.md','requirements.txt'],'rftokenizer':['data/*','pred/*','models/*']},
   url = 'https://github.com/amir-zeldes/RFTokenizer',
-  install_requires=["scikit-learn","numpy","pandas","xgboost==0.81","hyperopt","joblib"],
+  install_requires=["scikit-learn","numpy","pandas","xgboost==2.0.3","hyperopt","joblib"],
   license='Apache License, Version 2.0',
-  download_url = 'https://github.com/amir-zeldes/RFTokenizer/releases/tag/v2.0.1',
+  download_url = 'https://github.com/amir-zeldes/RFTokenizer/releases/tag/v2.2.0',
   keywords = ['NLP', 'tokenization', 'segmentation', 'morphology', 'morphological', 'Hebrew', 'Arabic', 'Coptic', 'word', 'splitting'],
   classifiers = ['Programming Language :: Python',
 'Programming Language :: Python :: 2',