Fix missing expected links by token during inference process (#120)

QubitPi · Jan 3, 2025 · 0accedb · 0accedb
1 parent f8eed40
commit 0accedb
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 58 deletions.
diff --git a/german.yaml b/german.yaml
@@ -6219,7 +6219,7 @@ vocabulary:
       - at all
       - any
   - term: jemand
-    definition: someone
+    definition: (pron.) someone
   - term: Ergebnis
     definition: result
   - term: führt zu keinem Ergebnis

diff --git a/huggingface/vocabulary_parser.py b/huggingface/vocabulary_parser.py
@@ -237,6 +237,21 @@ def get_inflection_tokens(
 
 
 def get_tokens_of(word: dict, inflection_supplier: Callable[[object], dict[str, str]] = lambda word: {}) -> set[str]:
+    """
+    Returns the tokens of a word used for link inferences.
+
+    The tokens come from the following attributes:
+
+    1. term
+    2. definition
+    3. inflection field (conjugation & declension)
+
+    :param word:  A list entry of wilhelm-vocabulary repo YAML file deserialized
+    :param inflection_supplier:  A functional object that, given a YAML dictionary, returns the inflection table of that
+    word. The key of the table can be arbitrary but the value must be a sole inflected word
+
+    :return: a list of tokens
+    """
     return get_inflection_tokens(word, inflection_supplier) | get_term_tokens(word) | get_definition_tokens(word)
 
 
@@ -275,11 +290,15 @@ def get_inferred_tokenization_links(
 
     :param vocabulary:  A wilhelm-vocabulary repo YAML file deserialized
     :param label_key:  The name of the node attribute that will be used as the label in displaying the node
+    :param inflection_supplier:  A functional object that, given a YAML dictionary, returns the inflection table of that
+    word. The key of the table can be arbitrary but the value must be a sole inflected word
 
     :return: a list of link object, each of which has a "source_label", a "target_label", and an "attributes" key
     """
     all_vocabulary_tokenizations_by_term = dict(
         [word["term"], get_tokens_of(word, inflection_supplier)] for word in vocabulary)
+
+    existing_pairs: set[set] = set()
     inferred_links = []
     for this_word in vocabulary:
         this_term = this_word["term"]
@@ -290,14 +309,17 @@ def get_inferred_tokenization_links(
             if this_term == that_term:
                 continue
 
-            for this_token in get_term_tokens(this_word):
+            for this_token in all_vocabulary_tokenizations_by_term[this_term]:
                 for that_token in that_term_tokens:
-                    if this_token.lower().strip() == that_token:
+                    if this_token.lower().strip() == that_token and ({this_term, that_term} not in existing_pairs):
+                        existing_pairs.add(frozenset({this_term, that_term}))
+
                         inferred_links.append({
                             "source_label": this_term,
                             "target_label": that_term,
                             "attributes": {label_key: "term related"},
                         })
+
                         jump_to_next_term = True
                         break
 

diff --git a/tests/test_vocabulary_parser.py b/tests/test_vocabulary_parser.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import yaml
@@ -34,8 +35,8 @@
     definition: the grilled tomato
     declension: Unknown
 """
-
 LABEL_KEY = "label"
+DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
 
 class TestVocabularyParser(unittest.TestCase):
@@ -163,28 +164,39 @@ def test_get_inferred_links_on_unrelated_terms_with_same_definite_article(self):
 
     def test_get_definition_tokens(self):
         vocabulary = yaml.safe_load("""
-                    vocabulary:
-                      - term: morgens
-                        definition:
-                          - (adv.) in the morning
-                          - (adv.) a.m.
-                """)["vocabulary"]
+            vocabulary:
+              - term: morgens
+                definition:
+                  - (adv.) in the morning
+                  - (adv.) a.m.
+            """)["vocabulary"]
         self.assertEqual(
             {"morning", "a.m."},
             get_definition_tokens(vocabulary[0])
         )
 
         vocabulary = yaml.safe_load("""
-                    vocabulary:
-                      - term: exekutieren
-                        definition: to execute (kill)
-                        audio: https://upload.wikimedia.org/wikipedia/commons/f/f1/De-exekutieren.ogg
-                """)["vocabulary"]
+            vocabulary:
+              - term: exekutieren
+                definition: to execute (kill)
+                audio: https://upload.wikimedia.org/wikipedia/commons/f/f1/De-exekutieren.ogg
+            """)["vocabulary"]
         self.assertEqual(
             {"execute", "kill"},
             get_definition_tokens(vocabulary[0])
         )
 
+        vocabulary = yaml.safe_load("""
+            vocabulary:
+              - term: töten
+                definition: to kill
+                audio: https://upload.wikimedia.org/wikipedia/commons/b/b0/De-t%C3%B6ten.ogg
+            """)["vocabulary"]
+        self.assertEqual(
+            {"kill"},
+            get_definition_tokens(vocabulary[0])
+        )
+
     def test_get_term_tokens(self):
         vocabulary = yaml.safe_load("""
                     vocabulary:
@@ -257,49 +269,47 @@ def test_two_words_sharing_some_same_declension_table_entries(self):
         )
 
     def test_get_inferred_tokenization_links(self):
-        vocabulary = yaml.safe_load("""
-            vocabulary:
-              - term: das Jahr
-                definition: the year
-                declension:
-                  - ["",         singular,        plural        ]
-                  - [nominative, Jahr,            "Jahre, Jahr" ]
-                  - [genitive,   "Jahres, Jahrs", "Jahre, Jahr" ]
-                  - [dative,     Jahr,            "Jahren, Jahr"]
-                  - [accusative, Jahr,            "Jahre, Jahr" ]
-              - term: seit zwei Jahren
-                definition: for two years
-              - term: letzte
-                definition: (adj.) last
-              - term: in den letzten Jahren
-                definition: in recent years
-        """)["vocabulary"]
-
-        self.assertEqual(
-            [
-                {
-                    'attributes': {LABEL_KEY: 'term related'},
-                    'source_label': 'seit zwei Jahren',
-                    'target_label': 'das Jahr'
-                },
-                {
-                    'attributes': {LABEL_KEY: 'term related'},
-                    'source_label': 'seit zwei Jahren',
-                    'target_label': 'in den letzten Jahren'
-                },
-                {
-                    'attributes': {LABEL_KEY: 'term related'},
-                    'source_label': 'in den letzten Jahren',
-                    'target_label': 'das Jahr'
-                },
-                {
-                    'attributes': {LABEL_KEY: 'term related'},
-                    'source_label': 'in den letzten Jahren',
-                    'target_label': 'seit zwei Jahren'
-                }
-            ],
-            get_inferred_tokenization_links(vocabulary, LABEL_KEY, get_declension_attributes)
-        )
+        test_cases = [
+            {
+                "words": ["das Jahr", "seit zwei Jahren", "letzte", "in den letzten Jahren"],
+                "expected": [
+                    {
+                        'attributes': {'label': 'term related'},
+                        'source_label': 'das Jahr',
+                        'target_label': 'seit zwei Jahren'
+                    },
+                    {
+                        'attributes': {'label': 'term related'},
+                        'source_label': 'das Jahr',
+                        'target_label': 'in den letzten Jahren'
+                    },
+                    {
+                        'attributes': {'label': 'term related'},
+                        'source_label': 'seit zwei Jahren',
+                        'target_label': 'in den letzten Jahren'
+                    }
+                ]
+            },
+            {
+                "words": ["exekutieren", "töten"],
+                "expected": [
+                    {
+                        'attributes': {LABEL_KEY: 'term related'},
+                        'source_label': 'exekutieren',
+                        'target_label': 'töten'
+                    },
+                ]
+            }
+        ]
+
+        for test_case in test_cases:
+            with open("{path}/../german.yaml".format(path=DIR_PATH), "r") as f:
+                vocabulary = [word for word in yaml.safe_load(f)["vocabulary"] if word["term"] in test_case["words"]]
+
+            self.assertEqual(
+                test_case["expected"],
+                get_inferred_tokenization_links(vocabulary, LABEL_KEY, get_declension_attributes)
+            )
 
     def test_get_structurally_similar_links(self):
         vocabulary = yaml.safe_load("""