Skip to content

Commit

Permalink
Fix missing expected links by token during inference process (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
QubitPi authored Jan 3, 2025
1 parent f8eed40 commit 0accedb
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 58 deletions.
2 changes: 1 addition & 1 deletion german.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6219,7 +6219,7 @@ vocabulary:
- at all
- any
- term: jemand
definition: someone
definition: (pron.) someone
- term: Ergebnis
definition: result
- term: führt zu keinem Ergebnis
Expand Down
26 changes: 24 additions & 2 deletions huggingface/vocabulary_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,21 @@ def get_inflection_tokens(


def get_tokens_of(word: dict, inflection_supplier: Callable[[object], dict[str, str]] = lambda word: {}) -> set[str]:
"""
Returns the tokens of a word used for link inferences.
The tokens come from the following attributes:
1. term
2. definition
3. inflection field (conjugation & declension)
:param word: A list entry of wilhelm-vocabulary repo YAML file deserialized
:param inflection_supplier: A functional object that, given a YAML dictionary, returns the inflection table of that
word. The key of the table can be arbitrary but the value must be a sole inflected word
:return: a list of tokens
"""
return get_inflection_tokens(word, inflection_supplier) | get_term_tokens(word) | get_definition_tokens(word)


Expand Down Expand Up @@ -275,11 +290,15 @@ def get_inferred_tokenization_links(
:param vocabulary: A wilhelm-vocabulary repo YAML file deserialized
:param label_key: The name of the node attribute that will be used as the label in displaying the node
:param inflection_supplier: A functional object that, given a YAML dictionary, returns the inflection table of that
word. The key of the table can be arbitrary but the value must be a sole inflected word
:return: a list of link object, each of which has a "source_label", a "target_label", and an "attributes" key
"""
all_vocabulary_tokenizations_by_term = dict(
[word["term"], get_tokens_of(word, inflection_supplier)] for word in vocabulary)

existing_pairs: set[set] = set()
inferred_links = []
for this_word in vocabulary:
this_term = this_word["term"]
Expand All @@ -290,14 +309,17 @@ def get_inferred_tokenization_links(
if this_term == that_term:
continue

for this_token in get_term_tokens(this_word):
for this_token in all_vocabulary_tokenizations_by_term[this_term]:
for that_token in that_term_tokens:
if this_token.lower().strip() == that_token:
if this_token.lower().strip() == that_token and ({this_term, that_term} not in existing_pairs):
existing_pairs.add(frozenset({this_term, that_term}))

inferred_links.append({
"source_label": this_term,
"target_label": that_term,
"attributes": {label_key: "term related"},
})

jump_to_next_term = True
break

Expand Down
120 changes: 65 additions & 55 deletions tests/test_vocabulary_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import unittest

import yaml
Expand All @@ -34,8 +35,8 @@
definition: the grilled tomato
declension: Unknown
"""

LABEL_KEY = "label"
DIR_PATH = os.path.dirname(os.path.realpath(__file__))


class TestVocabularyParser(unittest.TestCase):
Expand Down Expand Up @@ -163,28 +164,39 @@ def test_get_inferred_links_on_unrelated_terms_with_same_definite_article(self):

def test_get_definition_tokens(self):
vocabulary = yaml.safe_load("""
vocabulary:
- term: morgens
definition:
- (adv.) in the morning
- (adv.) a.m.
""")["vocabulary"]
vocabulary:
- term: morgens
definition:
- (adv.) in the morning
- (adv.) a.m.
""")["vocabulary"]
self.assertEqual(
{"morning", "a.m."},
get_definition_tokens(vocabulary[0])
)

vocabulary = yaml.safe_load("""
vocabulary:
- term: exekutieren
definition: to execute (kill)
audio: https://upload.wikimedia.org/wikipedia/commons/f/f1/De-exekutieren.ogg
""")["vocabulary"]
vocabulary:
- term: exekutieren
definition: to execute (kill)
audio: https://upload.wikimedia.org/wikipedia/commons/f/f1/De-exekutieren.ogg
""")["vocabulary"]
self.assertEqual(
{"execute", "kill"},
get_definition_tokens(vocabulary[0])
)

vocabulary = yaml.safe_load("""
vocabulary:
- term: töten
definition: to kill
audio: https://upload.wikimedia.org/wikipedia/commons/b/b0/De-t%C3%B6ten.ogg
""")["vocabulary"]
self.assertEqual(
{"kill"},
get_definition_tokens(vocabulary[0])
)

def test_get_term_tokens(self):
vocabulary = yaml.safe_load("""
vocabulary:
Expand Down Expand Up @@ -257,49 +269,47 @@ def test_two_words_sharing_some_same_declension_table_entries(self):
)

def test_get_inferred_tokenization_links(self):
vocabulary = yaml.safe_load("""
vocabulary:
- term: das Jahr
definition: the year
declension:
- ["", singular, plural ]
- [nominative, Jahr, "Jahre, Jahr" ]
- [genitive, "Jahres, Jahrs", "Jahre, Jahr" ]
- [dative, Jahr, "Jahren, Jahr"]
- [accusative, Jahr, "Jahre, Jahr" ]
- term: seit zwei Jahren
definition: for two years
- term: letzte
definition: (adj.) last
- term: in den letzten Jahren
definition: in recent years
""")["vocabulary"]

self.assertEqual(
[
{
'attributes': {LABEL_KEY: 'term related'},
'source_label': 'seit zwei Jahren',
'target_label': 'das Jahr'
},
{
'attributes': {LABEL_KEY: 'term related'},
'source_label': 'seit zwei Jahren',
'target_label': 'in den letzten Jahren'
},
{
'attributes': {LABEL_KEY: 'term related'},
'source_label': 'in den letzten Jahren',
'target_label': 'das Jahr'
},
{
'attributes': {LABEL_KEY: 'term related'},
'source_label': 'in den letzten Jahren',
'target_label': 'seit zwei Jahren'
}
],
get_inferred_tokenization_links(vocabulary, LABEL_KEY, get_declension_attributes)
)
test_cases = [
{
"words": ["das Jahr", "seit zwei Jahren", "letzte", "in den letzten Jahren"],
"expected": [
{
'attributes': {'label': 'term related'},
'source_label': 'das Jahr',
'target_label': 'seit zwei Jahren'
},
{
'attributes': {'label': 'term related'},
'source_label': 'das Jahr',
'target_label': 'in den letzten Jahren'
},
{
'attributes': {'label': 'term related'},
'source_label': 'seit zwei Jahren',
'target_label': 'in den letzten Jahren'
}
]
},
{
"words": ["exekutieren", "töten"],
"expected": [
{
'attributes': {LABEL_KEY: 'term related'},
'source_label': 'exekutieren',
'target_label': 'töten'
},
]
}
]

for test_case in test_cases:
with open("{path}/../german.yaml".format(path=DIR_PATH), "r") as f:
vocabulary = [word for word in yaml.safe_load(f)["vocabulary"] if word["term"] in test_case["words"]]

self.assertEqual(
test_case["expected"],
get_inferred_tokenization_links(vocabulary, LABEL_KEY, get_declension_attributes)
)

def test_get_structurally_similar_links(self):
vocabulary = yaml.safe_load("""
Expand Down

0 comments on commit 0accedb

Please sign in to comment.