Skip to content

Commit

Permalink
Keep line numbers as part of the doc format. Useful for printing out …
Browse files Browse the repository at this point in the history
…where errors happen when validating
  • Loading branch information
AngledLuffa committed Feb 13, 2025
1 parent 916328d commit e0c149b
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 9 deletions.
18 changes: 17 additions & 1 deletion stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class MWTProcessingType(Enum):
SENTIMENT = 'sentiment'
CONSTITUENCY = 'constituency'
COREF_CHAINS = 'coref_chains'
LINE_NUMBER = 'line_number'

# field indices when converting the document to conll
FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
Expand Down Expand Up @@ -965,7 +966,7 @@ def init_from_misc(unit):
# some key_value can not be split
key, value = key_value
# start & end char are kept as ints
if key in (START_CHAR, END_CHAR):
if key in (START_CHAR, END_CHAR, LINE_NUMBER):
value = int(value)
# set attribute
attr = f'_{key}'
Expand Down Expand Up @@ -1014,6 +1015,9 @@ def dict_to_conll_text(token_dict, id_connector="-"):
token_conll[FIELD_TO_IDX[key]] = id_connector.join([str(x) for x in token_dict[key]]) if isinstance(token_dict[key], tuple) else str(token_dict[key])
elif key in FIELD_TO_IDX:
token_conll[FIELD_TO_IDX[key]] = str(token_dict[key])
elif key == LINE_NUMBER:
# skip this when converting back for now
pass
if misc:
token_conll[FIELD_TO_IDX[MISC]] = "|".join(misc)
else:
Expand Down Expand Up @@ -1051,6 +1055,7 @@ def __init__(self, sentence, token_entry, words=None):
self._mexp = token_entry.get(MEXP, None)
self._spaces_before = ""
self._spaces_after = " "
self._line_number = None

if self._misc is not None:
init_from_misc(self)
Expand Down Expand Up @@ -1178,6 +1183,11 @@ def words(self, value):
for w in self._words:
w.parent = self

@property
def line_number(self):
""" Access the line number from the original document, if set """
return self._line_number

@property
def start_char(self):
""" Access the start character index for this token in the raw text. """
Expand Down Expand Up @@ -1323,6 +1333,7 @@ def __init__(self, sentence, word_entry):
self._sent = sentence
self._mexp = word_entry.get(MEXP, None)
self._coref_chains = None
self._line_number = None

if self._misc is not None:
init_from_misc(self)
Expand Down Expand Up @@ -1485,6 +1496,11 @@ def misc(self, value):
""" Set the word's miscellaneousness value. """
self._misc = value if self._is_null(value) == False else None

@property
def line_number(self):
""" Access the line number from the original document, if set """
return self._line_number

@property
def start_char(self):
""" Access the start character index for this token in the raw text. """
Expand Down
20 changes: 19 additions & 1 deletion stanza/tests/common/test_data_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def test_read_multiple_doc_ids():
1 This this PRON DT Number=Sing|PronType=Dem 4 nsubj _ start_char=0|end_char=4
2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 cop _ start_char=5|end_char=7
3 a a DET DT Definite=Ind|PronType=Art 4 det _ start_char=8|end_char=9
4 test test NOUN NN Number=Sing 0 root _ start_char=10|end_char=14|SpaceAfter=No
4 test test NOUN NN Number=Sing 0 root _ SpaceAfter=No|start_char=10|end_char=14
""".lstrip()

def test_convert_dict():
Expand All @@ -518,3 +518,21 @@ def test_convert_dict():
['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]

assert converted == expected

def test_line_numbers():
doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE, keep_line_numbers=True)
# currently the line numbers are not output in the conllu format
doc_conllu = "{:C}\n".format(doc)
assert doc_conllu == ENGLISH_TEST_SENTENCE

# currently the line numbers are not output in the dict format
converted = CoNLL.convert_dict(doc.to_dict())
expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]
assert converted == expected

for word_idx, word in enumerate(doc.sentences[0].words):
# the test sentence has two comments in it
assert word.line_number == word_idx + 2
20 changes: 13 additions & 7 deletions stanza/utils/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from stanza.models.common.doc import Document
from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, NER, START_CHAR, END_CHAR
from stanza.models.common.doc import FIELD_TO_IDX, FIELD_NUM
from stanza.models.common.doc import LINE_NUMBER

class CoNLLError(ValueError):
pass

class CoNLL:

@staticmethod
def load_conll(f, ignore_gapping=True):
def load_conll(f, ignore_gapping=True, keep_line_numbers=False):
""" Load the file or string into the CoNLL-U format data.
Input: file or string reader, where the data is in CoNLL-U format.
Output: a tuple whose first element is a list of list of list for each token in each sentence in the data,
Expand Down Expand Up @@ -43,6 +44,11 @@ def load_conll(f, ignore_gapping=True):
continue
if len(array) != FIELD_NUM:
raise CoNLLError(f"Cannot parse CoNLL line {line_idx+1}: expecting {FIELD_NUM} fields, {len(array)} found at line {line_idx}\n {array}")
if keep_line_numbers:
if array[-1] == "_" or array[-1] is None:
array[-1] = "%s=%d" % (LINE_NUMBER, line_idx)
else:
array[-1] = "%s|%s=%d" % (array[-1], LINE_NUMBER, line_idx)
sent += [array]
if len(sent) > 0:
doc.append(sent)
Expand Down Expand Up @@ -113,29 +119,29 @@ def convert_conll_token(token_conll):
return token_dict

@staticmethod
def conll2dict(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
def conll2dict(input_file=None, input_str=None, ignore_gapping=True, zip_file=None, keep_line_numbers=False):
""" Load the CoNLL-U format data from file or string into lists of dictionaries.
"""
assert any([input_file, input_str]) and not all([input_file, input_str]), 'either use input file or input string'
if zip_file: assert input_file, 'must provide input_file if zip_file is set'

if input_str:
infile = io.StringIO(input_str)
doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping)
doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping, keep_line_numbers)
elif zip_file:
with ZipFile(zip_file) as zin:
with zin.open(input_file) as fin:
doc_conll, doc_comments = CoNLL.load_conll(io.TextIOWrapper(fin, encoding="utf-8"), ignore_gapping)
doc_conll, doc_comments = CoNLL.load_conll(io.TextIOWrapper(fin, encoding="utf-8"), ignore_gapping, keep_line_numbers)
else:
with open(input_file, encoding='utf-8') as fin:
doc_conll, doc_comments = CoNLL.load_conll(fin, ignore_gapping)
doc_conll, doc_comments = CoNLL.load_conll(fin, ignore_gapping, keep_line_numbers)

doc_dict, doc_empty = CoNLL.convert_conll(doc_conll)
return doc_dict, doc_comments, doc_empty

@staticmethod
def conll2doc(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file)
def conll2doc(input_file=None, input_str=None, ignore_gapping=True, zip_file=None, keep_line_numbers=False):
doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file, keep_line_numbers=keep_line_numbers)
return Document(doc_dict, text=None, comments=doc_comments, empty_sentences=doc_empty)

@staticmethod
Expand Down

0 comments on commit e0c149b

Please sign in to comment.