Keep line numbers as part of the doc format. Useful for printing out …

…where errors happen when validating
stanfordnlp · Feb 13, 2025 · e0c149b · e0c149b
1 parent 916328d
commit e0c149b
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 9 deletions.
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -46,6 +46,7 @@ class MWTProcessingType(Enum):
 SENTIMENT = 'sentiment'
 CONSTITUENCY = 'constituency'
 COREF_CHAINS = 'coref_chains'
+LINE_NUMBER = 'line_number'
 
 # field indices when converting the document to conll
 FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
@@ -965,7 +966,7 @@ def init_from_misc(unit):
             # some key_value can not be split
             key, value = key_value
             # start & end char are kept as ints
-            if key in (START_CHAR, END_CHAR):
+            if key in (START_CHAR, END_CHAR, LINE_NUMBER):
                 value = int(value)
             # set attribute
             attr = f'_{key}'
@@ -1014,6 +1015,9 @@ def dict_to_conll_text(token_dict, id_connector="-"):
             token_conll[FIELD_TO_IDX[key]] = id_connector.join([str(x) for x in token_dict[key]]) if isinstance(token_dict[key], tuple) else str(token_dict[key])
         elif key in FIELD_TO_IDX:
             token_conll[FIELD_TO_IDX[key]] = str(token_dict[key])
+        elif key == LINE_NUMBER:
+            # skip this when converting back for now
+            pass
     if misc:
         token_conll[FIELD_TO_IDX[MISC]] = "|".join(misc)
     else:
@@ -1051,6 +1055,7 @@ def __init__(self, sentence, token_entry, words=None):
         self._mexp = token_entry.get(MEXP, None)
         self._spaces_before = ""
         self._spaces_after = " "
+        self._line_number = None
 
         if self._misc is not None:
             init_from_misc(self)
@@ -1178,6 +1183,11 @@ def words(self, value):
         for w in self._words:
             w.parent = self
 
+    @property
+    def line_number(self):
+        """ Access the line number from the original document, if set """
+        return self._line_number
+
     @property
     def start_char(self):
         """ Access the start character index for this token in the raw text. """
@@ -1323,6 +1333,7 @@ def __init__(self, sentence, word_entry):
         self._sent = sentence
         self._mexp = word_entry.get(MEXP, None)
         self._coref_chains = None
+        self._line_number = None
 
         if self._misc is not None:
             init_from_misc(self)
@@ -1485,6 +1496,11 @@ def misc(self, value):
         """ Set the word's miscellaneousness value. """
         self._misc = value if self._is_null(value) == False else None
 
+    @property
+    def line_number(self):
+        """ Access the line number from the original document, if set """
+        return self._line_number
+
     @property
     def start_char(self):
         """ Access the start character index for this token in the raw text. """

diff --git a/stanza/tests/common/test_data_conversion.py b/stanza/tests/common/test_data_conversion.py
@@ -505,7 +505,7 @@ def test_read_multiple_doc_ids():
 1	This	this	PRON	DT	Number=Sing|PronType=Dem	4	nsubj	_	start_char=0|end_char=4
 2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	cop	_	start_char=5|end_char=7
 3	a	a	DET	DT	Definite=Ind|PronType=Art	4	det	_	start_char=8|end_char=9
-4	test	test	NOUN	NN	Number=Sing	0	root	_	start_char=10|end_char=14|SpaceAfter=No
+4	test	test	NOUN	NN	Number=Sing	0	root	_	SpaceAfter=No|start_char=10|end_char=14
 """.lstrip()
 
 def test_convert_dict():
@@ -518,3 +518,21 @@ def test_convert_dict():
                  ['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]
 
     assert converted == expected
+
+def test_line_numbers():
+    doc = CoNLL.conll2doc(input_str=ENGLISH_TEST_SENTENCE, keep_line_numbers=True)
+    # currently the line numbers are not output in the conllu format
+    doc_conllu = "{:C}\n".format(doc)
+    assert doc_conllu == ENGLISH_TEST_SENTENCE
+
+    # currently the line numbers are not output in the dict format
+    converted = CoNLL.convert_dict(doc.to_dict())
+    expected = [[['1', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '4', 'nsubj', '_', 'start_char=0|end_char=4'],
+                 ['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', 'start_char=5|end_char=7'],
+                 ['3', 'a', 'a', 'DET', 'DT', 'Definite=Ind|PronType=Art', '4', 'det', '_', 'start_char=8|end_char=9'],
+                 ['4', 'test', 'test', 'NOUN', 'NN', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No|start_char=10|end_char=14']]]
+    assert converted == expected
+
+    for word_idx, word in enumerate(doc.sentences[0].words):
+        # the test sentence has two comments in it
+        assert word.line_number == word_idx + 2
diff --git a/stanza/utils/conll.py b/stanza/utils/conll.py
@@ -8,14 +8,15 @@
 from stanza.models.common.doc import Document
 from stanza.models.common.doc import ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, NER, START_CHAR, END_CHAR
 from stanza.models.common.doc import FIELD_TO_IDX, FIELD_NUM
+from stanza.models.common.doc import LINE_NUMBER
 
 class CoNLLError(ValueError):
     pass
 
 class CoNLL:
 
     @staticmethod
-    def load_conll(f, ignore_gapping=True):
+    def load_conll(f, ignore_gapping=True, keep_line_numbers=False):
         """ Load the file or string into the CoNLL-U format data.
         Input: file or string reader, where the data is in CoNLL-U format.
         Output: a tuple whose first element is a list of list of list for each token in each sentence in the data,
@@ -43,6 +44,11 @@ def load_conll(f, ignore_gapping=True):
                     continue
                 if len(array) != FIELD_NUM:
                     raise CoNLLError(f"Cannot parse CoNLL line {line_idx+1}: expecting {FIELD_NUM} fields, {len(array)} found at line {line_idx}\n  {array}")
+                if keep_line_numbers:
+                    if array[-1] == "_" or array[-1] is None:
+                        array[-1] = "%s=%d" % (LINE_NUMBER, line_idx)
+                    else:
+                        array[-1] = "%s|%s=%d" % (array[-1], LINE_NUMBER, line_idx)
                 sent += [array]
         if len(sent) > 0:
             doc.append(sent)
@@ -113,29 +119,29 @@ def convert_conll_token(token_conll):
         return token_dict
 
     @staticmethod
-    def conll2dict(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
+    def conll2dict(input_file=None, input_str=None, ignore_gapping=True, zip_file=None, keep_line_numbers=False):
         """ Load the CoNLL-U format data from file or string into lists of dictionaries.
         """
         assert any([input_file, input_str]) and not all([input_file, input_str]), 'either use input file or input string'
         if zip_file: assert input_file, 'must provide input_file if zip_file is set'
 
         if input_str:
             infile = io.StringIO(input_str)
-            doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping)
+            doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping, keep_line_numbers)
         elif zip_file:
             with ZipFile(zip_file) as zin:
                 with zin.open(input_file) as fin:
-                    doc_conll, doc_comments = CoNLL.load_conll(io.TextIOWrapper(fin, encoding="utf-8"), ignore_gapping)
+                    doc_conll, doc_comments = CoNLL.load_conll(io.TextIOWrapper(fin, encoding="utf-8"), ignore_gapping, keep_line_numbers)
         else:
             with open(input_file, encoding='utf-8') as fin:
-                doc_conll, doc_comments = CoNLL.load_conll(fin, ignore_gapping)
+                doc_conll, doc_comments = CoNLL.load_conll(fin, ignore_gapping, keep_line_numbers)
 
         doc_dict, doc_empty = CoNLL.convert_conll(doc_conll)
         return doc_dict, doc_comments, doc_empty
 
     @staticmethod
-    def conll2doc(input_file=None, input_str=None, ignore_gapping=True, zip_file=None):
-        doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file)
+    def conll2doc(input_file=None, input_str=None, ignore_gapping=True, zip_file=None, keep_line_numbers=False):
+        doc_dict, doc_comments, doc_empty = CoNLL.conll2dict(input_file, input_str, ignore_gapping, zip_file=zip_file, keep_line_numbers=keep_line_numbers)
         return Document(doc_dict, text=None, comments=doc_comments, empty_sentences=doc_empty)
 
     @staticmethod