Skip to content

Commit

Permalink
Version 3.3.0; fixed bug in opening quotes that follow paragraph markers
Browse files Browse the repository at this point in the history
  • Loading branch information
vthorsteinsson committed Sep 8, 2021
1 parent c8d6e2e commit 7535222
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 25 deletions.
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -809,11 +809,13 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
Changelog
---------

* Version 3.3.0: Fixed bug where opening quotes at the start of paragraphs
were sometimes incorrectly recognized and normalized.
* Version 3.2.0: Numbers and amounts that consist of word tokens only ('sex hundruð')
are now returned as the original ``TOK.WORD``s ('sex' and 'hundruð'), not as single
coalesced ``TOK.NUMBER``/``TOK.AMOUNT``/etc. tokens.
* Version 3.1.2: Changed paragraph markers to ``[[`` and ``]]`` (removing spaces)
* Version 3.1.1: Minor fixes; added Tok.from_token()
* Version 3.1.2: Changed paragraph markers to ``[[`` and ``]]`` (removing spaces).
* Version 3.1.1: Minor fixes; added Tok.from_token().
* Version 3.1.0: Added ``-o`` switch to ``tokenize`` command to return original
token text, enabling the tokenizer to run as a sentence splitter only.
* Version 3.0.0: Added tracking of character offsets for tokens within the
Expand Down
38 changes: 16 additions & 22 deletions src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,12 +846,11 @@ def Split_Sentence(t: Optional[Tok] = None) -> Tok:


class TokenStream:
"""
Wrapper for token iterator allowing lookahead.
"""

""" Wrapper for token iterator allowing lookahead. """

def __init__(self, token_it: Iterator[Tok], *, lookahead_size: int = 2):
"""Initialize from token iterator."""
""" Initialize from token iterator. """
self.__it: Iterator[Tok] = token_it
if lookahead_size <= 0:
lookahead_size = 1
Expand Down Expand Up @@ -880,59 +879,57 @@ def __getitem__(self, i: int) -> Optional[Tok]:
return None

def txt(self, i: int = 0) -> Optional[str]:
"""Return token.txt for token at index i."""
""" Return token.txt for token at index i. """
t = self[i]
return t.txt if t else None

def kind(self, i: int = 0) -> Optional[int]:
"""Return token.kind for token at index i."""
""" Return token.kind for token at index i. """
t = self[i]
return t.kind if t else None

def punctuation(self, i: int = 0) -> Optional[str]:
"""Return token.punctuation for token at index i."""
""" Return token.punctuation for token at index i. """
t = self[i]
return t.punctuation if t else None

def number(self, i: int = 0) -> Optional[float]:
"""Return token.number for token at index i."""
""" Return token.number for token at index i. """
t = self[i]
return t.number if t else None

def integer(self, i: int = 0) -> Optional[int]:
"""Return token.integer for token at index i."""
""" Return token.integer for token at index i. """
t = self[i]
return t.integer if t else None

def ordinal(self, i: int = 0) -> Optional[int]:
"""Return token.ordinal for token at index i."""
""" Return token.ordinal for token at index i. """
t = self[i]
return t.ordinal if t else None

def has_meanings(self, i: int = 0) -> Optional[bool]:
"""Return token.has_meanings for token at index i."""
""" Return token.has_meanings for token at index i. """
t = self[i]
return t.has_meanings if t else None

def meanings(self, i: int = 0) -> Optional[BIN_TupleList]:
"""Return token.meanings for token at index i."""
""" Return token.meanings for token at index i. """
t = self[i]
return t.meanings if t else None

def person_names(self, i: int = 0) -> Optional[PersonNameList]:
"""Return token.person_names for token at index i."""
""" Return token.person_names for token at index i. """
t = self[i]
return t.person_names if t else None

def as_tuple(self, i: int = 0) -> Optional[Tuple[Any, ...]]:
"""Return token.as_tuple for token at index i."""
""" Return token.as_tuple for token at index i. """
t = self[i]
return t.as_tuple if t else None

def could_be_end_of_sentence(self, i: int = 0, *args: Any) -> bool:
"""
Wrapper to safely check if token at index i could be end of sentence.
"""
""" Wrapper to safely check if token at index i could be end of sentence. """
t = self[i]
return could_be_end_of_sentence(t, *args) if t else False

Expand Down Expand Up @@ -1500,7 +1497,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
# 7) The process is repeated from step 4) until the current raw token is
# exhausted. At that point, we obtain the next token and start from 2).

rt: Tok
rtxt: str = ""
inside_paragraph_marker: bool = False

for rt in generate_rough_tokens(
Expand All @@ -1519,7 +1516,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
# !!! the .txt attribute. Once Pylance/Pyright has been fixed, it is
# !!! probably a good idea to go back to using rt.txt since it
# !!! makes the code more resilient (if a bit slower).
rtxt: str = rt.txt
rtxt = rt.txt

if rtxt.isalpha() or rtxt in SI_UNITS:
# Shortcut for most common case: pure word
Expand Down Expand Up @@ -1840,10 +1837,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
ate = True

# Alphabetic characters
# (or a hyphen immediately followed by alphabetic characters,
# such as in 'þingkonur og -menn')
if rt.txt and rt.txt[0].isalpha():
# XXX: This does not seem to fit the comment above ('-'.isalpha()==False)
ate = True
lw = len(rt.txt)
i = 1
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.2.0"
__version__ = "3.3.0"

0 comments on commit 7535222

Please sign in to comment.