Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
fix(preprocessor): add min_len to split preprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Aug 29, 2019
1 parent c83448b commit 31bffeb
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions gnes/preprocessor/text/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@


class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, max_sent_len: int = 256,
def __init__(self,
min_sent_len: int = 8,
max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool = False,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.min_sent_len = min_sent_len
self.max_sent_len = max_sent_len
self.deliminator = deliminator
self.is_json = is_json
Expand All @@ -46,7 +49,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
for ci, (r, s, e) in enumerate(ret):
f = ''.join(filter(lambda x: x in string.printable, r))
f = re.sub('\n+', ' ', f).strip()
if f:
if len(f) > self.min_sent_len:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.text = f[:self.max_sent_len]
Expand Down

0 comments on commit 31bffeb

Please sign in to comment.