Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
fix(preprocessor): fix SentSplitPreprocessor
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Aug 29, 2019
1 parent 522c5a4 commit 5828d20
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
21 changes: 16 additions & 5 deletions gnes/preprocessor/text/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,34 @@

import json
import re
import string

from ..base import BaseTextPreprocessor
from ...proto import gnes_pb2


class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, max_sent_len: int = 256, *args, **kwargs):
def __init__(self, max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool= False,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.max_sent_len = max_sent_len
self.deliminator = deliminator
self.is_json = is_json

def apply(self, doc: 'gnes_pb2.Document') -> None:
super().apply(doc)
d = json.loads(doc.raw_bytes.decode())
doc.raw_text = d.pop('Content')
doc.meta_info = json.dumps(d).encode()
d = doc.raw_bytes.decode()
if self.is_json:
d = json.loads(d)
doc.raw_text = d.pop('Content')
doc.meta_info = json.dumps(d).encode()
else:
doc.raw_text = d

ret = [(m.group(0), m.start(), m.end()) for m in re.finditer(r'[^.!?]+[.!?]', doc.raw_text)]
ret = [(m.group(0), m.start(), m.end()) for m in
re.finditer(r'[^{0}]+[{0}]'.format(self.deliminator), doc.raw_text)]
for ci, (r, s, e) in enumerate(ret):
f = ''.join(filter(lambda x: x in string.printable, r))
f = re.sub('\n+', ' ', f).strip()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_preprocessor_service_realdata(self):
for v in fp:
if v.strip():
d = msg.request.train.docs.add()
d.raw_text = v
d.raw_bytes = v.encode()
all_text += v
with PreprocessorService(args), ZmqClient(c_args) as client:
client.send_message(msg)
Expand Down

0 comments on commit 5828d20

Please sign in to comment.