Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
refactor(router): separate router and scoring logics
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhxiao committed Aug 28, 2019
1 parent 9a791c2 commit 1ac8e6f
Show file tree
Hide file tree
Showing 48 changed files with 354 additions and 380 deletions.
4 changes: 2 additions & 2 deletions gnes/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
PipelineEncoder = encoder_base.PipelineEncoder

# Indexer
BaseVectorIndexer = indexer_base.BaseVectorIndexer
BaseChunkIndexer = indexer_base.BaseChunkIndexer
BaseIndexer = indexer_base.BaseIndexer
BaseTextIndexer = indexer_base.BaseTextIndexer
BaseDocIndexer = indexer_base.BaseDocIndexer
BaseKeyIndexer = indexer_base.BaseKeyIndexer
JointIndexer = indexer_base.JointIndexer

Expand Down
20 changes: 10 additions & 10 deletions gnes/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@
from ..base import register_all_class

_cls2file_map = {
'FaissIndexer': 'vector.faiss',
'LVDBIndexer': 'fulltext.leveldb',
'AsyncLVDBIndexer': 'fulltext.leveldb',
'NumpyIndexer': 'vector.numpy',
'BIndexer': 'vector.bindexer',
'HBIndexer': 'vector.hbindexer',
'FaissIndexer': 'chunk.faiss',
'LVDBIndexer': 'doc.leveldb',
'AsyncLVDBIndexer': 'doc.leveldb',
'NumpyIndexer': 'chunk.numpy',
'BIndexer': 'chunk.bindexer',
'HBIndexer': 'chunk.hbindexer',
'JointIndexer': 'base',
'BaseIndexer': 'base',
'BaseTextIndexer': 'base',
'AnnoyIndexer': 'vector.annoy',
'DirectoryIndexer': 'fulltext.filesys',
'DictIndexer': 'fulltext.dict'
'BaseDocIndexer': 'base',
'AnnoyIndexer': 'chunk.annoy',
'DirectoryIndexer': 'doc.filesys',
'DictIndexer': 'doc.dict'
}

register_all_class(_cls2file_map, 'indexer')
8 changes: 4 additions & 4 deletions gnes/indexer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def normalize_score(self, *args, **kwargs):
pass


class BaseVectorIndexer(BaseIndexer):
class BaseChunkIndexer(BaseIndexer):

def add(self, keys: List[Tuple[int, int]], vectors: np.ndarray, weights: List[float], *args, **kwargs):
pass
Expand All @@ -42,7 +42,7 @@ def query(self, keys: np.ndarray, top_k: int, *args, **kwargs) -> List[List[Tupl
pass


class BaseTextIndexer(BaseIndexer):
class BaseDocIndexer(BaseIndexer):

def add(self, keys: List[int], docs: Any, weights: List[float], *args, **kwargs):
pass
Expand Down Expand Up @@ -80,9 +80,9 @@ def components(self, comps: Callable[[], Union[list, dict]]):
self._binary_indexer = None
self._doc_indexer = None
for c in self.components:
if isinstance(c, BaseVectorIndexer):
if isinstance(c, BaseChunkIndexer):
self._binary_indexer = c
elif isinstance(c, BaseTextIndexer):
elif isinstance(c, BaseDocIndexer):
self._doc_indexer = c
if not self._binary_indexer or not self._doc_indexer:
raise ValueError('"JointIndexer" requires a valid pair of "BaseBinaryIndexer" and "BaseTextIndexer"')
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions gnes/indexer/vector/annoy.py → gnes/indexer/chunk/annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import numpy as np

from ..base import BaseVectorIndexer
from ..base import BaseChunkIndexer
from ..key_only import ListKeyIndexer


class AnnoyIndexer(BaseVectorIndexer):
class AnnoyIndexer(BaseChunkIndexer):

def __init__(self, num_dim: int, data_path: str, metric: str = 'angular', n_trees=10, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import numpy as np

from .cython import IndexCore
from ...base import BaseVectorIndexer
from ...base import BaseChunkIndexer


class BIndexer(BaseVectorIndexer):
class BIndexer(BaseChunkIndexer):

def __init__(self,
num_bytes: int = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,18 @@

# pylint: disable=low-comment-ratio

from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
from cpython cimport array
from libc.stdlib cimport qsort
from libc.stdio cimport fopen, fclose, FILE, fwrite, fread
from cpython cimport

array
from cpython.mem cimport

PyMem_Malloc, PyMem_Realloc, PyMem_Free
from libc.stdio cimport

fopen, fclose, FILE, fwrite, fread
from libc.stdlib cimport

qsort

cdef extern from "limits.h":
cdef int USHRT_MAX
Expand Down
4 changes: 2 additions & 2 deletions gnes/indexer/vector/faiss.py → gnes/indexer/chunk/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

import numpy as np

from ..base import BaseVectorIndexer
from ..base import BaseChunkIndexer
from ..key_only import ListKeyIndexer


class FaissIndexer(BaseVectorIndexer):
class FaissIndexer(BaseChunkIndexer):

def __init__(self, num_dim: int, index_key: str, data_path: str, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import numpy as np

from .cython import IndexCore
from ...base import BaseVectorIndexer
from ...base import BaseChunkIndexer


class HBIndexer(BaseVectorIndexer):
class HBIndexer(BaseChunkIndexer):

def __init__(self,
num_clusters: int = 100,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
# pylint: disable=low-comment-ratio


from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
from cpython cimport array
from libc.stdlib cimport qsort
from libc.stdio cimport fopen, fclose, FILE, fwrite, fread
from cpython cimport

array
from cpython.mem cimport

PyMem_Malloc, PyMem_Realloc, PyMem_Free


cdef extern from "limits.h":
Expand Down
4 changes: 2 additions & 2 deletions gnes/indexer/vector/numpy.py → gnes/indexer/chunk/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import numpy as np

from ..base import BaseVectorIndexer
from ..base import BaseChunkIndexer
from ..key_only import ListKeyIndexer


class NumpyIndexer(BaseVectorIndexer):
class NumpyIndexer(BaseChunkIndexer):

def __init__(self, num_bytes: int = None, *args, **kwargs):
super().__init__()
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions gnes/indexer/fulltext/dict.py → gnes/indexer/doc/dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from google.protobuf.json_format import MessageToJson, Parse

from ..base import BaseTextIndexer
from ..base import BaseDocIndexer
from ...proto import gnes_pb2


class DictIndexer(BaseTextIndexer):
class DictIndexer(BaseDocIndexer):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
import os
from typing import List

from ..base import BaseTextIndexer
from ..base import BaseDocIndexer
from ...proto import gnes_pb2


class DirectoryIndexer(BaseTextIndexer):
class DirectoryIndexer(BaseDocIndexer):

def __init__(self, data_path: str,
keep_na_doc: bool = True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
from threading import Thread, Event
from typing import List, Any

from ..base import BaseTextIndexer
from ..base import BaseDocIndexer
from ...proto import gnes_pb2


class LVDBIndexer(BaseTextIndexer):
class LVDBIndexer(BaseDocIndexer):

def __init__(self, data_path: str,
keep_na_doc: bool = True,
Expand Down
2 changes: 1 addition & 1 deletion gnes/preprocessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_cls2file_map = {
'BasePreprocessor': 'base',
'PipelinePreprocessor': 'base',
'PunctSplitPreprocessor': 'text.split_punct',
'SentSplitPreprocessor': 'text.split',
'BaseImagePreprocessor': 'base',
'BaseTextPreprocessor': 'base',
'VanillaSlidingPreprocessor': 'image.sliding_window',
Expand Down
2 changes: 1 addition & 1 deletion gnes/preprocessor/audio/audio_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(np.array(chunks, dtype=np.float32)))
c.offset_1d = ci
c.offset = ci
c.weight = 1 / len(audio)
else:
self.logger.info('bad document: no audio extracted')
Expand Down
2 changes: 1 addition & 1 deletion gnes/preprocessor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def apply(self, doc: 'gnes_pb2.Document'):
super().apply(doc)
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.offset_1d = 0
c.offset = 0
c.weight = 1.
if doc.raw_bytes:
self.raw_to_chunk(c, doc.raw_bytes)
Expand Down
4 changes: 2 additions & 2 deletions gnes/preprocessor/image/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ def apply(self, doc: 'gnes_pb2.Document'):
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(self._crop(original_image, ele[0])))
c.offset_1d = ci
c.offset = ci
c.offset_nd.extend(self._get_seg_offset_nd(all_subareas, index, ele[0]))
c.weight = self._cal_area(ele[0]) / (original_image.size[0] * original_image.size[1])

c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(np.array(original_image)))
c.offset_1d = len(chunks)
c.offset = len(chunks)
c.offset_nd.extend([100, 100])
c.weight = 1.
else:
Expand Down
2 changes: 1 addition & 1 deletion gnes/preprocessor/image/sliding_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def apply(self, doc: 'gnes_pb2.Document'):
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(ele[0]))
c.offset_1d = ci
c.offset = ci
c.offset_nd.extend(self._get_slid_offset_nd(all_subareas, index, center_point_list[ci]))
c.weight = ele[1]
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import re

from ..base import BaseTextPreprocessor
from ...proto import gnes_pb2


class PunctSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, deliminator: str = r'[.。!?!?]+', *args, **kwargs):
class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, max_sent_len: int = 256, *args, **kwargs):
super().__init__(*args, **kwargs)
self.deliminator = deliminator
self.max_sent_len = max_sent_len

def apply(self, doc: 'gnes_pb2.Document') -> None:
super().apply(doc)
doc.raw_text = doc.raw_bytes.decode().strip()
for ci, s in enumerate(re.split(self.deliminator, doc.raw_text)):
if s.strip():
d = json.loads(doc.raw_bytes.decode())
doc.raw_text = d.pop('Content')
doc.meta_info = json.dumps(d).encode()

ret = [(m.group(0), m.start(), m.end()) for m in re.finditer(r'[^.!?]+[.!?]', doc.raw_text)]
for ci, (r, s, e) in enumerate(ret):
f = ''.join(filter(lambda x: x in string.printable, r))
f = re.sub('\n+', ' ', f).strip()
if f:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.text = s.strip()
c.text = f[:self.max_sent_len]
c.offset_1d = ci
c.weight = len(c.text) / len(doc.raw_text)
c.offset_nd.extend([s, e])
6 changes: 3 additions & 3 deletions gnes/preprocessor/video/ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

from ..base import BaseVideoPreprocessor, RawChunkPreprocessor
from ..helper import split_video_frames, phash_descriptor, get_gif
from ...proto import gnes_pb2, array2blob, blob2array
from ..io_utils import video as video_util
from ...proto import gnes_pb2, array2blob, blob2array


class FFmpegPreprocessor(BaseVideoPreprocessor):
Expand Down Expand Up @@ -62,7 +62,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(chunk))
c.offset_1d = ci
c.offset = ci
c.weight = weight[ci]

else:
Expand Down Expand Up @@ -175,7 +175,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.blob.CopyFrom(array2blob(np.array(chunk, dtype=np.uint8)))
c.offset_1d = ci
c.offset = ci
c.weight = 1 / len(sub_videos)

else:
Expand Down
9 changes: 5 additions & 4 deletions gnes/preprocessor/video/shotdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from typing import List

import numpy as np

from ..base import BaseVideoPreprocessor
from ...proto import gnes_pb2, array2blob
from ..io_utils import video as video_util
from ..helper import compute_descriptor, compare_descriptor, detect_peak_boundary, compare_ecr
from ..io_utils import video as video_util
from ...proto import gnes_pb2, array2blob


class ShotDetectPreprocessor(BaseVideoPreprocessor):
Expand Down Expand Up @@ -83,7 +84,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
# chunk_data = np.concatenate(frames, axis=0)
chunk_data = np.array(frames)
c.blob.CopyFrom(array2blob(chunk_data))
c.offset_1d = ci
c.offset = ci
c.weight = len(frames) / num_frames
else:
self.logger.error('bad document: "raw_bytes" is empty!')
Loading

0 comments on commit 1ac8e6f

Please sign in to comment.