refactor(server): remove redundant logics of rank (#715)

jina-ai · May 11, 2022 · 59c0698 · 59c0698
1 parent edf0d86
commit 59c0698
Show file tree

Hide file tree

Showing 5 changed files with 97 additions and 249 deletions.
diff --git a/server/clip_server/executors/clip_onnx.py b/server/clip_server/executors/clip_onnx.py
@@ -3,19 +3,18 @@
 from functools import partial
 from multiprocessing.pool import ThreadPool
 from typing import Optional, Dict
-import numpy as np
-import onnxruntime as ort
 
+import onnxruntime as ort
 from jina import Executor, requests, DocumentArray
 
-from clip_server.model import clip
-from clip_server.model.clip_onnx import CLIPOnnxModel
 from clip_server.executors.helper import (
     split_img_txt_da,
     preproc_image,
     preproc_text,
-    numpy_softmax,
+    set_rank,
 )
+from clip_server.model import clip
+from clip_server.model.clip_onnx import CLIPOnnxModel
 
 
 class CLIPEncoder(Executor):
@@ -35,8 +34,6 @@ def __init__(
         self._minibatch_size = minibatch_size
 
         self._model = CLIPOnnxModel(name)
-        # Note: hard coded here since all the pretrained clip model use the same logit_scale parameter
-        self._logit_scale = np.exp(4.60517)
 
         import torch
 
@@ -84,77 +81,12 @@ def __init__(
 
     @requests(on='/rank')
     async def rank(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
-        _source = parameters.get('source', 'matches')
+        _source = parameters.get('source', '@m')
 
-        for d in docs:
-            _img_da = DocumentArray()
-            _txt_da = DocumentArray()
-            split_img_txt_da(d, _img_da, _txt_da)
-
-            candidates = getattr(d, _source)
-
-            for c in candidates:
-                split_img_txt_da(c, _img_da, _txt_da)
+        await self.encode(docs)
+        await self.encode(docs[_source])
 
-            if len(_img_da) != 1 and len(_txt_da) != 1:
-                raise ValueError(
-                    f'`d.{_source}` must be all in same modality, either all images or all text'
-                )
-            elif not _img_da or not _txt_da:
-                raise ValueError(
-                    f'`d` and `d.{_source}` must be in different modality, one is image one is text'
-                )
-            elif len(candidates) <= 1:
-                raise ValueError(
-                    f'`d.{_source}` must have more than one Documents to do ranking'
-                )
-            else:
-                _img_da = await self.encode(_img_da)
-                _txt_da = await self.encode(_txt_da)
-
-                # normalized features
-                image_features = _img_da.embeddings / np.linalg.norm(
-                    _img_da.embeddings, axis=1, keepdims=True
-                )
-                text_features = _txt_da.embeddings / np.linalg.norm(
-                    _txt_da.embeddings, axis=1, keepdims=True
-                )
-
-                # paired cosine similarity
-                scores_per_text = np.matmul(image_features, text_features.T)
-                scores_per_image = scores_per_text.T
-
-                if len(_img_da) == 1:
-                    cosine_scores = scores_per_text
-                elif len(_txt_da) == 1:
-                    cosine_scores = scores_per_image
-
-                softmax_scores = numpy_softmax(self._logit_scale * cosine_scores)
-
-                # squeeze scores
-                cosine_scores = cosine_scores[0]
-                softmax_scores = softmax_scores[0]
-
-                # drop embeddings
-                _img_da.embeddings = None
-                _txt_da.embeddings = None
-
-                for c, p, o in zip(candidates, softmax_scores, cosine_scores):
-                    c.scores['clip_score'].value = p
-                    c.scores['clip_score'].op_name = 'softmax'
-
-                    c.scores['clip_score_cosine'].value = o
-                    c.scores['clip_score_cosine'].op_name = 'cosine'
-
-                setattr(
-                    d,
-                    _source,
-                    sorted(
-                        candidates,
-                        key=lambda _m: _m.scores['clip_score'].value,
-                        reverse=True,
-                    ),
-                )
+        set_rank(docs, _source)
 
     @requests
     async def encode(self, docs: 'DocumentArray', **kwargs):

diff --git a/server/clip_server/executors/clip_tensorrt.py b/server/clip_server/executors/clip_tensorrt.py
@@ -1,17 +1,18 @@
-from typing import Dict
-from multiprocessing.pool import ThreadPool
 from functools import partial
+from multiprocessing.pool import ThreadPool
+from typing import Dict
+
 import numpy as np
 from jina import Executor, requests, DocumentArray
 
-from clip_server.model import clip
-from clip_server.model.clip_trt import CLIPTensorRTModel
 from clip_server.executors.helper import (
     split_img_txt_da,
     preproc_image,
     preproc_text,
-    numpy_softmax,
+    set_rank,
 )
+from clip_server.model import clip
+from clip_server.model.clip_trt import CLIPTensorRTModel
 
 
 class CLIPEncoder(Executor):
@@ -46,82 +47,14 @@ def __init__(
 
         self._model.start_engines()
 
-        # Note: hard coded here since all the pretrained clip model use the same logit_scale parameter
-        self._logit_scale = np.exp(4.60517)
-
     @requests(on='/rank')
     async def rank(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
-        _source = parameters.get('source', 'matches')
+        _source = parameters.get('source', '@m')
 
-        for d in docs:
-            _img_da = DocumentArray()
-            _txt_da = DocumentArray()
-            split_img_txt_da(d, _img_da, _txt_da)
+        await self.encode(docs)
+        await self.encode(docs[_source])
 
-            candidates = getattr(d, _source)
-
-            for c in candidates:
-                split_img_txt_da(c, _img_da, _txt_da)
-
-            if len(_img_da) != 1 and len(_txt_da) != 1:
-                raise ValueError(
-                    f'`d.{_source}` must be all in same modality, either all images or all text'
-                )
-            elif not _img_da or not _txt_da:
-                raise ValueError(
-                    f'`d` and `d.{_source}` must be in different modality, one is image one is text'
-                )
-            elif len(candidates) <= 1:
-                raise ValueError(
-                    f'`d.{_source}` must have more than one Documents to do ranking'
-                )
-            else:
-                _img_da = await self.encode(_img_da)
-                _txt_da = await self.encode(_txt_da)
-
-                # normalized features
-                image_features = _img_da.embeddings / np.linalg.norm(
-                    _img_da.embeddings, axis=1, keepdims=True
-                )
-                text_features = _txt_da.embeddings / np.linalg.norm(
-                    _txt_da.embeddings, axis=1, keepdims=True
-                )
-
-                # cosine similarity as rank score
-                scores_per_text = np.matmul(image_features, text_features.T)
-                scores_per_image = scores_per_text.T
-
-                if len(_img_da) == 1:
-                    cosine_scores = scores_per_text
-                elif len(_txt_da) == 1:
-                    cosine_scores = scores_per_image
-
-                softmax_scores = numpy_softmax(self._logit_scale * cosine_scores)
-
-                # squeeze scores
-                softmax_scores = softmax_scores[0]
-                cosine_scores = cosine_scores[0]
-
-                # drop embeddings
-                _img_da.embeddings = None
-                _txt_da.embeddings = None
-
-                for c, p, o in zip(candidates, softmax_scores, cosine_scores):
-                    c.scores['clip_score'].value = p
-                    c.scores['clip_score'].op_name = 'softmax'
-
-                    c.scores['clip_score_cosine'].value = o
-                    c.scores['clip_score_cosine'].op_name = 'cosine'
-
-                setattr(
-                    d,
-                    _source,
-                    sorted(
-                        candidates,
-                        key=lambda _m: _m.scores['clip_score'].value,
-                        reverse=True,
-                    ),
-                )
+        set_rank(docs, _source)
 
     @requests
     async def encode(self, docs: 'DocumentArray', **kwargs):

diff --git a/server/clip_server/executors/clip_torch.py b/server/clip_server/executors/clip_torch.py
@@ -1,17 +1,21 @@
 import os
 import warnings
 from functools import partial
-
 from multiprocessing.pool import ThreadPool
 from typing import Optional, Dict
 
 import numpy as np
 import torch
-from clip_server.model import clip
-from clip_server.executors.helper import split_img_txt_da, preproc_image, preproc_text
-
 from jina import Executor, requests, DocumentArray
 
+from clip_server.executors.helper import (
+    split_img_txt_da,
+    preproc_image,
+    preproc_text,
+    set_rank,
+)
+from clip_server.model import clip
+
 
 class CLIPEncoder(Executor):
     def __init__(
@@ -53,91 +57,18 @@ def __init__(
         self._model, self._preprocess_tensor = clip.load(
             name, device=self._device, jit=jit
         )
-        self._logit_scale = self._model.logit_scale.exp()
 
         self._pool = ThreadPool(processes=num_worker_preprocess)
 
     @requests(on='/rank')
     async def rank(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
-        import torch
-
-        _source = parameters.get('source', 'matches')
-
-        for d in docs:
-            _img_da = DocumentArray()
-            _txt_da = DocumentArray()
-            split_img_txt_da(d, _img_da, _txt_da)
 
-            candidates = getattr(d, _source)
+        _source = parameters.get('source', '@m')
 
-            for c in candidates:
-                split_img_txt_da(c, _img_da, _txt_da)
+        await self.encode(docs)
+        await self.encode(docs[_source])
 
-            if len(_img_da) != 1 and len(_txt_da) != 1:
-                raise ValueError(
-                    f'`d.{_source}` must be all in same modality, either all images or all text'
-                )
-            elif not _img_da or not _txt_da:
-                raise ValueError(
-                    f'`d` and `d.{_source}` must be in different modality, one is image one is text'
-                )
-            elif len(candidates) <= 1:
-                raise ValueError(
-                    f'`d.{_source}` must have more than one Documents to do ranking'
-                )
-            else:
-                _img_da = await self.encode(_img_da)
-                _txt_da = await self.encode(_txt_da)
-                _img_da.embeddings = torch.from_numpy(_img_da.embeddings).to(
-                    self._device, non_blocking=True
-                )
-                _txt_da.embeddings = torch.from_numpy(_txt_da.embeddings).to(
-                    self._device, non_blocking=True
-                )
-
-                # normalized features
-                image_features = _img_da.embeddings / _img_da.embeddings.norm(
-                    dim=-1, keepdim=True
-                )
-                text_features = _txt_da.embeddings / _txt_da.embeddings.norm(
-                    dim=-1, keepdim=True
-                )
-
-                # paired cosine between image and text
-                scores_per_text = image_features @ text_features.t()
-                scores_per_image = scores_per_text.t()
-
-                if len(_img_da) == 1:
-                    cosine_scores = scores_per_text
-                elif len(_txt_da) == 1:
-                    cosine_scores = scores_per_image
-
-                softmax_scores = self._logit_scale * cosine_scores
-                softmax_scores = softmax_scores.softmax(dim=-1)
-
-                # squeeze scores
-                cosine_scores = cosine_scores.cpu().detach().numpy().squeeze()
-                softmax_scores = softmax_scores.cpu().detach().numpy().squeeze()
-
-                _img_da.embeddings = None
-                _txt_da.embeddings = None
-
-                for c, p, o in zip(candidates, softmax_scores, cosine_scores):
-                    c.scores['clip_score'].value = p
-                    c.scores['clip_score'].op_name = 'softmax'
-
-                    c.scores['clip_score_cosine'].value = o
-                    c.scores['clip_score_cosine'].op_name = 'cosine'
-
-                setattr(
-                    d,
-                    _source,
-                    sorted(
-                        candidates,
-                        key=lambda _m: _m.scores['clip_score'].value,
-                        reverse=True,
-                    ),
-                )
+        set_rank(docs, _source)
 
     @requests
     async def encode(self, docs: 'DocumentArray', **kwargs):