jina-ai · jemmyshin · Dec 8, 2022 · Dec 4, 2022 · Dec 4, 2022 · Dec 4, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -113,7 +113,6 @@ jobs:
           pip install --no-cache-dir "server/[onnx]"
           pip install --no-cache-dir "server/[transformers]"
           pip install --no-cache-dir "server/[search]"
-          pip install --no-cache-dir "server/[transformers]"
       - name: Test
         id: test
         run: |
@@ -158,6 +157,7 @@ jobs:
           python -m pip install wheel pytest pytest-cov nvidia-pyindex
           pip install -e "client/[test]"
           pip install -e "server/[tensorrt]"
+          pip install -e "server/[onnx]"
           {
             pip install -e "server/[flash-attn]"
           } || {
@@ -168,6 +168,8 @@ jobs:
         run: |
           pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \
             -v -s -m "gpu" ./tests/test_tensorrt.py
+          pytest --suppress-no-test-exit-code --cov=clip_client --cov=clip_server --cov-report=xml \
+            -v -s -m "gpu" ./tests/test_fp16.py
           echo "::set-output name=codecov_flag::cas"
         timeout-minutes: 30
         env:

diff --git a/scripts/get-all-test-paths.sh b/scripts/get-all-test-paths.sh
@@ -6,7 +6,7 @@ BATCH_SIZE=3
 #declare -a array1=( "tests/unit/test_*.py" )
 #declare -a array2=( $(ls -d tests/unit/*/ | grep -v '__pycache__' | grep -v 'array') )
 #declare -a array3=( "tests/unit/array/*.py" )
-declare -a mixins=( $(find tests -name "test_*.py" | grep -v 'test_tensorrt.py') )
+declare -a mixins=( $(find tests -name "test_*.py" | grep -v 'test_tensorrt.py' | grep -v 'test_fp16.py') )
 declare -a array4=( "$(echo "${mixins[@]}" | xargs -n$BATCH_SIZE)" )
 # array5 is currently empty because in the array/ directory, mixins is the only directory
 # but add the following in case new directories are created in array/

diff --git a/server/clip_server/executors/clip_onnx.py b/server/clip_server/executors/clip_onnx.py
@@ -27,6 +27,7 @@ def __init__(
         minibatch_size: int = 32,
         access_paths: str = '@r',
         model_path: Optional[str] = None,
+        dtype: Optional[str] = None,
         **kwargs,
     ):
         """
@@ -41,8 +42,17 @@ def __init__(
         :param model_path: The path to the model to be used. If not specified, the model will be downloaded or loaded
             from the local cache. Visit https://clip-as-service.jina.ai/user-guides/server/#use-custom-model-for-onnx
             to learn how to finetune custom models.
+        :param dtype: inference data type, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
         """
         super().__init__(**kwargs)
+        import torch
+
+        if not device:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self._device = device
+        if not dtype:
+            dtype = 'fp32' if self._device in ('cpu', torch.device('cpu')) else 'fp16'
+        self._dtype = dtype
 
         self._minibatch_size = minibatch_size
         self._access_paths = access_paths
@@ -55,18 +65,11 @@ def __init__(
         self._num_worker_preprocess = num_worker_preprocess
         self._pool = ThreadPool(processes=num_worker_preprocess)
 
-        self._model = CLIPOnnxModel(name, model_path)
+        self._model = CLIPOnnxModel(name, model_path, dtype)
         self._tokenizer = Tokenizer(name)
 
         self._image_transform = clip._transform_ndarray(self._model.image_size)
 
-        import torch
-
-        if not device:
-            self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        else:
-            self._device = device
-
         # define the priority order for the execution providers
         providers = ['CPUExecutionProvider']
 
@@ -116,6 +119,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool):
                     preprocess_fn=self._image_transform,
                     return_np=True,
                     drop_image_content=drop_image_content,
+                    dtype=self._dtype,
                 )
 
     def _preproc_texts(self, docs: 'DocumentArray'):

diff --git a/server/clip_server/executors/clip_torch.py b/server/clip_server/executors/clip_torch.py
@@ -2,7 +2,7 @@
 import warnings
 from functools import partial
 from multiprocessing.pool import ThreadPool
-from typing import Dict, Optional
+from typing import Dict, Union, Optional
 
 import numpy as np
 import torch
@@ -12,6 +12,7 @@
     set_rank,
     split_img_txt_da,
 )
+from clip_server.helper import __cast_dtype__
 from clip_server.model import clip
 from clip_server.model.clip_model import CLIPModel
 from clip_server.model.tokenization import Tokenizer
@@ -28,6 +29,7 @@ def __init__(
         num_worker_preprocess: int = 4,
         minibatch_size: int = 32,
         access_paths: str = '@r',
+        dtype: Optional[Union[str, torch.dtype]] = None,
         **kwargs,
     ):
         """
@@ -40,6 +42,7 @@ def __init__(
             number if you encounter OOM errors.
         :param access_paths: The access paths to traverse on the input documents to get the images and texts to be
             processed. Visit https://docarray.jina.ai/fundamentals/documentarray/access-elements for more details.
+        :param dtype: inference data type, if None defaults to torch.float32 if device == 'cpu' else torch.float16.
         """
         super().__init__(**kwargs)
 
@@ -52,9 +55,17 @@ def __init__(
             self._access_paths = kwargs['traversal_paths']
 
         if not device:
-            self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        else:
-            self._device = device
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self._device = device
+        if isinstance(dtype, str):
+            dtype = __cast_dtype__.get(dtype)
+        elif not dtype:
+            dtype = (
+                torch.float32
+                if self._device in ('cpu', torch.device('cpu'))
+                else torch.float16
+            )
+        self._dtype = dtype
 
         if not self._device.startswith('cuda') and (
             'OMP_NUM_THREADS' not in os.environ
@@ -77,7 +88,9 @@ def __init__(
         self._num_worker_preprocess = num_worker_preprocess
         self._pool = ThreadPool(processes=num_worker_preprocess)
 
-        self._model = CLIPModel(name, device=self._device, jit=jit, **kwargs)
+        self._model = CLIPModel(
+            name, device=self._device, jit=jit, dtype=dtype, **kwargs
+        )
         self._tokenizer = Tokenizer(name)
         self._image_transform = clip._transform_ndarray(self._model.image_size)
 
@@ -96,6 +109,7 @@ def _preproc_images(self, docs: 'DocumentArray', drop_image_content: bool):
                     device=self._device,
                     return_np=False,
                     drop_image_content=drop_image_content,
+                    dtype=self._dtype,
                 )
 
     def _preproc_texts(self, docs: 'DocumentArray'):

diff --git a/server/clip_server/executors/helper.py b/server/clip_server/executors/helper.py
@@ -1,8 +1,9 @@
-from typing import Tuple, List, Callable, Any, Dict
+from typing import Tuple, List, Callable, Any, Dict, Union
 import torch
 import numpy as np
 from docarray import Document, DocumentArray
 from docarray.math.distance.numpy import cosine
+from clip_server.helper import __cast_dtype__
 
 
 from clip_server.model.tokenization import Tokenizer
@@ -22,8 +23,12 @@ def preproc_image(
     device: str = 'cpu',
     return_np: bool = False,
     drop_image_content: bool = False,
+    dtype: Union[str, torch.dtype] = torch.float32,
 ) -> Tuple['DocumentArray', Dict]:
 
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype)
+
     tensors_batch = []
 
     for d in da:
@@ -42,7 +47,7 @@ def preproc_image(
         if drop_image_content:
             d.pop('blob', 'tensor')
 
-    tensors_batch = torch.stack(tensors_batch).type(torch.float32)
+    tensors_batch = torch.stack(tensors_batch).type(dtype)
 
     if return_np:
         tensors_batch = tensors_batch.cpu().numpy()

diff --git a/server/clip_server/helper.py b/server/clip_server/helper.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import threading
+import torch
 from packaging.version import Version
 from urllib.request import Request, urlopen
 
@@ -19,6 +20,9 @@
 )
 
 
+__cast_dtype__ = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
+
+
 def _version_check(package: str = None, github_repo: str = None):
     try:
 

diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict
+from typing import Dict, Optional
 
 from clip_server.model.pretrained_models import (
     download_model,
@@ -201,8 +201,11 @@
 
 
 class CLIPOnnxModel(BaseCLIPModel):
-    def __init__(self, name: str, model_path: str = None):
+    def __init__(
+        self, name: str, model_path: str = None, dtype: Optional[str] = 'fp32'
+    ):
         super().__init__(name)
+        self._dtype = dtype
         if name in _MODELS:
             if not model_path:
                 cache_dir = os.path.expanduser(
@@ -237,6 +240,22 @@ def __init__(self, name: str, model_path: str = None):
                         f'The given model path {model_path} should be a folder containing both '
                         f'`textual.onnx` and `visual.onnx`.'
                     )
+            if dtype == 'fp16':
+                import onnx
+                from onnxmltools.utils import float16_converter
+
+                _textual_model_fp16 = (
+                    float16_converter.convert_float_to_float16_model_path(
+                        self._textual_path
+                    )
+                )
+                _visual_model_fp16 = (
+                    float16_converter.convert_float_to_float16_model_path(
+                        self._visual_path
+                    )
+                )
+                onnx.save_model(_textual_model_fp16, self._textual_path)
+                onnx.save_model(_visual_model_fp16, self._visual_path)
         else:
             raise RuntimeError(
                 'CLIP model {} not found or not supports ONNX backend; below is a list of all available models:\n{}'.format(

diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py
@@ -15,6 +15,7 @@
 from dataclasses import dataclass
 from typing import Tuple, Union, Optional
 from copy import deepcopy
+from clip_server.helper import __cast_dtype__
 from open_clip.transformer import QuickGELU, LayerNorm, LayerNormFp32, Attention
 from open_clip.timm_model import TimmModel
 from open_clip.factory import _MODEL_CONFIGS
@@ -81,6 +82,11 @@ def __init__(
         super().__init__(image_size, patch_size, output_dim=output_dim, **kwargs)
         self.transformer = Transformer(dtype=dtype, **kwargs)
 
+    def forward(self, x: torch.Tensor):
+        dtype = self.transformer.get_cast_dtype()
+        x = x.to(dtype)
+        return super().forward(x)
+
 
 class TextTransformer(_TextTransformer):
     def __init__(
@@ -435,7 +441,9 @@ def load_openai_model(
     preprocess : Callable[[PIL.Image], torch.Tensor]
         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
     """
-    if dtype is None:
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype, 'amp')
+    elif dtype is None:
         dtype = (
             torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16
         )
@@ -550,7 +558,9 @@ def load_openclip_model(
     pretrained_image: bool = False,
     dtype: Optional[Union[str, torch.dtype]] = None,
 ):
-    if dtype is None:
+    if isinstance(dtype, str):
+        dtype = __cast_dtype__.get(dtype)
+    elif dtype is None:
         dtype = (
             torch.float32 if device in ('cpu', torch.device('cpu')) else torch.float16
         )

diff --git a/server/setup.py b/server/setup.py
@@ -53,6 +53,7 @@
         'onnx': [
             'onnxruntime',
             'onnx',
+            'onnxmltools',
         ]
         + (['onnxruntime-gpu>=1.8.0'] if sys.platform != 'darwin' else []),
         'tensorrt': ['nvidia-tensorrt'],

diff --git a/tests/test_fp16.py b/tests/test_fp16.py
@@ -0,0 +1,63 @@
+import os
+
+import pytest
+from docarray import Document, DocumentArray
+from jina import Flow
+
+from clip_client.client import Client
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'inputs',
+    [
+        ['hello, world', 'goodbye, world'],
+        ('hello, world', 'goodbye, world'),
+        lambda: ('hello, world' for _ in range(10)),
+        [
+            'https://docarray.jina.ai/_static/favicon.png',
+            f'{os.path.dirname(os.path.abspath(__file__))}/img/00000.jpg',
+            'hello, world',
+        ],
+    ],
+)
+def test_plain_inputs(make_flow, inputs):
+    c = Client(server=f'grpc://0.0.0.0:{make_flow.port}')
+    r = c.encode(inputs if not callable(inputs) else inputs())
+    assert (
+        r.shape[0] == len(list(inputs)) if not callable(inputs) else len(list(inputs()))
+    )
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    'inputs',
+    [
+        [Document(text='hello, world'), Document(text='goodbye, world')],
+        DocumentArray([Document(text='hello, world'), Document(text='goodbye, world')]),
+        lambda: (Document(text='hello, world') for _ in range(10)),
+        DocumentArray(
+            [
+                Document(uri='https://docarray.jina.ai/_static/favicon.png'),
+                Document(
+                    uri=f'{os.path.dirname(os.path.abspath(__file__))}/img/00000.jpg'
+                ),
+                Document(text='hello, world'),
+                Document(
+                    uri=f'{os.path.dirname(os.path.abspath(__file__))}/img/00000.jpg'
+                ).load_uri_to_image_tensor(),
+            ]
+        ),
+        DocumentArray.from_files(
+            f'{os.path.dirname(os.path.abspath(__file__))}/**/*.jpg'
+        ),
+    ],
+)
+def test_docarray_inputs(make_flow, inputs):
+    c = Client(server=f'grpc://0.0.0.0:{make_flow.port}')
+    r = c.encode(inputs if not callable(inputs) else inputs())
+    assert isinstance(r, DocumentArray)
+    assert r.embeddings.shape
+    assert not r[0].tensor
+    if hasattr(inputs, '__len__'):
+        assert inputs[0] is r[0]