jina-ai · numb3r3 · Sep 26, 2022 · Sep 21, 2022 · Sep 21, 2022 · Sep 22, 2022
diff --git a/docs/user-guides/server.md b/docs/user-guides/server.md
@@ -90,12 +90,12 @@ Please also note that **different models give different sizes of output dimensio
 | ViT-L-14::laion400m_e32               | ✅       | ✅    | ❌        | 768              | 1631            | 3.42                | 2.03                 |
 | ViT-L-14::laion2B-s32B-b82K           | ✅       | ✅    | ❌        | 768              | 1631            | 3.43                | 2.03                 |
 | ViT-L-14-336::openai                  | ✅       | ✅    | ❌        | 768              | 891             | 3.74                | 2.23                 |
-| ViT-H-14::laion2B-s32B-b79K           | ✅       | 🚧   | ❌        | 1024             | 3762            | 4.45                | 3.26                 |
-| ViT-g-14::laion2B-s12B-b42K           | ✅       | 🚧   | ❌        | 1024             | 5214            | 5.16                | 4.00                 |
-| M-CLIP/XLM-Roberta-Large-Vit-B-32     | ✅       | 🚧   | 🚧       | 512              | 4284            | 5.37                | 1.68                 |
-| M-CLIP/XLM-Roberta-Large-Vit-L-14     | ✅       | 🚧   | ❌        | 768              | 4293            | 4.30                | 4.97                 |
-| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅       | 🚧   | 🚧       | 640              | 4293            | 4.30                | 4.13                 |
-| M-CLIP/LABSE-Vit-L-14                 | ✅       | 🚧   | ❌        | 768              | 3609            | 4.30                | 4.70                 |
+| ViT-H-14::laion2B-s32B-b79K           | ✅       | ✅    | ❌        | 1024             | 3762            | 4.45                | 3.26                 |
+| ViT-g-14::laion2B-s12B-b42K           | ✅       | ✅    | ❌        | 1024             | 5214            | 5.16                | 4.00                 |
+| M-CLIP/LABSE-Vit-L-14                 | ✅       | ✅    | ❌        | 768              | 3609            | 4.30                | 4.70                 |
+| M-CLIP/XLM-Roberta-Large-Vit-B-32     | ✅       | ✅    | 🚧       | 512              | 4284            | 5.37                | 1.68                 |
+| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅       | ✅    | 🚧       | 640              | 4293            | 4.30                | 4.13                 |
+| M-CLIP/XLM-Roberta-Large-Vit-L-14     | ✅       | ✅    | ❌        | 768              | 4293            | 4.30                | 4.97                 |
 
 ✅ = Supported — 🚧 = Working in progress — ❌ = Not supported
 

diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py
@@ -117,6 +117,14 @@
         ('ViT-L-14@336px/textual.onnx', '78fab479f136403eed0db46f3e9e7ed2'),
         ('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
     ),
+    'ViT-H-14::laion2B-s32B-b79K': (
+        ('ViT-H-14-laion2B-s32B-b79K/textual.onnx', '41e73c0c871d0e8e5d5e236f917f1ec3'),
+        ('ViT-H-14-laion2B-s32B-b79K/visual.zip', '38151ea5985d73de94520efef38db4e7'),
+    ),
+    'ViT-g-14::laion2B-s12B-b42K': (
+        ('ViT-g-14-laion2B-s12B-b42K/textual.onnx', 'e597b7ab4414ecd92f715d47e79a033f'),
+        ('ViT-g-14-laion2B-s12B-b42K/visual.zip', '6d0ac4329de9b02474f4752a5d16ba82'),
+    ),
     # older version name format
     'RN50': (
         ('RN50/textual.onnx', '722418bfe47a1f5c79d1f44884bb3103'),
@@ -155,10 +163,40 @@
         ('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
     ),
     # MultilingualCLIP models
-    # 'M-CLIP/LABSE-Vit-L-14': (
-    #     ('M-CLIP-LABSE-Vit-L-14/textual.onnx', 'b5b649f9e064457c764874e982bca296'),
-    #     ('M-CLIP-LABSE-Vit-L-14/visual.onnx', '471951562303c9afbb804b865eedf149'),
-    # ),
+    'M-CLIP/LABSE-Vit-L-14': (
+        ('M-CLIP-LABSE-Vit-L-14/textual.onnx', '03727820116e63c7d19c72bb5d839488'),
+        ('M-CLIP-LABSE-Vit-L-14/visual.onnx', 'a78028eab30084c3913edfb0c8411f15'),
+    ),
+    'M-CLIP/XLM-Roberta-Large-Vit-B-32': (
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-B-32/textual.zip',
+            '41f51ec9af4754d11c7b7929e2caf5b9',
+        ),
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-B-32/visual.onnx',
+            '5f18f68ac94e294863bfd1f695c8c5ca',
+        ),
+    ),
+    'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': (
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/textual.zip',
+            '6c3e55f7d2d6c12f2c1f1dd36fdec607',
+        ),
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/visual.onnx',
+            '467a3ef3e5f50abcf850c3db9e705f8e',
+        ),
+    ),
+    'M-CLIP/XLM-Roberta-Large-Vit-L-14': (
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-L-14/textual.zip',
+            '3dff00335dc3093acb726dab975ae57d',
+        ),
+        (
+            'M-CLIP-XLM-Roberta-Large-Vit-L-14/visual.onnx',
+            'a78028eab30084c3913edfb0c8411f15',
+        ),
+    ),
 }
 
 
@@ -226,10 +264,29 @@ def start_sessions(
     ):
         import onnxruntime as ort
 
-        self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
+        def _load_session_from_zip(model_path: str, model_type: str):
+            """Load a model from a zip file."""
+            import zipfile
+            import tempfile
+
+            with zipfile.ZipFile(
+                model_path, 'r'
+            ) as zip_ref, tempfile.TemporaryDirectory() as tmp_dir:
+                zip_ref.extractall(tmp_dir)
+                return ort.InferenceSession(tmp_dir + f'/{model_type}.onnx', **kwargs)
+
+        if self._visual_path.endswith('.zip'):
+            self._visual_session = _load_session_from_zip(self._visual_path, 'visual')
+        else:
+            self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
         self._visual_session.disable_fallback()
 
-        self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
+        if self._textual_path.endswith('.zip'):
+            self._textual_session = _load_session_from_zip(
+                self._textual_path, 'textual'
+            )
+        else:
+            self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
         self._textual_session.disable_fallback()
 
     def encode_image(self, image_input: Dict):

diff --git a/server/clip_server/model/mclip_model.py b/server/clip_server/model/mclip_model.py
@@ -2,15 +2,15 @@
 
 import transformers
 import torch
-import open_clip
 
 from clip_server.model.clip_model import CLIPModel
+from clip_server.model.openclip_model import OpenCLIPModel
 
 _CLIP_MODEL_MAPS = {
-    'M-CLIP/XLM-Roberta-Large-Vit-B-32': ('ViT-B-32', 'openai'),
-    'M-CLIP/XLM-Roberta-Large-Vit-L-14': ('ViT-L-14', 'openai'),
-    'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': ('ViT-B-16-plus-240', 'laion400m_e31'),
-    'M-CLIP/LABSE-Vit-L-14': ('ViT-L-14', 'openai'),
+    'M-CLIP/XLM-Roberta-Large-Vit-B-32': 'ViT-B-32::openai',
+    'M-CLIP/XLM-Roberta-Large-Vit-L-14': 'ViT-L-14::openai',
+    'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': 'ViT-B-16-plus-240::laion400m_e31',
+    'M-CLIP/LABSE-Vit-L-14': 'ViT-L-14::openai',
 }
 
 
@@ -56,18 +56,11 @@ def __init__(self, name: str, device: str = 'cpu', jit: bool = False, **kwargs):
         self._mclip_model = MultilingualCLIP.from_pretrained(name)
         self._mclip_model.to(device=device)
         self._mclip_model.eval()
+        self._model = OpenCLIPModel(_CLIP_MODEL_MAPS[name], device=device, jit=jit)
 
-        clip_name, clip_pretrained = _CLIP_MODEL_MAPS[name]
-        self._model = open_clip.create_model(
-            clip_name, pretrained=clip_pretrained, device=device, jit=jit
-        )
-        self._model.eval()
-
-        self._clip_name = clip_name
-
-    @property
-    def model_name(self):
-        return self._clip_name
+    @staticmethod
+    def get_model_name(name: str):
+        return _CLIP_MODEL_MAPS[name].split('::')[0]
 
     def encode_text(
         self, input_ids: 'torch.Tensor', attention_mask: 'torch.Tensor', **kwargs

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -1,5 +1,6 @@
 import pytest
 from clip_server.model.clip_model import CLIPModel
+from clip_server.model.clip_onnx import CLIPOnnxModel
 from clip_server.model.openclip_model import OpenCLIPModel
 from clip_server.model.mclip_model import MultilingualCLIPModel
 
@@ -8,10 +9,22 @@
     'name, model_cls',
     [
         ('ViT-L/14@336px', OpenCLIPModel),
-        ('RN101::openai', OpenCLIPModel),
-        ('M-CLIP/XLM-Roberta-Large-Vit-B-32', MultilingualCLIPModel),
+        ('RN50::openai', OpenCLIPModel),
+        ('M-CLIP/LABSE-Vit-L-14', MultilingualCLIPModel),
     ],
 )
-def test_model_name(name, model_cls):
+def test_torch_model(name, model_cls):
     model = CLIPModel(name)
     assert model.__class__ == model_cls
+
+
+@pytest.mark.parametrize(
+    'name',
+    [
+        'RN50::openai',
+        'ViT-H-14::laion2B-s32B-b79K',
+        'M-CLIP/LABSE-Vit-L-14',
+    ],
+)
+def test_onnx_model(name):
+    CLIPOnnxModel(name)