Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support large ONNX model files #828

Merged
merged 14 commits into from
Sep 26, 2022
12 changes: 6 additions & 6 deletions docs/user-guides/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@ Please also note that **different models give different sizes of output dimensio
| ViT-L-14::laion400m_e32 | ✅ | ✅ | ❌ | 768 | 1631 | 3.42 | 2.03 |
| ViT-L-14::laion2B-s32B-b82K | ✅ | ✅ | ❌ | 768 | 1631 | 3.43 | 2.03 |
| ViT-L-14-336::openai | ✅ | ✅ | ❌ | 768 | 891 | 3.74 | 2.23 |
| ViT-H-14::laion2B-s32B-b79K | ✅ | 🚧 | ❌ | 1024 | 3762 | 4.45 | 3.26 |
| ViT-g-14::laion2B-s12B-b42K | ✅ | 🚧 | ❌ | 1024 | 5214 | 5.16 | 4.00 |
| M-CLIP/XLM-Roberta-Large-Vit-B-32 | ✅ | 🚧 | 🚧 | 512 | 4284 | 5.37 | 1.68 |
| M-CLIP/XLM-Roberta-Large-Vit-L-14 | ✅ | 🚧 | ❌ | 768 | 4293 | 4.30 | 4.97 |
| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅ | 🚧 | 🚧 | 640 | 4293 | 4.30 | 4.13 |
| M-CLIP/LABSE-Vit-L-14 | ✅ | 🚧 | ❌ | 768 | 3609 | 4.30 | 4.70 |
| ViT-H-14::laion2B-s32B-b79K | ✅ | | ❌ | 1024 | 3762 | 4.45 | 3.26 |
| ViT-g-14::laion2B-s12B-b42K | ✅ | | ❌ | 1024 | 5214 | 5.16 | 4.00 |
| M-CLIP/LABSE-Vit-L-14 | ✅ | | ❌ | 768 | 3609 | 4.30 | 4.70 |
| M-CLIP/XLM-Roberta-Large-Vit-B-32 | ✅ | | 🚧 | 512 | 4284 | 5.37 | 1.68 |
| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅ | | 🚧 | 640 | 4293 | 4.30 | 4.13 |
| M-CLIP/XLM-Roberta-Large-Vit-L-14 | ✅ | ✅ | ❌ | 768 | 4293 | 4.30 | 4.97 |

✅ = Supported — 🚧 = Working in progress — ❌ = Not supported

Expand Down
69 changes: 63 additions & 6 deletions server/clip_server/model/clip_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@
('ViT-L-14@336px/textual.onnx', '78fab479f136403eed0db46f3e9e7ed2'),
('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
),
'ViT-H-14::laion2B-s32B-b79K': (
('ViT-H-14-laion2B-s32B-b79K/textual.onnx', '41e73c0c871d0e8e5d5e236f917f1ec3'),
('ViT-H-14-laion2B-s32B-b79K/visual.zip', '38151ea5985d73de94520efef38db4e7'),
),
'ViT-g-14::laion2B-s12B-b42K': (
('ViT-g-14-laion2B-s12B-b42K/textual.onnx', 'e597b7ab4414ecd92f715d47e79a033f'),
('ViT-g-14-laion2B-s12B-b42K/visual.zip', '6d0ac4329de9b02474f4752a5d16ba82'),
),
# older version name format
'RN50': (
('RN50/textual.onnx', '722418bfe47a1f5c79d1f44884bb3103'),
Expand Down Expand Up @@ -155,10 +163,40 @@
('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
),
# MultilingualCLIP models
# 'M-CLIP/LABSE-Vit-L-14': (
# ('M-CLIP-LABSE-Vit-L-14/textual.onnx', 'b5b649f9e064457c764874e982bca296'),
# ('M-CLIP-LABSE-Vit-L-14/visual.onnx', '471951562303c9afbb804b865eedf149'),
# ),
'M-CLIP/LABSE-Vit-L-14': (
('M-CLIP-LABSE-Vit-L-14/textual.onnx', '03727820116e63c7d19c72bb5d839488'),
('M-CLIP-LABSE-Vit-L-14/visual.onnx', 'a78028eab30084c3913edfb0c8411f15'),
),
'M-CLIP/XLM-Roberta-Large-Vit-B-32': (
(
'M-CLIP-XLM-Roberta-Large-Vit-B-32/textual.zip',
'41f51ec9af4754d11c7b7929e2caf5b9',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-B-32/visual.onnx',
'5f18f68ac94e294863bfd1f695c8c5ca',
),
),
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': (
(
'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/textual.zip',
'6c3e55f7d2d6c12f2c1f1dd36fdec607',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/visual.onnx',
'467a3ef3e5f50abcf850c3db9e705f8e',
),
),
'M-CLIP/XLM-Roberta-Large-Vit-L-14': (
(
'M-CLIP-XLM-Roberta-Large-Vit-L-14/textual.zip',
'3dff00335dc3093acb726dab975ae57d',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-L-14/visual.onnx',
'a78028eab30084c3913edfb0c8411f15',
),
),
}


Expand Down Expand Up @@ -226,10 +264,29 @@ def start_sessions(
):
import onnxruntime as ort

self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
def _load_session_from_zip(model_path: str, model_type: str):
"""Load a model from a zip file."""
import zipfile
import tempfile

with zipfile.ZipFile(
model_path, 'r'
) as zip_ref, tempfile.TemporaryDirectory() as tmp_dir:
zip_ref.extractall(tmp_dir)
return ort.InferenceSession(tmp_dir + f'/{model_type}.onnx', **kwargs)

if self._visual_path.endswith('.zip'):
self._visual_session = _load_session_from_zip(self._visual_path, 'visual')
else:
self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
self._visual_session.disable_fallback()

self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
if self._textual_path.endswith('.zip'):
self._textual_session = _load_session_from_zip(
self._textual_path, 'textual'
)
else:
self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
self._textual_session.disable_fallback()

def encode_image(self, image_input: Dict):
Expand Down
25 changes: 9 additions & 16 deletions server/clip_server/model/mclip_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

import transformers
import torch
import open_clip

from clip_server.model.clip_model import CLIPModel
from clip_server.model.openclip_model import OpenCLIPModel

_CLIP_MODEL_MAPS = {
'M-CLIP/XLM-Roberta-Large-Vit-B-32': ('ViT-B-32', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-L-14': ('ViT-L-14', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': ('ViT-B-16-plus-240', 'laion400m_e31'),
'M-CLIP/LABSE-Vit-L-14': ('ViT-L-14', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-B-32': 'ViT-B-32::openai',
'M-CLIP/XLM-Roberta-Large-Vit-L-14': 'ViT-L-14::openai',
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': 'ViT-B-16-plus-240::laion400m_e31',
'M-CLIP/LABSE-Vit-L-14': 'ViT-L-14::openai',
}


Expand Down Expand Up @@ -56,18 +56,11 @@ def __init__(self, name: str, device: str = 'cpu', jit: bool = False, **kwargs):
self._mclip_model = MultilingualCLIP.from_pretrained(name)
self._mclip_model.to(device=device)
self._mclip_model.eval()
self._model = OpenCLIPModel(_CLIP_MODEL_MAPS[name], device=device, jit=jit)

clip_name, clip_pretrained = _CLIP_MODEL_MAPS[name]
self._model = open_clip.create_model(
clip_name, pretrained=clip_pretrained, device=device, jit=jit
)
self._model.eval()

self._clip_name = clip_name

@property
def model_name(self):
return self._clip_name
@staticmethod
def get_model_name(name: str):
return _CLIP_MODEL_MAPS[name].split('::')[0]

def encode_text(
self, input_ids: 'torch.Tensor', attention_mask: 'torch.Tensor', **kwargs
Expand Down
19 changes: 16 additions & 3 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from clip_server.model.clip_model import CLIPModel
from clip_server.model.clip_onnx import CLIPOnnxModel
from clip_server.model.openclip_model import OpenCLIPModel
from clip_server.model.mclip_model import MultilingualCLIPModel

Expand All @@ -8,10 +9,22 @@
'name, model_cls',
[
('ViT-L/14@336px', OpenCLIPModel),
('RN101::openai', OpenCLIPModel),
('M-CLIP/XLM-Roberta-Large-Vit-B-32', MultilingualCLIPModel),
('RN50::openai', OpenCLIPModel),
('M-CLIP/LABSE-Vit-L-14', MultilingualCLIPModel),
],
)
def test_model_name(name, model_cls):
def test_torch_model(name, model_cls):
model = CLIPModel(name)
assert model.__class__ == model_cls


@pytest.mark.parametrize(
'name',
[
'RN50::openai',
'ViT-H-14::laion2B-s32B-b79K',
'M-CLIP/LABSE-Vit-L-14',
],
)
def test_onnx_model(name):
CLIPOnnxModel(name)