Skip to content

Commit

Permalink
feat: support large ONNX model files (#828)
Browse files Browse the repository at this point in the history
* fix: add 4 onnx models and md5

* fix: tmp remove H and L onnx model

* feat: support not yet supported onnx models

* docs: update onnx model

* fix: add md5

* test: load onnx model

* fix: md5

* fix: use custom openclip visual model load in mclip

* fix: load zip model

* fix: onnx local model test

* fix: onnx custom model test

* fix: typo

* fix: typo

* fix: apply comment
  • Loading branch information
ZiniuYu authored Sep 26, 2022
1 parent 09d1548 commit 2ba8a4f
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 31 deletions.
12 changes: 6 additions & 6 deletions docs/user-guides/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@ Please also note that **different models give different sizes of output dimensio
| ViT-L-14::laion400m_e32 |||| 768 | 1631 | 3.42 | 2.03 |
| ViT-L-14::laion2B-s32B-b82K |||| 768 | 1631 | 3.43 | 2.03 |
| ViT-L-14-336::openai |||| 768 | 891 | 3.74 | 2.23 |
| ViT-H-14::laion2B-s32B-b79K || 🚧 || 1024 | 3762 | 4.45 | 3.26 |
| ViT-g-14::laion2B-s12B-b42K || 🚧 || 1024 | 5214 | 5.16 | 4.00 |
| M-CLIP/XLM-Roberta-Large-Vit-B-32 || 🚧 | 🚧 | 512 | 4284 | 5.37 | 1.68 |
| M-CLIP/XLM-Roberta-Large-Vit-L-14 || 🚧 | | 768 | 4293 | 4.30 | 4.97 |
| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus || 🚧 | 🚧 | 640 | 4293 | 4.30 | 4.13 |
| M-CLIP/LABSE-Vit-L-14 | | 🚧 | | 768 | 3609 | 4.30 | 4.70 |
| ViT-H-14::laion2B-s32B-b79K || || 1024 | 3762 | 4.45 | 3.26 |
| ViT-g-14::laion2B-s12B-b42K || || 1024 | 5214 | 5.16 | 4.00 |
| M-CLIP/LABSE-Vit-L-14 || | | 768 | 3609 | 4.30 | 4.70 |
| M-CLIP/XLM-Roberta-Large-Vit-B-32 || | 🚧 | 512 | 4284 | 5.37 | 1.68 |
| M-CLIP/XLM-Roberta-Large-Vit-B-16Plus || | 🚧 | 640 | 4293 | 4.30 | 4.13 |
| M-CLIP/XLM-Roberta-Large-Vit-L-14 | || | 768 | 4293 | 4.30 | 4.97 |

✅ = Supported — 🚧 = Working in progress — ❌ = Not supported

Expand Down
69 changes: 63 additions & 6 deletions server/clip_server/model/clip_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@
('ViT-L-14@336px/textual.onnx', '78fab479f136403eed0db46f3e9e7ed2'),
('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
),
'ViT-H-14::laion2B-s32B-b79K': (
('ViT-H-14-laion2B-s32B-b79K/textual.onnx', '41e73c0c871d0e8e5d5e236f917f1ec3'),
('ViT-H-14-laion2B-s32B-b79K/visual.zip', '38151ea5985d73de94520efef38db4e7'),
),
'ViT-g-14::laion2B-s12B-b42K': (
('ViT-g-14-laion2B-s12B-b42K/textual.onnx', 'e597b7ab4414ecd92f715d47e79a033f'),
('ViT-g-14-laion2B-s12B-b42K/visual.zip', '6d0ac4329de9b02474f4752a5d16ba82'),
),
# older version name format
'RN50': (
('RN50/textual.onnx', '722418bfe47a1f5c79d1f44884bb3103'),
Expand Down Expand Up @@ -155,10 +163,40 @@
('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),
),
# MultilingualCLIP models
# 'M-CLIP/LABSE-Vit-L-14': (
# ('M-CLIP-LABSE-Vit-L-14/textual.onnx', 'b5b649f9e064457c764874e982bca296'),
# ('M-CLIP-LABSE-Vit-L-14/visual.onnx', '471951562303c9afbb804b865eedf149'),
# ),
'M-CLIP/LABSE-Vit-L-14': (
('M-CLIP-LABSE-Vit-L-14/textual.onnx', '03727820116e63c7d19c72bb5d839488'),
('M-CLIP-LABSE-Vit-L-14/visual.onnx', 'a78028eab30084c3913edfb0c8411f15'),
),
'M-CLIP/XLM-Roberta-Large-Vit-B-32': (
(
'M-CLIP-XLM-Roberta-Large-Vit-B-32/textual.zip',
'41f51ec9af4754d11c7b7929e2caf5b9',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-B-32/visual.onnx',
'5f18f68ac94e294863bfd1f695c8c5ca',
),
),
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': (
(
'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/textual.zip',
'6c3e55f7d2d6c12f2c1f1dd36fdec607',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus/visual.onnx',
'467a3ef3e5f50abcf850c3db9e705f8e',
),
),
'M-CLIP/XLM-Roberta-Large-Vit-L-14': (
(
'M-CLIP-XLM-Roberta-Large-Vit-L-14/textual.zip',
'3dff00335dc3093acb726dab975ae57d',
),
(
'M-CLIP-XLM-Roberta-Large-Vit-L-14/visual.onnx',
'a78028eab30084c3913edfb0c8411f15',
),
),
}


Expand Down Expand Up @@ -226,10 +264,29 @@ def start_sessions(
):
import onnxruntime as ort

self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
def _load_session_from_zip(model_path: str, model_type: str):
"""Load a model from a zip file."""
import zipfile
import tempfile

with zipfile.ZipFile(
model_path, 'r'
) as zip_ref, tempfile.TemporaryDirectory() as tmp_dir:
zip_ref.extractall(tmp_dir)
return ort.InferenceSession(tmp_dir + f'/{model_type}.onnx', **kwargs)

if self._visual_path.endswith('.zip'):
self._visual_session = _load_session_from_zip(self._visual_path, 'visual')
else:
self._visual_session = ort.InferenceSession(self._visual_path, **kwargs)
self._visual_session.disable_fallback()

self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
if self._textual_path.endswith('.zip'):
self._textual_session = _load_session_from_zip(
self._textual_path, 'textual'
)
else:
self._textual_session = ort.InferenceSession(self._textual_path, **kwargs)
self._textual_session.disable_fallback()

def encode_image(self, image_input: Dict):
Expand Down
25 changes: 9 additions & 16 deletions server/clip_server/model/mclip_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

import transformers
import torch
import open_clip

from clip_server.model.clip_model import CLIPModel
from clip_server.model.openclip_model import OpenCLIPModel

_CLIP_MODEL_MAPS = {
'M-CLIP/XLM-Roberta-Large-Vit-B-32': ('ViT-B-32', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-L-14': ('ViT-L-14', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': ('ViT-B-16-plus-240', 'laion400m_e31'),
'M-CLIP/LABSE-Vit-L-14': ('ViT-L-14', 'openai'),
'M-CLIP/XLM-Roberta-Large-Vit-B-32': 'ViT-B-32::openai',
'M-CLIP/XLM-Roberta-Large-Vit-L-14': 'ViT-L-14::openai',
'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus': 'ViT-B-16-plus-240::laion400m_e31',
'M-CLIP/LABSE-Vit-L-14': 'ViT-L-14::openai',
}


Expand Down Expand Up @@ -56,18 +56,11 @@ def __init__(self, name: str, device: str = 'cpu', jit: bool = False, **kwargs):
self._mclip_model = MultilingualCLIP.from_pretrained(name)
self._mclip_model.to(device=device)
self._mclip_model.eval()
self._model = OpenCLIPModel(_CLIP_MODEL_MAPS[name], device=device, jit=jit)

clip_name, clip_pretrained = _CLIP_MODEL_MAPS[name]
self._model = open_clip.create_model(
clip_name, pretrained=clip_pretrained, device=device, jit=jit
)
self._model.eval()

self._clip_name = clip_name

@property
def model_name(self):
return self._clip_name
@staticmethod
def get_model_name(name: str):
return _CLIP_MODEL_MAPS[name].split('::')[0]

def encode_text(
self, input_ids: 'torch.Tensor', attention_mask: 'torch.Tensor', **kwargs
Expand Down
19 changes: 16 additions & 3 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from clip_server.model.clip_model import CLIPModel
from clip_server.model.clip_onnx import CLIPOnnxModel
from clip_server.model.openclip_model import OpenCLIPModel
from clip_server.model.mclip_model import MultilingualCLIPModel

Expand All @@ -8,10 +9,22 @@
'name, model_cls',
[
('ViT-L/14@336px', OpenCLIPModel),
('RN101::openai', OpenCLIPModel),
('M-CLIP/XLM-Roberta-Large-Vit-B-32', MultilingualCLIPModel),
('RN50::openai', OpenCLIPModel),
('M-CLIP/LABSE-Vit-L-14', MultilingualCLIPModel),
],
)
def test_model_name(name, model_cls):
def test_torch_model(name, model_cls):
model = CLIPModel(name)
assert model.__class__ == model_cls


@pytest.mark.parametrize(
'name',
[
'RN50::openai',
'ViT-H-14::laion2B-s32B-b79K',
'M-CLIP/LABSE-Vit-L-14',
],
)
def test_onnx_model(name):
CLIPOnnxModel(name)

0 comments on commit 2ba8a4f

Please sign in to comment.