feat: support B/32, L/14, H/14, and g/14 trained on LAION-2B (#825)

* feat: support 4 new clip models pretrained on laion2b * fix: add md5 for 4 new torch model * fix: add 4 onnx models and md5 * fix: tmp remove H and L onnx model * docs: 4 new large models * docs: 4 new large models
jina-ai · Sep 23, 2022 · 09d1548 · 09d1548
1 parent c690c24
commit 09d1548
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 17 deletions.
diff --git a/docs/user-guides/server.md b/docs/user-guides/server.md
@@ -79,6 +79,7 @@ Please also note that **different models give different sizes of output dimensio
 | ViT-B-32::laion2b_e16                 | ✅       | ✅    | ✅        | 512              | 577             | 2.93                | 1.40                 |
 | ViT-B-32::laion400m_e31               | ✅       | ✅    | ✅        | 512              | 577             | 2.93                | 1.40                 |
 | ViT-B-32::laion400m_e32               | ✅       | ✅    | ✅        | 512              | 577             | 2.94                | 1.40                 |
+| ViT-B-32::laion2B-s34B-b79K           | ✅       | ✅    | ❌        | 512              | 577             | 2.94                | 1.40                 |
 | ViT-B-16::openai                      | ✅       | ✅    | ✅        | 512              | 335             | 3.20                | 1.44                 |
 | ViT-B-16::laion400m_e31               | ✅       | ✅    | ✅        | 512              | 571             | 2.93                | 1.44                 |
 | ViT-B-16::laion400m_e32               | ✅       | ✅    | ✅        | 512              | 571             | 2.94                | 1.44                 |
@@ -87,7 +88,10 @@ Please also note that **different models give different sizes of output dimensio
 | ViT-L-14::openai                      | ✅       | ✅    | ❌        | 768              | 890             | 3.66                | 2.04                 |
 | ViT-L-14::laion400m_e31               | ✅       | ✅    | ❌        | 768              | 1631            | 3.43                | 2.03                 |
 | ViT-L-14::laion400m_e32               | ✅       | ✅    | ❌        | 768              | 1631            | 3.42                | 2.03                 |
+| ViT-L-14::laion2B-s32B-b82K           | ✅       | ✅    | ❌        | 768              | 1631            | 3.43                | 2.03                 |
 | ViT-L-14-336::openai                  | ✅       | ✅    | ❌        | 768              | 891             | 3.74                | 2.23                 |
+| ViT-H-14::laion2B-s32B-b79K           | ✅       | 🚧   | ❌        | 1024             | 3762            | 4.45                | 3.26                 |
+| ViT-g-14::laion2B-s12B-b42K           | ✅       | 🚧   | ❌        | 1024             | 5214            | 5.16                | 4.00                 |
 | M-CLIP/XLM-Roberta-Large-Vit-B-32     | ✅       | 🚧   | 🚧       | 512              | 4284            | 5.37                | 1.68                 |
 | M-CLIP/XLM-Roberta-Large-Vit-L-14     | ✅       | 🚧   | ❌        | 768              | 4293            | 4.30                | 4.97                 |
 | M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅       | 🚧   | 🚧       | 640              | 4293            | 4.30                | 4.13                 |
@@ -192,7 +196,6 @@ Basically, each YAML file defines a [Jina Flow](https://docs.jina.ai/fundamental
 Looking at the YAML file again, we can put it into three subsections as below:
 
 
-
 ````{tab} CLIP model config
 
 ```{code-block} yaml

diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py
@@ -61,6 +61,10 @@
         ('ViT-B-32-laion400m_e32/textual.onnx', '93284915937ba42a2b52ae8d3e5283a0'),
         ('ViT-B-32-laion400m_e32/visual.onnx', 'db220821a31fe9795fd8c2ba419078c5'),
     ),
+    'ViT-B-32::laion2B-s34B-b79K': (
+        ('ViT-B-32-laion2B-s34B-b79K/textual.onnx', '84af5ae53da56464c76e67fe50fddbe9'),
+        ('ViT-B-32-laion2B-s34B-b79K/visual.onnx', 'a2d4cbd1cf2632cd09ffce9b40bfd8bd'),
+    ),
     'ViT-B-16::openai': (
         ('ViT-B-16/textual.onnx', '6f0976629a446f95c0c8767658f12ebe'),
         ('ViT-B-16/visual.onnx', 'd5c03bfeef1abbd9bede54a8f6e1eaad'),
@@ -105,6 +109,10 @@
         ('ViT-L-14-laion400m_e32/textual.onnx', '8ba5b76ba71992923470c0261b10a67c'),
         ('ViT-L-14-laion400m_e32/visual.onnx', '49db3ba92bd816001e932530ad92d76c'),
     ),
+    'ViT-L-14::laion2B-s32B-b82K': (
+        ('ViT-L-14-laion2B-s32B-b82K/textual.onnx', 'da36a6cbed4f56abf576fdea8b6fe2ee'),
+        ('ViT-L-14-laion2B-s32B-b82K/visual.onnx', '1e337a190abba6a8650237dfae4740b7'),
+    ),
     'ViT-L-14-336::openai': (
         ('ViT-L-14@336px/textual.onnx', '78fab479f136403eed0db46f3e9e7ed2'),
         ('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'),

diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py
@@ -49,17 +49,17 @@ def __init__(self, inplanes, planes, stride=1):
         # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
         self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
         self.bn1 = nn.BatchNorm2d(planes)
-        self.relu1 = nn.ReLU(inplace=True)
+        self.act1 = nn.ReLU(inplace=True)
 
         self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
         self.bn2 = nn.BatchNorm2d(planes)
-        self.relu2 = nn.ReLU(inplace=True)
+        self.act2 = nn.ReLU(inplace=True)
 
         self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
 
         self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-        self.relu3 = nn.ReLU(inplace=True)
+        self.act3 = nn.ReLU(inplace=True)
 
         self.downsample = None
         self.stride = stride
@@ -88,16 +88,16 @@ def __init__(self, inplanes, planes, stride=1):
     def forward(self, x: torch.Tensor):
         identity = x
 
-        out = self.relu1(self.bn1(self.conv1(x)))
-        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.act1(self.bn1(self.conv1(x)))
+        out = self.act2(self.bn2(self.conv2(out)))
         out = self.avgpool(out)
         out = self.bn3(self.conv3(out))
 
         if self.downsample is not None:
             identity = self.downsample(x)
 
         out += identity
-        out = self.relu3(out)
+        out = self.act3(out)
         return out
 
 
@@ -166,15 +166,15 @@ def __init__(self, layers, output_dim, heads, image_size=224, width=64):
             3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
         )
         self.bn1 = nn.BatchNorm2d(width // 2)
-        self.relu1 = nn.ReLU(inplace=True)
+        self.act1 = nn.ReLU(inplace=True)
         self.conv2 = nn.Conv2d(
             width // 2, width // 2, kernel_size=3, padding=1, bias=False
         )
         self.bn2 = nn.BatchNorm2d(width // 2)
-        self.relu2 = nn.ReLU(inplace=True)
+        self.act2 = nn.ReLU(inplace=True)
         self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
         self.bn3 = nn.BatchNorm2d(width)
-        self.relu3 = nn.ReLU(inplace=True)
+        self.act3 = nn.ReLU(inplace=True)
         self.avgpool = nn.AvgPool2d(2)
 
         # residual layers
@@ -226,9 +226,9 @@ def set_grad_checkpointing(self, enable=True):
         pass
 
     def stem(self, x):
-        x = self.relu1(self.bn1(self.conv1(x)))
-        x = self.relu2(self.bn2(self.conv2(x)))
-        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.act1(self.bn1(self.conv1(x)))
+        x = self.act2(self.bn2(self.conv2(x)))
+        x = self.act3(self.bn3(self.conv3(x)))
         x = self.avgpool(x)
         return x
 
@@ -273,30 +273,37 @@ def __init__(
         n_head: int,
         mlp_ratio: float = 4.0,
         act_layer: Callable = nn.GELU,
+        scale_cosine_attn: bool = False,
+        scale_heads: bool = False,
+        scale_attn: bool = False,
+        scale_fc: bool = False,
     ):
         super().__init__()
 
-        self.attn = nn.MultiheadAttention(d_model, n_head)
         self.ln_1 = LayerNorm(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_attn = LayerNorm(d_model) if scale_attn else nn.Identity()
+
+        self.ln_2 = LayerNorm(d_model)
         mlp_width = int(d_model * mlp_ratio)
         self.mlp = nn.Sequential(
             OrderedDict(
                 [
                     ("c_fc", nn.Linear(d_model, mlp_width)),
+                    ('ln', LayerNorm(mlp_width) if scale_fc else nn.Identity()),
                     ("gelu", act_layer()),
                     ("c_proj", nn.Linear(mlp_width, d_model)),
                 ]
             )
         )
-        self.ln_2 = LayerNorm(d_model)
 
     def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
 
     def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         if attn_mask is not None:
             attn_mask = attn_mask.to(dtype=x.dtype, device=x.device)
-        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
+        x = x + self.ln_attn(self.attention(self.ln_1(x), attn_mask=attn_mask))
         x = x + self.mlp(self.ln_2(x))
         return x
 

diff --git a/server/clip_server/model/pretrained_models.py b/server/clip_server/model/pretrained_models.py
@@ -27,6 +27,10 @@
         'ViT-B-32-laion400m_e32.pt',
         '359e0dba4a419f175599ee0c63a110d8',
     ),
+    'ViT-B-32::laion2B-s34B-b79K': (
+        'ViT-B-32-laion2B-s34B-b79K.bin',
+        '2fc036aea9cd7306f5ce7ce6abb8d0bf',
+    ),
     'ViT-B-16::openai': ('ViT-B-16.pt', '44c3d804ecac03d9545ac1a3adbca3a6'),
     'ViT-B-16::laion400m_e31': (
         'ViT-B-16-laion400m_e31.pt',
@@ -53,7 +57,19 @@
         'ViT-L-14-laion400m_e32.pt',
         'a76cde1bc744ca38c6036b920c847a89',
     ),
+    'ViT-L-14::laion2B-s32B-b82K': (
+        'ViT-L-14-laion2B-s32B-b82K.bin',
+        '4d2275fc7b2d7ee9db174f9b57ddecbd',
+    ),
     'ViT-L-14-336::openai': ('ViT-L-14-336px.pt', 'b311058cae50cb10fbfa2a44231c9473'),
+    'ViT-H-14::laion2B-s32B-b79K': (
+        'ViT-H-14-laion2B-s32B-b79K.bin',
+        '2aa6c46521b165a0daeb8cdc6668c7d3',
+    ),
+    'ViT-g-14::laion2B-s12B-b42K': (
+        'ViT-g-14-laion2B-s12B-b42K.bin',
+        '3bf99353f6f1829faac0bb155be4382a',
+    ),
     # older version name format
     'RN50': ('RN50.pt', '9140964eaaf9f68c95aa8df6ca13777c'),
     'RN101': ('RN101.pt', 'fa9d5f64ebf152bc56a18db245071014'),
@@ -81,10 +97,12 @@
     'RN50x64': 448,
     'ViT-B-32': 224,
     'ViT-B-16': 224,
+    'Vit-B-16Plus': 240,
     'ViT-B-16-plus-240': 240,
     'ViT-L-14': 224,
     'ViT-L-14-336': 336,
-    'Vit-B-16Plus': 240,
+    'ViT-H-14': 224,
+    'ViT-g-14': 224,
 }
 
 

diff --git a/server/clip_server/onnx-flow.yml b/server/clip_server/onnx-flow.yml
@@ -9,4 +9,5 @@ executors:
       metas:
         py_modules:
           - clip_server.executors.clip_onnx
+    timeout_ready: 3000000
     replicas: 1
diff --git a/server/clip_server/torch-flow.yml b/server/clip_server/torch-flow.yml
@@ -9,4 +9,5 @@ executors:
       metas:
         py_modules:
           - clip_server.executors.clip_torch
+    timeout_ready: 3000000
     replicas: 1