diff --git a/docs/user-guides/server.md b/docs/user-guides/server.md index c8609c6cf..659716b07 100644 --- a/docs/user-guides/server.md +++ b/docs/user-guides/server.md @@ -79,6 +79,7 @@ Please also note that **different models give different sizes of output dimensio | ViT-B-32::laion2b_e16 | ✅ | ✅ | ✅ | 512 | 577 | 2.93 | 1.40 | | ViT-B-32::laion400m_e31 | ✅ | ✅ | ✅ | 512 | 577 | 2.93 | 1.40 | | ViT-B-32::laion400m_e32 | ✅ | ✅ | ✅ | 512 | 577 | 2.94 | 1.40 | +| ViT-B-32::laion2B-s34B-b79K | ✅ | ✅ | ❌ | 512 | 577 | 2.94 | 1.40 | | ViT-B-16::openai | ✅ | ✅ | ✅ | 512 | 335 | 3.20 | 1.44 | | ViT-B-16::laion400m_e31 | ✅ | ✅ | ✅ | 512 | 571 | 2.93 | 1.44 | | ViT-B-16::laion400m_e32 | ✅ | ✅ | ✅ | 512 | 571 | 2.94 | 1.44 | @@ -87,7 +88,10 @@ Please also note that **different models give different sizes of output dimensio | ViT-L-14::openai | ✅ | ✅ | ❌ | 768 | 890 | 3.66 | 2.04 | | ViT-L-14::laion400m_e31 | ✅ | ✅ | ❌ | 768 | 1631 | 3.43 | 2.03 | | ViT-L-14::laion400m_e32 | ✅ | ✅ | ❌ | 768 | 1631 | 3.42 | 2.03 | +| ViT-L-14::laion2B-s32B-b82K | ✅ | ✅ | ❌ | 768 | 1631 | 3.43 | 2.03 | | ViT-L-14-336::openai | ✅ | ✅ | ❌ | 768 | 891 | 3.74 | 2.23 | +| ViT-H-14::laion2B-s32B-b79K | ✅ | 🚧 | ❌ | 1024 | 3762 | 4.45 | 3.26 | +| ViT-g-14::laion2B-s12B-b42K | ✅ | 🚧 | ❌ | 1024 | 5214 | 5.16 | 4.00 | | M-CLIP/XLM-Roberta-Large-Vit-B-32 | ✅ | 🚧 | 🚧 | 512 | 4284 | 5.37 | 1.68 | | M-CLIP/XLM-Roberta-Large-Vit-L-14 | ✅ | 🚧 | ❌ | 768 | 4293 | 4.30 | 4.97 | | M-CLIP/XLM-Roberta-Large-Vit-B-16Plus | ✅ | 🚧 | 🚧 | 640 | 4293 | 4.30 | 4.13 | @@ -192,7 +196,6 @@ Basically, each YAML file defines a [Jina Flow](https://docs.jina.ai/fundamental Looking at the YAML file again, we can put it into three subsections as below: - ````{tab} CLIP model config ```{code-block} yaml diff --git a/server/clip_server/model/clip_onnx.py b/server/clip_server/model/clip_onnx.py index cf82f9100..023cd81dc 100644 --- a/server/clip_server/model/clip_onnx.py +++ b/server/clip_server/model/clip_onnx.py @@ -61,6 +61,10 @@ ('ViT-B-32-laion400m_e32/textual.onnx', '93284915937ba42a2b52ae8d3e5283a0'), ('ViT-B-32-laion400m_e32/visual.onnx', 'db220821a31fe9795fd8c2ba419078c5'), ), + 'ViT-B-32::laion2B-s34B-b79K': ( + ('ViT-B-32-laion2B-s34B-b79K/textual.onnx', '84af5ae53da56464c76e67fe50fddbe9'), + ('ViT-B-32-laion2B-s34B-b79K/visual.onnx', 'a2d4cbd1cf2632cd09ffce9b40bfd8bd'), + ), 'ViT-B-16::openai': ( ('ViT-B-16/textual.onnx', '6f0976629a446f95c0c8767658f12ebe'), ('ViT-B-16/visual.onnx', 'd5c03bfeef1abbd9bede54a8f6e1eaad'), @@ -105,6 +109,10 @@ ('ViT-L-14-laion400m_e32/textual.onnx', '8ba5b76ba71992923470c0261b10a67c'), ('ViT-L-14-laion400m_e32/visual.onnx', '49db3ba92bd816001e932530ad92d76c'), ), + 'ViT-L-14::laion2B-s32B-b82K': ( + ('ViT-L-14-laion2B-s32B-b82K/textual.onnx', 'da36a6cbed4f56abf576fdea8b6fe2ee'), + ('ViT-L-14-laion2B-s32B-b82K/visual.onnx', '1e337a190abba6a8650237dfae4740b7'), + ), 'ViT-L-14-336::openai': ( ('ViT-L-14@336px/textual.onnx', '78fab479f136403eed0db46f3e9e7ed2'), ('ViT-L-14@336px/visual.onnx', 'f3b1f5d55ca08d43d749e11f7e4ba27e'), diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py index e1c5e5164..7c4d6633e 100644 --- a/server/clip_server/model/model.py +++ b/server/clip_server/model/model.py @@ -49,17 +49,17 @@ def __init__(self, inplanes, planes, stride=1): # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) self.bn1 = nn.BatchNorm2d(planes) - self.relu1 = nn.ReLU(inplace=True) + self.act1 = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) - self.relu2 = nn.ReLU(inplace=True) + self.act2 = nn.ReLU(inplace=True) self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion) - self.relu3 = nn.ReLU(inplace=True) + self.act3 = nn.ReLU(inplace=True) self.downsample = None self.stride = stride @@ -88,8 +88,8 @@ def __init__(self, inplanes, planes, stride=1): def forward(self, x: torch.Tensor): identity = x - out = self.relu1(self.bn1(self.conv1(x))) - out = self.relu2(self.bn2(self.conv2(out))) + out = self.act1(self.bn1(self.conv1(x))) + out = self.act2(self.bn2(self.conv2(out))) out = self.avgpool(out) out = self.bn3(self.conv3(out)) @@ -97,7 +97,7 @@ def forward(self, x: torch.Tensor): identity = self.downsample(x) out += identity - out = self.relu3(out) + out = self.act3(out) return out @@ -166,15 +166,15 @@ def __init__(self, layers, output_dim, heads, image_size=224, width=64): 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False ) self.bn1 = nn.BatchNorm2d(width // 2) - self.relu1 = nn.ReLU(inplace=True) + self.act1 = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d( width // 2, width // 2, kernel_size=3, padding=1, bias=False ) self.bn2 = nn.BatchNorm2d(width // 2) - self.relu2 = nn.ReLU(inplace=True) + self.act2 = nn.ReLU(inplace=True) self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) self.bn3 = nn.BatchNorm2d(width) - self.relu3 = nn.ReLU(inplace=True) + self.act3 = nn.ReLU(inplace=True) self.avgpool = nn.AvgPool2d(2) # residual layers @@ -226,9 +226,9 @@ def set_grad_checkpointing(self, enable=True): pass def stem(self, x): - x = self.relu1(self.bn1(self.conv1(x))) - x = self.relu2(self.bn2(self.conv2(x))) - x = self.relu3(self.bn3(self.conv3(x))) + x = self.act1(self.bn1(self.conv1(x))) + x = self.act2(self.bn2(self.conv2(x))) + x = self.act3(self.bn3(self.conv3(x))) x = self.avgpool(x) return x @@ -273,22 +273,29 @@ def __init__( n_head: int, mlp_ratio: float = 4.0, act_layer: Callable = nn.GELU, + scale_cosine_attn: bool = False, + scale_heads: bool = False, + scale_attn: bool = False, + scale_fc: bool = False, ): super().__init__() - self.attn = nn.MultiheadAttention(d_model, n_head) self.ln_1 = LayerNorm(d_model) + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_attn = LayerNorm(d_model) if scale_attn else nn.Identity() + + self.ln_2 = LayerNorm(d_model) mlp_width = int(d_model * mlp_ratio) self.mlp = nn.Sequential( OrderedDict( [ ("c_fc", nn.Linear(d_model, mlp_width)), + ('ln', LayerNorm(mlp_width) if scale_fc else nn.Identity()), ("gelu", act_layer()), ("c_proj", nn.Linear(mlp_width, d_model)), ] ) ) - self.ln_2 = LayerNorm(d_model) def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0] @@ -296,7 +303,7 @@ def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): if attn_mask is not None: attn_mask = attn_mask.to(dtype=x.dtype, device=x.device) - x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) + x = x + self.ln_attn(self.attention(self.ln_1(x), attn_mask=attn_mask)) x = x + self.mlp(self.ln_2(x)) return x diff --git a/server/clip_server/model/pretrained_models.py b/server/clip_server/model/pretrained_models.py index 949bfec81..e90dbf2de 100644 --- a/server/clip_server/model/pretrained_models.py +++ b/server/clip_server/model/pretrained_models.py @@ -27,6 +27,10 @@ 'ViT-B-32-laion400m_e32.pt', '359e0dba4a419f175599ee0c63a110d8', ), + 'ViT-B-32::laion2B-s34B-b79K': ( + 'ViT-B-32-laion2B-s34B-b79K.bin', + '2fc036aea9cd7306f5ce7ce6abb8d0bf', + ), 'ViT-B-16::openai': ('ViT-B-16.pt', '44c3d804ecac03d9545ac1a3adbca3a6'), 'ViT-B-16::laion400m_e31': ( 'ViT-B-16-laion400m_e31.pt', @@ -53,7 +57,19 @@ 'ViT-L-14-laion400m_e32.pt', 'a76cde1bc744ca38c6036b920c847a89', ), + 'ViT-L-14::laion2B-s32B-b82K': ( + 'ViT-L-14-laion2B-s32B-b82K.bin', + '4d2275fc7b2d7ee9db174f9b57ddecbd', + ), 'ViT-L-14-336::openai': ('ViT-L-14-336px.pt', 'b311058cae50cb10fbfa2a44231c9473'), + 'ViT-H-14::laion2B-s32B-b79K': ( + 'ViT-H-14-laion2B-s32B-b79K.bin', + '2aa6c46521b165a0daeb8cdc6668c7d3', + ), + 'ViT-g-14::laion2B-s12B-b42K': ( + 'ViT-g-14-laion2B-s12B-b42K.bin', + '3bf99353f6f1829faac0bb155be4382a', + ), # older version name format 'RN50': ('RN50.pt', '9140964eaaf9f68c95aa8df6ca13777c'), 'RN101': ('RN101.pt', 'fa9d5f64ebf152bc56a18db245071014'), @@ -81,10 +97,12 @@ 'RN50x64': 448, 'ViT-B-32': 224, 'ViT-B-16': 224, + 'Vit-B-16Plus': 240, 'ViT-B-16-plus-240': 240, 'ViT-L-14': 224, 'ViT-L-14-336': 336, - 'Vit-B-16Plus': 240, + 'ViT-H-14': 224, + 'ViT-g-14': 224, } diff --git a/server/clip_server/onnx-flow.yml b/server/clip_server/onnx-flow.yml index 31365da00..635017e27 100644 --- a/server/clip_server/onnx-flow.yml +++ b/server/clip_server/onnx-flow.yml @@ -9,4 +9,5 @@ executors: metas: py_modules: - clip_server.executors.clip_onnx + timeout_ready: 3000000 replicas: 1 \ No newline at end of file diff --git a/server/clip_server/torch-flow.yml b/server/clip_server/torch-flow.yml index 925620247..05a6c73c6 100644 --- a/server/clip_server/torch-flow.yml +++ b/server/clip_server/torch-flow.yml @@ -9,4 +9,5 @@ executors: metas: py_modules: - clip_server.executors.clip_torch + timeout_ready: 3000000 replicas: 1 \ No newline at end of file