Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: bump open-clip-torch to v2.8.0 #883

Merged
merged 4 commits into from
Dec 26, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions server/clip_server/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from open_clip.transformer import QuickGELU, LayerNorm, LayerNormFp32, Attention
from open_clip.timm_model import TimmModel
from open_clip.factory import _MODEL_CONFIGS
from open_clip.hf_model import PreTrainedTextEncoder
from open_clip.hf_model import HFTextEncoder
from open_clip.transformer import ResidualAttentionBlock as _ResidualAttentionBlock
from open_clip.transformer import Transformer as _Transformer
from open_clip.transformer import VisionTransformer as _VisionTransformer
Expand Down Expand Up @@ -75,11 +75,20 @@ def __init__(
self,
image_size: int,
patch_size: int,
global_average_pool: bool,
output_dim: int,
patch_dropout: float,
dtype: torch.dtype = torch.float32,
**kwargs,
):
super().__init__(image_size, patch_size, output_dim=output_dim, **kwargs)
super().__init__(
image_size,
patch_size,
global_average_pool=global_average_pool,
output_dim=output_dim,
patch_dropout=patch_dropout,
**kwargs,
)
self.transformer = Transformer(dtype=dtype, **kwargs)

def forward(self, x: torch.Tensor):
Expand Down Expand Up @@ -111,6 +120,8 @@ class CLIPVisionCfg:
patch_size: int = 16
image_size: Union[Tuple[int, int], int] = 224
ls_init_value: Optional[float] = None # layer scale initial value
patch_dropout: float = 0.0 # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why we need dropout during inference? It's not training process

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

disabled by default

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we can remove this args if we don't need it

global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
timm_model_name: str = (
None # a valid model name overrides layers, width, patch_size
)
Expand All @@ -136,6 +147,7 @@ class CLIPTextCfg:
ls_init_value: Optional[float] = None # layer scale initial value
hf_model_name: str = None
hf_tokenizer_name: str = None
hf_model_pretrained: bool = True
proj: str = 'mlp'
pooler_type: str = 'mean_pooler'

Expand Down Expand Up @@ -189,6 +201,8 @@ def _build_vision_tower(
heads=vision_heads,
mlp_ratio=vision_cfg.mlp_ratio,
ls_init_value=vision_cfg.ls_init_value,
patch_dropout=vision_cfg.patch_dropout,
global_average_pool=vision_cfg.global_average_pool,
output_dim=embed_dim,
act_layer=act_layer,
norm_layer=norm_layer,
Expand All @@ -208,11 +222,12 @@ def _build_text_tower(
text_cfg = CLIPTextCfg(**text_cfg)

if text_cfg.hf_model_name:
text = PreTrainedTextEncoder(
text = HFTextEncoder(
text_cfg.hf_model_name,
output_dim=embed_dim,
proj=text_cfg.proj,
pooler_type=text_cfg.pooler_type,
pretrained=text_cfg.hf_model_pretrained,
)
else:
act_layer = QuickGELU if quick_gelu else nn.GELU
Expand Down Expand Up @@ -555,6 +570,7 @@ def load_openclip_model(
jit: bool = False,
force_quick_gelu: bool = False,
force_custom_text: bool = False,
force_patch_dropout: Optional[float] = None,
pretrained_image: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
):
Expand All @@ -578,6 +594,10 @@ def load_openclip_model(
# override for use of QuickGELU on non-OpenAI transformer models
model_cfg["quick_gelu"] = True

if force_patch_dropout is not None:
# override the default patch dropout value
model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout

if pretrained_image:
if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
# pretrained weight loading for timm models set via vision_cfg
Expand Down
2 changes: 1 addition & 1 deletion server/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
'torchvision<=0.13.0' if sys.version_info <= (3, 7, 2) else 'torchvision',
'jina>=3.12.0',
'prometheus-client',
'open_clip_torch>=2.7.0',
'open_clip_torch>=2.8.0',
],
extras_require={
'onnx': [
Expand Down