diff --git a/server/clip_server/model/model.py b/server/clip_server/model/model.py index f324a0199..c0b1a2296 100644 --- a/server/clip_server/model/model.py +++ b/server/clip_server/model/model.py @@ -9,8 +9,6 @@ from torch import nn - - class Bottleneck(nn.Module): expansion = 4 @@ -76,7 +74,7 @@ def __init__( ): super().__init__() self.positional_embedding = nn.Parameter( - torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5 + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5 ) self.k_proj = nn.Linear(embed_dim, embed_dim) self.q_proj = nn.Linear(embed_dim, embed_dim) @@ -269,7 +267,7 @@ def __init__( bias=False, ) - scale = width ** -0.5 + scale = width**-0.5 self.class_embedding = nn.Parameter(scale * torch.randn(width)) self.positional_embedding = nn.Parameter( scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width) @@ -375,7 +373,7 @@ def initialize_parameters(self): if isinstance(self.visual, ModifiedResNet): if self.visual.attnpool is not None: - std = self.visual.attnpool.c_proj.in_features ** -0.5 + std = self.visual.attnpool.c_proj.in_features**-0.5 nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) @@ -391,10 +389,10 @@ def initialize_parameters(self): if name.endswith('bn3.weight'): nn.init.zeros_(param) - proj_std = (self.transformer.width ** -0.5) * ( + proj_std = (self.transformer.width**-0.5) * ( (2 * self.transformer.layers) ** -0.5 ) - attn_std = self.transformer.width ** -0.5 + attn_std = self.transformer.width**-0.5 fc_std = (2 * self.transformer.width) ** -0.5 for block in self.transformer.resblocks: nn.init.normal_(block.attn.in_proj_weight, std=attn_std) @@ -403,7 +401,7 @@ def initialize_parameters(self): nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) if self.text_projection is not None: - nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5) def build_attention_mask(self): # lazily create causal attention mask, with full attention between the vision tokens @@ -516,7 +514,7 @@ def build_model(state_dict: dict): ) vision_patch_size = None assert ( - output_width ** 2 + 1 + output_width**2 + 1 == state_dict['visual.attnpool.positional_embedding'].shape[0] ) image_resolution = output_width * 32