jina-ai · numb3r3 · Nov 29, 2022 · Nov 16, 2022 · Nov 16, 2022 · Nov 16, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -163,6 +163,7 @@ jobs:
           }
           pip install -e "client/[test]"
           pip install -e "server/[tensorrt]"
+          pip install -e "server/[transformers]"
       - name: Test
         id: test
         run: |

diff --git a/server/clip_server/model/flash_attention.py b/server/clip_server/model/flash_attention.py
@@ -57,9 +57,12 @@ def attention(
             key_padding_mask: a bool tensor of shape (B, S)
 
         """
-        assert not need_weights
-        assert q.dtype in [torch.float16, torch.bfloat16]
-        assert q.is_cuda
+        assert not need_weights, "not allowed to return weights."
+        assert q.dtype in [
+            torch.float16,
+            torch.bfloat16,
+        ], f"flash attention only support torch.float16 or torch.bfloat16 but got {q.dtype}."
+        assert q.is_cuda, "flash attention only support cuda."
 
         if cu_seqlens is None:
             max_s = seqlen