jina-ai · numb3r3 · Aug 15, 2022 · Aug 10, 2022 · Aug 11, 2022 · Aug 11, 2022
diff --git a/client/clip_client/client.py b/client/clip_client/client.py
@@ -104,6 +104,8 @@ def encode(
         ...
 
     def encode(self, content, **kwargs):
+        from docarray import Document
+
         if isinstance(content, str):
             raise TypeError(
                 f'content must be an Iterable of [str, Document], try `.encode(["{content}"])` instead'
@@ -119,6 +121,11 @@ def encode(self, content, **kwargs):
                 **self._get_post_payload(content, kwargs),
                 on_done=partial(self._gather_result, results=results),
             )
+
+        for c in content:
+            if isinstance(c, Document) and c.tags.pop('__loaded_by_CAS__', False):
-            if isinstance(c, Document) and c.tags.pop('__loaded_by_CAS__', False):
+            if hasattr(c, 'tags') and c.tags.pop('__loaded_by_CAS__', False):
-            if isinstance(c, Document) and c.tags.pop('__loaded_by_CAS__', False):
+            if hasattr(c, 'tags') and c.tags.pop('__loaded_by_CAS__', False):
+                c.pop('blob')
+
         return self._unboxed_result(results)
 
     def _gather_result(self, response, results: 'DocumentArray'):
@@ -160,7 +167,8 @@ def _iter_doc(self, content) -> Generator['Document', None, None]:
                 _mime = mimetypes.guess_type(c)[0]
                 if _mime and _mime.startswith('image'):
                     yield Document(
-                        tags={'__created_by_CAS__': True}, uri=c
+                        tags={'__created_by_CAS__': True, '__loaded_by_CAS__': True},
+                        uri=c,
                     ).load_uri_to_blob()
                 else:
                     yield Document(tags={'__created_by_CAS__': True}, text=c)
@@ -169,6 +177,7 @@ def _iter_doc(self, content) -> Generator['Document', None, None]:
                     yield c
                 elif not c.blob and c.uri:
                     c.load_uri_to_blob()
+                    c.tags['__loaded_by_CAS__'] = True
                     yield c
                 elif c.tensor is not None:
                     yield c
@@ -331,6 +340,7 @@ def _prepare_single_doc(d: 'Document'):
             return d
         elif not d.blob and d.uri:
             d.load_uri_to_blob()
+            d.tags['__loaded_by_CAS__'] = True
             return d
         elif d.tensor is not None:
             return d
@@ -346,6 +356,18 @@ def _prepare_rank_doc(d: 'Document', _source: str = 'matches'):
         setattr(d, _source, [Client._prepare_single_doc(c) for c in _get(d)])
         return d
 
+    @staticmethod
+    def _reset_rank_doc(d: 'Document', _source: str = 'matches'):
+        _get = lambda d: getattr(d, _source)
+
+        if d.tags.pop('__loaded_by_CAS__', False):
+            d.pop('blob')
+
+        for c in _get(d):
+            if c.tags.pop('__loaded_by_CAS__', False):
+                c.pop('blob')
+        return d
+
     def _iter_rank_docs(
         self, content, _source='matches'
     ) -> Generator['Document', None, None]:
@@ -408,6 +430,9 @@ def rank(self, docs: Iterable['Document'], **kwargs) -> 'DocumentArray':
                 **self._get_rank_payload(docs, kwargs),
                 on_done=partial(self._gather_result, results=results),
             )
+        for d in docs:
+            self._reset_rank_doc(d, _source=kwargs.get('source', 'matches'))
+
         return results
 
     async def arank(self, docs: Iterable['Document'], **kwargs) -> 'DocumentArray':

diff --git a/server/clip_server/executors/helper.py b/server/clip_server/executors/helper.py
@@ -37,7 +37,10 @@ def preproc_image(
         tensors_batch.append(preprocess_fn(d.tensor).detach())
 
         # recover doc content
-        d.content = content
+        if d.tags.pop('__loaded_by_CAS__', False):
+            d.pop('tensor')
+        else:
+            d.content = content
 
     tensors_batch = torch.stack(tensors_batch).type(torch.float32)
 

diff --git a/tests/test_ranker.py b/tests/test_ranker.py
@@ -30,8 +30,14 @@ async def test_torch_executor_rank_img2texts(encoder_class):
     for d in da:
         for c in d.matches:
             assert c.scores['clip_score'].value is not None
+            assert '__loaded_by_CAS__' not in c.tags
+            assert not c.tensor
+            assert not c.blob
         org_score = d.matches[:, 'scores__clip_score__value']
         assert org_score == list(sorted(org_score, reverse=True))
+        assert '__loaded_by_CAS__' not in d.tags
+        assert not d.tensor
+        assert not d.blob
 
 
 @pytest.mark.asyncio
@@ -53,9 +59,15 @@ async def test_torch_executor_rank_text2imgs(encoder_class):
         for c in d.matches:
             assert c.scores['clip_score'].value is not None
             assert c.scores['clip_score_cosine'].value is not None
+            assert '__loaded_by_CAS__' not in c.tags
+            assert not c.tensor
+            assert not c.blob
         np.testing.assert_almost_equal(
             sum(c.scores['clip_score'].value for c in d.matches), 1
         )
+        assert '__loaded_by_CAS__' not in d.tags
+        assert not d.tensor
+        assert not d.blob
 
 
 @pytest.mark.parametrize(
@@ -79,6 +91,12 @@ async def test_torch_executor_rank_text2imgs(encoder_class):
 def test_docarray_inputs(make_flow, d):
     c = Client(server=f'grpc://0.0.0.0:{make_flow.port}')
     r = c.rank([d])
+    assert '__loaded_by_CAS__' not in d.tags
+    assert not d.blob
+    assert not d.tensor
+    assert '__loaded_by_CAS__' not in d.matches[0].tags
+    assert not d.matches[0].blob
+    assert not d.matches[0].tensor
     assert isinstance(r, DocumentArray)
     rv1 = r['@m', 'scores__clip_score__value']
     rv2 = r['@m', 'scores__clip_score_cosine__value']

diff --git a/tests/test_simple.py b/tests/test_simple.py
@@ -78,6 +78,9 @@ def test_docarray_inputs(make_flow, inputs, port_generator):
     assert isinstance(r, DocumentArray)
     assert r.embeddings.shape
     assert '__created_by_CAS__' not in r[0].tags
+    assert '__loaded_by_CAS__' not in r[0].tags
+    assert not r[0].tensor
+    assert not r[0].blob
 
 
 @pytest.mark.parametrize(
@@ -104,6 +107,9 @@ def test_docarray_preserve_original_inputs(make_flow, inputs, port_generator):
     assert r.embeddings.shape
     assert r.contents == inputs.contents
     assert '__created_by_CAS__' not in r[0].tags
+    assert '__loaded_by_CAS__' not in r[0].tags
+    assert not r[0].tensor
+    assert not r[0].blob
 
 
 @pytest.mark.parametrize(
@@ -134,5 +140,15 @@ def test_docarray_traversal(make_flow, inputs, port_generator):
     r2 = c.post(on='/', inputs=da, parameters={'access_paths': '@c'})
     assert r1[0].chunks.embeddings.shape[0] == len(inputs)
     assert '__created_by_CAS__' not in r1[0].tags
+    assert '__loaded_by_CAS__' not in r1[0].tags
+    assert not r1[0].tensor
+    assert not r1[0].blob
+    assert not r1[0].chunks[0].tensor
+    assert not r1[0].chunks[0].blob
     assert r2[0].chunks.embeddings.shape[0] == len(inputs)
     assert '__created_by_CAS__' not in r2[0].tags
+    assert '__loaded_by_CAS__' not in r2[0].tags
+    assert not r2[0].tensor
+    assert not r2[0].blob
+    assert not r2[0].chunks[0].tensor
+    assert not r2[0].chunks[0].blob