docs: update clip search (#820)

* docs: update subtitle * docs: minor updates * chore: update playgroud * docs: search titler * docs: playground * docs: playground * docs: update client api docs * docs: update client api docs * docs: update client api docs * fix: search limit param * docs: examples * docs: examples * docs: client api * docs: client api * docs: playground * docs: playground
jina-ai · Sep 13, 2022 · 8d9725f · 8d9725f
1 parent 213ecc2
commit 8d9725f
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 25 deletions.
diff --git a/client/clip_client/client.py b/client/clip_client/client.py
@@ -616,7 +616,7 @@ def search(
         self,
         content: Iterable[str],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[Dict] = None,
@@ -639,7 +639,7 @@ def search(
         self,
         content: Union['DocumentArray', Iterable['Document']],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[dict] = None,
@@ -657,7 +657,7 @@ def search(
         """
         ...
 
-    def search(self, content, **kwargs) -> 'DocumentArray':
+    def search(self, content, limit: int = 10, **kwargs) -> 'DocumentArray':
         if isinstance(content, str):
             raise TypeError(
                 f'content must be an Iterable of [str, Document], try `.search(["{content}"])` instead'
@@ -670,7 +670,7 @@ def search(self, content, **kwargs) -> 'DocumentArray':
         results = DocumentArray()
         with self._pbar:
             parameters = kwargs.pop('parameters', {})
-            parameters['limit'] = kwargs.get('limit')
+            parameters['limit'] = limit
 
             self._client.post(
                 on='/search',
@@ -690,7 +690,7 @@ async def asearch(
         self,
         content: Iterator[str],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[Dict] = None,
@@ -702,14 +702,14 @@ async def asearch(
         self,
         content: Union['DocumentArray', Iterable['Document']],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[dict] = None,
     ):
         ...
 
-    async def asearch(self, content, **kwargs):
+    async def asearch(self, content, limit: int = 10, **kwargs):
         from rich import filesize
 
         self._prepare_streaming(
@@ -720,7 +720,7 @@ async def asearch(self, content, **kwargs):
 
         with self._pbar:
             parameters = kwargs.pop('parameters', {})
-            parameters['limit'] = kwargs.get('limit')
+            parameters['limit'] = limit
 
             async for da in self._async_client.post(
                 on='/search',

diff --git a/docs/index.md b/docs/index.md
@@ -195,6 +195,7 @@ hosting/on-jcloud
 
 playground/embedding
 playground/reasoning
+playground/searching
 ```
 
 

diff --git a/docs/playground/embedding.md b/docs/playground/embedding.md
@@ -8,7 +8,7 @@ The model is `ViT-L/14-336px` on one GPU.
 
 <iframe frameborder="0" allowtransparency="true" scrolling="no" src="../../_static/demo-embed.html" style="overflow:hidden;overflow-x:hidden;overflow-y:hidden;height:100vh;width:100%"></iframe>
 
-```{button-link} ../../_static/demo-text-rank.html
+```{button-link} ../../_static/demo-embed.html
 :color: primary
 :align: center
 

diff --git a/docs/playground/searching.md b/docs/playground/searching.md
@@ -0,0 +1,10 @@
+# Text & Image Searching
+
+<iframe frameborder="0" allowtransparency="true" scrolling="no" src="https://jemmyshin-laion5b-streamlit-streamlit-demo-rddbqz.streamlitapp.com?embedded=true" style="overflow:hidden;overflow-x:hidden;overflow-y:hidden;height:100vh;width:100%"></iframe>
+
+```{button-link} https://jemmyshin-laion5b-streamlit-streamlit-demo-rddbqz.streamlitapp.com/
+:color: primary
+:align: center
+
+{octicon}`link-external` Open this playground in a new window
+```
diff --git a/docs/user-guides/client.md b/docs/user-guides/client.md
@@ -334,6 +334,88 @@ Finally, in the return you can observe the matches are re-ranked according to `.
 [0.9920725226402283, 0.006038925610482693, 0.0009973491542041302, 0.00078492151806131, 0.00010626466246321797]]
 ```
 
+(indexing)=
+## Indexing
+
+```{tip}
+This feature is only available with clip_client>=0.7.0, and the server is running with 
+a FLOW consisting of encoder and indexer.
+``` 
+
+You can index Documents via {func}`~clip_client.client.Client.index` or {func}`~clip_client.client.Client.aindex`. 
+
+```python
+from clip_client import Client
+from docarray import Document
+
+c = Client('grpc://0.0.0.0:23456')
+
+da = [
+    Document(text='she smiled, with pain'),
+    Document(uri='apple.png'),
+    Document(uri='apple.png').load_uri_to_image_tensor(),
+    Document(blob=open('apple.png', 'rb').read()),
+    Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'),
+    Document(
+        uri='data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7'
+    ),
+]
+
+r = c.index(da)
+```
+Now that the return result is a DocumentArray, we can get a summary of it.
+
+```text
+╭──────────────────────────── Documents Summary ─────────────────────────────╮
+│                                                                            │
+│   Length                        6                                          │
+│   Homogenous Documents          False                                      │
+│   4 Documents have attributes   ('id', 'mime_type', 'uri', 'embedding')    │
+│   1 Document has attributes     ('id', 'mime_type', 'text', 'embedding')   │
+│   1 Document has attributes     ('id', 'embedding')                        │
+│                                                                            │
+╰────────────────────────────────────────────────────────────────────────────╯
+╭────────────────────── Attributes Summary ───────────────────────╮
+│                                                                 │
+│   Attribute   Data type      #Unique values   Has empty value   │
+│  ─────────────────────────────────────────────────────────────  │
+│   embedding   ('ndarray',)   6                False             │
+│   id          ('str',)       6                False             │
+│   mime_type   ('str',)       5                False             │
+│   text        ('str',)       2                False             │
+│   uri         ('str',)       4                False             │
+│                                                                 │
+╰─────────────────────────────────────────────────────────────────╯
+```
+
+The `embedding` is the output of the encoder, which is a 512-dim vector. 
+Now we can use the indexer to search for the indexed Documents.
+
+
+(searching)=
+## Searching
+
+You can use {func}`~clip_client.client.Client.search` or {func}`~clip_client.client.Client.asearch`
+to search for relevant Documents in the index for a given query.
+
+```python
+from clip_client import Client
+
+c = Client('grpc://0.0.0.0:23456')
+
+result = c.search(['smile'], limit=2)
+
+
+print(result['@m', ['text', 'scores__cosine']])
+```
+
+The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799.
+```text
+[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]]
+```
+You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved.
+
+
 
 (profiling)=
 ## Profiling

diff --git a/docs/user-guides/retriever.md b/docs/user-guides/retriever.md
@@ -1,4 +1,4 @@
-# Search API
+# CLIP Search
 
 
 CLIP Search is a search paradigm that uses the CLIP model to encode the text and image documents into a common vector space. 
@@ -51,16 +51,16 @@ executors:
       metas:
         py_modules:
           - clip_server.executors.clip_torch
-          
+    
   - name: indexer
     uses:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
 ```
 
 ````
@@ -90,57 +90,69 @@ executors:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
+        limit: 10
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
 ```
 
 ````
 
 The first part defines the CLIP model config, which is explained [here](https://clip-as-service.jina.ai/user-guides/server/#clip-model-config).
 And the second part defines the Annlite indexer config, you can set the following parameters:
 
-| Parameter | Description                                                                                   |
-|-----------|-----------------------------------------------------------------------------------------------|
-| `n_dim`   | The dimension of the vector space. It should be the same as the dimension of the CLIP model.  |
+| Parameter | Description                                                                                  |
+|-----------|----------------------------------------------------------------------------------------------|
+| `n_dim`   | The dimension of the vector space. It should be the same as the dimension of the CLIP model. |
+| `limit`   | The number of the most relevant documents to be retrieved. The default value is 10.          |
 
 And the `workspace` parameter is the path to the workspace directory, which is used to store the index files.
 
-## Connect from client
+## Index and search documents
 
 ```{tip}
 You will need to install client first in Python 3.7+: `pip install clip-client>=0.7.0`.
 ```
 
-To connect to the server, you can use the following code:
+### Index Documents
+
+To index image or text documents in the CLIP search server, you can use the client function {func}`~clip_client.Client.index`:
 
 ```python
 from clip_client import Client
 from docarray import Document
 
 client = Client('grpc://0.0.0.0:61000')
 
-# index
 client.index(
     [
         Document(text='she smiled, with pain'),
         Document(uri='apple.png'),
         Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'),
     ]
 )
+```
+
+You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you.
+
 
-# search
-client.search(['smile'])
+### Search Documents
+
+Then, you can use the client function {func}`~clip_client.Client.search` to search for similar documents:
+
+```python
+result = client.search(['smile'], limit=2)
+
+print(result['@m', ['text', 'scores__cosine']])
 ```
 
 The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799.
 ```text
-she smiled, with pain defaultdict(<class 'docarray.score.NamedScore'>, {'cosine': {'value': 0.09604912996292114}})
-defaultdict(<class 'docarray.score.NamedScore'>, {'cosine': {'value': 0.7994112372398376}})
+[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]]
 ```
+You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved.
 
-You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you.
 
 ## Support large-scale dataset
 
@@ -168,10 +180,10 @@ executors:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
     shards: 5
     polling: {'/index': 'ANY', '/search': 'ALL', '/update': 'ALL',
               '/delete': 'ALL', '/status': 'ALL'}
-Original file line number
+Diff line change
@@ Expand Up / @@ -195,6 +195,7 @@ hosting/on-jcloud @@
     playground/embedding
     playground/reasoning
+    playground/searching
     ```
@@ Expand Down @@