From 8d9725fb874d94944cb1129ca2ccc8293c52dc90 Mon Sep 17 00:00:00 2001
From: felix-wang <35718120+numb3r3@users.noreply.github.com>
Date: Tue, 13 Sep 2022 21:34:36 +0800
Subject: [PATCH] docs: update clip search (#820)

* docs: update subtitle

* docs: minor updates

* chore: update playgroud

* docs: search titler

* docs: playground

* docs: playground

* docs: update client api docs

* docs: update client api docs

* docs: update client api docs

* fix: search limit param

* docs: examples

* docs: examples

* docs: client api

* docs: client api

* docs: playground

* docs: playground
---
 client/clip_client/client.py  | 16 +++----
 docs/index.md                 |  1 +
 docs/playground/embedding.md  |  2 +-
 docs/playground/searching.md  | 10 +++++
 docs/user-guides/client.md    | 82 +++++++++++++++++++++++++++++++++++
 docs/user-guides/retriever.md | 44 ++++++++++++-------
 6 files changed, 130 insertions(+), 25 deletions(-)
 create mode 100644 docs/playground/searching.md

diff --git a/client/clip_client/client.py b/client/clip_client/client.py
index fc851a6de..77c1a258a 100644
--- a/client/clip_client/client.py
+++ b/client/clip_client/client.py
@@ -616,7 +616,7 @@ def search(
         self,
         content: Iterable[str],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[Dict] = None,
@@ -639,7 +639,7 @@ def search(
         self,
         content: Union['DocumentArray', Iterable['Document']],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[dict] = None,
@@ -657,7 +657,7 @@ def search(
         """
         ...
 
-    def search(self, content, **kwargs) -> 'DocumentArray':
+    def search(self, content, limit: int = 10, **kwargs) -> 'DocumentArray':
         if isinstance(content, str):
             raise TypeError(
                 f'content must be an Iterable of [str, Document], try `.search(["{content}"])` instead'
@@ -670,7 +670,7 @@ def search(self, content, **kwargs) -> 'DocumentArray':
         results = DocumentArray()
         with self._pbar:
             parameters = kwargs.pop('parameters', {})
-            parameters['limit'] = kwargs.get('limit')
+            parameters['limit'] = limit
 
             self._client.post(
                 on='/search',
@@ -690,7 +690,7 @@ async def asearch(
         self,
         content: Iterator[str],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[Dict] = None,
@@ -702,14 +702,14 @@ async def asearch(
         self,
         content: Union['DocumentArray', Iterable['Document']],
         *,
-        limit: int = 20,
+        limit: int = 10,
         batch_size: Optional[int] = None,
         show_progress: bool = False,
         parameters: Optional[dict] = None,
     ):
         ...
 
-    async def asearch(self, content, **kwargs):
+    async def asearch(self, content, limit: int = 10, **kwargs):
         from rich import filesize
 
         self._prepare_streaming(
@@ -720,7 +720,7 @@ async def asearch(self, content, **kwargs):
 
         with self._pbar:
             parameters = kwargs.pop('parameters', {})
-            parameters['limit'] = kwargs.get('limit')
+            parameters['limit'] = limit
 
             async for da in self._async_client.post(
                 on='/search',
diff --git a/docs/index.md b/docs/index.md
index 4cec612d6..cf6babb65 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -195,6 +195,7 @@ hosting/on-jcloud
 
 playground/embedding
 playground/reasoning
+playground/searching
 ```
 
 
diff --git a/docs/playground/embedding.md b/docs/playground/embedding.md
index 3c9eccb73..f7b73a29a 100644
--- a/docs/playground/embedding.md
+++ b/docs/playground/embedding.md
@@ -8,7 +8,7 @@ The model is `ViT-L/14-336px` on one GPU.
 
 <iframe frameborder="0" allowtransparency="true" scrolling="no" src="../../_static/demo-embed.html" style="overflow:hidden;overflow-x:hidden;overflow-y:hidden;height:100vh;width:100%"></iframe>
 
-```{button-link} ../../_static/demo-text-rank.html
+```{button-link} ../../_static/demo-embed.html
 :color: primary
 :align: center
 
diff --git a/docs/playground/searching.md b/docs/playground/searching.md
new file mode 100644
index 000000000..8790c4600
--- /dev/null
+++ b/docs/playground/searching.md
@@ -0,0 +1,10 @@
+# Text & Image Searching
+
+<iframe frameborder="0" allowtransparency="true" scrolling="no" src="https://jemmyshin-laion5b-streamlit-streamlit-demo-rddbqz.streamlitapp.com?embedded=true" style="overflow:hidden;overflow-x:hidden;overflow-y:hidden;height:100vh;width:100%"></iframe>
+
+```{button-link} https://jemmyshin-laion5b-streamlit-streamlit-demo-rddbqz.streamlitapp.com/
+:color: primary
+:align: center
+
+{octicon}`link-external` Open this playground in a new window
+```
\ No newline at end of file
diff --git a/docs/user-guides/client.md b/docs/user-guides/client.md
index 9f136ec3a..f0eedf404 100644
--- a/docs/user-guides/client.md
+++ b/docs/user-guides/client.md
@@ -334,6 +334,88 @@ Finally, in the return you can observe the matches are re-ranked according to `.
 [0.9920725226402283, 0.006038925610482693, 0.0009973491542041302, 0.00078492151806131, 0.00010626466246321797]]
 ```
 
+(indexing)=
+## Indexing
+
+```{tip}
+This feature is only available with clip_client>=0.7.0, and the server is running with 
+a FLOW consisting of encoder and indexer.
+``` 
+
+You can index Documents via {func}`~clip_client.client.Client.index` or {func}`~clip_client.client.Client.aindex`. 
+
+```python
+from clip_client import Client
+from docarray import Document
+
+c = Client('grpc://0.0.0.0:23456')
+
+da = [
+    Document(text='she smiled, with pain'),
+    Document(uri='apple.png'),
+    Document(uri='apple.png').load_uri_to_image_tensor(),
+    Document(blob=open('apple.png', 'rb').read()),
+    Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'),
+    Document(
+        uri='data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7'
+    ),
+]
+
+r = c.index(da)
+```
+Now that the return result is a DocumentArray, we can get a summary of it.
+
+```text
+╭──────────────────────────── Documents Summary ─────────────────────────────╮
+│                                                                            │
+│   Length                        6                                          │
+│   Homogenous Documents          False                                      │
+│   4 Documents have attributes   ('id', 'mime_type', 'uri', 'embedding')    │
+│   1 Document has attributes     ('id', 'mime_type', 'text', 'embedding')   │
+│   1 Document has attributes     ('id', 'embedding')                        │
+│                                                                            │
+╰────────────────────────────────────────────────────────────────────────────╯
+╭────────────────────── Attributes Summary ───────────────────────╮
+│                                                                 │
+│   Attribute   Data type      #Unique values   Has empty value   │
+│  ─────────────────────────────────────────────────────────────  │
+│   embedding   ('ndarray',)   6                False             │
+│   id          ('str',)       6                False             │
+│   mime_type   ('str',)       5                False             │
+│   text        ('str',)       2                False             │
+│   uri         ('str',)       4                False             │
+│                                                                 │
+╰─────────────────────────────────────────────────────────────────╯
+```
+
+The `embedding` is the output of the encoder, which is a 512-dim vector. 
+Now we can use the indexer to search for the indexed Documents.
+
+
+(searching)=
+## Searching
+
+You can use {func}`~clip_client.client.Client.search` or {func}`~clip_client.client.Client.asearch`
+to search for relevant Documents in the index for a given query.
+
+```python
+from clip_client import Client
+
+c = Client('grpc://0.0.0.0:23456')
+
+result = c.search(['smile'], limit=2)
+
+
+print(result['@m', ['text', 'scores__cosine']])
+```
+
+The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799.
+```text
+[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]]
+```
+You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved.
+
+
 
 (profiling)=
 ## Profiling
diff --git a/docs/user-guides/retriever.md b/docs/user-guides/retriever.md
index e9d7ced14..6e5597ab9 100644
--- a/docs/user-guides/retriever.md
+++ b/docs/user-guides/retriever.md
@@ -1,4 +1,4 @@
-# Search API
+# CLIP Search
 
 
 CLIP Search is a search paradigm that uses the CLIP model to encode the text and image documents into a common vector space. 
@@ -51,16 +51,16 @@ executors:
       metas:
         py_modules:
           - clip_server.executors.clip_torch
-          
+    
   - name: indexer
     uses:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
 ```
 
 ````
@@ -90,10 +90,11 @@ executors:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
+        limit: 10
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
 ```
 
 ````
@@ -101,19 +102,22 @@ executors:
 The first part defines the CLIP model config, which is explained [here](https://clip-as-service.jina.ai/user-guides/server/#clip-model-config).
 And the second part defines the Annlite indexer config, you can set the following parameters:
 
-| Parameter | Description                                                                                   |
-|-----------|-----------------------------------------------------------------------------------------------|
-| `n_dim`   | The dimension of the vector space. It should be the same as the dimension of the CLIP model.  |
+| Parameter | Description                                                                                  |
+|-----------|----------------------------------------------------------------------------------------------|
+| `n_dim`   | The dimension of the vector space. It should be the same as the dimension of the CLIP model. |
+| `limit`   | The number of the most relevant documents to be retrieved. The default value is 10.          |
 
 And the `workspace` parameter is the path to the workspace directory, which is used to store the index files.
 
-## Connect from client
+## Index and search documents
 
 ```{tip}
 You will need to install client first in Python 3.7+: `pip install clip-client>=0.7.0`.
 ```
 
-To connect to the server, you can use the following code:
+### Index Documents
+
+To index image or text documents in the CLIP search server, you can use the client function {func}`~clip_client.Client.index`:
 
 ```python
 from clip_client import Client
@@ -121,7 +125,6 @@ from docarray import Document
 
 client = Client('grpc://0.0.0.0:61000')
 
-# index
 client.index(
     [
         Document(text='she smiled, with pain'),
@@ -129,18 +132,27 @@ client.index(
         Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'),
     ]
 )
+```
+
+You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you.
+
 
-# search
-client.search(['smile'])
+### Search Documents
+
+Then, you can use the client function {func}`~clip_client.Client.search` to search for similar documents:
+
+```python
+result = client.search(['smile'], limit=2)
+
+print(result['@m', ['text', 'scores__cosine']])
 ```
 
 The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799.
 ```text
-she smiled, with pain defaultdict(<class 'docarray.score.NamedScore'>, {'cosine': {'value': 0.09604912996292114}})
-defaultdict(<class 'docarray.score.NamedScore'>, {'cosine': {'value': 0.7994112372398376}})
+[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]]
 ```
+You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved.
 
-You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you.
 
 ## Support large-scale dataset
 
@@ -168,10 +180,10 @@ executors:
       jtype: AnnLiteIndexer
       with:
         n_dim: 512
-      workspace: './workspace'
       metas:
         py_modules:
           - annlite.executor
+    workspace: './workspace'
     shards: 5
     polling: {'/index': 'ANY', '/search': 'ALL', '/update': 'ALL',
               '/delete': 'ALL', '/status': 'ALL'}