From 8d9725fb874d94944cb1129ca2ccc8293c52dc90 Mon Sep 17 00:00:00 2001 From: felix-wang <35718120+numb3r3@users.noreply.github.com> Date: Tue, 13 Sep 2022 21:34:36 +0800 Subject: [PATCH] docs: update clip search (#820) * docs: update subtitle * docs: minor updates * chore: update playgroud * docs: search titler * docs: playground * docs: playground * docs: update client api docs * docs: update client api docs * docs: update client api docs * fix: search limit param * docs: examples * docs: examples * docs: client api * docs: client api * docs: playground * docs: playground --- client/clip_client/client.py | 16 +++---- docs/index.md | 1 + docs/playground/embedding.md | 2 +- docs/playground/searching.md | 10 +++++ docs/user-guides/client.md | 82 +++++++++++++++++++++++++++++++++++ docs/user-guides/retriever.md | 44 ++++++++++++------- 6 files changed, 130 insertions(+), 25 deletions(-) create mode 100644 docs/playground/searching.md diff --git a/client/clip_client/client.py b/client/clip_client/client.py index fc851a6de..77c1a258a 100644 --- a/client/clip_client/client.py +++ b/client/clip_client/client.py @@ -616,7 +616,7 @@ def search( self, content: Iterable[str], *, - limit: int = 20, + limit: int = 10, batch_size: Optional[int] = None, show_progress: bool = False, parameters: Optional[Dict] = None, @@ -639,7 +639,7 @@ def search( self, content: Union['DocumentArray', Iterable['Document']], *, - limit: int = 20, + limit: int = 10, batch_size: Optional[int] = None, show_progress: bool = False, parameters: Optional[dict] = None, @@ -657,7 +657,7 @@ def search( """ ... - def search(self, content, **kwargs) -> 'DocumentArray': + def search(self, content, limit: int = 10, **kwargs) -> 'DocumentArray': if isinstance(content, str): raise TypeError( f'content must be an Iterable of [str, Document], try `.search(["{content}"])` instead' @@ -670,7 +670,7 @@ def search(self, content, **kwargs) -> 'DocumentArray': results = DocumentArray() with self._pbar: parameters = kwargs.pop('parameters', {}) - parameters['limit'] = kwargs.get('limit') + parameters['limit'] = limit self._client.post( on='/search', @@ -690,7 +690,7 @@ async def asearch( self, content: Iterator[str], *, - limit: int = 20, + limit: int = 10, batch_size: Optional[int] = None, show_progress: bool = False, parameters: Optional[Dict] = None, @@ -702,14 +702,14 @@ async def asearch( self, content: Union['DocumentArray', Iterable['Document']], *, - limit: int = 20, + limit: int = 10, batch_size: Optional[int] = None, show_progress: bool = False, parameters: Optional[dict] = None, ): ... - async def asearch(self, content, **kwargs): + async def asearch(self, content, limit: int = 10, **kwargs): from rich import filesize self._prepare_streaming( @@ -720,7 +720,7 @@ async def asearch(self, content, **kwargs): with self._pbar: parameters = kwargs.pop('parameters', {}) - parameters['limit'] = kwargs.get('limit') + parameters['limit'] = limit async for da in self._async_client.post( on='/search', diff --git a/docs/index.md b/docs/index.md index 4cec612d6..cf6babb65 100644 --- a/docs/index.md +++ b/docs/index.md @@ -195,6 +195,7 @@ hosting/on-jcloud playground/embedding playground/reasoning +playground/searching ``` diff --git a/docs/playground/embedding.md b/docs/playground/embedding.md index 3c9eccb73..f7b73a29a 100644 --- a/docs/playground/embedding.md +++ b/docs/playground/embedding.md @@ -8,7 +8,7 @@ The model is `ViT-L/14-336px` on one GPU. -```{button-link} ../../_static/demo-text-rank.html +```{button-link} ../../_static/demo-embed.html :color: primary :align: center diff --git a/docs/playground/searching.md b/docs/playground/searching.md new file mode 100644 index 000000000..8790c4600 --- /dev/null +++ b/docs/playground/searching.md @@ -0,0 +1,10 @@ +# Text & Image Searching + + + +```{button-link} https://jemmyshin-laion5b-streamlit-streamlit-demo-rddbqz.streamlitapp.com/ +:color: primary +:align: center + +{octicon}`link-external` Open this playground in a new window +``` \ No newline at end of file diff --git a/docs/user-guides/client.md b/docs/user-guides/client.md index 9f136ec3a..f0eedf404 100644 --- a/docs/user-guides/client.md +++ b/docs/user-guides/client.md @@ -334,6 +334,88 @@ Finally, in the return you can observe the matches are re-ranked according to `. [0.9920725226402283, 0.006038925610482693, 0.0009973491542041302, 0.00078492151806131, 0.00010626466246321797]] ``` +(indexing)= +## Indexing + +```{tip} +This feature is only available with clip_client>=0.7.0, and the server is running with +a FLOW consisting of encoder and indexer. +``` + +You can index Documents via {func}`~clip_client.client.Client.index` or {func}`~clip_client.client.Client.aindex`. + +```python +from clip_client import Client +from docarray import Document + +c = Client('grpc://0.0.0.0:23456') + +da = [ + Document(text='she smiled, with pain'), + Document(uri='apple.png'), + Document(uri='apple.png').load_uri_to_image_tensor(), + Document(blob=open('apple.png', 'rb').read()), + Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'), + Document( + uri='' + ), +] + +r = c.index(da) +``` +Now that the return result is a DocumentArray, we can get a summary of it. + +```text +╭──────────────────────────── Documents Summary ─────────────────────────────╮ +│ │ +│ Length 6 │ +│ Homogenous Documents False │ +│ 4 Documents have attributes ('id', 'mime_type', 'uri', 'embedding') │ +│ 1 Document has attributes ('id', 'mime_type', 'text', 'embedding') │ +│ 1 Document has attributes ('id', 'embedding') │ +│ │ +╰────────────────────────────────────────────────────────────────────────────╯ +╭────────────────────── Attributes Summary ───────────────────────╮ +│ │ +│ Attribute Data type #Unique values Has empty value │ +│ ───────────────────────────────────────────────────────────── │ +│ embedding ('ndarray',) 6 False │ +│ id ('str',) 6 False │ +│ mime_type ('str',) 5 False │ +│ text ('str',) 2 False │ +│ uri ('str',) 4 False │ +│ │ +╰─────────────────────────────────────────────────────────────────╯ +``` + +The `embedding` is the output of the encoder, which is a 512-dim vector. +Now we can use the indexer to search for the indexed Documents. + + +(searching)= +## Searching + +You can use {func}`~clip_client.client.Client.search` or {func}`~clip_client.client.Client.asearch` +to search for relevant Documents in the index for a given query. + +```python +from clip_client import Client + +c = Client('grpc://0.0.0.0:23456') + +result = c.search(['smile'], limit=2) + + +print(result['@m', ['text', 'scores__cosine']]) +``` + +The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799. +```text +[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]] +``` +You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved. + + (profiling)= ## Profiling diff --git a/docs/user-guides/retriever.md b/docs/user-guides/retriever.md index e9d7ced14..6e5597ab9 100644 --- a/docs/user-guides/retriever.md +++ b/docs/user-guides/retriever.md @@ -1,4 +1,4 @@ -# Search API +# CLIP Search CLIP Search is a search paradigm that uses the CLIP model to encode the text and image documents into a common vector space. @@ -51,16 +51,16 @@ executors: metas: py_modules: - clip_server.executors.clip_torch - + - name: indexer uses: jtype: AnnLiteIndexer with: n_dim: 512 - workspace: './workspace' metas: py_modules: - annlite.executor + workspace: './workspace' ``` ```` @@ -90,10 +90,11 @@ executors: jtype: AnnLiteIndexer with: n_dim: 512 - workspace: './workspace' + limit: 10 metas: py_modules: - annlite.executor + workspace: './workspace' ``` ```` @@ -101,19 +102,22 @@ executors: The first part defines the CLIP model config, which is explained [here](https://clip-as-service.jina.ai/user-guides/server/#clip-model-config). And the second part defines the Annlite indexer config, you can set the following parameters: -| Parameter | Description | -|-----------|-----------------------------------------------------------------------------------------------| -| `n_dim` | The dimension of the vector space. It should be the same as the dimension of the CLIP model. | +| Parameter | Description | +|-----------|----------------------------------------------------------------------------------------------| +| `n_dim` | The dimension of the vector space. It should be the same as the dimension of the CLIP model. | +| `limit` | The number of the most relevant documents to be retrieved. The default value is 10. | And the `workspace` parameter is the path to the workspace directory, which is used to store the index files. -## Connect from client +## Index and search documents ```{tip} You will need to install client first in Python 3.7+: `pip install clip-client>=0.7.0`. ``` -To connect to the server, you can use the following code: +### Index Documents + +To index image or text documents in the CLIP search server, you can use the client function {func}`~clip_client.Client.index`: ```python from clip_client import Client @@ -121,7 +125,6 @@ from docarray import Document client = Client('grpc://0.0.0.0:61000') -# index client.index( [ Document(text='she smiled, with pain'), @@ -129,18 +132,27 @@ client.index( Document(uri='https://clip-as-service.jina.ai/_static/favicon.png'), ] ) +``` + +You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you. + -# search -client.search(['smile']) +### Search Documents + +Then, you can use the client function {func}`~clip_client.Client.search` to search for similar documents: + +```python +result = client.search(['smile'], limit=2) + +print(result['@m', ['text', 'scores__cosine']]) ``` The results will look like this, the most relevant doc is "she smiled, with pain" with the cosine distance of 0.096. And the apple image has the cosine distance of 0.799. ```text -she smiled, with pain defaultdict(, {'cosine': {'value': 0.09604912996292114}}) -defaultdict(, {'cosine': {'value': 0.7994112372398376}}) +[['she smiled, with pain', ''], [{'value': 0.09604918956756592}, {'value': 0.7994111776351929}]] ``` +You can set the `limit` parameter (default is `10`) to control the number of the most similar documents to be retrieved. -You don't need to call `client.encode()` explicitly since `client.index()` will handle this for you. ## Support large-scale dataset @@ -168,10 +180,10 @@ executors: jtype: AnnLiteIndexer with: n_dim: 512 - workspace: './workspace' metas: py_modules: - annlite.executor + workspace: './workspace' shards: 5 polling: {'/index': 'ANY', '/search': 'ALL', '/update': 'ALL', '/delete': 'ALL', '/status': 'ALL'}