Skip to content

Commit

Permalink
feat: support bm25 milvus function (#33)
Browse files Browse the repository at this point in the history
This PR introduced some major refactors:
- Introduce the abstract class `BaseMilvusBuiltInFunction`, which is a
light wrapper of [Milvus
Function](https://milvus.io/docs/manage-collections.md#Function).
- Introduce `Bm25BuiltInFunction` extended from
`BaseMilvusBuiltInFunction` , which includes the Milvus
`FunctionType.BM25` settings and the configs of Milvus analyzer. We can
use this `Bm25BuiltInFunction` to implement [Full text
search](https://milvus.io/docs/full-text-search.md) in Milvus
- In the future, Milvus will support more built-in Functions which
support text-in(instead of vector-in) abilities, without transporting
text to embedding on the user's end because it does this on the server's
end automatically (here is a `FunctionType.TEXTEMBEDDING`
[example](https://github.com/milvus-io/pymilvus/blob/master/examples/text_embedding.py)).
So in the future we can implement more subclass from
`BaseMilvusBuiltInFunction` to support the text-in functions in Milvus.
- The how-to-use introduction is on the way, and there are some use case
examples in the unittest `test_builtin_bm25_function()`. Simply
speaking, we can pass in any customized Langchain embedding functions or
milvus built-in functions to the Milvus class initialization function to
build multi index fields in Milvus.
Some use case examples will be like these:
```python
from langchain_milvus import Milvus, BM25BuiltInFunction
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embedding,
    builtin_function=BM25BuiltInFunction(
        output_field_names="sparse"
    ),
    #"dense" field is used for similarity search for OpenAI dense embedding, "sparse" field is used for BM25 full-text search
    vector_field=["dense", "sparse"],
    connection_args={
        "uri": URI,
    },
    drop_old=True,
)
```
or with multi embedding fields and bm25 function:
```python
from langchain_voyageai import VoyageAIEmbeddings

embedding = OpenAIEmbeddings()
embedding2 = VoyageAIEmbeddings(model="voyage-3")

vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=[embedding, embedding2],
    builtin_function=BM25BuiltInFunction(
        input_field_names="text",
        output_field_names="sparse"
    ),
    text_field="text",
    vector_field=["dense", "dense2", "sparse"],
    connection_args={
        "uri": URI,
    },
    drop_old=True,
)
```

---------

Signed-off-by: ChengZi <[email protected]>
  • Loading branch information
zc277584121 authored Jan 10, 2025
1 parent b925dac commit 1c13e43
Show file tree
Hide file tree
Showing 9 changed files with 883 additions and 384 deletions.
6 changes: 6 additions & 0 deletions libs/milvus/langchain_milvus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from langchain_milvus.function import (
BaseMilvusBuiltInFunction,
BM25BuiltInFunction,
)
from langchain_milvus.retrievers import (
MilvusCollectionHybridSearchRetriever,
ZillizCloudPipelineRetriever,
Expand All @@ -9,4 +13,6 @@
"Zilliz",
"ZillizCloudPipelineRetriever",
"MilvusCollectionHybridSearchRetriever",
"BaseMilvusBuiltInFunction",
"BM25BuiltInFunction",
]
74 changes: 74 additions & 0 deletions libs/milvus/langchain_milvus/function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import uuid
from abc import ABC
from typing import Any, Dict, List, Optional, Union

from pymilvus import Function, FunctionType

from langchain_milvus.utils.constant import SPARSE_VECTOR_FIELD, TEXT_FIELD


class BaseMilvusBuiltInFunction(ABC):
"""
Base class for Milvus built-in functions.
See:
https://milvus.io/docs/manage-collections.md#Function
"""

def __init__(self) -> None:
self._function: Optional[Function] = None

@property
def function(self) -> Function:
return self._function

@property
def input_field_names(self) -> Union[str, List[str]]:
return self.function.input_field_names

@property
def output_field_names(self) -> Union[str, List[str]]:
return self.function.output_field_names

@property
def type(self) -> FunctionType:
return self.function.type


class BM25BuiltInFunction(BaseMilvusBuiltInFunction):
"""
Milvus BM25 built-in function.
See:
https://milvus.io/docs/full-text-search.md
"""

def __init__(
self,
*,
input_field_names: str = TEXT_FIELD,
output_field_names: str = SPARSE_VECTOR_FIELD,
analyzer_params: Optional[Dict[Any, Any]] = None,
enable_match: bool = False,
function_name: Optional[str] = None,
):
super().__init__()
if not function_name:
function_name = f"bm25_function_{str(uuid.uuid4())[:8]}"
self._function = Function(
name=function_name,
input_field_names=input_field_names,
output_field_names=output_field_names,
function_type=FunctionType.BM25,
)
self.analyzer_params: Optional[Dict[Any, Any]] = analyzer_params
self.enable_match = enable_match

def get_input_field_schema_kwargs(self) -> dict:
field_schema_kwargs: Dict[Any, Any] = {
"enable_analyzer": True,
"enable_match": self.enable_match,
}
if self.analyzer_params is not None:
field_schema_kwargs["analyzer_params"] = self.analyzer_params
return field_schema_kwargs
4 changes: 4 additions & 0 deletions libs/milvus/langchain_milvus/utils/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
VECTOR_FIELD = "vector"
SPARSE_VECTOR_FIELD = "sparse"
TEXT_FIELD = "text"
PRIMARY_FIELD = "pk"
Loading

0 comments on commit 1c13e43

Please sign in to comment.