From 781d47c63ffa10f16315d29597b60310c6b4bcca Mon Sep 17 00:00:00 2001 From: Cheney Zhang Date: Thu, 16 Jan 2025 11:34:16 +0800 Subject: [PATCH] fix _sparse_to_dict() misalign bug (#39) Signed-off-by: ChengZi --- libs/milvus/langchain_milvus/function.py | 11 +++++++++++ libs/milvus/langchain_milvus/utils/sparse.py | 18 ++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libs/milvus/langchain_milvus/function.py b/libs/milvus/langchain_milvus/function.py index 4b47682..3469fd0 100644 --- a/libs/milvus/langchain_milvus/function.py +++ b/libs/milvus/langchain_milvus/function.py @@ -52,6 +52,17 @@ def __init__( enable_match: bool = False, function_name: Optional[str] = None, ): + """ + Args: + input_field_names (str): The name of the input field, default is 'text'. + output_field_names (str): The name of the output field, default is 'sparse'. + analyzer_params (Optional[Dict[Any, Any]]): The parameters for the analyzer. + Default is None. See: + https://milvus.io/docs/analyzer-overview.md#Analyzer-Overview + enable_match (bool): Whether to enable match. + function_name (Optional[str]): The name of the function. Default is None, + which means a random name will be generated. + """ super().__init__() if not function_name: function_name = f"bm25_function_{str(uuid.uuid4())[:8]}" diff --git a/libs/milvus/langchain_milvus/utils/sparse.py b/libs/milvus/langchain_milvus/utils/sparse.py index 46d8c38..a8e8e6c 100644 --- a/libs/milvus/langchain_milvus/utils/sparse.py +++ b/libs/milvus/langchain_milvus/utils/sparse.py @@ -20,6 +20,17 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]: class BM25SparseEmbedding(BaseSparseEmbedding): """Sparse embedding model based on BM25. + **Note: We recommend using the Milvus built-in BM25 function to implement sparse + embedding in your application. + This class is more of a reference because it requires the user to manage the corpus, + which is not practical. The Milvus built-in function solves this problem and makes + the BM25 sparse process easier and less frustrating for users. + For more information, please refer to: + https://milvus.io/docs/full-text-search.md#Full-Text-Search + and + https://github.com/milvus-io/bootcamp/blob/master/bootcamp/tutorials/integration/langchain/full_text_search_with_langchain.ipynb + ** + This class uses the BM25 model in Milvus model to implement sparse vector embedding. This model requires pymilvus[model] to be installed. `pip install pymilvus[model]` @@ -45,9 +56,4 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]: return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays] def _sparse_to_dict(self, sparse_array: Any) -> Dict[int, float]: - row_indices, col_indices = sparse_array.nonzero() - non_zero_values = sparse_array.data - result_dict = {} - for col_index, value in zip(col_indices, non_zero_values): - result_dict[col_index] = value - return result_dict + return {j: sparse_array[i, j] for i, j in zip(*sparse_array.nonzero())}