fix _sparse_to_dict() misalign bug (#39)

Signed-off-by: ChengZi <[email protected]>
langchain-ai · Jan 16, 2025 · 781d47c · 781d47c
1 parent 91c27ad
commit 781d47c
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 6 deletions.
diff --git a/libs/milvus/langchain_milvus/function.py b/libs/milvus/langchain_milvus/function.py
@@ -52,6 +52,17 @@ def __init__(
         enable_match: bool = False,
         function_name: Optional[str] = None,
     ):
+        """
+        Args:
+            input_field_names (str): The name of the input field, default is 'text'.
+            output_field_names (str): The name of the output field, default is 'sparse'.
+            analyzer_params (Optional[Dict[Any, Any]]): The parameters for the analyzer.
+                Default is None. See:
+                https://milvus.io/docs/analyzer-overview.md#Analyzer-Overview
+            enable_match (bool): Whether to enable match.
+            function_name (Optional[str]): The name of the function. Default is None,
+                which means a random name will be generated.
+        """
         super().__init__()
         if not function_name:
             function_name = f"bm25_function_{str(uuid.uuid4())[:8]}"

diff --git a/libs/milvus/langchain_milvus/utils/sparse.py b/libs/milvus/langchain_milvus/utils/sparse.py
@@ -20,6 +20,17 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
 class BM25SparseEmbedding(BaseSparseEmbedding):
     """Sparse embedding model based on BM25.
 
+    **Note: We recommend using the Milvus built-in BM25 function to implement sparse
+    embedding in your application.
+    This class is more of a reference because it requires the user to manage the corpus,
+     which is not practical. The Milvus built-in function solves this problem and makes
+     the BM25 sparse process easier and less frustrating for users.
+    For more information, please refer to:
+    https://milvus.io/docs/full-text-search.md#Full-Text-Search
+    and
+    https://github.com/milvus-io/bootcamp/blob/master/bootcamp/tutorials/integration/langchain/full_text_search_with_langchain.ipynb
+    **
+
     This class uses the BM25 model in Milvus model to implement sparse vector embedding.
     This model requires pymilvus[model] to be installed.
     `pip install pymilvus[model]`
@@ -45,9 +56,4 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
         return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays]
 
     def _sparse_to_dict(self, sparse_array: Any) -> Dict[int, float]:
-        row_indices, col_indices = sparse_array.nonzero()
-        non_zero_values = sparse_array.data
-        result_dict = {}
-        for col_index, value in zip(col_indices, non_zero_values):
-            result_dict[col_index] = value
-        return result_dict
+        return {j: sparse_array[i, j] for i, j in zip(*sparse_array.nonzero())}