Skip to content

Commit

Permalink
fix _sparse_to_dict() misalign bug (#39)
Browse files Browse the repository at this point in the history
Signed-off-by: ChengZi <[email protected]>
  • Loading branch information
zc277584121 authored Jan 16, 2025
1 parent 91c27ad commit 781d47c
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 6 deletions.
11 changes: 11 additions & 0 deletions libs/milvus/langchain_milvus/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@ def __init__(
enable_match: bool = False,
function_name: Optional[str] = None,
):
"""
Args:
input_field_names (str): The name of the input field, default is 'text'.
output_field_names (str): The name of the output field, default is 'sparse'.
analyzer_params (Optional[Dict[Any, Any]]): The parameters for the analyzer.
Default is None. See:
https://milvus.io/docs/analyzer-overview.md#Analyzer-Overview
enable_match (bool): Whether to enable match.
function_name (Optional[str]): The name of the function. Default is None,
which means a random name will be generated.
"""
super().__init__()
if not function_name:
function_name = f"bm25_function_{str(uuid.uuid4())[:8]}"
Expand Down
18 changes: 12 additions & 6 deletions libs/milvus/langchain_milvus/utils/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
class BM25SparseEmbedding(BaseSparseEmbedding):
"""Sparse embedding model based on BM25.
**Note: We recommend using the Milvus built-in BM25 function to implement sparse
embedding in your application.
This class is more of a reference because it requires the user to manage the corpus,
which is not practical. The Milvus built-in function solves this problem and makes
the BM25 sparse process easier and less frustrating for users.
For more information, please refer to:
https://milvus.io/docs/full-text-search.md#Full-Text-Search
and
https://github.com/milvus-io/bootcamp/blob/master/bootcamp/tutorials/integration/langchain/full_text_search_with_langchain.ipynb
**
This class uses the BM25 model in Milvus model to implement sparse vector embedding.
This model requires pymilvus[model] to be installed.
`pip install pymilvus[model]`
Expand All @@ -45,9 +56,4 @@ def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays]

def _sparse_to_dict(self, sparse_array: Any) -> Dict[int, float]:
row_indices, col_indices = sparse_array.nonzero()
non_zero_values = sparse_array.data
result_dict = {}
for col_index, value in zip(col_indices, non_zero_values):
result_dict[col_index] = value
return result_dict
return {j: sparse_array[i, j] for i, j in zip(*sparse_array.nonzero())}

0 comments on commit 781d47c

Please sign in to comment.