From 39f469d4be5332a9116441f50924c3143d50b42d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Behmo?= Date: Mon, 14 Oct 2024 08:37:58 +0200 Subject: [PATCH] feat: add Meilisearch-compatible search engine The goal of this change is to introduce a search engine that is compatible with the edx-search API but that uses Meilisearch instead of Elasticsearch. That way, we can replace one by the other across edx-platform by simply changing a single SEARCH_ENGINE django setting. There are a couple of differences between Meilisearch and Elasticsearch: 1. Filterable attributes must be defined explicitly. 2. No support for datetime objects, which must be converted to timestamps (with an extra field to store the timezone). 3. No special characters allowed in the primary key values, such that we must hash course IDs before we can use them as primary key values. Note that this PR does not introduce any breaking change. This is an opt-in engine that anyone is free to use. There is some setup work for every search feature: see the engine module documentation for more information. See the corresponding conversation here: https://github.com/openedx/frontend-app-authoring/issues/1334#issuecomment-2401805382 --- .../core/djangoapps/content/search/engine.py | 484 ++++++++++++++++++ .../content/search/tests/test_engine.py | 250 +++++++++ 2 files changed, 734 insertions(+) create mode 100644 openedx/core/djangoapps/content/search/engine.py create mode 100644 openedx/core/djangoapps/content/search/tests/test_engine.py diff --git a/openedx/core/djangoapps/content/search/engine.py b/openedx/core/djangoapps/content/search/engine.py new file mode 100644 index 000000000000..4a281f3c50b1 --- /dev/null +++ b/openedx/core/djangoapps/content/search/engine.py @@ -0,0 +1,484 @@ +""" +This is a search engine for Meilisearch. It implements the edx-search's SearchEngine +API, such that it can be setup as a drop-in replacement for the ElasticSearchEngine. To +switch to this engine, you should run a Meilisearch instance and define the following +setting: + + SEARCH_ENGINE = "openedx.core.djangoapps.content.search.engine.MeilisearchEngine" + +You will then need to create the new indices by running: + + ./manage.py lms shell -c "from openedx.core.djangoapps.content.search import engine; engine.create_indexes()" + +For more information about the Meilisearch API in Python, check +https://github.com/meilisearch/meilisearch-python + +When implementing a new index, you might discover that you need to list explicit filterable +fields. Typically, you try to index new documents, and Meilisearch fails with the +following response: + + meilisearch.errors.MeilisearchApiError: MeilisearchApiError. Error code: invalid_search_filter. + Error message: Attribute `field3` is not filterable. Available filterable attributes are: + `field1 field2 _pk`. + +In such cases, the filterable field should be added to INDEX_FILTERABLES below. And you should +then run the `create_indexes()` function again, as indicated above. + +This search engine was tested for the following indexes: + +1. course_info ("course discovery"): + - Enable the course discovery feature: FEATURES["ENABLE_COURSE_DISCOVERY"] = True + - A search bar appears in the LMS landing page. + - Content is automatically indexed every time a course is edited in the studio. +2. courseware_content ("courseware search"): + - Enable the courseware search waffle flag: + + ./manage.py lms waffle_flag --create --everyone courseware.mfe_courseware_search + + - Enable the following feature flags: + + FEATURES["ENABLE_COURSEWARE_INDEX"] = True + FEATURES["ENABLE_COURSEWARE_SEARCH"] = True + - Reindex courseware content by running: ./manage.py cms reindex_course --active + - Alternatively, reindex course content by clicking the "Reindex" button in the Studio. + - In the learning MFE, a course search bar appears when opening a course. +""" + +from copy import deepcopy +from datetime import datetime +import hashlib +import json +import logging +import typing as t + +import meilisearch + +from django.conf import settings +from django.utils import timezone + +from search.search_engine_base import SearchEngine +from search.utils import ValueRange + + +logger = logging.getLogger(__name__) + +PRIMARY_KEY_FIELD_NAME = "_pk" +UTC_OFFSET_SUFFIX = "__utcoffset" + + +# In Meilisearch, we need to explicitly list fields for which we expect to define +# filters and aggregation functions. +# This is different than Elasticsearch where we can aggregate results over any field. +# Here, we list facet fields per index. +# Reference: https://www.meilisearch.com/docs/learn/filtering_and_sorting/search_with_facet_filters +# Note that index names are hard-coded here, because they are hardcoded anyway across all of edx-search. +INDEX_FILTERABLES = { + "course_info": [ + "language", # aggregate by language, mode, org + "modes", + "org", + "catalog_visibility", # exclude visibility="none" + "enrollment_end", # include only enrollable courses + ], + "courseware_content": [ + PRIMARY_KEY_FIELD_NAME, # exclude some specific documents based on ID + "course", # search courseware content by course + "org", # used during indexing + "start_date", # limit search to started courses + ], +} + + +class MeilisearchEngine(SearchEngine): + """ + Meilisearch-compatible search engine. We work very hard to produce an output that is + compliant with edx-search's ElasticSearchEngine. + """ + + def __init__(self, index=None): + super().__init__(index=index) + self.meilisearch_index = get_meilisearch_index(self.index_name) + + @property + def meilisearch_index_name(self): + """ + The index UID is its name. + """ + return self.meilisearch_index.uid + + def index(self, sources: list[dict[str, t.Any]], **kwargs): + """ + Index a number of documents, which can have just any type. + """ + logger.debug( + "Index request: index=%s sources=%s kwargs=%s", + self.meilisearch_index_name, + sources, + kwargs, + ) + self.meilisearch_index.add_documents( + [process_document(source) for source in sources], + serializer=DocumentEncoder, + ) + + def search( + self, + query_string=None, + field_dictionary=None, + filter_dictionary=None, + exclude_dictionary=None, + aggregation_terms=None, + # exclude_ids=None, # deprecated + # use_field_match=False, # deprecated + log_search_params=False, + **kwargs, + ): + """ + See meilisearch docs: https://www.meilisearch.com/docs/reference/api/search + """ + + opt_params = get_search_params( + field_dictionary=field_dictionary, + filter_dictionary=filter_dictionary, + exclude_dictionary=exclude_dictionary, + aggregation_terms=aggregation_terms, + **kwargs, + ) + if log_search_params: + logger.info(opt_params) + + meilisearch_results = self.meilisearch_index.search(query_string, opt_params) + return process_results(meilisearch_results, self.index_name) + + def remove(self, doc_ids, **kwargs): + logger.debug( + "Remove request: index=%s, doc_ids=%s kwargs=%s", + self.meilisearch_index_name, + doc_ids, + kwargs, + ) + if doc_ids: + self.meilisearch_index.delete_documents( + [id2pk(doc_id) for doc_id in doc_ids] + ) + + +class DocumentEncoder(json.JSONEncoder): + """ + Custom encoder, useful in particular to encode datetime fields. + Ref: https://github.com/meilisearch/meilisearch-python?tab=readme-ov-file#custom-serializer-for-documents- + """ + + def default(self, o): + if isinstance(o, datetime): + return str(o) + return super().default(o) + + +def create_indexes(): + """ + This is an initialization function that creates indexes and makes sure that they + support the right facetting. + """ + client = get_meilisearch_client() + for index_name, filterables in INDEX_FILTERABLES.items(): + meilisearch_index_name = get_meilisearch_index_name(index_name) + try: + index = client.get_index(meilisearch_index_name) + except meilisearch.errors.MeilisearchApiError as e: + if e.code != "index_not_found": + raise + client.create_index( + meilisearch_index_name, {"primaryKey": PRIMARY_KEY_FIELD_NAME} + ) + # Get the index again + index = client.get_index(meilisearch_index_name) + + # Update filterables + if filterables and index.get_filterable_attributes() != filterables: + index.update_filterable_attributes(filterables) + + +def get_meilisearch_index(index_name: str): + """ + Return a meilisearch index. + + Note that the index may not exist, and it will be created on first insertion. + ideally, the initialisation function `create_indexes` should be run first. + """ + meilisearch_client = get_meilisearch_client() + meilisearch_index_name = get_meilisearch_index_name(index_name) + return meilisearch_client.index(meilisearch_index_name) + + +def get_meilisearch_client(): + return meilisearch.Client( + settings.MEILISEARCH_URL, api_key=settings.MEILISEARCH_API_KEY + ) + + +def get_meilisearch_index_name(index_name: str) -> str: + """ + Return the index name in Meilisearch associated to a hard-coded index name. + + This is useful for multi-tenant Meilisearch: just define a different prefix for + every tenant. + + Usually, meilisearch API keys are allowed to access only certain index prefixes. + Make sure that your API key matches the prefix. + """ + return settings.MEILISEARCH_INDEX_PREFIX + index_name + + +def process_document(doc: dict[str, t.Any]) -> dict[str, t.Any]: + """ + Process document before indexing. + + We make a copy to avoid modifying the source document. + """ + processed = {PRIMARY_KEY_FIELD_NAME: id2pk(doc["id"])} + for key, value in doc.items(): + if isinstance(value, timezone.datetime): + # Convert datetime objects to timestamp, and store the timezone in a + # separate field with a suffix given by UTC_OFFSET_SUFFIX. + utcoffset = None + if value.tzinfo: + utcoffset = value.utcoffset().seconds + processed[key] = value.timestamp() + processed[f"{key}{UTC_OFFSET_SUFFIX}"] = utcoffset + elif isinstance(value, dict): + processed[key] = process_document(value) + else: + # Pray that there are not datetime objects inside lists + processed[key] = value + return processed + + +def id2pk(value: str) -> str: + """ + Convert a document "id" field into a primary key that is compatible with Meilisearch. + + This step is necessary because the "id" is typically a course id, which includes + colon ":" characters, which are not supported by Meilisearch. Source: + https://www.meilisearch.com/docs/learn/getting_started/primary_key#formatting-the-document-id + """ + return hashlib.sha1(value.encode()).hexdigest() + + +def get_search_params( + field_dictionary=None, + filter_dictionary=None, + exclude_dictionary=None, + aggregation_terms=None, + **kwargs, +) -> dict[str, t.Any]: + """ + Return a dictionary of parameters that should be passed to the Meilisearch client + `.search()` method. + """ + params = {"showRankingScore": True} + + # Aggregation + if aggregation_terms: + params["facets"] = list(aggregation_terms.keys()) + + # Exclusion and inclusion filters + filters = [] + if field_dictionary: + filters += get_filter_rules(field_dictionary) + if filter_dictionary: + filters += get_filter_rules(filter_dictionary, optional=True) + if exclude_dictionary: + filters += get_filter_rules(exclude_dictionary, exclude=True) + if filters: + params["filter"] = filters + + # Offset/Size + if "from_" in kwargs: + params["offset"] = kwargs["from_"] + if "size" in kwargs: + params["limit"] = kwargs["size"] + + return params + + +def get_filter_rules( + rule_dict: dict[str, t.Any], exclude: bool = False, optional: bool = False +) -> list[str]: + """ + Convert inclusion/exclusion rules. + """ + rules = [] + for key, value in rule_dict.items(): + if isinstance(value, list): + for v in value: + rules.append( + get_filter_rule(key, v, exclude=exclude, optional=optional) + ) + else: + rules.append( + get_filter_rule(key, value, exclude=exclude, optional=optional) + ) + return rules + + +def get_filter_rule( + key: str, value: str, exclude: bool = False, optional: bool = False +) -> str: + """ + Meilisearch filter rule. + + See: https://www.meilisearch.com/docs/learn/filtering_and_sorting/filter_expression_reference + """ + prefix = "NOT " if exclude else "" + if key == "id": + key = PRIMARY_KEY_FIELD_NAME + value = id2pk(value) + if isinstance(value, str): + rule = f'{prefix}{key} = "{value}"' + elif isinstance(value, ValueRange): + constraints = [] + lower = value.lower + if isinstance(lower, timezone.datetime): + lower = lower.timestamp() + upper = value.upper + if isinstance(upper, timezone.datetime): + upper = upper.timestamp() + # I know that the following fails if value == 0, but we are being + # consistent with the behaviour in the elasticsearch engine. + if lower: + constraints.append(f"{key} >= {lower}") + if upper: + constraints.append(f"{key} <= {upper}") + rule = " AND ".join(constraints) + if len(constraints) > 1: + rule = f"({rule})" + else: + raise ValueError(f"Unknown value type: {value.__class__}") + if optional: + rule += f" OR {key} NOT EXISTS" + return rule + + +def process_results(results: dict[str, t.Any], index_name: str) -> dict[str, t.Any]: + """ + Convert results produced by Meilisearch into results that are compatible with the + edx-search engine API. + + Example input: + + { + 'hits': [ + { + 'pk': 'f381d4f1914235c9532576c0861d09b484ade634', + 'id': 'course-v1:OpenedX+DemoX+DemoCourse', + ... + "_rankingScore": 0.865, + }, + ... + ], + 'query': 'demo', + 'processingTimeMs': 0, + 'limit': 20, + 'offset': 0, + 'estimatedTotalHits': 1 + } + + Example output: + + { + 'took': 13, + 'total': 1, + 'max_score': 0.4001565, + 'results': [ + { + '_index': 'course_info', + '_type': '_doc', + '_id': 'course-v1:OpenedX+DemoX+DemoCourse', + '_ignored': ['content.overview.keyword'], # removed + 'data': { + 'id': 'course-v1:OpenedX+DemoX+DemoCourse', + 'course': 'course-v1:OpenedX+DemoX+DemoCourse', + 'content': { + 'display_name': 'Open edX Demo Course', + ... + }, + 'image_url': '/asset-v1:OpenedX+DemoX+DemoCourse+type@asset+block@thumbnail_demox.jpeg', + 'start': '2020-01-01T00:00:00+00:00', + ... + }, + 'score': 0.4001565 + } + ], + 'aggs': { + 'modes': { + 'terms': {'audit': 1}, + 'total': 1.0, + 'other': 0 + }, + 'org': { + 'terms': {'OpenedX': 1}, 'total': 1.0, 'other': 0 + }, + 'language': {'terms': {'en': 1}, 'total': 1.0, 'other': 0} + } + } + """ + # Base + processed = { + "took": results["processingTimeMs"], + "total": results["estimatedTotalHits"], + "results": [], + "aggs": {}, + } + + # Hits + max_score = 0 + for result in results["hits"]: + result = process_hit(result) + score = result.pop("_rankingScore") + max_score = max(max_score, score) + processed_result = { + "_id": result["id"], + "_index": index_name, + "_type": "_doc", + "data": result, + } + processed["results"].append(processed_result) + processed["max_score"] = max_score + + # Aggregates/Facets + for facet_name, facet_distribution in results.get("facetDistribution", {}).items(): + total = sum(facet_distribution.values()) + processed["aggs"][facet_name] = { + "terms": facet_distribution, + "total": total, + "other": 0, + } + return processed + + +def process_hit(hit: dict[str, t.Any]) -> dict[str, t.Any]: + """ + Convert a search result back to the ES format. + """ + processed = deepcopy(hit) + + # Remove primary key field + try: + processed.pop(PRIMARY_KEY_FIELD_NAME) + except KeyError: + pass + + # Convert datetime fields back to datetime + for key, value in hit.items(): + if key.endswith(UTC_OFFSET_SUFFIX): + utcoffset = processed.pop(key) + key = key[: -len(UTC_OFFSET_SUFFIX)] + timestamp = hit[key] + tz = ( + timezone.get_fixed_timezone(timezone.timedelta(seconds=utcoffset)) + if utcoffset + else None + ) + processed[key] = timezone.datetime.fromtimestamp(timestamp, tz=tz) + return processed diff --git a/openedx/core/djangoapps/content/search/tests/test_engine.py b/openedx/core/djangoapps/content/search/tests/test_engine.py new file mode 100644 index 000000000000..9a2c652635b5 --- /dev/null +++ b/openedx/core/djangoapps/content/search/tests/test_engine.py @@ -0,0 +1,250 @@ +""" +Test for the Meilisearch search engine. +""" + +from datetime import datetime + +import django.test +from django.utils import timezone + +from search.utils import DateRange, ValueRange +from openedx.core.djangoapps.content.search import engine + + +class DocumentEncoderTests(django.test.TestCase): + """ + JSON encoder unit tests. + """ + + def test_document_encode_without_timezone(self): + document = { + "date": timezone.datetime(2024, 12, 31, 5, 0, 0), + } + encoder = engine.DocumentEncoder() + encoded = encoder.encode(document) + self.assertEqual('{"date": "2024-12-31 05:00:00"}', encoded) + + def test_document_encode_with_timezone(self): + document = { + "date": timezone.datetime( + 2024, 12, 31, 5, 0, 0, tzinfo=timezone.get_fixed_timezone(0) + ), + } + encoder = engine.DocumentEncoder() + encoded = encoder.encode(document) + self.assertEqual('{"date": "2024-12-31 05:00:00+00:00"}', encoded) + + +class EngineTests(django.test.TestCase): + """ + MeilisearchEngine tests. + """ + + def test_index(self): + document = { + "id": "abcd", + "name": "My name", + "title": "My title", + } + processed = engine.process_document(document) + + # Check that the source document was not modified + self.assertNotIn(engine.PRIMARY_KEY_FIELD_NAME, document) + + # Primary key field + # can be verified with: echo -n "abcd" | sha1sum + self.assertEqual( + "81fe8bfe87576c3ecb22426f8e57847382917acf", + processed[engine.PRIMARY_KEY_FIELD_NAME], + ) + # Additional fields + self.assertEqual("My name", processed["name"]) + self.assertEqual("My title", processed["title"]) + + def test_index_datetime_no_tz(self): + # No timezone + document = {"id": "1", "dt": timezone.datetime(2024, 1, 1)} + processed = engine.process_document(document) + self.assertEqual(1704067200.0, processed["dt"]) + self.assertEqual(None, processed["dt__utcoffset"]) + # reverse serialisation + reverse = engine.process_hit(processed) + self.assertEqual(document, reverse) + + def test_index_datetime_with_tz(self): + # With timezone + document = { + "id": "1", + "dt": timezone.datetime( + 2024, + 1, + 1, + tzinfo=timezone.get_fixed_timezone(timezone.timedelta(seconds=3600)), + ), + } + processed = engine.process_document(document) + self.assertEqual(1704063600.0, processed["dt"]) + self.assertEqual(3600, processed["dt__utcoffset"]) + # reverse serialisation + reverse = engine.process_hit(processed) + self.assertEqual(document, reverse) + + def test_search(self): + meilisearch_results = { + "hits": [ + { + "id": "id1", + engine.PRIMARY_KEY_FIELD_NAME: engine.id2pk("id1"), + "title": "title 1", + "_rankingScore": 0.8, + }, + { + "id": "id2", + engine.PRIMARY_KEY_FIELD_NAME: engine.id2pk("id2"), + "title": "title 2", + "_rankingScore": 0.2, + }, + ], + "query": "demo", + "processingTimeMs": 14, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + } + processed_results = engine.process_results(meilisearch_results, "index_name") + self.assertEqual(14, processed_results["took"]) + self.assertEqual(2, processed_results["total"]) + self.assertEqual(0.8, processed_results["max_score"]) + + self.assertEqual(2, len(processed_results["results"])) + self.assertEqual( + { + "_id": "id1", + "_index": "index_name", + "_type": "_doc", + "data": { + "id": "id1", + "title": "title 1", + }, + }, + processed_results["results"][0], + ) + self.assertEqual( + { + "_id": "id2", + "_index": "index_name", + "_type": "_doc", + "data": { + "id": "id2", + "title": "title 2", + }, + }, + processed_results["results"][1], + ) + + def test_search_with_facets(self): + meilisearch_results = { + "hits": [], + "query": "", + "processingTimeMs": 1, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "facetDistribution": { + "modes": {"audit": 1, "honor": 3}, + "facet2": {"val1": 1, "val2": 2, "val3": 3}, + }, + } + processed_results = engine.process_results(meilisearch_results, "index_name") + aggs = processed_results["aggs"] + self.assertEqual( + { + "terms": {"audit": 1, "honor": 3}, + "total": 4.0, + "other": 0, + }, + aggs["modes"], + ) + + def test_search_params(self): + params = engine.get_search_params() + self.assertTrue(params["showRankingScore"]) + + params = engine.get_search_params(from_=0) + self.assertEqual(0, params["offset"]) + + def test_search_params_exclude_dictionary(self): + # Simple value + params = engine.get_search_params( + exclude_dictionary={"course_visibility": "none"} + ) + self.assertEqual(['NOT course_visibility = "none"'], params["filter"]) + + # Multiple IDs + params = engine.get_search_params(exclude_dictionary={"id": ["1", "2"]}) + self.assertEqual( + [ + f'NOT {engine.PRIMARY_KEY_FIELD_NAME} = "{engine.id2pk("1")}"', + f'NOT {engine.PRIMARY_KEY_FIELD_NAME} = "{engine.id2pk("2")}"', + ], + params["filter"], + ) + + def test_search_params_field_dictionary(self): + params = engine.get_search_params( + field_dictionary={ + "course": "course-v1:testorg+test1+alpha", + "org": "testorg", + } + ) + self.assertEqual( + ['course = "course-v1:testorg+test1+alpha"', 'org = "testorg"'], + params["filter"], + ) + + def test_search_params_filter_dictionary(self): + params = engine.get_search_params(filter_dictionary={"key": "value"}) + self.assertEqual( + ['key = "value" OR key NOT EXISTS'], + params["filter"], + ) + + def test_search_params_value_range(self): + params = engine.get_search_params( + filter_dictionary={"value": ValueRange(lower=1, upper=2)} + ) + self.assertEqual( + ["(value >= 1 AND value <= 2) OR value NOT EXISTS"], + params["filter"], + ) + + params = engine.get_search_params( + filter_dictionary={"value": ValueRange(lower=1)} + ) + self.assertEqual( + ["value >= 1 OR value NOT EXISTS"], + params["filter"], + ) + + def test_search_params_date_range(self): + params = engine.get_search_params( + filter_dictionary={ + "enrollment_end": DateRange( + lower=datetime(2024, 1, 1), upper=datetime(2024, 1, 2) + ) + } + ) + self.assertEqual( + [ + "(enrollment_end >= 1704067200.0 AND enrollment_end <= 1704153600.0) OR enrollment_end NOT EXISTS" + ], + params["filter"], + ) + + params = engine.get_search_params( + filter_dictionary={"enrollment_end": DateRange(lower=datetime(2024, 1, 1))} + ) + self.assertEqual( + ["enrollment_end >= 1704067200.0 OR enrollment_end NOT EXISTS"], + params["filter"], + )