From 00a3f150a737173385e928be09cad3ab6174f34d Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Wed, 6 Jul 2022 11:50:19 -0400 Subject: [PATCH] OpenSearch and GraphQL for multiple source filters Why are these changes being introduced: * Being able to search multiple source at once is desired Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/RDI-170 How does this address that need: * Creates a method to add "OR" filtering in OpenSearch * Updates GraphQL to expect source as an array of keywords instead of a single string Document any side effects to this change: * GraphQL previously expected Source as a single string and now expects an array of strings. As V2 is not yet launched, this isn't a concern for V2, but may require changes for V1 consumers including Bento which filters to just ArchivesSpace. --- app/graphql/types/query_type.rb | 2 +- app/models/opensearch.rb | 25 +++- .../controllers/graphql_controller_v2_test.rb | 98 ++++++++++++++++ test/models/opensearch_test.rb | 21 +++- .../graphql_v2_filter_multiple_sources.yml | 108 ++++++++++++++++++ .../graphql_v2_filter_single_source.yml | 108 ++++++++++++++++++ 6 files changed, 356 insertions(+), 6 deletions(-) create mode 100644 test/vcr_cassettes/graphql_v2_filter_multiple_sources.yml create mode 100644 test/vcr_cassettes/graphql_v2_filter_single_source.yml diff --git a/app/graphql/types/query_type.rb b/app/graphql/types/query_type.rb index 33ad9b7..c0576f2 100644 --- a/app/graphql/types/query_type.rb +++ b/app/graphql/types/query_type.rb @@ -52,7 +52,7 @@ def record_id(id:) argument :format_facet, [String], required: false, default_value: nil argument :languages_facet, [String], required: false, default_value: nil argument :literary_form_facet, String, required: false, default_value: nil - argument :source_facet, String, required: false, default_value: 'All' + argument :source_facet, [String], required: false, default_value: nil argument :subjects_facet, [String], required: false, default_value: nil end else diff --git a/app/models/opensearch.rb b/app/models/opensearch.rb index 39ac149..e9c74c3 100644 --- a/app/models/opensearch.rb +++ b/app/models/opensearch.rb @@ -4,7 +4,7 @@ class Opensearch def search(from, params, client) @params = params - client.search(index: ENV['ELASTICSEARCH_INDEX'], + client.search(index: ENV.fetch('ELASTICSEARCH_INDEX', nil), body: build_query(from)) end @@ -114,7 +114,8 @@ def filters f.push filter_single(@params[:literary_form_facet], 'literary_form') if @params[:literary_form_facet] - f.push filter_single(@params[:source_facet], 'source') if @params[:source_facet] + f.push filter_sources(@params[:source_facet]) if @params[:source_facet] + f.push filter(@params[:subjects_facet], 'subjects') if @params[:subjects_facet] f end @@ -155,6 +156,26 @@ def filter_single(param, field) } end + def filter_sources(param) + { + bool: { + should: source_array(param) + } + } + end + + def source_array(param) + sources = [] + param.each do |source| + sources << { + term: { + source: source + } + } + end + sources + end + # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html def aggregations { diff --git a/test/controllers/graphql_controller_v2_test.rb b/test/controllers/graphql_controller_v2_test.rb index 254cc90..d47fecb 100644 --- a/test/controllers/graphql_controller_v2_test.rb +++ b/test/controllers/graphql_controller_v2_test.rb @@ -314,4 +314,102 @@ def setup json = JSON.parse(response.body) assert(json['errors'].first['message'].present?) end + + test 'graphqlv2 filter multiple sources' do + VCR.use_cassette('graphql v2 filter multiple sources') do + + # no filters to return all sources. used later to test filters return less than the total. + post '/graphql', params: { query: + '{ + search(searchterm: "data") { + hits + aggregations { + source { + key + docCount + } + } + } + }' + } + + json = JSON.parse(response.body) + initial_source_array = json['data']['search']['aggregations']['source'] + + # filtering to 2 sources returns 2 sources + post '/graphql', params: { query: + '{ + search(searchterm: "data", sourceFacet: ["Zenodo", "DSpace@MIT"]) { + hits + aggregations { + source { + key + docCount + } + } + } + }' + } + assert_equal(200, response.status) + + json = JSON.parse(response.body) + filtered_source_array = json['data']['search']['aggregations']['source'] + + assert(initial_source_array.count > filtered_source_array.count) + assert_equal(2, filtered_source_array.count) + + expected_sources = ['zenodo', 'dspace@mit'] + actual_sources = filtered_source_array.map{|source| source["key"]} + assert_equal(expected_sources, actual_sources) + end + end + + test 'graphqlv2 filter single source' do + VCR.use_cassette('graphql v2 filter single source') do + + # no filters to return all sources. used later to test filters return less than the total. + post '/graphql', params: { query: + '{ + search(searchterm: "data") { + hits + aggregations { + source { + key + docCount + } + } + } + }' + } + + json = JSON.parse(response.body) + initial_source_array = json['data']['search']['aggregations']['source'] + + # filtering to 1 sources returns 1 source + post '/graphql', params: { query: + '{ + search(searchterm: "data", sourceFacet: ["DSpace@MIT"]) { + hits + aggregations { + source { + key + docCount + } + } + } + }' + } + assert_equal(200, response.status) + + json = JSON.parse(response.body) + filtered_source_array = json['data']['search']['aggregations']['source'] + + assert(initial_source_array.count > filtered_source_array.count) + assert_equal(1, filtered_source_array.count) + + expected_sources = ['dspace@mit'] + actual_sources = filtered_source_array.map{|source| source["key"]} + assert_equal(expected_sources, actual_sources) + end + end end diff --git a/test/models/opensearch_test.rb b/test/models/opensearch_test.rb index 32d3cb7..55f5311 100644 --- a/test/models/opensearch_test.rb +++ b/test/models/opensearch_test.rb @@ -72,17 +72,32 @@ class OpensearchTest < ActiveSupport::TestCase VCR.use_cassette('opensearch single field nested') do params = { contributors: 'mcternan' } results = Opensearch.new.search(0, params, Timdex::OSClient) - assert_equal "A common table : 80 recipes and stories from my shared cultures /", + assert_equal 'A common table : 80 recipes and stories from my shared cultures /', results['hits']['hits'].first['_source']['title'] end end test 'searches multiple fields' do VCR.use_cassette('opensearch multiple fields') do - params = { q: 'chinese', title: 'common', contributors: 'mcternan'} + params = { q: 'chinese', title: 'common', contributors: 'mcternan' } results = Opensearch.new.search(0, params, Timdex::OSClient) - assert_equal "A common table : 80 recipes and stories from my shared cultures /", + assert_equal 'A common table : 80 recipes and stories from my shared cultures /', results['hits']['hits'].first['_source']['title'] end end + + test 'source_array creates correct query structure' do + sources = ['Zenodo', 'DSpace@MIT'] + expected = [{ term: { source: 'Zenodo' } }, { term: { source: 'DSpace@MIT' } }] + + assert_equal(expected, Opensearch.new.source_array(sources)) + end + + test 'filter_sources creates correct query structure' do + sources = ['Zenodo', 'DSpace@MIT'] + expected = { bool: { should: [{ term: { source: 'Zenodo' } }, + { term: { source: 'DSpace@MIT' } }] } } + + assert_equal(expected, Opensearch.new.filter_sources(sources)) + end end diff --git a/test/vcr_cassettes/graphql_v2_filter_multiple_sources.yml b/test/vcr_cassettes/graphql_v2_filter_multiple_sources.yml new file mode 100644 index 0000000..9a11daf --- /dev/null +++ b/test/vcr_cassettes/graphql_v2_filter_multiple_sources.yml @@ -0,0 +1,108 @@ +--- +http_interactions: +- request: + method: get + uri: http://localhost:9200/ + body: + encoding: US-ASCII + string: '' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '350' + body: + encoding: ASCII-8BIT + string: | + { + "name" : "aa481ae9e939", + "cluster_name" : "docker-cluster", + "cluster_uuid" : "ktiriADZSI2Vohg8EZdVfA", + "version" : { + "distribution" : "opensearch", + "number" : "1.3.0", + "build_type" : "tar", + "build_hash" : "e45991597c86ba1bbcc36ee1dfdc165197a913af", + "build_date" : "2022-03-15T19:07:30.455415Z", + "build_snapshot" : false, + "lucene_version" : "8.10.1", + "minimum_wire_compatibility_version" : "6.8.0", + "minimum_index_compatibility_version" : "6.0.0-beta1" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" + } + recorded_at: Wed, 06 Jul 2022 15:35:56 GMT +- request: + method: post + uri: http://localhost:9200/timdex-prod/_search + body: + encoding: UTF-8 + string: '{"from":"0","size":20,"query":{"bool":{"should":[{"prefix":{"title.exact_value":{"value":"data","boost":15.0}}},{"term":{"title":{"value":"data","boost":1.0}}},{"nested":{"path":"contributors","query":{"term":{"contributors.value":{"value":"data","boost":0.1}}}}}],"must":[{"multi_match":{"query":"data"}}],"filter":[]}},"highlight":{"pre_tags":["\u003cspan + class=\"highlight\"\u003e"],"post_tags":["\u003c/span\u003e"],"fields":{"*":{}}},"aggregations":{"collections":{"terms":{"field":"collections.keyword"}},"contributors":{"nested":{"path":"contributors"},"aggs":{"contributor_names":{"terms":{"field":"contributors.value.keyword"}}}},"content_type":{"terms":{"field":"content_type"}},"content_format":{"terms":{"field":"format"}},"languages":{"terms":{"field":"languages.keyword"}},"literary_form":{"terms":{"field":"literary_form"}},"source":{"terms":{"field":"source"}},"subjects":{"nested":{"path":"subjects"},"aggs":{"subject_names":{"terms":{"field":"subjects.value.keyword"}}}}}}' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '17634' + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 06 Jul 2022 15:35:56 GMT +- request: + method: post + uri: http://localhost:9200/timdex-prod/_search + body: + encoding: UTF-8 + string: '{"from":"0","size":20,"query":{"bool":{"should":[{"prefix":{"title.exact_value":{"value":"data","boost":15.0}}},{"term":{"title":{"value":"data","boost":1.0}}},{"nested":{"path":"contributors","query":{"term":{"contributors.value":{"value":"data","boost":0.1}}}}}],"must":[{"multi_match":{"query":"data"}}],"filter":[{"bool":{"should":[{"term":{"source":"Zenodo"}},{"term":{"source":"DSpace@MIT"}}]}}]}},"highlight":{"pre_tags":["\u003cspan + class=\"highlight\"\u003e"],"post_tags":["\u003c/span\u003e"],"fields":{"*":{}}},"aggregations":{"collections":{"terms":{"field":"collections.keyword"}},"contributors":{"nested":{"path":"contributors"},"aggs":{"contributor_names":{"terms":{"field":"contributors.value.keyword"}}}},"content_type":{"terms":{"field":"content_type"}},"content_format":{"terms":{"field":"format"}},"languages":{"terms":{"field":"languages.keyword"}},"literary_form":{"terms":{"field":"literary_form"}},"source":{"terms":{"field":"source"}},"subjects":{"nested":{"path":"subjects"},"aggs":{"subject_names":{"terms":{"field":"subjects.value.keyword"}}}}}}' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '16146' + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 06 Jul 2022 15:35:56 GMT +recorded_with: VCR 6.1.0 diff --git a/test/vcr_cassettes/graphql_v2_filter_single_source.yml b/test/vcr_cassettes/graphql_v2_filter_single_source.yml new file mode 100644 index 0000000..62579fb --- /dev/null +++ b/test/vcr_cassettes/graphql_v2_filter_single_source.yml @@ -0,0 +1,108 @@ +--- +http_interactions: +- request: + method: get + uri: http://localhost:9200/ + body: + encoding: US-ASCII + string: '' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '350' + body: + encoding: ASCII-8BIT + string: | + { + "name" : "aa481ae9e939", + "cluster_name" : "docker-cluster", + "cluster_uuid" : "ktiriADZSI2Vohg8EZdVfA", + "version" : { + "distribution" : "opensearch", + "number" : "1.3.0", + "build_type" : "tar", + "build_hash" : "e45991597c86ba1bbcc36ee1dfdc165197a913af", + "build_date" : "2022-03-15T19:07:30.455415Z", + "build_snapshot" : false, + "lucene_version" : "8.10.1", + "minimum_wire_compatibility_version" : "6.8.0", + "minimum_index_compatibility_version" : "6.0.0-beta1" + }, + "tagline" : "The OpenSearch Project: https://opensearch.org/" + } + recorded_at: Wed, 06 Jul 2022 15:49:13 GMT +- request: + method: post + uri: http://localhost:9200/timdex-prod/_search + body: + encoding: UTF-8 + string: '{"from":"0","size":20,"query":{"bool":{"should":[{"prefix":{"title.exact_value":{"value":"data","boost":15.0}}},{"term":{"title":{"value":"data","boost":1.0}}},{"nested":{"path":"contributors","query":{"term":{"contributors.value":{"value":"data","boost":0.1}}}}}],"must":[{"multi_match":{"query":"data"}}],"filter":[]}},"highlight":{"pre_tags":["\u003cspan + class=\"highlight\"\u003e"],"post_tags":["\u003c/span\u003e"],"fields":{"*":{}}},"aggregations":{"collections":{"terms":{"field":"collections.keyword"}},"contributors":{"nested":{"path":"contributors"},"aggs":{"contributor_names":{"terms":{"field":"contributors.value.keyword"}}}},"content_type":{"terms":{"field":"content_type"}},"content_format":{"terms":{"field":"format"}},"languages":{"terms":{"field":"languages.keyword"}},"literary_form":{"terms":{"field":"literary_form"}},"source":{"terms":{"field":"source"}},"subjects":{"nested":{"path":"subjects"},"aggs":{"subject_names":{"terms":{"field":"subjects.value.keyword"}}}}}}' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '17634' + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 06 Jul 2022 15:49:14 GMT +- request: + method: post + uri: http://localhost:9200/timdex-prod/_search + body: + encoding: UTF-8 + string: '{"from":"0","size":20,"query":{"bool":{"should":[{"prefix":{"title.exact_value":{"value":"data","boost":15.0}}},{"term":{"title":{"value":"data","boost":1.0}}},{"nested":{"path":"contributors","query":{"term":{"contributors.value":{"value":"data","boost":0.1}}}}}],"must":[{"multi_match":{"query":"data"}}],"filter":[{"bool":{"should":[{"term":{"source":"DSpace@MIT"}}]}}]}},"highlight":{"pre_tags":["\u003cspan + class=\"highlight\"\u003e"],"post_tags":["\u003c/span\u003e"],"fields":{"*":{}}},"aggregations":{"collections":{"terms":{"field":"collections.keyword"}},"contributors":{"nested":{"path":"contributors"},"aggs":{"contributor_names":{"terms":{"field":"contributors.value.keyword"}}}},"content_type":{"terms":{"field":"content_type"}},"content_format":{"terms":{"field":"format"}},"languages":{"terms":{"field":"languages.keyword"}},"literary_form":{"terms":{"field":"literary_form"}},"source":{"terms":{"field":"source"}},"subjects":{"nested":{"path":"subjects"},"aggs":{"subject_names":{"terms":{"field":"subjects.value.keyword"}}}}}}' + headers: + User-Agent: + - 'opensearch-ruby/2.0.0 (RUBY_VERSION: 2.7.6; darwin x86_64; Faraday v1.10.0)' + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Content-Length: + - '11087' + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Wed, 06 Jul 2022 15:49:14 GMT +recorded_with: VCR 6.1.0