From 609a63f6d3ddbfe32e1e6e3d6d663048a3f95242 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Thu, 17 Jun 2021 13:18:56 -0700 Subject: [PATCH 1/7] LUCENE-10008: Respect ignoreCase flag in CommonGramsFilterFactory CommonGramsFilterFactory should respect the ignoreCase flag passed in args even when the default stop word set is used. --- .../commongrams/CommonGramsFilterFactory.java | 2 +- .../TestCommonGramsFilterFactory.java | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java index ccf26bde1ab6..c2f292d027a4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java @@ -76,7 +76,7 @@ public void inform(ResourceLoader loader) throws IOException { commonWords = getWordSet(loader, commonWordFiles, ignoreCase); } } else { - commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET; + commonWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java index ea274a5f8448..3fffe56d3c1f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java @@ -98,6 +98,22 @@ public void testDefaults() throws Exception { stream, new String[] {"testing", "testing_the", "the", "the_factory", "factory"}); } + /** Test that ignoreCase flag is honored when no words are provided and default stopwords are used. */ + public void testIgnoreCase() throws Exception { + ResourceLoader loader = new ClasspathResourceLoader(getClass()); + CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", + Version.LATEST, loader, "ignoreCase", "true"); + CharArraySet words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue(words.contains("the")); + assertTrue(words.contains("The")); + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(new StringReader("testing The factory")); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents( + stream, new String[] {"testing", "testing_The", "The", "The_factory", "factory"}); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected = From 6131ed12137acb234ac0f01d2acddd78b282dc26 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Thu, 17 Jun 2021 14:32:55 -0700 Subject: [PATCH 2/7] LUCENE-10008: Styling fixes from precommit check --- .../commongrams/TestCommonGramsFilterFactory.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java index 3fffe56d3c1f..1c76568b798c 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java @@ -98,11 +98,14 @@ public void testDefaults() throws Exception { stream, new String[] {"testing", "testing_the", "the", "the_factory", "factory"}); } - /** Test that ignoreCase flag is honored when no words are provided and default stopwords are used. */ + /** + * Test that ignoreCase flag is honored when no words are provided and default stopwords are used. + * */ public void testIgnoreCase() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); - CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", - Version.LATEST, loader, "ignoreCase", "true"); + CommonGramsFilterFactory factory = + (CommonGramsFilterFactory) + tokenFilterFactory("CommonGrams", Version.LATEST, loader, "ignoreCase", "true"); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); From eeb1fe34327222c89cca0f1c3a2d7dff996ff044 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Thu, 17 Jun 2021 14:35:43 -0700 Subject: [PATCH 3/7] Spotless violations fix --- .../analysis/commongrams/TestCommonGramsFilterFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java index 1c76568b798c..57fbd3bb931a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java @@ -100,7 +100,7 @@ public void testDefaults() throws Exception { /** * Test that ignoreCase flag is honored when no words are provided and default stopwords are used. - * */ + */ public void testIgnoreCase() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); CommonGramsFilterFactory factory = From 17e93a6a93d9d601b989a381a08be2077087395c Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 18 Jun 2021 19:58:39 -0700 Subject: [PATCH 4/7] Add common base class for Common/Stop/KeepWords filter factories --- .../commongrams/CommonGramsFilterFactory.java | 40 +----- .../analysis/core/StopFilterFactory.java | 48 +------- .../en/AbstractWordsFileFilterFactory.java | 115 ++++++++++++++++++ .../miscellaneous/KeepWordFilterFactory.java | 33 +---- .../TestCommonGramsFilterFactory.java | 14 +-- .../lucene/analysis/commongrams/common-1.txt | 17 +++ .../lucene/analysis/commongrams/common-2.txt | 17 +++ .../analysis/commongrams/common-snowball.txt | 10 ++ .../miscellaneous/TestKeepFilterFactory.java | 20 +++ .../analysis/miscellaneous/keep-snowball.txt | 10 ++ 10 files changed, 207 insertions(+), 117 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java index c2f292d027a4..b8d623e08650 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java @@ -16,14 +16,11 @@ */ package org.apache.lucene.analysis.commongrams; -import java.io.IOException; import java.util.Map; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.util.ResourceLoader; +import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; import org.apache.lucene.util.ResourceLoaderAware; /** @@ -40,26 +37,14 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class CommonGramsFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { +public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { /** SPI name */ public static final String NAME = "commonGrams"; - // TODO: shared base class for Stop/Keep/CommonGrams? - private CharArraySet commonWords; - private final String commonWordFiles; - private final String format; - private final boolean ignoreCase; - /** Creates a new CommonGramsFilterFactory */ public CommonGramsFilterFactory(Map args) { super(args); - commonWordFiles = get(args, "words"); - format = get(args, "format"); - ignoreCase = getBoolean(args, "ignoreCase", false); - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } } /** Default ctor for compatibility with SPI */ @@ -67,30 +52,13 @@ public CommonGramsFilterFactory() { throw defaultCtorException(); } - @Override - public void inform(ResourceLoader loader) throws IOException { - if (commonWordFiles != null) { - if ("snowball".equalsIgnoreCase(format)) { - commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); - } else { - commonWords = getWordSet(loader, commonWordFiles, ignoreCase); - } - } else { - commonWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - public CharArraySet getCommonWords() { - return commonWords; + return getWords(); } @Override public TokenFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords); + CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords()); return commonGrams; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java index c2b46b2a1cc4..1bbc45fd120c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java @@ -16,14 +16,12 @@ */ package org.apache.lucene.analysis.core; -import java.io.IOException; import java.util.Map; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.util.ResourceLoader; import org.apache.lucene.util.ResourceLoaderAware; /** @@ -65,28 +63,14 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { +public class StopFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { /** SPI name */ public static final String NAME = "stop"; - public static final String FORMAT_WORDSET = "wordset"; - public static final String FORMAT_SNOWBALL = "snowball"; - - private CharArraySet stopWords; - private final String stopWordFiles; - private final String format; - private final boolean ignoreCase; - /** Creates a new StopFilterFactory */ public StopFilterFactory(Map args) { super(args); - stopWordFiles = get(args, "words"); - format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET)); - ignoreCase = getBoolean(args, "ignoreCase", false); - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } } /** Default ctor for compatibility with SPI */ @@ -94,37 +78,13 @@ public StopFilterFactory() { throw defaultCtorException(); } - @Override - public void inform(ResourceLoader loader) throws IOException { - if (stopWordFiles != null) { - if (FORMAT_WORDSET.equalsIgnoreCase(format)) { - stopWords = getWordSet(loader, stopWordFiles, ignoreCase); - } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { - stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); - } else { - throw new IllegalArgumentException( - "Unknown 'format' specified for 'words' file: " + format); - } - } else { - if (null != format) { - throw new IllegalArgumentException( - "'format' can not be specified w/o an explicit 'words' file: " + format); - } - stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - public CharArraySet getStopWords() { - return stopWords; + return getWords(); } @Override public TokenStream create(TokenStream input) { - StopFilter stopFilter = new StopFilter(input, stopWords); + StopFilter stopFilter = new StopFilter(input, getWords()); return stopFilter; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java new file mode 100644 index 000000000000..08c0857c6967 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java @@ -0,0 +1,115 @@ +package org.apache.lucene.analysis.en; + +import java.io.IOException; +import java.util.Map; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.util.ResourceLoader; +import org.apache.lucene.util.ResourceLoaderAware; + +/** + * Abstract parent class for analysis factories that accept a stopwords file as input. + * + *

Concrete implementations can leverage the following input attributes. + * All attributes are optional: + * + *

    + *
  • ignoreCase defaults to false + *
  • words should be the name of a stopwords file to parse, if not specified the + * factory will use the value provided by {@link #createDefaultWords()} implementation in concrete + * subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET} + *
  • format defines how the words file will be parsed, and defaults to + * wordset. If words is not specified, then format must + * not be specified. + *
+ * + *

The valid values for the format option are: + * + *

    + *
  • wordset - This is the default format, which supports one word per line + * (including any intra-word whitespace) and allows whole line comments beginning with the "#" + * character. Blank lines are ignored. See {@link WordlistLoader#getLines + * WordlistLoader.getLines} for details. + *
  • snowball - This format allows for multiple words specified on each line, and + * trailing comments may be specified using the vertical line ("|"). Blank lines are + * ignored. See {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet} + * for details. + *
+ */ +public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + public static final String FORMAT_WORDSET = "wordset"; + public static final String FORMAT_SNOWBALL = "snowball"; + + private CharArraySet words; + private final String wordFiles; + private final String format; + private final boolean ignoreCase; + + /** + * Default ctor for compatibility with SPI + */ + protected AbstractWordsFileFilterFactory() { + throw defaultCtorException(); + } + + /** + * Initialize this factory via a set of key-value pairs. + */ + public AbstractWordsFileFilterFactory(Map args) { + super(args); + wordFiles = get(args, "words"); + format = get(args, "format", (null == wordFiles ? null : FORMAT_WORDSET)); + ignoreCase = getBoolean(args, "ignoreCase", false); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** + * Initialize the set of stopwords provided via ResourceLoader, or using defaults. + */ + @Override + public void inform(ResourceLoader loader) throws IOException { + if (wordFiles != null) { + if (FORMAT_WORDSET.equalsIgnoreCase(format)) { + words = getWordSet(loader, wordFiles, ignoreCase); + } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { + words = getSnowballWordSet(loader, wordFiles, ignoreCase); + } else { + throw new IllegalArgumentException( + "Unknown 'format' specified for 'words' file: " + format); + } + } else { + if (null != format) { + throw new IllegalArgumentException( + "'format' can not be specified w/o an explicit 'words' file: " + format); + } + words = createDefaultWords(); + } + } + + /** + * Default word set implementation. + */ + protected CharArraySet createDefaultWords() { + return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); + } + + public CharArraySet getWords() { + return words; + } + + public String getWordFiles() { + return wordFiles; + } + + public String getFormat() { + return format; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java index 9286a6a4d8b3..8ee91767d070 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java @@ -16,12 +16,10 @@ */ package org.apache.lucene.analysis.miscellaneous; -import java.io.IOException; import java.util.Map; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.util.ResourceLoader; +import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; import org.apache.lucene.util.ResourceLoaderAware; /** @@ -38,23 +36,14 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { +public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { /** SPI name */ public static final String NAME = "keepWord"; - private final boolean ignoreCase; - private final String wordFiles; - private CharArraySet words; - /** Creates a new KeepWordFilterFactory */ public KeepWordFilterFactory(Map args) { super(args); - wordFiles = get(args, "words"); - ignoreCase = getBoolean(args, "ignoreCase", false); - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } } /** Default ctor for compatibility with SPI */ @@ -63,27 +52,17 @@ public KeepWordFilterFactory() { } @Override - public void inform(ResourceLoader loader) throws IOException { - if (wordFiles != null) { - words = getWordSet(loader, wordFiles, ignoreCase); - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public CharArraySet getWords() { - return words; + protected CharArraySet createDefaultWords() { + return null; } @Override public TokenStream create(TokenStream input) { // if the set is null, it means it was empty - if (words == null) { + if (getWords() == null) { return input; } else { - final TokenStream filter = new KeepWordFilter(input, words); + final TokenStream filter = new KeepWordFilter(input, getWords()); return filter; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java index 57fbd3bb931a..f48dd98046b2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java @@ -22,25 +22,19 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.TestStopFilterFactory; import org.apache.lucene.util.ClasspathResourceLoader; import org.apache.lucene.util.ResourceLoader; import org.apache.lucene.util.Version; -/** - * Tests pretty much copied from StopFilterFactoryTest We use the test files used by the - * StopFilterFactoryTest TODO: consider creating separate test files so this won't break if stop - * filter test files change - */ public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase { public void testInform() throws Exception { - ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class); + ResourceLoader loader = new ClasspathResourceLoader(getClass()); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory( - "CommonGrams", Version.LATEST, loader, "words", "stop-1.txt", "ignoreCase", "true"); + "CommonGrams", Version.LATEST, loader, "words", "common-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); @@ -53,7 +47,7 @@ public void testInform() throws Exception { Version.LATEST, loader, "words", - "stop-1.txt, stop-2.txt", + "common-1.txt, common-2.txt", "ignoreCase", "true"); words = factory.getCommonWords(); @@ -68,7 +62,7 @@ public void testInform() throws Exception { Version.LATEST, loader, "words", - "stop-snowball.txt", + "common-snowball.txt", "format", "snowball", "ignoreCase", diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt new file mode 100644 index 000000000000..8dfe80902d26 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +foo +bar \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt new file mode 100644 index 000000000000..646b7ff4ddba --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +junk +more \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt new file mode 100644 index 000000000000..1c0c6f51142a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt @@ -0,0 +1,10 @@ + | This is a file in snowball format, empty lines are ignored, '|' is a comment + | Additionally, multiple words can be on the same line, allowing stopwords to be + | arranged in tables (useful in some languages where they might inflect) + + | fictitious table below + +|third person singular +|Subject Object Possessive Reflexive +he him his himself| masculine +she her hers herself| feminine diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java index 524d25b504cc..b95c1c3a4977 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java @@ -39,6 +39,26 @@ public void testInform() throws Exception { words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); + + factory = + (KeepWordFilterFactory) + tokenFilterFactory( + "KeepWord", "words", "keep-snowball.txt", "format", "snowball", "ignoreCase", "true"); + words = factory.getWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); + + // defaults + factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord"); + assertTrue(factory.getWords() == null); + assertEquals(false, factory.isIgnoreCase()); } /** Test that bogus arguments result in exception */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt new file mode 100644 index 000000000000..1c0c6f51142a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt @@ -0,0 +1,10 @@ + | This is a file in snowball format, empty lines are ignored, '|' is a comment + | Additionally, multiple words can be on the same line, allowing stopwords to be + | arranged in tables (useful in some languages where they might inflect) + + | fictitious table below + +|third person singular +|Subject Object Possessive Reflexive +he him his himself| masculine +she her hers herself| feminine From 31fff407ea7e7555b70bbdf8422b2f1c502317ee Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 18 Jun 2021 20:02:37 -0700 Subject: [PATCH 5/7] Linting errors --- .../lucene/analysis/commongrams/CommonGramsFilterFactory.java | 3 +-- .../org/apache/lucene/analysis/core/StopFilterFactory.java | 3 +-- .../lucene/analysis/miscellaneous/KeepWordFilterFactory.java | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java index b8d623e08650..09f1d293bf78 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java @@ -21,7 +21,6 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; -import org.apache.lucene.util.ResourceLoaderAware; /** * Constructs a {@link CommonGramsFilter}. @@ -37,7 +36,7 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { +public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory { /** SPI name */ public static final String NAME = "commonGrams"; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java index 1bbc45fd120c..3018e87a9bc1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java @@ -22,7 +22,6 @@ import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.util.ResourceLoaderAware; /** * Factory for {@link StopFilter}. @@ -63,7 +62,7 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class StopFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { +public class StopFilterFactory extends AbstractWordsFileFilterFactory { /** SPI name */ public static final String NAME = "stop"; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java index 8ee91767d070..324c4ca9a43c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java @@ -20,7 +20,6 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; -import org.apache.lucene.util.ResourceLoaderAware; /** * Factory for {@link KeepWordFilter}. @@ -36,7 +35,7 @@ * @since 3.1 * @lucene.spi {@value #NAME} */ -public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware { +public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory { /** SPI name */ public static final String NAME = "keepWord"; From b5d06e3c6facf9f62fef50fb0067db4b222c3a21 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 18 Jun 2021 20:21:27 -0700 Subject: [PATCH 6/7] Add license header --- .../en/AbstractWordsFileFilterFactory.java | 43 +++++++++++-------- .../TestCommonGramsFilterFactory.java | 8 +++- .../miscellaneous/TestKeepFilterFactory.java | 8 +++- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java index 08c0857c6967..8a3278ef9e3b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.analysis.en; import java.io.IOException; @@ -11,14 +27,14 @@ /** * Abstract parent class for analysis factories that accept a stopwords file as input. * - *

Concrete implementations can leverage the following input attributes. - * All attributes are optional: + *

Concrete implementations can leverage the following input attributes. All attributes are + * optional: * *

    *
  • ignoreCase defaults to false *
  • words should be the name of a stopwords file to parse, if not specified the - * factory will use the value provided by {@link #createDefaultWords()} implementation in concrete - * subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET} + * factory will use the value provided by {@link #createDefaultWords()} implementation in + * concrete subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET} *
  • format defines how the words file will be parsed, and defaults to * wordset. If words is not specified, then format must * not be specified. @@ -37,7 +53,8 @@ * for details. *
*/ -public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { +public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory + implements ResourceLoaderAware { public static final String FORMAT_WORDSET = "wordset"; public static final String FORMAT_SNOWBALL = "snowball"; @@ -47,16 +64,12 @@ public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory private final String format; private final boolean ignoreCase; - /** - * Default ctor for compatibility with SPI - */ + /** Default ctor for compatibility with SPI */ protected AbstractWordsFileFilterFactory() { throw defaultCtorException(); } - /** - * Initialize this factory via a set of key-value pairs. - */ + /** Initialize this factory via a set of key-value pairs. */ public AbstractWordsFileFilterFactory(Map args) { super(args); wordFiles = get(args, "words"); @@ -67,9 +80,7 @@ public AbstractWordsFileFilterFactory(Map args) { } } - /** - * Initialize the set of stopwords provided via ResourceLoader, or using defaults. - */ + /** Initialize the set of stopwords provided via ResourceLoader, or using defaults. */ @Override public void inform(ResourceLoader loader) throws IOException { if (wordFiles != null) { @@ -90,9 +101,7 @@ public void inform(ResourceLoader loader) throws IOException { } } - /** - * Default word set implementation. - */ + /** Default word set implementation. */ protected CharArraySet createDefaultWords() { return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java index f48dd98046b2..d93cd6078eb3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java @@ -34,7 +34,13 @@ public void testInform() throws Exception { CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory( - "CommonGrams", Version.LATEST, loader, "words", "common-1.txt", "ignoreCase", "true"); + "CommonGrams", + Version.LATEST, + loader, + "words", + "common-1.txt", + "ignoreCase", + "true"); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java index b95c1c3a4977..baf6a7117406 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java @@ -43,7 +43,13 @@ public void testInform() throws Exception { factory = (KeepWordFilterFactory) tokenFilterFactory( - "KeepWord", "words", "keep-snowball.txt", "format", "snowball", "ignoreCase", "true"); + "KeepWord", + "words", + "keep-snowball.txt", + "format", + "snowball", + "ignoreCase", + "true"); words = factory.getWords(); assertEquals(8, words.size()); assertTrue(words.contains("he")); From f4b055845970b4ebaece0e5cb178aab1662587d9 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Thu, 12 Aug 2021 13:27:56 -0700 Subject: [PATCH 7/7] Move default stop word implementation to concrete subclasses --- .../analysis/commongrams/CommonGramsFilterFactory.java | 6 ++++++ .../org/apache/lucene/analysis/core/StopFilterFactory.java | 5 +++++ .../lucene/analysis/en/AbstractWordsFileFilterFactory.java | 6 ++---- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java index 09f1d293bf78..939b712a75fc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; +import org.apache.lucene.analysis.en.EnglishAnalyzer; /** * Constructs a {@link CommonGramsFilter}. @@ -55,6 +56,11 @@ public CharArraySet getCommonWords() { return getWords(); } + @Override + protected CharArraySet createDefaultWords() { + return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase()); + } + @Override public TokenFilter create(TokenStream input) { CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords()); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java index 3018e87a9bc1..b7ea4615f1df 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java @@ -81,6 +81,11 @@ public CharArraySet getStopWords() { return getWords(); } + @Override + protected CharArraySet createDefaultWords() { + return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase()); + } + @Override public TokenStream create(TokenStream input) { StopFilter stopFilter = new StopFilter(input, getWords()); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java index 8a3278ef9e3b..ec97093dde54 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java @@ -34,7 +34,7 @@ *
  • ignoreCase defaults to false *
  • words should be the name of a stopwords file to parse, if not specified the * factory will use the value provided by {@link #createDefaultWords()} implementation in - * concrete subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET} + * concrete subclass. *
  • format defines how the words file will be parsed, and defaults to * wordset. If words is not specified, then format must * not be specified. @@ -102,9 +102,7 @@ public void inform(ResourceLoader loader) throws IOException { } /** Default word set implementation. */ - protected CharArraySet createDefaultWords() { - return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); - } + protected abstract CharArraySet createDefaultWords(); public CharArraySet getWords() { return words;