From 609a63f6d3ddbfe32e1e6e3d6d663048a3f95242 Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Thu, 17 Jun 2021 13:18:56 -0700
Subject: [PATCH 1/7] LUCENE-10008: Respect ignoreCase flag in
 CommonGramsFilterFactory

CommonGramsFilterFactory should respect the ignoreCase flag passed in args
even when the default stop word set is used.
---
 .../commongrams/CommonGramsFilterFactory.java    |  2 +-
 .../TestCommonGramsFilterFactory.java            | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index ccf26bde1ab6..c2f292d027a4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -76,7 +76,7 @@ public void inform(ResourceLoader loader) throws IOException {
         commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
       }
     } else {
-      commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
+      commonWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
     }
   }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index ea274a5f8448..3fffe56d3c1f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -98,6 +98,22 @@ public void testDefaults() throws Exception {
         stream, new String[] {"testing", "testing_the", "the", "the_factory", "factory"});
   }
 
+  /** Test that ignoreCase flag is honored when no words are provided and default stopwords are used. */
+  public void testIgnoreCase() throws Exception {
+    ResourceLoader loader = new ClasspathResourceLoader(getClass());
+    CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams",
+        Version.LATEST, loader, "ignoreCase", "true");
+    CharArraySet words = factory.getCommonWords();
+    assertTrue("words is null and it shouldn't be", words != null);
+    assertTrue(words.contains("the"));
+    assertTrue(words.contains("The"));
+    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenizer.setReader(new StringReader("testing The factory"));
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(
+        stream, new String[] {"testing", "testing_The", "The", "The_factory", "factory"});
+  }
+
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     IllegalArgumentException expected =

From 6131ed12137acb234ac0f01d2acddd78b282dc26 Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Thu, 17 Jun 2021 14:32:55 -0700
Subject: [PATCH 2/7] LUCENE-10008: Styling fixes from precommit check

---
 .../commongrams/TestCommonGramsFilterFactory.java        | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index 3fffe56d3c1f..1c76568b798c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -98,11 +98,14 @@ public void testDefaults() throws Exception {
         stream, new String[] {"testing", "testing_the", "the", "the_factory", "factory"});
   }
 
-  /** Test that ignoreCase flag is honored when no words are provided and default stopwords are used. */
+  /**
+   * Test that ignoreCase flag is honored when no words are provided and default stopwords are used.
+   * */
   public void testIgnoreCase() throws Exception {
     ResourceLoader loader = new ClasspathResourceLoader(getClass());
-    CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams",
-        Version.LATEST, loader, "ignoreCase", "true");
+    CommonGramsFilterFactory factory =
+        (CommonGramsFilterFactory)
+            tokenFilterFactory("CommonGrams", Version.LATEST, loader, "ignoreCase", "true");
     CharArraySet words = factory.getCommonWords();
     assertTrue("words is null and it shouldn't be", words != null);
     assertTrue(words.contains("the"));

From eeb1fe34327222c89cca0f1c3a2d7dff996ff044 Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Thu, 17 Jun 2021 14:35:43 -0700
Subject: [PATCH 3/7] Spotless violations fix

---
 .../analysis/commongrams/TestCommonGramsFilterFactory.java      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index 1c76568b798c..57fbd3bb931a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -100,7 +100,7 @@ public void testDefaults() throws Exception {
 
   /**
    * Test that ignoreCase flag is honored when no words are provided and default stopwords are used.
-   * */
+   */
   public void testIgnoreCase() throws Exception {
     ResourceLoader loader = new ClasspathResourceLoader(getClass());
     CommonGramsFilterFactory factory =

From 17e93a6a93d9d601b989a381a08be2077087395c Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Fri, 18 Jun 2021 19:58:39 -0700
Subject: [PATCH 4/7] Add common base class for Common/Stop/KeepWords filter
 factories

---
 .../commongrams/CommonGramsFilterFactory.java |  40 +-----
 .../analysis/core/StopFilterFactory.java      |  48 +-------
 .../en/AbstractWordsFileFilterFactory.java    | 115 ++++++++++++++++++
 .../miscellaneous/KeepWordFilterFactory.java  |  33 +----
 .../TestCommonGramsFilterFactory.java         |  14 +--
 .../lucene/analysis/commongrams/common-1.txt  |  17 +++
 .../lucene/analysis/commongrams/common-2.txt  |  17 +++
 .../analysis/commongrams/common-snowball.txt  |  10 ++
 .../miscellaneous/TestKeepFilterFactory.java  |  20 +++
 .../analysis/miscellaneous/keep-snowball.txt  |  10 ++
 10 files changed, 207 insertions(+), 117 deletions(-)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index c2f292d027a4..b8d623e08650 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -16,14 +16,11 @@
  */
 package org.apache.lucene.analysis.commongrams;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.util.ResourceLoader;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
@@ -40,26 +37,14 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class CommonGramsFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
 
   /** SPI name */
   public static final String NAME = "commonGrams";
 
-  // TODO: shared base class for Stop/Keep/CommonGrams?
-  private CharArraySet commonWords;
-  private final String commonWordFiles;
-  private final String format;
-  private final boolean ignoreCase;
-
   /** Creates a new CommonGramsFilterFactory */
   public CommonGramsFilterFactory(Map<String, String> args) {
     super(args);
-    commonWordFiles = get(args, "words");
-    format = get(args, "format");
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
@@ -67,30 +52,13 @@ public CommonGramsFilterFactory() {
     throw defaultCtorException();
   }
 
-  @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (commonWordFiles != null) {
-      if ("snowball".equalsIgnoreCase(format)) {
-        commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
-      } else {
-        commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
-      }
-    } else {
-      commonWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
-  }
-
   public CharArraySet getCommonWords() {
-    return commonWords;
+    return getWords();
   }
 
   @Override
   public TokenFilter create(TokenStream input) {
-    CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
+    CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords());
     return commonGrams;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
index c2b46b2a1cc4..1bbc45fd120c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -16,14 +16,12 @@
  */
 package org.apache.lucene.analysis.core;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.util.ResourceLoader;
 import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
@@ -65,28 +63,14 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class StopFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
 
   /** SPI name */
   public static final String NAME = "stop";
 
-  public static final String FORMAT_WORDSET = "wordset";
-  public static final String FORMAT_SNOWBALL = "snowball";
-
-  private CharArraySet stopWords;
-  private final String stopWordFiles;
-  private final String format;
-  private final boolean ignoreCase;
-
   /** Creates a new StopFilterFactory */
   public StopFilterFactory(Map<String, String> args) {
     super(args);
-    stopWordFiles = get(args, "words");
-    format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
@@ -94,37 +78,13 @@ public StopFilterFactory() {
     throw defaultCtorException();
   }
 
-  @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (stopWordFiles != null) {
-      if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
-        stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
-      } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
-        stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
-      } else {
-        throw new IllegalArgumentException(
-            "Unknown 'format' specified for 'words' file: " + format);
-      }
-    } else {
-      if (null != format) {
-        throw new IllegalArgumentException(
-            "'format' can not be specified w/o an explicit 'words' file: " + format);
-      }
-      stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
-  }
-
   public CharArraySet getStopWords() {
-    return stopWords;
+    return getWords();
   }
 
   @Override
   public TokenStream create(TokenStream input) {
-    StopFilter stopFilter = new StopFilter(input, stopWords);
+    StopFilter stopFilter = new StopFilter(input, getWords());
     return stopFilter;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
new file mode 100644
index 000000000000..08c0857c6967
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
@@ -0,0 +1,115 @@
+package org.apache.lucene.analysis.en;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.ResourceLoader;
+import org.apache.lucene.util.ResourceLoaderAware;
+
+/**
+ * Abstract parent class for analysis factories that accept a stopwords file as input.
+ *
+ * <p>Concrete implementations can leverage the following input attributes.
+ * All attributes are optional:
+ *
+ * <ul>
+ *   <li><code>ignoreCase</code> defaults to <code>false</code>
+ *   <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
+ *       factory will use the value provided by {@link #createDefaultWords()} implementation in concrete
+ *       subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET}
+ *   <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
+ *       <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
+ *       not be specified.
+ * </ul>
+ *
+ * <p>The valid values for the <code>format</code> option are:
+ *
+ * <ul>
+ *   <li><code>wordset</code> - This is the default format, which supports one word per line
+ *       (including any intra-word whitespace) and allows whole line comments beginning with the "#"
+ *       character. Blank lines are ignored. See {@link WordlistLoader#getLines
+ *       WordlistLoader.getLines} for details.
+ *   <li><code>snowball</code> - This format allows for multiple words specified on each line, and
+ *       trailing comments may be specified using the vertical line ("&#124;"). Blank lines are
+ *       ignored. See {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
+ *       for details.
+ * </ul>
+ */
+public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+
+  public static final String FORMAT_WORDSET = "wordset";
+  public static final String FORMAT_SNOWBALL = "snowball";
+
+  private CharArraySet words;
+  private final String wordFiles;
+  private final String format;
+  private final boolean ignoreCase;
+
+  /**
+   * Default ctor for compatibility with SPI
+   */
+  protected AbstractWordsFileFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  /**
+   * Initialize this factory via a set of key-value pairs.
+   */
+  public AbstractWordsFileFilterFactory(Map<String, String> args) {
+    super(args);
+    wordFiles = get(args, "words");
+    format = get(args, "format", (null == wordFiles ? null : FORMAT_WORDSET));
+    ignoreCase = getBoolean(args, "ignoreCase", false);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /**
+   * Initialize the set of stopwords provided via ResourceLoader, or using defaults.
+   */
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (wordFiles != null) {
+      if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
+        words = getWordSet(loader, wordFiles, ignoreCase);
+      } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
+        words = getSnowballWordSet(loader, wordFiles, ignoreCase);
+      } else {
+        throw new IllegalArgumentException(
+            "Unknown 'format' specified for 'words' file: " + format);
+      }
+    } else {
+      if (null != format) {
+        throw new IllegalArgumentException(
+            "'format' can not be specified w/o an explicit 'words' file: " + format);
+      }
+      words = createDefaultWords();
+    }
+  }
+
+  /**
+   * Default word set implementation.
+   */
+  protected CharArraySet createDefaultWords() {
+    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
+  }
+
+  public CharArraySet getWords() {
+    return words;
+  }
+
+  public String getWordFiles() {
+    return wordFiles;
+  }
+
+  public String getFormat() {
+    return format;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
index 9286a6a4d8b3..8ee91767d070 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
@@ -16,12 +16,10 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.util.ResourceLoader;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
@@ -38,23 +36,14 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
 
   /** SPI name */
   public static final String NAME = "keepWord";
 
-  private final boolean ignoreCase;
-  private final String wordFiles;
-  private CharArraySet words;
-
   /** Creates a new KeepWordFilterFactory */
   public KeepWordFilterFactory(Map<String, String> args) {
     super(args);
-    wordFiles = get(args, "words");
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
@@ -63,27 +52,17 @@ public KeepWordFilterFactory() {
   }
 
   @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (wordFiles != null) {
-      words = getWordSet(loader, wordFiles, ignoreCase);
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
-  }
-
-  public CharArraySet getWords() {
-    return words;
+  protected CharArraySet createDefaultWords() {
+    return null;
   }
 
   @Override
   public TokenStream create(TokenStream input) {
     // if the set is null, it means it was empty
-    if (words == null) {
+    if (getWords() == null) {
       return input;
     } else {
-      final TokenStream filter = new KeepWordFilter(input, words);
+      final TokenStream filter = new KeepWordFilter(input, getWords());
       return filter;
     }
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index 57fbd3bb931a..f48dd98046b2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -22,25 +22,19 @@
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.TestStopFilterFactory;
 import org.apache.lucene.util.ClasspathResourceLoader;
 import org.apache.lucene.util.ResourceLoader;
 import org.apache.lucene.util.Version;
 
-/**
- * Tests pretty much copied from StopFilterFactoryTest We use the test files used by the
- * StopFilterFactoryTest TODO: consider creating separate test files so this won't break if stop
- * filter test files change
- */
 public class TestCommonGramsFilterFactory extends BaseTokenStreamFactoryTestCase {
 
   public void testInform() throws Exception {
-    ResourceLoader loader = new ClasspathResourceLoader(TestStopFilterFactory.class);
+    ResourceLoader loader = new ClasspathResourceLoader(getClass());
     assertTrue("loader is null and it shouldn't be", loader != null);
     CommonGramsFilterFactory factory =
         (CommonGramsFilterFactory)
             tokenFilterFactory(
-                "CommonGrams", Version.LATEST, loader, "words", "stop-1.txt", "ignoreCase", "true");
+                "CommonGrams", Version.LATEST, loader, "words", "common-1.txt", "ignoreCase", "true");
     CharArraySet words = factory.getCommonWords();
     assertTrue("words is null and it shouldn't be", words != null);
     assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
@@ -53,7 +47,7 @@ public void testInform() throws Exception {
                 Version.LATEST,
                 loader,
                 "words",
-                "stop-1.txt, stop-2.txt",
+                "common-1.txt, common-2.txt",
                 "ignoreCase",
                 "true");
     words = factory.getCommonWords();
@@ -68,7 +62,7 @@ public void testInform() throws Exception {
                 Version.LATEST,
                 loader,
                 "words",
-                "stop-snowball.txt",
+                "common-snowball.txt",
                 "format",
                 "snowball",
                 "ignoreCase",
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt
new file mode 100644
index 000000000000..8dfe80902d26
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-1.txt
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+foo
+bar
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt
new file mode 100644
index 000000000000..646b7ff4ddba
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-2.txt
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+junk
+more
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt
new file mode 100644
index 000000000000..1c0c6f51142a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/common-snowball.txt
@@ -0,0 +1,10 @@
+ | This is a file in snowball format, empty lines are ignored, '|' is a comment
+ | Additionally, multiple words can be on the same line, allowing stopwords to be
+ | arranged in tables (useful in some languages where they might inflect)
+
+ | fictitious table below
+
+|third person singular
+|Subject Object Possessive Reflexive
+he       him    his        himself| masculine
+she      her    hers       herself| feminine
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
index 524d25b504cc..b95c1c3a4977 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
@@ -39,6 +39,26 @@ public void testInform() throws Exception {
     words = factory.getWords();
     assertTrue("words is null and it shouldn't be", words != null);
     assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
+
+    factory =
+        (KeepWordFilterFactory)
+            tokenFilterFactory(
+                "KeepWord", "words", "keep-snowball.txt", "format", "snowball", "ignoreCase", "true");
+    words = factory.getWords();
+    assertEquals(8, words.size());
+    assertTrue(words.contains("he"));
+    assertTrue(words.contains("him"));
+    assertTrue(words.contains("his"));
+    assertTrue(words.contains("himself"));
+    assertTrue(words.contains("she"));
+    assertTrue(words.contains("her"));
+    assertTrue(words.contains("hers"));
+    assertTrue(words.contains("herself"));
+
+    // defaults
+    factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord");
+    assertTrue(factory.getWords() == null);
+    assertEquals(false, factory.isIgnoreCase());
   }
 
   /** Test that bogus arguments result in exception */
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt
new file mode 100644
index 000000000000..1c0c6f51142a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-snowball.txt
@@ -0,0 +1,10 @@
+ | This is a file in snowball format, empty lines are ignored, '|' is a comment
+ | Additionally, multiple words can be on the same line, allowing stopwords to be
+ | arranged in tables (useful in some languages where they might inflect)
+
+ | fictitious table below
+
+|third person singular
+|Subject Object Possessive Reflexive
+he       him    his        himself| masculine
+she      her    hers       herself| feminine

From 31fff407ea7e7555b70bbdf8422b2f1c502317ee Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Fri, 18 Jun 2021 20:02:37 -0700
Subject: [PATCH 5/7] Linting errors

---
 .../lucene/analysis/commongrams/CommonGramsFilterFactory.java  | 3 +--
 .../org/apache/lucene/analysis/core/StopFilterFactory.java     | 3 +--
 .../lucene/analysis/miscellaneous/KeepWordFilterFactory.java   | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index b8d623e08650..09f1d293bf78 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -21,7 +21,6 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
-import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
  * Constructs a {@link CommonGramsFilter}.
@@ -37,7 +36,7 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
+public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "commonGrams";
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
index 1bbc45fd120c..3018e87a9bc1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -22,7 +22,6 @@
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
  * Factory for {@link StopFilter}.
@@ -63,7 +62,7 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class StopFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
+public class StopFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "stop";
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
index 8ee91767d070..324c4ca9a43c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
@@ -20,7 +20,6 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
-import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
  * Factory for {@link KeepWordFilter}.
@@ -36,7 +35,7 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory implements ResourceLoaderAware {
+public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "keepWord";

From b5d06e3c6facf9f62fef50fb0067db4b222c3a21 Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigya.work@gmail.com>
Date: Fri, 18 Jun 2021 20:21:27 -0700
Subject: [PATCH 6/7] Add license header

---
 .../en/AbstractWordsFileFilterFactory.java    | 43 +++++++++++--------
 .../TestCommonGramsFilterFactory.java         |  8 +++-
 .../miscellaneous/TestKeepFilterFactory.java  |  8 +++-
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
index 08c0857c6967..8a3278ef9e3b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.lucene.analysis.en;
 
 import java.io.IOException;
@@ -11,14 +27,14 @@
 /**
  * Abstract parent class for analysis factories that accept a stopwords file as input.
  *
- * <p>Concrete implementations can leverage the following input attributes.
- * All attributes are optional:
+ * <p>Concrete implementations can leverage the following input attributes. All attributes are
+ * optional:
  *
  * <ul>
  *   <li><code>ignoreCase</code> defaults to <code>false</code>
  *   <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
- *       factory will use the value provided by {@link #createDefaultWords()} implementation in concrete
- *       subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET}
+ *       factory will use the value provided by {@link #createDefaultWords()} implementation in
+ *       concrete subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET}
  *   <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
  *       <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
  *       not be specified.
@@ -37,7 +53,8 @@
  *       for details.
  * </ul>
  */
-public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory
+    implements ResourceLoaderAware {
 
   public static final String FORMAT_WORDSET = "wordset";
   public static final String FORMAT_SNOWBALL = "snowball";
@@ -47,16 +64,12 @@ public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory
   private final String format;
   private final boolean ignoreCase;
 
-  /**
-   * Default ctor for compatibility with SPI
-   */
+  /** Default ctor for compatibility with SPI */
   protected AbstractWordsFileFilterFactory() {
     throw defaultCtorException();
   }
 
-  /**
-   * Initialize this factory via a set of key-value pairs.
-   */
+  /** Initialize this factory via a set of key-value pairs. */
   public AbstractWordsFileFilterFactory(Map<String, String> args) {
     super(args);
     wordFiles = get(args, "words");
@@ -67,9 +80,7 @@ public AbstractWordsFileFilterFactory(Map<String, String> args) {
     }
   }
 
-  /**
-   * Initialize the set of stopwords provided via ResourceLoader, or using defaults.
-   */
+  /** Initialize the set of stopwords provided via ResourceLoader, or using defaults. */
   @Override
   public void inform(ResourceLoader loader) throws IOException {
     if (wordFiles != null) {
@@ -90,9 +101,7 @@ public void inform(ResourceLoader loader) throws IOException {
     }
   }
 
-  /**
-   * Default word set implementation.
-   */
+  /** Default word set implementation. */
   protected CharArraySet createDefaultWords() {
     return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
index f48dd98046b2..d93cd6078eb3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java
@@ -34,7 +34,13 @@ public void testInform() throws Exception {
     CommonGramsFilterFactory factory =
         (CommonGramsFilterFactory)
             tokenFilterFactory(
-                "CommonGrams", Version.LATEST, loader, "words", "common-1.txt", "ignoreCase", "true");
+                "CommonGrams",
+                Version.LATEST,
+                loader,
+                "words",
+                "common-1.txt",
+                "ignoreCase",
+                "true");
     CharArraySet words = factory.getCommonWords();
     assertTrue("words is null and it shouldn't be", words != null);
     assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
index b95c1c3a4977..baf6a7117406 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java
@@ -43,7 +43,13 @@ public void testInform() throws Exception {
     factory =
         (KeepWordFilterFactory)
             tokenFilterFactory(
-                "KeepWord", "words", "keep-snowball.txt", "format", "snowball", "ignoreCase", "true");
+                "KeepWord",
+                "words",
+                "keep-snowball.txt",
+                "format",
+                "snowball",
+                "ignoreCase",
+                "true");
     words = factory.getWords();
     assertEquals(8, words.size());
     assertTrue(words.contains("he"));

From f4b055845970b4ebaece0e5cb178aab1662587d9 Mon Sep 17 00:00:00 2001
From: Vigya Sharma <vigyaspeaks@gmail.com>
Date: Thu, 12 Aug 2021 13:27:56 -0700
Subject: [PATCH 7/7] Move default stop word implementation to concrete
 subclasses

---
 .../analysis/commongrams/CommonGramsFilterFactory.java      | 6 ++++++
 .../org/apache/lucene/analysis/core/StopFilterFactory.java  | 5 +++++
 .../lucene/analysis/en/AbstractWordsFileFilterFactory.java  | 6 ++----
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
index 09f1d293bf78..939b712a75fc 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
 
 /**
  * Constructs a {@link CommonGramsFilter}.
@@ -55,6 +56,11 @@ public CharArraySet getCommonWords() {
     return getWords();
   }
 
+  @Override
+  protected CharArraySet createDefaultWords() {
+    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
+  }
+
   @Override
   public TokenFilter create(TokenStream input) {
     CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords());
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
index 3018e87a9bc1..b7ea4615f1df 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -81,6 +81,11 @@ public CharArraySet getStopWords() {
     return getWords();
   }
 
+  @Override
+  protected CharArraySet createDefaultWords() {
+    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
+  }
+
   @Override
   public TokenStream create(TokenStream input) {
     StopFilter stopFilter = new StopFilter(input, getWords());
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
index 8a3278ef9e3b..ec97093dde54 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
@@ -34,7 +34,7 @@
  *   <li><code>ignoreCase</code> defaults to <code>false</code>
  *   <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
  *       factory will use the value provided by {@link #createDefaultWords()} implementation in
- *       concrete subclass. Defaults to {@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET}
+ *       concrete subclass.
  *   <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
  *       <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
  *       not be specified.
@@ -102,9 +102,7 @@ public void inform(ResourceLoader loader) throws IOException {
   }
 
   /** Default word set implementation. */
-  protected CharArraySet createDefaultWords() {
-    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
-  }
+  protected abstract CharArraySet createDefaultWords();
 
   public CharArraySet getWords() {
     return words;