Lucene-10008: Respect ignoreCase flag in CommonGramsFilterFactory and…

… factor out a common abstract base class AbstractWordsFileFilterFactory.java (#188)
apache · Aug 13, 2021 · cb4c8ae · cb4c8ae
1 parent 624560a
commit cb4c8ae
Show file tree

Hide file tree

Showing 10 changed files with 253 additions and 117 deletions.
diff --git a/...ysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java b/...ysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@@ -16,15 +16,12 @@
  */
 package org.apache.lucene.analysis.commongrams;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.util.ResourceLoader;
-import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
  * Constructs a {@link CommonGramsFilter}.
@@ -40,57 +37,33 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class CommonGramsFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class CommonGramsFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "commonGrams";
 
-  // TODO: shared base class for Stop/Keep/CommonGrams?
-  private CharArraySet commonWords;
-  private final String commonWordFiles;
-  private final String format;
-  private final boolean ignoreCase;
-
   /** Creates a new CommonGramsFilterFactory */
   public CommonGramsFilterFactory(Map<String, String> args) {
     super(args);
-    commonWordFiles = get(args, "words");
-    format = get(args, "format");
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
   public CommonGramsFilterFactory() {
     throw defaultCtorException();
   }
 
-  @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (commonWordFiles != null) {
-      if ("snowball".equalsIgnoreCase(format)) {
-        commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
-      } else {
-        commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
-      }
-    } else {
-      commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
+  public CharArraySet getCommonWords() {
+    return getWords();
   }
 
-  public CharArraySet getCommonWords() {
-    return commonWords;
+  @Override
+  protected CharArraySet createDefaultWords() {
+    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
   }
 
   @Override
   public TokenFilter create(TokenStream input) {
-    CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
+    CommonGramsFilter commonGrams = new CommonGramsFilter(input, getWords());
     return commonGrams;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
@@ -16,15 +16,12 @@
  */
 package org.apache.lucene.analysis.core;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
-import org.apache.lucene.util.ResourceLoader;
-import org.apache.lucene.util.ResourceLoaderAware;
 
 /**
  * Factory for {@link StopFilter}.
@@ -65,66 +62,33 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class StopFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "stop";
 
-  public static final String FORMAT_WORDSET = "wordset";
-  public static final String FORMAT_SNOWBALL = "snowball";
-
-  private CharArraySet stopWords;
-  private final String stopWordFiles;
-  private final String format;
-  private final boolean ignoreCase;
-
   /** Creates a new StopFilterFactory */
   public StopFilterFactory(Map<String, String> args) {
     super(args);
-    stopWordFiles = get(args, "words");
-    format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
   public StopFilterFactory() {
     throw defaultCtorException();
   }
 
-  @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (stopWordFiles != null) {
-      if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
-        stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
-      } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
-        stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
-      } else {
-        throw new IllegalArgumentException(
-            "Unknown 'format' specified for 'words' file: " + format);
-      }
-    } else {
-      if (null != format) {
-        throw new IllegalArgumentException(
-            "'format' can not be specified w/o an explicit 'words' file: " + format);
-      }
-      stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
+  public CharArraySet getStopWords() {
+    return getWords();
   }
 
-  public CharArraySet getStopWords() {
-    return stopWords;
+  @Override
+  protected CharArraySet createDefaultWords() {
+    return new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, isIgnoreCase());
   }
 
   @Override
   public TokenStream create(TokenStream input) {
-    StopFilter stopFilter = new StopFilter(input, stopWords);
+    StopFilter stopFilter = new StopFilter(input, getWords());
     return stopFilter;
   }
 }
diff --git a/...nalysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java b/...nalysis/common/src/java/org/apache/lucene/analysis/en/AbstractWordsFileFilterFactory.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.en;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.ResourceLoader;
+import org.apache.lucene.util.ResourceLoaderAware;
+
+/**
+ * Abstract parent class for analysis factories that accept a stopwords file as input.
+ *
+ * <p>Concrete implementations can leverage the following input attributes. All attributes are
+ * optional:
+ *
+ * <ul>
+ *   <li><code>ignoreCase</code> defaults to <code>false</code>
+ *   <li><code>words</code> should be the name of a stopwords file to parse, if not specified the
+ *       factory will use the value provided by {@link #createDefaultWords()} implementation in
+ *       concrete subclass.
+ *   <li><code>format</code> defines how the <code>words</code> file will be parsed, and defaults to
+ *       <code>wordset</code>. If <code>words</code> is not specified, then <code>format</code> must
+ *       not be specified.
+ * </ul>
+ *
+ * <p>The valid values for the <code>format</code> option are:
+ *
+ * <ul>
+ *   <li><code>wordset</code> - This is the default format, which supports one word per line
+ *       (including any intra-word whitespace) and allows whole line comments beginning with the "#"
+ *       character. Blank lines are ignored. See {@link WordlistLoader#getLines
+ *       WordlistLoader.getLines} for details.
+ *   <li><code>snowball</code> - This format allows for multiple words specified on each line, and
+ *       trailing comments may be specified using the vertical line ("&#124;"). Blank lines are
+ *       ignored. See {@link WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet}
+ *       for details.
+ * </ul>
+ */
+public abstract class AbstractWordsFileFilterFactory extends TokenFilterFactory
+    implements ResourceLoaderAware {
+
+  public static final String FORMAT_WORDSET = "wordset";
+  public static final String FORMAT_SNOWBALL = "snowball";
+
+  private CharArraySet words;
+  private final String wordFiles;
+  private final String format;
+  private final boolean ignoreCase;
+
+  /** Default ctor for compatibility with SPI */
+  protected AbstractWordsFileFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  /** Initialize this factory via a set of key-value pairs. */
+  public AbstractWordsFileFilterFactory(Map<String, String> args) {
+    super(args);
+    wordFiles = get(args, "words");
+    format = get(args, "format", (null == wordFiles ? null : FORMAT_WORDSET));
+    ignoreCase = getBoolean(args, "ignoreCase", false);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Initialize the set of stopwords provided via ResourceLoader, or using defaults. */
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (wordFiles != null) {
+      if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
+        words = getWordSet(loader, wordFiles, ignoreCase);
+      } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
+        words = getSnowballWordSet(loader, wordFiles, ignoreCase);
+      } else {
+        throw new IllegalArgumentException(
+            "Unknown 'format' specified for 'words' file: " + format);
+      }
+    } else {
+      if (null != format) {
+        throw new IllegalArgumentException(
+            "'format' can not be specified w/o an explicit 'words' file: " + format);
+      }
+      words = createDefaultWords();
+    }
+  }
+
+  /** Default word set implementation. */
+  protected abstract CharArraySet createDefaultWords();
+
+  public CharArraySet getWords() {
+    return words;
+  }
+
+  public String getWordFiles() {
+    return wordFiles;
+  }
+
+  public String getFormat() {
+    return format;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+}
diff --git a/...lysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java b/...lysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
@@ -16,13 +16,10 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
-import java.io.IOException;
 import java.util.Map;
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.util.ResourceLoader;
-import org.apache.lucene.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 
 /**
  * Factory for {@link KeepWordFilter}.
@@ -38,23 +35,14 @@
  * @since 3.1
  * @lucene.spi {@value #NAME}
  */
-public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+public class KeepWordFilterFactory extends AbstractWordsFileFilterFactory {
 
   /** SPI name */
   public static final String NAME = "keepWord";
 
-  private final boolean ignoreCase;
-  private final String wordFiles;
-  private CharArraySet words;
-
   /** Creates a new KeepWordFilterFactory */
   public KeepWordFilterFactory(Map<String, String> args) {
     super(args);
-    wordFiles = get(args, "words");
-    ignoreCase = getBoolean(args, "ignoreCase", false);
-    if (!args.isEmpty()) {
-      throw new IllegalArgumentException("Unknown parameters: " + args);
-    }
   }
 
   /** Default ctor for compatibility with SPI */
@@ -63,27 +51,17 @@ public KeepWordFilterFactory() {
   }
 
   @Override
-  public void inform(ResourceLoader loader) throws IOException {
-    if (wordFiles != null) {
-      words = getWordSet(loader, wordFiles, ignoreCase);
-    }
-  }
-
-  public boolean isIgnoreCase() {
-    return ignoreCase;
-  }
-
-  public CharArraySet getWords() {
-    return words;
+  protected CharArraySet createDefaultWords() {
+    return null;
   }
 
   @Override
   public TokenStream create(TokenStream input) {
     // if the set is null, it means it was empty
-    if (words == null) {
+    if (getWords() == null) {
       return input;
     } else {
-      final TokenStream filter = new KeepWordFilter(input, words);
+      final TokenStream filter = new KeepWordFilter(input, getWords());
       return filter;
     }
   }