Added support for the 'CountVectorizer.token_pattern' attribute. Fixes …

…#95, fixes jpmml/sklearn2pmml#74
jpmml · Jan 12, 2021 · 00ea7b3 · 00ea7b3
1 parent 9bd46fb
commit 00ea7b3
Show file tree

Hide file tree

Showing 12 changed files with 3,068 additions and 2,050 deletions.
diff --git a/src/main/java/sklearn/feature_extraction/text/CountVectorizer.java b/src/main/java/sklearn/feature_extraction/text/CountVectorizer.java
@@ -57,6 +57,7 @@
 import org.jpmml.python.ClassDictUtil;
 import org.jpmml.sklearn.SkLearnEncoder;
 import sklearn.Transformer;
+import sklearn2pmml.feature_extraction.text.Matcher;
 import sklearn2pmml.feature_extraction.text.Tokenizer;
 
 public class CountVectorizer extends Transformer {
@@ -157,6 +158,13 @@ public DefineFunction encodeDefineFunction(Feature feature, SkLearnEncoder encod
 
 		if(stripAccents != null){
 			throw new IllegalArgumentException(stripAccents);
+		} // End if
+
+		if(tokenizer == null){
+			String tokenPattern = getTokenPattern();
+
+			tokenizer = new Matcher()
+				.setWordRE(tokenPattern);
 		}
 
 		ParameterField documentField = new ParameterField(FieldName.create("document"));
@@ -237,7 +245,11 @@ public String getStripAccents(){
 	}
 
 	public Tokenizer getTokenizer(){
-		return get("tokenizer", Tokenizer.class);
+		return getOptional("tokenizer", Tokenizer.class);
+	}
+
+	public String getTokenPattern(){
+		return getString("token_pattern");
 	}
 
 	public Map<String, ?> getVocabulary(){

diff --git a/src/main/java/sklearn2pmml/feature_extraction/text/Matcher.java b/src/main/java/sklearn2pmml/feature_extraction/text/Matcher.java
@@ -46,13 +46,11 @@ public TextIndex configure(TextIndex textIndex){
 	public String formatStopWordsRE(List<String> stopWords){
 		String wordRE = getWordRE();
 
-		if(!("\\w+").equals(wordRE)){
-			throw new IllegalArgumentException(wordRE);
-		}
+		boolean unicode = wordRE.startsWith("(?u)");
 
 		Joiner joiner = Joiner.on("|");
 
-		return "\\b(" + joiner.join(stopWords) + ")\\b";
+		return (unicode ? "(?u)" : "") + "\\b(" + joiner.join(stopWords) + ")\\b";
 	}
 
 	public void __setstate__(String wordRE){

diff --git a/src/test/java/org/jpmml/sklearn/TokenizerTest.java b/src/test/java/org/jpmml/sklearn/TokenizerTest.java
@@ -54,6 +54,11 @@ public void split() throws Exception {
 	@Test
 	public void match() throws Exception {
 		Matcher matcher = new Matcher()
+			.setWordRE("(?u)\\b\\w\\w+\\b");
+
+		evaluate("CountVectorizer", "Sentiment", matcher);
+
+		matcher = new Matcher()
 			.setWordRE("\\w+");
 
 		evaluate("Matcher", "Sentiment", matcher);