-
Notifications
You must be signed in to change notification settings - Fork 641
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEATURE: Lucene.Net.Analysis.Miscellaneous: Added TypeAsSynonymFilter…
… from Lucene 8.2.0 because it is called out in the docs as part of the process of configuring Lucene.Net.Analysis.OpenNLP. Changed CannedTokenStream to set ITypeAttribute.Type because it is required by the tests for TypeAsSynonymFilter.
- Loading branch information
1 parent
e742bbb
commit 79d4610
Showing
4 changed files
with
222 additions
and
1 deletion.
There are no files selected for viewing
97 changes: 97 additions & 0 deletions
97
src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
// Lucene version compatibility level 8.2.0 | ||
// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful. | ||
using Lucene.Net.Analysis.TokenAttributes; | ||
using Lucene.Net.Util; | ||
#nullable enable | ||
|
||
namespace Lucene.Net.Analysis.Miscellaneous | ||
{ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/// <summary> | ||
/// Adds the <see cref="ITypeAttribute.Type"/> as a synonym, | ||
/// i.e. another token at the same position, optionally with a specified prefix prepended. | ||
/// </summary> | ||
public sealed class TypeAsSynonymFilter : TokenFilter | ||
{ | ||
private readonly ICharTermAttribute termAtt; | ||
private readonly ITypeAttribute typeAtt; | ||
private readonly IPositionIncrementAttribute posIncrAtt; | ||
private readonly string? prefix; | ||
|
||
private State? savedToken = null; | ||
|
||
/// <summary> | ||
/// Initializes a new instance of <see cref="TypeAsSynonymFilter"/> with | ||
/// the specified token stream. | ||
/// </summary> | ||
/// <param name="input">Input token stream.</param> | ||
public TypeAsSynonymFilter(TokenStream input) | ||
: this(input, null) | ||
{ | ||
} | ||
|
||
/// <summary> | ||
/// Initializes a new instance of <see cref="TypeAsSynonymFilter"/> with | ||
/// the specified token stream and prefix. | ||
/// </summary> | ||
/// <param name="input">Input token stream.</param> | ||
/// <param name="prefix">Prepend this string to every token type emitted as token text. | ||
/// If <c>null</c>, nothing will be prepended.</param> | ||
public TypeAsSynonymFilter(TokenStream input, string? prefix) | ||
: base(input) | ||
{ | ||
this.prefix = prefix; | ||
termAtt = AddAttribute<ICharTermAttribute>(); | ||
typeAtt = AddAttribute<ITypeAttribute>(); | ||
posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); | ||
} | ||
|
||
|
||
public override bool IncrementToken() | ||
{ | ||
if (savedToken != null) | ||
{ | ||
// Emit last token's type at the same position | ||
RestoreState(savedToken); | ||
savedToken = null; | ||
termAtt.SetEmpty(); | ||
if (prefix != null) | ||
{ | ||
termAtt.Append(prefix); | ||
} | ||
termAtt.Append(typeAtt.Type); | ||
posIncrAtt.PositionIncrement = 0; | ||
return true; | ||
} | ||
else if (m_input.IncrementToken()) | ||
{ | ||
// Ho pending token type to emit | ||
savedToken = CaptureState(); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
public override void Reset() | ||
{ | ||
base.Reset(); | ||
savedToken = null; | ||
} | ||
} | ||
} |
62 changes: 62 additions & 0 deletions
62
src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilterFactory.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
// Lucene version compatibility level 8.2.0 | ||
// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful. | ||
using Lucene.Net.Analysis.Util; | ||
using System; | ||
using System.Collections.Generic; | ||
#nullable enable | ||
|
||
namespace Lucene.Net.Analysis.Miscellaneous | ||
{ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/// <summary> | ||
/// Factory for <see cref="TypeAsSynonymFilter"/>. | ||
/// <code> | ||
/// <fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100"> | ||
/// <analyzer> | ||
/// <tokenizer class="solr.UAX29URLEmailTokenizerFactory"/> | ||
/// <filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" /> | ||
/// </analyzer> | ||
/// </fieldType> | ||
/// </code> | ||
/// | ||
/// <para/> | ||
/// If the optional <c>prefix</c> parameter is used, the specified value will be prepended | ||
/// to the type, e.g.with prefix = "_type_", for a token "example.com" with type "<URL>", | ||
/// the emitted synonym will have text "_type_<URL>". | ||
/// </summary> | ||
public class TypeAsSynonymFilterFactory : TokenFilterFactory | ||
{ | ||
private readonly string prefix; | ||
|
||
public TypeAsSynonymFilterFactory(IDictionary<string, string> args) | ||
: base(args) | ||
{ | ||
prefix = Get(args, "prefix"); // default value is null | ||
if (args.Count > 0) | ||
{ | ||
throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args)); | ||
} | ||
} | ||
|
||
public override TokenStream Create(TokenStream input) | ||
{ | ||
return new TypeAsSynonymFilter(input, prefix); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
54 changes: 54 additions & 0 deletions
54
...Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestTypeAsSynonymFilterFactory.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
using Lucene.Net.Analysis.Util; | ||
using NUnit.Framework; | ||
|
||
namespace Lucene.Net.Analysis.Miscellaneous | ||
{ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
public class TestTypeAsSynonymFilterFactory : BaseTokenStreamFactoryTestCase | ||
{ | ||
private static readonly Token[] TOKENS = { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") }; | ||
|
||
[Test] | ||
public void TestBasic() | ||
{ | ||
TokenStream stream = new CannedTokenStream(TOKENS); | ||
stream = TokenFilterFactory("TypeAsSynonym").Create(stream); | ||
AssertTokenStreamContents(stream, new string[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" }, | ||
null, null, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 }); | ||
} | ||
|
||
[Test] | ||
public void TestPrefix() | ||
{ | ||
TokenStream stream = new CannedTokenStream(TOKENS); | ||
stream = TokenFilterFactory("TypeAsSynonym", "prefix", "_type_").Create(stream); | ||
AssertTokenStreamContents(stream, new string[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" }, | ||
null, null, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 }); | ||
} | ||
|
||
private static Token token(string term, string type) | ||
{ | ||
Token token = new Token(); | ||
token.SetEmpty(); | ||
token.Append(term); | ||
token.Type = type; | ||
return token; | ||
} | ||
} | ||
} |