diff --git a/README.md b/README.md index 23fd639..4a4bdf2 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -[![Build Status](https://travis-ci.com/haeungun/index4j.svg?branch=master)](https://travis-ci.com/haeungun/index4j) -[![codecov](https://codecov.io/gh/haeungun/index4j/branch/master/graph/badge.svg)](https://codecov.io/gh/haeungun/index4j) +[![Build Status](https://travis-ci.com/haeungun/index4j.svg?branch=master)](https://travis-ci.com/haeungun/indexer4j) +[![codecov](https://codecov.io/gh/haeungun/index4j/branch/master/graph/badge.svg)](https://codecov.io/gh/haeungun/indexer4j) -# index4j +# indexer4j Simple full text indexing and searching library for Java ## Install @@ -9,7 +9,7 @@ ``` gradle repositories { maven { - url "https://dl.bintray.com/haeungun/index4j" + url "https://dl.bintray.com/haeungun/indexer4j" } } ``` @@ -22,6 +22,7 @@ repositories { - Support ngram, wordgram - Parrallel build and search - Support JDK 11 CI on travis CI (Jacoco not supports yet) +- Improve saving and loading features ## Examples ```java diff --git a/build.gradle b/build.gradle index c5c9f62..2699955 100644 --- a/build.gradle +++ b/build.gradle @@ -1,4 +1,4 @@ -group 'index4j' +group 'indexer4j' version '0.1.0-SNAPSHOT' apply plugin: 'java' @@ -25,15 +25,15 @@ dependencies { } publish { - repoName = 'index4j' + repoName = 'indexer4j' userOrg = 'haeungun' - groupId = 'com.haeungun.index4j' - artifactId = 'index4j' + groupId = 'com.haeungun.indexer4j' + artifactId = 'indexer4j' publishVersion = '0.1.0' desc = 'Simple full text indexing and searching library for Java' - website = 'https://github.com/haeungun/index4j' - issueTracker = 'https://github.com/haeungun/index4j/issues' - repository = 'https://github.com/haeungun/index4j.git' + website = 'https://github.com/haeungun/indexer4j' + issueTracker = 'https://github.com/haeungun/indexer4j/issues' + repository = 'https://github.com/haeungun/indexer4j.git' } jacocoTestReport { diff --git a/settings.gradle b/settings.gradle index f36a625..dec7bb5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,2 +1,2 @@ -rootProject.name = 'index4j' +rootProject.name = 'indexer4j' diff --git a/src/main/java/com/haeungun/index4j/DocumentMeta.java b/src/main/java/com/haeungun/index4j/DocumentMeta.java deleted file mode 100644 index bad16f1..0000000 --- a/src/main/java/com/haeungun/index4j/DocumentMeta.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.haeungun.index4j; - -import java.util.List; - -public class DocumentMeta { - - private final String docId; - private final List tokenizedWords; - - public DocumentMeta(String docId, List tokenizedWords) { - this.docId = docId; - this.tokenizedWords = tokenizedWords; - } - - public String getDocId() { - return this.docId; - } - - public List getTokenizedWords() { - return this.tokenizedWords; - } - - @Override - public String toString() { - return "DocumentMeta{docId=" + this.docId - + ", tokenizedWords=" + this.tokenizedWords.toString() + "}"; - } -} diff --git a/src/main/java/com/haeungun/index4j/annotation/Document.java b/src/main/java/com/haeungun/index4j/annotation/Document.java deleted file mode 100644 index 9d58f85..0000000 --- a/src/main/java/com/haeungun/index4j/annotation/Document.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.haeungun.index4j.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Target(ElementType.TYPE) -@Retention(RetentionPolicy.RUNTIME) -public @interface Document { -} diff --git a/src/main/java/com/haeungun/index4j/annotation/DocumentField.java b/src/main/java/com/haeungun/index4j/annotation/DocumentField.java deleted file mode 100644 index 7f9e66e..0000000 --- a/src/main/java/com/haeungun/index4j/annotation/DocumentField.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.haeungun.index4j.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Target(ElementType.FIELD) -@Retention(RetentionPolicy.RUNTIME) -public @interface DocumentField { -} diff --git a/src/main/java/com/haeungun/index4j/annotation/DocumentId.java b/src/main/java/com/haeungun/index4j/annotation/DocumentId.java deleted file mode 100644 index a377ed0..0000000 --- a/src/main/java/com/haeungun/index4j/annotation/DocumentId.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.haeungun.index4j.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Target(ElementType.FIELD) -@Retention(RetentionPolicy.RUNTIME) -public @interface DocumentId { -} diff --git a/src/main/java/com/haeungun/index4j/core/relevance/Relevance.java b/src/main/java/com/haeungun/index4j/core/relevance/Relevance.java deleted file mode 100644 index 90fb2ee..0000000 --- a/src/main/java/com/haeungun/index4j/core/relevance/Relevance.java +++ /dev/null @@ -1,6 +0,0 @@ -package com.haeungun.index4j.core.relevance; - -public enum Relevance { - TFIDF, - BM25 -} diff --git a/src/main/java/com/haeungun/index4j/core/relevance/RelevanceFactory.java b/src/main/java/com/haeungun/index4j/core/relevance/RelevanceFactory.java deleted file mode 100644 index e1e1bdd..0000000 --- a/src/main/java/com/haeungun/index4j/core/relevance/RelevanceFactory.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.haeungun.index4j.core.relevance; - -import com.haeungun.index4j.exceptions.UnsupportedRelevanceException; - -public class RelevanceFactory { - - public static RelevanceRanker createRanker(Relevance relevance) throws UnsupportedRelevanceException { - switch (relevance) { - case TFIDF: - return new TFIDFRanker(); - case BM25: - return new BM25Ranker(); - default: - throw new UnsupportedRelevanceException(relevance.name()); - } - } -} diff --git a/src/main/java/com/haeungun/index4j/core/relevance/RelevanceRanker.java b/src/main/java/com/haeungun/index4j/core/relevance/RelevanceRanker.java deleted file mode 100644 index f6e58da..0000000 --- a/src/main/java/com/haeungun/index4j/core/relevance/RelevanceRanker.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.haeungun.index4j.core.relevance; - -import java.util.List; - -public interface RelevanceRanker { - - double rank(String term, List doc, List> docs); - -} diff --git a/src/main/java/com/haeungun/index4j/core/tokenizer/RegexTokenizer.java b/src/main/java/com/haeungun/index4j/core/tokenizer/RegexTokenizer.java deleted file mode 100644 index a81becd..0000000 --- a/src/main/java/com/haeungun/index4j/core/tokenizer/RegexTokenizer.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.haeungun.index4j.core.tokenizer; - -import java.util.Arrays; -import java.util.List; - -public class RegexTokenizer implements Tokenizer { - - private final String regex; - - public RegexTokenizer(String regex) { - this.regex = regex; - } - - @Override - public List tokenizing(String str) { - return Arrays.asList(str.split(regex)); - } - -} diff --git a/src/main/java/com/haeungun/index4j/core/tokenizer/Tokenizer.java b/src/main/java/com/haeungun/index4j/core/tokenizer/Tokenizer.java deleted file mode 100644 index 79c04c2..0000000 --- a/src/main/java/com/haeungun/index4j/core/tokenizer/Tokenizer.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.haeungun.index4j.core.tokenizer; - -import java.util.List; - -public interface Tokenizer { - - /** - * Parses the document and returns it as a term list - * @param str to tokenized - * @return split term list - */ - List tokenizing(String str); -} diff --git a/src/main/java/com/haeungun/index4j/exceptions/DuplicatedDocumentException.java b/src/main/java/com/haeungun/index4j/exceptions/DuplicatedDocumentException.java deleted file mode 100644 index 38ae5be..0000000 --- a/src/main/java/com/haeungun/index4j/exceptions/DuplicatedDocumentException.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.haeungun.index4j.exceptions; - -public class DuplicatedDocumentException extends RuntimeException { - - private static final String errorMessage = "Duplicated document key {docId=%s}"; - - public DuplicatedDocumentException(String docId) { - super(String.format(errorMessage, docId)); - } -} diff --git a/src/main/java/com/haeungun/index4j/exceptions/UndefinedDocumentIdException.java b/src/main/java/com/haeungun/index4j/exceptions/UndefinedDocumentIdException.java deleted file mode 100644 index e3413a8..0000000 --- a/src/main/java/com/haeungun/index4j/exceptions/UndefinedDocumentIdException.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.haeungun.index4j.exceptions; - -public class UndefinedDocumentIdException extends Exception { - - private static final String errMsg = "Document must have a documentID field defined"; - - public UndefinedDocumentIdException() { - super(errMsg); - } -} diff --git a/src/main/java/com/haeungun/index4j/exceptions/UnsupportedRelevanceException.java b/src/main/java/com/haeungun/index4j/exceptions/UnsupportedRelevanceException.java deleted file mode 100644 index cbd5c80..0000000 --- a/src/main/java/com/haeungun/index4j/exceptions/UnsupportedRelevanceException.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.haeungun.index4j.exceptions; - -public class UnsupportedRelevanceException extends Exception { - - private static final String errMsg = "Unsupported relevance [%s]"; - - public UnsupportedRelevanceException(String input) { - super(String.format(errMsg, input)); - } - -} diff --git a/src/main/java/com/haeungun/indexer4j/DocumentMeta.java b/src/main/java/com/haeungun/indexer4j/DocumentMeta.java new file mode 100644 index 0000000..8514c9e --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/DocumentMeta.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j; + +import java.util.List; + +public class DocumentMeta { + + private final String docId; + private final List tokenizedWords; + + public DocumentMeta(String docId, List tokenizedWords) { + this.docId = docId; + this.tokenizedWords = tokenizedWords; + } + + public String getDocId() { + return this.docId; + } + + public List getTokenizedWords() { + return this.tokenizedWords; + } + + @Override + public String toString() { + return "DocumentMeta{docId=" + this.docId + + ", tokenizedWords=" + this.tokenizedWords.toString() + "}"; + } +} diff --git a/src/main/java/com/haeungun/index4j/Indexer.java b/src/main/java/com/haeungun/indexer4j/Indexer.java similarity index 57% rename from src/main/java/com/haeungun/index4j/Indexer.java rename to src/main/java/com/haeungun/indexer4j/Indexer.java index 09ff2e6..38d57b1 100644 --- a/src/main/java/com/haeungun/index4j/Indexer.java +++ b/src/main/java/com/haeungun/indexer4j/Indexer.java @@ -1,14 +1,28 @@ -package com.haeungun.index4j; - -import com.haeungun.index4j.core.DocumentExtractor; -import com.haeungun.index4j.core.tokenizer.Tokenizer; -import com.haeungun.index4j.core.tokenizer.RegexTokenizer; -import com.haeungun.index4j.exceptions.UndefinedDocumentIdException; -import com.haeungun.index4j.exceptions.UnsupportedRelevanceException; -import com.haeungun.index4j.core.relevance.Relevance; -import com.haeungun.index4j.core.relevance.RelevanceFactory; -import com.haeungun.index4j.core.relevance.RelevanceRanker; -import com.haeungun.index4j.utils.Serializer; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j; + +import com.haeungun.indexer4j.core.DocumentExtractor; +import com.haeungun.indexer4j.core.relevance.BM25Ranker; +import com.haeungun.indexer4j.core.tokenizer.Tokenizer; +import com.haeungun.indexer4j.core.tokenizer.RegexTokenizer; +import com.haeungun.indexer4j.exceptions.UndefinedDocumentIdException; +import com.haeungun.indexer4j.core.relevance.RelevanceRanker; +import com.haeungun.indexer4j.utils.Serializer; import java.util.*; import java.util.stream.Collectors; @@ -23,23 +37,39 @@ public class Indexer { private RelevanceRanker ranker; private Serializer>> serializer; - public Indexer() throws UnsupportedRelevanceException { - this(Relevance.BM25, new RegexTokenizer("\\W+")); + /** + * Constructor to create a Indexer object + * by using a default options(BM25, RegexTokenizer) + */ + public Indexer() { + this(new BM25Ranker(), new RegexTokenizer("\\W+")); } - public Indexer(Relevance relevance) throws UnsupportedRelevanceException { - this(relevance, new RegexTokenizer("\\W+")); + /** + * Constructor to create a Indexer object + * @param ranker for ranking algorithm(BM25, TFIDF, etc..) + */ + public Indexer(RelevanceRanker ranker) { + this(ranker, new RegexTokenizer("\\W+")); } - public Indexer(Relevance relevance, Tokenizer tokenizer) throws UnsupportedRelevanceException { + /** + * Constructor to create a Indexer object + * @param ranker for ranking algorithm(BM25, TFIDF, etc..) + * @param tokenizer for term and document + */ + public Indexer(RelevanceRanker ranker, Tokenizer tokenizer) { this.documents = new HashMap<>(); this.serializer = new Serializer<>(); - this.ranker = RelevanceFactory.createRanker(relevance); + this.ranker = ranker; this.extractor = new DocumentExtractor<>(tokenizer); this.tokenizer = tokenizer; } + /** + * Building a inverted index + */ public void build() { this.index = new HashMap<>(); @@ -62,11 +92,24 @@ public void build() { } } + /** + * Adding a doucment. + * @param doc to add + * @return true if success to add + * @throws UndefinedDocumentIdException when the DocumentId is not designated + */ public boolean add(T doc) throws UndefinedDocumentIdException { boolean allowOverwrite = true; return this.add(doc, allowOverwrite); } + /** + * Adding a doucment. + * @param doc doc to add + * @param allowOverwrite true if you want to allow overwrite document for same document id + * @return true if success to add + * @throws UndefinedDocumentIdException when the DocumentId is not designated + */ public boolean add(T doc, boolean allowOverwrite) throws UndefinedDocumentIdException { if (!this.extractor.isDocument(doc)) return false; @@ -84,6 +127,11 @@ public boolean add(T doc, boolean allowOverwrite) throws UndefinedDocumentIdExce return true; } + /** + * Saving a index into local disk. + * @param fileName to save a index into the local disk + * @return true if success to save + */ public boolean save(String fileName) { assert this.index != null; try { @@ -95,6 +143,11 @@ public boolean save(String fileName) { return true; } + /** + * Loading a index from the designate file path. + * @param fileName to load a index from the local disk + * @return true if success to load + */ public boolean load(String fileName) { try { this.index = this.serializer.deserializing(fileName); @@ -106,6 +159,12 @@ public boolean load(String fileName) { return true; } + /** + * + * Retrieve search results from a inverted index. + * @param query to search documents + * @return the list of SearchResult + */ public List search(String query) { List queries = this.tokenizer.tokenizing(query); Map results = new HashMap<>(); // {docId, score} diff --git a/src/main/java/com/haeungun/index4j/SearchResult.java b/src/main/java/com/haeungun/indexer4j/SearchResult.java similarity index 52% rename from src/main/java/com/haeungun/index4j/SearchResult.java rename to src/main/java/com/haeungun/indexer4j/SearchResult.java index 397f122..e1b9a16 100644 --- a/src/main/java/com/haeungun/index4j/SearchResult.java +++ b/src/main/java/com/haeungun/indexer4j/SearchResult.java @@ -1,4 +1,20 @@ -package com.haeungun.index4j; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j; public class SearchResult implements Comparable { diff --git a/src/main/java/com/haeungun/indexer4j/annotation/Document.java b/src/main/java/com/haeungun/indexer4j/annotation/Document.java new file mode 100644 index 0000000..976067f --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/annotation/Document.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Target(ElementType.TYPE) +@Retention(RetentionPolicy.RUNTIME) +public @interface Document { +} diff --git a/src/main/java/com/haeungun/indexer4j/annotation/DocumentField.java b/src/main/java/com/haeungun/indexer4j/annotation/DocumentField.java new file mode 100644 index 0000000..42a7dd3 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/annotation/DocumentField.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Target(ElementType.FIELD) +@Retention(RetentionPolicy.RUNTIME) +public @interface DocumentField { +} diff --git a/src/main/java/com/haeungun/indexer4j/annotation/DocumentId.java b/src/main/java/com/haeungun/indexer4j/annotation/DocumentId.java new file mode 100644 index 0000000..deffab0 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/annotation/DocumentId.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Target(ElementType.FIELD) +@Retention(RetentionPolicy.RUNTIME) +public @interface DocumentId { +} diff --git a/src/main/java/com/haeungun/index4j/core/DocumentExtractor.java b/src/main/java/com/haeungun/indexer4j/core/DocumentExtractor.java similarity index 69% rename from src/main/java/com/haeungun/index4j/core/DocumentExtractor.java rename to src/main/java/com/haeungun/indexer4j/core/DocumentExtractor.java index ebb3e77..09eea0e 100644 --- a/src/main/java/com/haeungun/index4j/core/DocumentExtractor.java +++ b/src/main/java/com/haeungun/indexer4j/core/DocumentExtractor.java @@ -1,9 +1,25 @@ -package com.haeungun.index4j.core; - -import com.haeungun.index4j.DocumentMeta; -import com.haeungun.index4j.annotation.*; -import com.haeungun.index4j.core.tokenizer.Tokenizer; -import com.haeungun.index4j.exceptions.UndefinedDocumentIdException; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core; + +import com.haeungun.indexer4j.DocumentMeta; +import com.haeungun.indexer4j.annotation.*; +import com.haeungun.indexer4j.core.tokenizer.Tokenizer; +import com.haeungun.indexer4j.exceptions.UndefinedDocumentIdException; import java.lang.reflect.Field; import java.util.ArrayList; diff --git a/src/main/java/com/haeungun/index4j/core/relevance/BM25Ranker.java b/src/main/java/com/haeungun/indexer4j/core/relevance/BM25Ranker.java similarity index 78% rename from src/main/java/com/haeungun/index4j/core/relevance/BM25Ranker.java rename to src/main/java/com/haeungun/indexer4j/core/relevance/BM25Ranker.java index 8c72ae8..0289280 100644 --- a/src/main/java/com/haeungun/index4j/core/relevance/BM25Ranker.java +++ b/src/main/java/com/haeungun/indexer4j/core/relevance/BM25Ranker.java @@ -1,4 +1,20 @@ -package com.haeungun.index4j.core.relevance; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core.relevance; import java.util.List; diff --git a/src/main/java/com/haeungun/indexer4j/core/relevance/RelevanceRanker.java b/src/main/java/com/haeungun/indexer4j/core/relevance/RelevanceRanker.java new file mode 100644 index 0000000..a4c1314 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/core/relevance/RelevanceRanker.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core.relevance; + +import java.util.List; + +public interface RelevanceRanker { + + /** + * Calculate a ranking score for given term and documents + * @param term given term + * @param doc given document + * @param docs all documents term list + * @return + */ + double rank(String term, List doc, List> docs); + +} diff --git a/src/main/java/com/haeungun/index4j/core/relevance/TFIDFRanker.java b/src/main/java/com/haeungun/indexer4j/core/relevance/TFIDFRanker.java similarity index 58% rename from src/main/java/com/haeungun/index4j/core/relevance/TFIDFRanker.java rename to src/main/java/com/haeungun/indexer4j/core/relevance/TFIDFRanker.java index c73ee41..e3cb09f 100644 --- a/src/main/java/com/haeungun/index4j/core/relevance/TFIDFRanker.java +++ b/src/main/java/com/haeungun/indexer4j/core/relevance/TFIDFRanker.java @@ -1,4 +1,20 @@ -package com.haeungun.index4j.core.relevance; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core.relevance; import java.util.List; diff --git a/src/main/java/com/haeungun/indexer4j/core/tokenizer/RegexTokenizer.java b/src/main/java/com/haeungun/indexer4j/core/tokenizer/RegexTokenizer.java new file mode 100644 index 0000000..95444a4 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/core/tokenizer/RegexTokenizer.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core.tokenizer; + +import java.util.Arrays; +import java.util.List; + +public class RegexTokenizer implements Tokenizer { + + private final String regex; + + public RegexTokenizer(String regex) { + this.regex = regex; + } + + @Override + public List tokenizing(String str) { + return Arrays.asList(str.split(regex)); + } + +} diff --git a/src/main/java/com/haeungun/indexer4j/core/tokenizer/Tokenizer.java b/src/main/java/com/haeungun/indexer4j/core/tokenizer/Tokenizer.java new file mode 100644 index 0000000..e69bd7b --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/core/tokenizer/Tokenizer.java @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core.tokenizer; + +import java.util.List; + +public interface Tokenizer { + + /** + * Parses the document and returns it as a term list + * @param str to tokenized + * @return split term list + */ + List tokenizing(String str); +} diff --git a/src/main/java/com/haeungun/indexer4j/exceptions/DuplicatedDocumentException.java b/src/main/java/com/haeungun/indexer4j/exceptions/DuplicatedDocumentException.java new file mode 100644 index 0000000..ad9a876 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/exceptions/DuplicatedDocumentException.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.exceptions; + +public class DuplicatedDocumentException extends RuntimeException { + + private static final String errorMessage = "Duplicated document key {docId=%s}"; + + public DuplicatedDocumentException(String docId) { + super(String.format(errorMessage, docId)); + } +} diff --git a/src/main/java/com/haeungun/indexer4j/exceptions/UndefinedDocumentIdException.java b/src/main/java/com/haeungun/indexer4j/exceptions/UndefinedDocumentIdException.java new file mode 100644 index 0000000..fee21bd --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/exceptions/UndefinedDocumentIdException.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.exceptions; + +public class UndefinedDocumentIdException extends Exception { + + private static final String errMsg = "Document must have a documentID field defined"; + + public UndefinedDocumentIdException() { + super(errMsg); + } +} diff --git a/src/main/java/com/haeungun/indexer4j/exceptions/UnsupportedRelevanceException.java b/src/main/java/com/haeungun/indexer4j/exceptions/UnsupportedRelevanceException.java new file mode 100644 index 0000000..fc164e6 --- /dev/null +++ b/src/main/java/com/haeungun/indexer4j/exceptions/UnsupportedRelevanceException.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.exceptions; + +public class UnsupportedRelevanceException extends Exception { + + private static final String errMsg = "Unsupported relevance [%s]"; + + public UnsupportedRelevanceException(String input) { + super(String.format(errMsg, input)); + } + +} diff --git a/src/main/java/com/haeungun/index4j/utils/Serializer.java b/src/main/java/com/haeungun/indexer4j/utils/Serializer.java similarity index 95% rename from src/main/java/com/haeungun/index4j/utils/Serializer.java rename to src/main/java/com/haeungun/indexer4j/utils/Serializer.java index 859ada2..b0c92b6 100644 --- a/src/main/java/com/haeungun/index4j/utils/Serializer.java +++ b/src/main/java/com/haeungun/indexer4j/utils/Serializer.java @@ -1,4 +1,4 @@ -package com.haeungun.index4j.utils; +package com.haeungun.indexer4j.utils; import java.io.*; diff --git a/src/test/java/com/haeungun/index4j/DocumentScoreTest.java b/src/test/java/com/haeungun/index4j/DocumentScoreTest.java deleted file mode 100644 index 69eb056..0000000 --- a/src/test/java/com/haeungun/index4j/DocumentScoreTest.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.haeungun.index4j; - -import org.junit.Test; - -import static org.junit.Assert.*; - -public class DocumentScoreTest { - - @Test - public void compareTo_test() { - SearchResult score = new SearchResult("doc1", 2.0); - SearchResult score2 = new SearchResult("doc2", 1.0); - - assertEquals(1, score.compareTo(score2)); - } - - @Test - public void toString_test() { - SearchResult score = new SearchResult("doc1", 2.0); - - assertEquals("DocumentScore{docId=doc1, score=2.0}", score.toString()); - } -} \ No newline at end of file diff --git a/src/test/java/com/haeungun/index4j/core/relevance/RelevanceFactoryTest.java b/src/test/java/com/haeungun/index4j/core/relevance/RelevanceFactoryTest.java deleted file mode 100644 index 631c81c..0000000 --- a/src/test/java/com/haeungun/index4j/core/relevance/RelevanceFactoryTest.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.haeungun.index4j.core.relevance; - -import static org.junit.Assert.*; - -import com.haeungun.index4j.exceptions.UnsupportedRelevanceException; -import org.junit.Test; - -public class RelevanceFactoryTest { - - @Test - public void createTFIDFRanker_test() throws UnsupportedRelevanceException { - RelevanceRanker ranker = RelevanceFactory.createRanker(Relevance.TFIDF); - assertTrue(ranker instanceof TFIDFRanker); - } - - @Test - public void testBM25Ranker_test() throws UnsupportedRelevanceException { - RelevanceRanker ranker = RelevanceFactory.createRanker(Relevance.BM25); - assertTrue(ranker instanceof BM25Ranker); - } - -} diff --git a/src/test/java/com/haeungun/index4j/example/ExampleDocument.java b/src/test/java/com/haeungun/index4j/example/ExampleDocument.java deleted file mode 100644 index 50ceca5..0000000 --- a/src/test/java/com/haeungun/index4j/example/ExampleDocument.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.haeungun.index4j.example; - -import com.haeungun.index4j.annotation.*; - -@Document -public class ExampleDocument { - - @DocumentId - private String id; - - @DocumentField - private String title; - - @DocumentField - private String contents; - - public ExampleDocument(String id, String title, String contents) { - this.id = id; - this.title = title; - this.contents = contents; - } - - public String getId() { - return id; - } - - public String getTitle() { - return title; - } - - public String getContents() { - return contents; - } -} diff --git a/src/test/java/com/haeungun/index4j/example/WrongDocument.java b/src/test/java/com/haeungun/index4j/example/WrongDocument.java deleted file mode 100644 index fa84b22..0000000 --- a/src/test/java/com/haeungun/index4j/example/WrongDocument.java +++ /dev/null @@ -1,4 +0,0 @@ -package com.haeungun.index4j.example; - -public class WrongDocument { -} diff --git a/src/test/java/com/haeungun/indexer4j/DocumentScoreTest.java b/src/test/java/com/haeungun/indexer4j/DocumentScoreTest.java new file mode 100644 index 0000000..dcc05e0 --- /dev/null +++ b/src/test/java/com/haeungun/indexer4j/DocumentScoreTest.java @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j; + +import org.junit.Test; + +import static org.junit.Assert.*; + +public class DocumentScoreTest { + + @Test + public void compareTo_test() { + SearchResult score = new SearchResult("doc1", 2.0); + SearchResult score2 = new SearchResult("doc2", 1.0); + + assertEquals(1, score.compareTo(score2)); + } + + @Test + public void toString_test() { + SearchResult score = new SearchResult("doc1", 2.0); + + assertEquals("DocumentScore{docId=doc1, score=2.0}", score.toString()); + } +} \ No newline at end of file diff --git a/src/test/java/com/haeungun/index4j/IndexerTest.java b/src/test/java/com/haeungun/indexer4j/IndexerTest.java similarity index 74% rename from src/test/java/com/haeungun/index4j/IndexerTest.java rename to src/test/java/com/haeungun/indexer4j/IndexerTest.java index 845ff13..abf1864 100644 --- a/src/test/java/com/haeungun/index4j/IndexerTest.java +++ b/src/test/java/com/haeungun/indexer4j/IndexerTest.java @@ -1,10 +1,26 @@ -package com.haeungun.index4j; - -import com.haeungun.index4j.core.relevance.Relevance; -import com.haeungun.index4j.example.ExampleDocument; -import com.haeungun.index4j.example.WrongDocument; -import com.haeungun.index4j.exceptions.UndefinedDocumentIdException; -import com.haeungun.index4j.exceptions.UnsupportedRelevanceException; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j; + +import com.haeungun.indexer4j.core.relevance.TFIDFRanker; +import com.haeungun.indexer4j.example.ExampleDocument; +import com.haeungun.indexer4j.example.WrongDocument; +import com.haeungun.indexer4j.exceptions.UndefinedDocumentIdException; +import com.haeungun.indexer4j.exceptions.UnsupportedRelevanceException; import org.junit.After; import org.junit.Test; @@ -26,13 +42,13 @@ public class IndexerTest { ); @Test - public void addDocument_test() throws UndefinedDocumentIdException, UnsupportedRelevanceException { + public void addDocument_test() throws UndefinedDocumentIdException { Indexer index = new Indexer<>(); assertFalse(index.add(new WrongDocument())); } @Test - public void BM25_test() throws UndefinedDocumentIdException, UnsupportedRelevanceException { + public void BM25_test() throws UndefinedDocumentIdException { Indexer index = new Indexer<>(); for (ExampleDocument document : this.documents) { index.add(document); @@ -58,7 +74,7 @@ public void BM25_test() throws UndefinedDocumentIdException, UnsupportedRelevanc @Test public void TFIDF_test() throws UnsupportedRelevanceException, UndefinedDocumentIdException { - Indexer index = new Indexer<>(Relevance.TFIDF); + Indexer index = new Indexer<>(new TFIDFRanker()); for (ExampleDocument document : this.documents) { index.add(document); } @@ -82,7 +98,7 @@ public void TFIDF_test() throws UnsupportedRelevanceException, UndefinedDocument } @Test - public void save_test() throws UnsupportedRelevanceException, UndefinedDocumentIdException { + public void save_test() throws UndefinedDocumentIdException { Indexer index = new Indexer<>(); for (ExampleDocument document : this.documents) { index.add(document); diff --git a/src/test/java/com/haeungun/index4j/core/DocumentExtractorTest.java b/src/test/java/com/haeungun/indexer4j/core/DocumentExtractorTest.java similarity index 64% rename from src/test/java/com/haeungun/index4j/core/DocumentExtractorTest.java rename to src/test/java/com/haeungun/indexer4j/core/DocumentExtractorTest.java index e7ebcc4..44d3abb 100644 --- a/src/test/java/com/haeungun/index4j/core/DocumentExtractorTest.java +++ b/src/test/java/com/haeungun/indexer4j/core/DocumentExtractorTest.java @@ -1,12 +1,28 @@ -package com.haeungun.index4j.core; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.core; import static org.junit.Assert.*; -import com.haeungun.index4j.DocumentMeta; -import com.haeungun.index4j.core.tokenizer.RegexTokenizer; -import com.haeungun.index4j.core.tokenizer.Tokenizer; -import com.haeungun.index4j.example.ExampleDocument; -import com.haeungun.index4j.example.WrongDocument; +import com.haeungun.indexer4j.DocumentMeta; +import com.haeungun.indexer4j.core.tokenizer.RegexTokenizer; +import com.haeungun.indexer4j.core.tokenizer.Tokenizer; +import com.haeungun.indexer4j.example.ExampleDocument; +import com.haeungun.indexer4j.example.WrongDocument; import org.junit.Before; import org.junit.Test; diff --git a/src/test/java/com/haeungun/indexer4j/example/ExampleDocument.java b/src/test/java/com/haeungun/indexer4j/example/ExampleDocument.java new file mode 100644 index 0000000..0a8f99a --- /dev/null +++ b/src/test/java/com/haeungun/indexer4j/example/ExampleDocument.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.example; + +import com.haeungun.indexer4j.annotation.*; + +@Document +public class ExampleDocument { + + @DocumentId + private String id; + + @DocumentField + private String title; + + @DocumentField + private String contents; + + public ExampleDocument(String id, String title, String contents) { + this.id = id; + this.title = title; + this.contents = contents; + } + + public String getId() { + return id; + } + + public String getTitle() { + return title; + } + + public String getContents() { + return contents; + } +} diff --git a/src/test/java/com/haeungun/indexer4j/example/WrongDocument.java b/src/test/java/com/haeungun/indexer4j/example/WrongDocument.java new file mode 100644 index 0000000..b030b75 --- /dev/null +++ b/src/test/java/com/haeungun/indexer4j/example/WrongDocument.java @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.example; + +public class WrongDocument { +} diff --git a/src/test/java/com/haeungun/index4j/utils/SerializerTest.java b/src/test/java/com/haeungun/indexer4j/utils/SerializerTest.java similarity index 51% rename from src/test/java/com/haeungun/index4j/utils/SerializerTest.java rename to src/test/java/com/haeungun/indexer4j/utils/SerializerTest.java index 1ca61e3..496f5ad 100644 --- a/src/test/java/com/haeungun/index4j/utils/SerializerTest.java +++ b/src/test/java/com/haeungun/indexer4j/utils/SerializerTest.java @@ -1,4 +1,20 @@ -package com.haeungun.index4j.utils; +/* + * Copyright (C) 2019 The Indexer4j Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.haeungun.indexer4j.utils; import org.junit.After; import org.junit.Test;