-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext-retrieval.py
40 lines (30 loc) · 1004 Bytes
/
text-retrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import argparse
import os
import nltk
from algorithms.document_ranker import Ranker
parser = argparse.ArgumentParser()
parser.add_argument("--query", "--Q", help="Query to submit to the \"database\"", required=False)
parser.add_argument("--algorithm", "-A", help="Search algorithm to use. Options: bm25, jm, dirichlet, pln, default", required=False)
args = parser.parse_args()
data = nltk.corpus.webtext
# tokenize the query
tokenizer = nltk.tokenize.word_tokenize
query_tokens = tokenizer(args.query)
print("Tokenized query:")
print(query_tokens)
# read list of files
documents = data.fileids()
# sanity check, list file IDs
print("found files")
print(documents)
corpus = []
for document in documents:
wordList = data.words(document)
print(type(wordList))
print("words in doc:")
print(wordList)
corpus.append(wordList)
len(wordList)
engine = Ranker(corpus, alpha=0.25, b=0.75, k=1.2, mu=0.75)
result = engine.search(query_tokens, algorithm=args.algorithm)
print(result)