diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/p3/PorterStemmer_p3.py b/p3/PorterStemmer_p3.py new file mode 100644 index 0000000..3b7af40 --- /dev/null +++ b/p3/PorterStemmer_p3.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python + +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j-1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k-length+1:self.k+1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): self.setto("ate") + elif self.ends("bl"): self.setto("ble") + elif self.ends("iz"): self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): self.r("ate") + elif self.ends("tional"): self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): self.r("ence") + elif self.ends("anci"): self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): self.r("al") + elif self.ends("entli"): self.r("ent") + elif self.ends("eli"): self.r("e") + elif self.ends("ousli"): self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): self.r("ize") + elif self.ends("ation"): self.r("ate") + elif self.ends("ator"): self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): self.r("al") + elif self.ends("iveness"): self.r("ive") + elif self.ends("fulness"): self.r("ful") + elif self.ends("ousness"): self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): self.r("al") + elif self.ends("iviti"): self.r("ive") + elif self.ends("biliti"): self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): self.r("ic") + elif self.ends("ative"): self.r("") + elif self.ends("alize"): self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): self.r("ic") + elif self.ends("ful"): self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): pass + else: return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): pass + elif self.ends("ence"): pass + else: return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): pass + else: return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): pass + else: return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): pass + elif self.ends("ible"): pass + else: return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): pass + elif self.ends("ement"): pass + elif self.ends("ment"): pass + elif self.ends("ent"): pass + else: return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass + elif self.ends("ou"): pass + # takes care of -ous + else: return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): pass + else: return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): pass + elif self.ends("iti"): pass + else: return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): pass + else: return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): pass + else: return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): pass + else: return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k-1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k -1 + + def stem(self, p, i, j): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k+1] + + +if __name__ == '__main__': + p = PorterStemmer() + if len(sys.argv) > 1: + for f in sys.argv[1:]: + infile = open(f, 'r') + while 1: + output = '' + word = '' + line = infile.readline() + if line == '': + break + for c in line: + if c.isalpha(): + word += c.lower() + else: + if word: + output += p.stem(word, 0,len(word)-1) + word = '' + output += c.lower() + # print output, + infile.close() diff --git a/p3/bingclient_p3.py b/p3/bingclient_p3.py new file mode 100644 index 0000000..f02f26d --- /dev/null +++ b/p3/bingclient_p3.py @@ -0,0 +1,44 @@ +''' +Created on Sep 21, 2012 + +@author: johnterzis + +BingClient takes in an Account Key to its ctor and exposes web search query +method to client that is a wrapper of Bing Search API 1.0 + +Parameters are standardized based on assignment requirements and query returns +top 10 results only, in JSON format +''' + +import logging +from py_bing_search import PyBingWebSearch + + +class BingClient: + ''' + classdocs + ''' + def __init__(self, AccountKey=None): + ''' + Constructor + ''' + + # enfore pseudo privacy of account key member with __ prefix + self.__i_accountKey = AccountKey + + if self.__i_accountKey is None: + logging.error('Account Key is NULL!!!') + + # send a web query to Bing Search API returning top 10 results as json + def webQuery(self, query, result_num=10): + # format query based on OData protocol and desired JSON format of results + + full_query = query.replace(' ', '+') + logging.debug('Sending following URL query: ' + full_query) + + print('%-20s= %s' % ("URL", full_query)) + + bing_web = PyBingWebSearch(self.__i_accountKey, full_query, web_only=False) + first_n_result = bing_web.search(limit=result_num, format='json') + + return first_n_result diff --git a/p3/common_p3.py b/p3/common_p3.py new file mode 100644 index 0000000..bd46b6e --- /dev/null +++ b/p3/common_p3.py @@ -0,0 +1,112 @@ +''' + +@author: aiman.najjar + +Functions that are commonly used across the project + +''' + +import constants_p3 as constants +import re +from html.parser import HTMLParser +from PorterStemmer_p3 import PorterStemmer + + +''' +MLStripper: + An implementation of the HTMLParser class that returns only useful terms and discard other markup + Initial skeleton of this implementation was obtained from the following StackOverflow page but was + modified as per our needs: + http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python +''' + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.fed = [] + self.currentTag = "" + self.currentAttrs = [] + + def handle_starttag(self, tag, attrs): + self.currentTag = tag + self.currentAttrs = attrs + + def handle_endtag(self, tag): + self.currentTag = "" + self.currentAttrs = [] + + def handle_data(self, d): + if self.currentTag not in constants.IGNORE_TAGS: + res = re.match(r"(.*http.*)", d.lower()) + if not res: + self.fed.append(d) + + def get_data(self): + return ''.join(self.fed) + + +# Convinent function to quickly invoke our special HTML parser +def strip_tags(html): + s = MLStripper() + try: + html = html.decode('UTF-8') + except UnicodeDecodeError: + html = html + + s.feed(html) + return s.get_data() + + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + +''' +getTopTerms: + Given the current query and the new query vector, return the highest scoring terms (default 2 terms) + The current query is used to ensure that returned terms are actually new +''' + + +def getTopTerms(currentQuery, weightsMap, topX=2): + + p = PorterStemmer() + current_terms = [] + # for term in currentQuery.split(): + # term = p.stem(term.lower(), 0,len(term)-1) + # current_terms.append(term) + + i = 0 + terms = [] + for term in sorted(weightsMap, key=weightsMap.get, reverse=True): + if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0, len(term) - 1) in current_terms: + continue + terms.append(term) + current_terms.append(p.stem(term.lower(), 0, len(term) - 1)) + i = i + 1 + if (topX != 'ALL' and i >= topX): + break + + return terms + + +''' +printWeights: + Given the new query vector, print out the highest scoring terms (default 10 terms) + Used for debugging purposes only +''' + + +def printWeights(weightsMap, topX=10): + i = 0 + for term in sorted(weightsMap, key=weightsMap.get, reverse=True): + if term in constants.STOP_WORDS_LIST: + continue + print("%-10s: %10f" % (term, weightsMap[term])) + i = i + 1 + if (topX != 'ALL' and i >= topX): + break diff --git a/p3/constants_p3.py b/p3/constants_p3.py new file mode 100644 index 0000000..e4dfe6d --- /dev/null +++ b/p3/constants_p3.py @@ -0,0 +1,171 @@ +''' + +@author: aiman.najjar + +Constants and Setting Variables. + +NUM_INDEXER_THREADS : How many indexer worker threads should work concurrently +DELIMITERS : RegEx experession to separate (tokenize) words based on +BING_ACCT_KEY : Bing Account Key required to invoke Bing API +BING_URL : Prefix to Bing Query API URL +STEM_TOKEN : Flag indicates whether tokens should be stemmed in the invertedFile or not (useful to experiment around) +ALPHA : Weight for previous query vector terms used while computing the expanded query vector (first term in Rocchio Alg.) +BETA : Weight for relevant document vector terms used while computing the expanded query vector (second term in Rocchio Alg.) +GAMMA : Weight for non-relevant document vector terms used while computing the expanded query vector (second term in Rocchio Alg.) +STEM_IN_ROCCHIO : Flag indicates whether terms should be stemmed before computing summation of their weights for Rocchio formula +IGNORE_TAGS : A list of HTML tags in which its content must be ignored (e.g. tags that contain only css or javascript code) +QUERY_SKIP_TERMS : A list of terms that should not be considered in the expanded query even with high scores (e.g. stop words) +''' + + +NUM_INDEXER_THREADS = 2 +DELIMITERS = '[\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]+' # DELIMITERS = '\W+' +BING_ACCT_KEY = 'EEss/QY1BWmE0o0fSsqvzmcsZ+2S/lTTT0xgvAy4Z8s' +BING_URL = 'https://api.datamarket.azure.com/Data.ashx/Bing/SearchWeb/v1/Web?' +STEM_TOKEN = False +ALPHA = 0 +BETA = 1.0 +GAMMA = 1.0 +STEM_IN_ROCCHIO = False +IGNORE_TAGS = ["style", "script"] +QUERY_SKIP_TERMS = { "about" : True, +"above" : True, +"after" : True, +"again" : True, +"against" : True, +"all" : True, +"am" : True, +"an" : True, +"and" : True, +"any" : True, +"are" : True, +"aren" : True, +"as" : True, +"at" : True, +"be" : True, +"because" : True, +"been" : True, +"before" : True, +"being" : True, +"below" : True, +"between" : True, +"both" : True, +"but" : True, +"by" : True, +"can" : True, +"cannot" : True, +"could" : True, +"couldn" : True, +"did" : True, +"didn" : True, +"do" : True, +"does" : True, +"doesn" : True, +"doing" : True, +"don" : True, +"down" : True, +"during" : True, +"each" : True, +"few" : True, +"for" : True, +"from" : True, +"further" : True, +"had" : True, +"hadn" : True, +"has" : True, +"hasn" : True, +"have" : True, +"haven" : True, +"having" : True, +"he" : True, +"her" : True, +"here" : True, +"here" : True, +"hers" : True, +"herself" : True, +"him" : True, +"himself" : True, +"his" : True, +"how" : True, +"how" : True, +"if" : True, +"in" : True, +"into" : True, +"is" : True, +"isn" : True, +"it" : True, +"its" : True, +"itself" : True, +"let" : True, +"me" : True, +"more" : True, +"most" : True, +"mustn" : True, +"my" : True, +"myself" : True, +"no" : True, +"nor" : True, +"not" : True, +"of" : True, +"off" : True, +"on" : True, +"once" : True, +"only" : True, +"or" : True, +"other" : True, +"ought" : True, +"our" : True, +"ours" : True, +"ourselves" : True, +"out" : True, +"over" : True, +"own" : True, +"same" : True, +"shan" : True, +"she" : True, +"should" : True, +"shouldn" : True, +"so" : True, +"some" : True, +"such" : True, +"than" : True, +"that" : True, +"the" : True, +"their" : True, +"theirs" : True, +"them" : True, +"themselves" : True, +"then" : True, +"there" : True, +"these" : True, +"they" : True, +"this" : True, +"those" : True, +"through" : True, +"to" : True, +"too" : True, +"under" : True, +"until" : True, +"up" : True, +"very" : True, +"was" : True, +"wasn" : True, +"we" : True, +"were" : True, +"weren" : True, +"what" : True, +"when" : True, +"where" : True, +"which" : True, +"while" : True, +"who" : True, +"whom" : True, +"why" : True, +"with" : True, +"would" : True, +"wouldn" : True, +"you" : True, +"your" : True, +"yours" : True, +"yourself" : True, +"yourselves" : True } diff --git a/p3/indexer_p3.py b/p3/indexer_p3.py new file mode 100644 index 0000000..fac6a00 --- /dev/null +++ b/p3/indexer_p3.py @@ -0,0 +1,173 @@ +''' +Created on Sep 21, 2012 + +@author: aiman.najjar + +This class is resposible for indexing the documents and it performs the following steps: + 1. Retrieves the body content of the document, if the HTTP request fails, the body + summary returned in Bing API is used + 2. Tokenize the document text based on constants.DELIMITERS regular expression + 3. OPTIONAL: Stem token (default is False, setting can be changed in constants.py) + 4. Throw away out terms that are likely to be useless (e.g. length is 1 or numerical only) + 5. Insert into invertedFile + 6. In the same pass, we compute term frequencies for each term in d and store the weight + in document["tfVector"][term], this is useful later for Rocchio + +Note that this indexer is setup to work concurrently and dynamically build the index as opposed to index +the document collections at once. +To index a document, the document object should be enqueued in documents_queue and one of the worker +threads will be pick it up to process it +Therefore, a mutex lock was necessary while accessing invertedFile to ensure dictionary consistency + +Here is the invertedFile structure: + + invertedFile = + { + + "Term 1" : { + "DocID 1" : + { + "body": [0,3,4,2,1] # List of positions + . + . + other zones (currently only indexing body) + } + + . + . + . + other documents + + } + + . + . + . + . + other terms + + } + + +You will notice our liberal usage of hash maps which are convenient for quick access but consume larger +memory, we explain our design choice in the README file + + +''' + +import threading +import re +import requests +import logging +import constants_p3 as constants +from PorterStemmer_p3 import PorterStemmer +from common_p3 import * +from queue import Queue +from threading import Thread + + +class Indexer(): + + def __init__(self): + logging.info("Initializing indexer") + self.ifile_lock = threading.Lock() + self.documents_queue = Queue() + self.invertedFile = dict() + self.termsFrequencies = dict() + + for i in range(constants.NUM_INDEXER_THREADS): + worker = Thread(target=self.index, args=(i, self.documents_queue,)) + worker.setDaemon(True) + worker.start() + + # Enqueues a task in the indexer queue + def indexDocument(self, document): + self.documents_queue.put(document) + + def waitForIndexer(self): + self.documents_queue.join() + + def clearIndex(self): + with self.ifile_lock: + self.invertedFile = dict() + self.termsFrequencies = dict() + + def index(self, i, q): + while True: + logging.info('Indexer-%s: Waiting for next document' % i) + document = q.get() + + logging.info('Indexer-%s: Indexing document #%s (%s)' % (i, document["ID"], document["Url"])) + + # Create key to hold tf weights + document["tfVector"] = {} + + # Retrive Entire document + url = document["Url"] + + try: + response = requests.get(url) + body = response.text # response.read() + # Strip out HTML + document["Body"] = strip_tags(body) + except Exception: + document["Body"] = document["Description"] + + # Terms List + terms = [] + + # Tokenizer + logging.debug('Indexer-%s: Tokenizing document #%s' % (i, document["ID"])) + tokens = re.compile(constants.DELIMITERS).split(document["Body"]) + logging.debug('Indexer-%s: Found %d tokens' % (i, len(tokens))) + j = 0 + + # Process Tokens + p = PorterStemmer() + for token in tokens: + + # Stem Token + if (constants.STEM_TOKEN): + logging.debug('Indexer-%s: Stemming token: \'%s\'' % (i, token)) + token = p.stem(token.lower(), 0, len(token) - 1) + else: + token = token.lower() + + # Is token eligible to indexed? + if (token == '' or len(token) <= 1 or len(token) >= 10 or is_number(token)): + logging.debug('Indexer-%s: Discarding short or empty token \'%s\'' % (i, token)) + continue + + terms.append(token) + + # Insert into invertedFile + with self.ifile_lock: + logging.debug('Indexer-%s: Updating postings for token: %s' % (i, token)) + + if token not in self.termsFrequencies: + self.termsFrequencies[token] = 1 + else: + self.termsFrequencies[token] = self.termsFrequencies[token] + 1 + + if token not in self.invertedFile: + self.invertedFile[token] = {} + + if document["ID"] not in self.invertedFile[token]: + self.invertedFile[token][document["ID"]] = {} + + body_postings = [] + if "body" in self.invertedFile[token][document["ID"]]: + body_postings = self.invertedFile[token][document["ID"]]["body"] + body_postings.append(j) + else: + self.invertedFile[token][document["ID"]]["body"] = [j] + + if (token in document["tfVector"]): + document["tfVector"][token] = document["tfVector"][token] + 1 + else: + document["tfVector"][token] = 1 + + j = j + 1 + + logging.info('Indexer-%s: Finished indexing document %s' % (i, document["ID"])) + q.task_done() diff --git a/p3/main_p3.py b/p3/main_p3.py new file mode 100644 index 0000000..8efea39 --- /dev/null +++ b/p3/main_p3.py @@ -0,0 +1,157 @@ +''' +Created on Sep 21, 2012 + +@author: johnterzis + +arguments: + +Contains the main loop of the application + +''' + +import sys +import bingclient_p3 as bingclient +import constants_p3 as constants +import logging +import indexer_p3 as indexer +import rocchio_p3 as rocchio +import common_p3 as common +import math + +# only if run as standalone script (not imported module) does, __name__ attribute defaults to __main__ +# assume first arg is second is +if __name__ == '__main__': + + logging.basicConfig(level=logging.ERROR) + +# create all singleton objects + arglist = sys.argv + if len(arglist) < 3: + print("Usage: ") + sys.exit(1) # exit interpreter + + print('Desired precision@10: {}'.format(arglist[1])) + + precisionTenTarg = float(arglist[1]) # must convert string to float + # 'eECeOiLBFOie0G3C03YjoHSqb1aMhEfqk8qe7Xi2YMs=' + # connect to client with key arg[1] and post a query with arg[3], query + + bingClient = bingclient.BingClient(constants.BING_ACCT_KEY) + indexer = indexer.Indexer() + queryOptimizer = rocchio.RocchioOptimizeQuery(arglist[2]) + + firstPass = 1 + precisionAtK = 0.00 + expandedQuery = arglist[2] + queryWeights = {} + + # while precision at 10 is less than desired amt issue a query, obtain new precision metric, expand query, repeat + while (precisionAtK < precisionTenTarg): + precisionAtK = 0.00 # reset precision each round + # PROCESS A QUERY + + print('Parameters') + print('%-20s= %s' % ("Query", expandedQuery)) + print('%-20s= %s' % ("Target Precision", precisionTenTarg)) + + indexer.clearIndex() + + if firstPass == 1: + results = bingClient.webQuery(arglist[2]) + else: + results = bingClient.webQuery(expandedQuery) + + DocumentList = [] + + for wq in results: + d = {} + d['Url'] = wq.url + d['Description'] = wq.description + d['Title'] = wq.title + DocumentList.append(d) + + print('Total number of results: %d' % len(DocumentList)) + + # to calc precision@10 display documents to user and ask them to categorize as Relevant or Non-Relevant + print('======================') + + # Reset collections for relevant ad nonrelevant documents + relevantDocuments = [] + nonrelevantDocuments = [] + + for i in range(len(DocumentList)): + + DocumentList[i]["ID"] = i + indexer.indexDocument(DocumentList[i]) + + print('Result %d' % (i + 1)) + print('[') + print(' %-9s: %10s' % ("URL", DocumentList[i]["Url"])) + print(' %-9s: %10s' % ("Title", DocumentList[i]["Title"])) + print(' %-9s: %10s' % ("Summary", DocumentList[i]["Description"])) + print(']') + + print('') + sys.stdout.write('Relevant (Y/N)? ') + value = input() + if value.upper() == 'Y': + DocumentList[i]['IsRelevant'] = 1 # 1 is true , 0 is false + precisionAtK = precisionAtK + 1 + relevantDocuments.append(i) + + elif value.upper() == 'N': + DocumentList[i]['IsRelevant'] = 0 # 1 is true , 0 is false + nonrelevantDocuments.append(i) + else: + print('Invalid value entered!') + + precisionAtK = float(precisionAtK) / 10 # final precision@10 per round + + print('') + print('Precision@10 is: {}'.format(float(precisionAtK))) + print('') + + # expand query here by indexing and weighting current document list + if (precisionAtK == 0): + print('Below desired precision, but can no longer augment the query') + sys.exit() + + print('Indexing results...') + indexer.waitForIndexer() # Will block until indexer is done indexing all documents + + # print(inveretd file + + for term in sorted(indexer.invertedFile, key=lambda posting: len(indexer.invertedFile[posting].keys())): + logging.info( + "%-30s %-2s:%-3d %-2s:%-3d %-3s:%-10f" % ( + term, + "TF", + indexer.termsFrequencies[term], + "DF", + len(indexer.invertedFile[term]), + "IDF", + math.log(float(len(DocumentList)) / len(indexer.invertedFile[term].keys()), 10) + ) + ) + + print('======================') + print('FEEDBACK SUMMARY') + + if (precisionAtK < precisionTenTarg): + print('') + print('Still below desired precision of %f' % precisionTenTarg) + queryWeights = queryOptimizer.Rocchio( + indexer.invertedFile, + DocumentList, + relevantDocuments, + nonrelevantDocuments + ) # optimize new query here + + newTerms = common.getTopTerms(expandedQuery, queryWeights, 2) + expandedQuery = expandedQuery + " " + newTerms[0] + " " + newTerms[1] + firstPass = 0 + + print('Augmenting by %s %s' % (newTerms[0], newTerms[1])) + + # precision@10 is > desired , return query and results to user + print('Desired precision reached, done') diff --git a/p3/parser_p3.py b/p3/parser_p3.py new file mode 100644 index 0000000..7481de1 --- /dev/null +++ b/p3/parser_p3.py @@ -0,0 +1,51 @@ +''' +Created on Sep 25, 2012 + +@author: johnterzis + +Parser takes raw json output from BingClient and parses the result list of dictionaries, placing +significant components into a Document List + +e.g. +if json document, exampleResults, is passed into contructor + +exampleResults['d]['results'] is list of 10 dictionaries, each a result + +''' + + +class Parser: + ''' + classdocs + ''' + + def __init__(self, rawJSON): + + self.rawJSON = rawJSON + self.DocumentsList = [] + + def parser(self): + + results = self.rawJSON + + resultLength = len(results) + + # generate list of dictionaries one for each doc + self.DocumentsList = [ + { + 'Description': results[k]['Description'], + 'Title': results[k]['Title'], + 'Url': results[k]['Url'], + 'IsRelevant': None, + 'Body': None, + 'URLBody': None + } for k in range(resultLength) + ] + + def getDocList(self): + + if self.DocumentsList is None: + print('Document List Empty!') + return + + return self.DocumentsList diff --git a/p3/rocchio_p3.py b/p3/rocchio_p3.py new file mode 100644 index 0000000..80091e8 --- /dev/null +++ b/p3/rocchio_p3.py @@ -0,0 +1,100 @@ +''' +Implement Rocchio algo on a corpus of relevant documents +by weighting based on td-idf to iteratively form a new query vector of weightings +for each unique term across all dictionaries (from invertedFiles) passed into Rocchio +''' +import constants_p3 as constants +import math +import PorterStemmer_p3 as PorterStemmer + + +class RocchioOptimizeQuery: + + def __init__(self, firstQueryTerm): + ''' + Constructor + ''' + self.query = {} + self.query[firstQueryTerm] = 1 + + def Rocchio(self, invertedFile, documentsList, relevantDocs, nonrelevantDocs): + ''' + output new query vector' + + calculate summation of relevant documents weights + 'calculate IDF per inverted file' + + + ''' + p = PorterStemmer.PorterStemmer() + + weights = {} + for term in invertedFile: + sterm = term + if constants.STEM_IN_ROCCHIO: + sterm = p.stem(term.lower(), 0, len(term) - 1) + weights[sterm] = 0.0 # initialize weight vector for each key in inverted file + print('') + + relevantDocsTFWeights = {} + nonrelevantDocsTFWeights = {} + + # ------------------------------------- # + # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors + for docId in relevantDocs: + doc = documentsList[docId] + for term in doc["tfVector"]: + sterm = term + if constants.STEM_IN_ROCCHIO: + sterm = p.stem(term.lower(), 0, len(term) - 1) + + if sterm in relevantDocsTFWeights: + relevantDocsTFWeights[sterm] = relevantDocsTFWeights[sterm] + doc["tfVector"][term] + else: + relevantDocsTFWeights[sterm] = doc["tfVector"][term] + + for docId in nonrelevantDocs: + doc = documentsList[docId] + for term in doc["tfVector"]: + sterm = term + if constants.STEM_IN_ROCCHIO: + sterm = p.stem(term.lower(), 0, len(term) - 1) + + if sterm in nonrelevantDocsTFWeights: + nonrelevantDocsTFWeights[sterm] = nonrelevantDocsTFWeights[sterm] + doc["tfVector"][term] + else: + nonrelevantDocsTFWeights[sterm] = doc["tfVector"][term] + + # ------------------------------------- # + # Compute Rocchio vector + for term in invertedFile: + idf = math.log(float(len(documentsList)) / float(len(invertedFile[term].keys())), 10) + + sterm = term + if constants.STEM_IN_ROCCHIO: + sterm = p.stem(term.lower(), 0, len(term) - 1) + + # Terms 2 and 3 of Rocchio algorithm + for docId in invertedFile[term]: + if documentsList[docId]['IsRelevant'] == 1: + # Term 2: Relevant documents weights normalized and given BETA weight + weights[sterm] = ( + weights[sterm] + constants.BETA * idf * (relevantDocsTFWeights[sterm] / len(relevantDocs)) + ) + else: + # Term 3: NonRelevant documents weights normalized and given BETA weight + weights[sterm] = ( + weights[sterm] - constants.GAMMA * idf * ( + nonrelevantDocsTFWeights[sterm] / len(nonrelevantDocs) + ) + ) + + # Term 1 of Rocchio, query terms + if term in self.query: + self.query[term] = ( + constants.ALPHA * self.query[term] + weights[sterm] # build new query vector of weights + ) + elif weights[sterm] > 0: + self.query[term] = weights[sterm] + + return self.query diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..254ba5d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +py-bing-search==0.2.6