From cc8ea72ee7883940f2d2210507e533e0a59c8d2c Mon Sep 17 00:00:00 2001
From: Aivin Solatorio <aivin@kalibrr.com>
Date: Mon, 12 Sep 2016 09:51:19 +0800
Subject: [PATCH 1/2] Port codes to python3 to solve SSL problems.

---
 .gitignore             |   1 +
 p3/PorterStemmer_p3.py | 367 +++++++++++++++++++++++++++++++++++++++++
 p3/bingclient_p3.py    |  44 +++++
 p3/common_p3.py        | 112 +++++++++++++
 p3/constants_p3.py     | 171 +++++++++++++++++++
 p3/indexer_p3.py       | 173 +++++++++++++++++++
 p3/main_p3.py          | 157 ++++++++++++++++++
 p3/parser_p3.py        |  51 ++++++
 p3/rocchio_p3.py       | 100 +++++++++++
 9 files changed, 1176 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 p3/PorterStemmer_p3.py
 create mode 100644 p3/bingclient_p3.py
 create mode 100644 p3/common_p3.py
 create mode 100644 p3/constants_p3.py
 create mode 100644 p3/indexer_p3.py
 create mode 100644 p3/main_p3.py
 create mode 100644 p3/parser_p3.py
 create mode 100644 p3/rocchio_p3.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/p3/PorterStemmer_p3.py b/p3/PorterStemmer_p3.py
new file mode 100644
index 0000000..3b7af40
--- /dev/null
+++ b/p3/PorterStemmer_p3.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python
+
+"""Porter Stemming Algorithm
+This is the Porter stemming algorithm, ported to Python from the
+version coded up in ANSI C by the author. It may be be regarded
+as canonical, in that it follows the algorithm presented in
+
+Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+no. 3, pp 130-137,
+
+only differing from it at the points maked --DEPARTURE-- below.
+
+See also http://www.tartarus.org/~martin/PorterStemmer
+
+The algorithm as described in the paper could be exactly replicated
+by adjusting the points of DEPARTURE, but this is barely necessary,
+because (a) the points of DEPARTURE are definitely improvements, and
+(b) no encoding of the Porter stemmer I have seen is anything like
+as exact as this version, even with the points of DEPARTURE!
+
+Vivake Gupta (v@nano.com)
+
+Release 1: January 2001
+
+Further adjustments by Santiago Bruno (bananabruno@gmail.com)
+to allow word input not restricted to one word per line, leading
+to:
+
+release 2: July 2008
+"""
+
+import sys
+
+class PorterStemmer:
+
+    def __init__(self):
+        """The main part of the stemming algorithm starts here.
+        b is a buffer holding a word to be stemmed. The letters are in b[k0],
+        b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+        readjusted downwards as the stemming progresses. Zero termination is
+        not in fact used in the algorithm.
+
+        Note that only lower case sequences are stemmed. Forcing to lower case
+        should be done before stem(...) is called.
+        """
+
+        self.b = ""  # buffer for word to be stemmed
+        self.k = 0
+        self.k0 = 0
+        self.j = 0   # j is a general offset into the string
+
+    def cons(self, i):
+        """cons(i) is TRUE <=> b[i] is a consonant."""
+        if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
+            return 0
+        if self.b[i] == 'y':
+            if i == self.k0:
+                return 1
+            else:
+                return (not self.cons(i - 1))
+        return 1
+
+    def m(self):
+        """m() measures the number of consonant sequences between k0 and j.
+        if c is a consonant sequence and v a vowel sequence, and <..>
+        indicates arbitrary presence,
+
+           <c><v>       gives 0
+           <c>vc<v>     gives 1
+           <c>vcvc<v>   gives 2
+           <c>vcvcvc<v> gives 3
+           ....
+        """
+        n = 0
+        i = self.k0
+        while 1:
+            if i > self.j:
+                return n
+            if not self.cons(i):
+                break
+            i = i + 1
+        i = i + 1
+        while 1:
+            while 1:
+                if i > self.j:
+                    return n
+                if self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+            n = n + 1
+            while 1:
+                if i > self.j:
+                    return n
+                if not self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+
+    def vowelinstem(self):
+        """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+        for i in range(self.k0, self.j + 1):
+            if not self.cons(i):
+                return 1
+        return 0
+
+    def doublec(self, j):
+        """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+        if j < (self.k0 + 1):
+            return 0
+        if (self.b[j] != self.b[j-1]):
+            return 0
+        return self.cons(j)
+
+    def cvc(self, i):
+        """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+        and also if the second c is not w,x or y. this is used when trying to
+        restore an e at the end of a short  e.g.
+
+           cav(e), lov(e), hop(e), crim(e), but
+           snow, box, tray.
+        """
+        if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+            return 0
+        ch = self.b[i]
+        if ch == 'w' or ch == 'x' or ch == 'y':
+            return 0
+        return 1
+
+    def ends(self, s):
+        """ends(s) is TRUE <=> k0,...k ends with the string s."""
+        length = len(s)
+        if s[length - 1] != self.b[self.k]: # tiny speed-up
+            return 0
+        if length > (self.k - self.k0 + 1):
+            return 0
+        if self.b[self.k-length+1:self.k+1] != s:
+            return 0
+        self.j = self.k - length
+        return 1
+
+    def setto(self, s):
+        """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+        length = len(s)
+        self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+        self.k = self.j + length
+
+    def r(self, s):
+        """r(s) is used further down."""
+        if self.m() > 0:
+            self.setto(s)
+
+    def step1ab(self):
+        """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+           caresses  ->  caress
+           ponies    ->  poni
+           ties      ->  ti
+           caress    ->  caress
+           cats      ->  cat
+
+           feed      ->  feed
+           agreed    ->  agree
+           disabled  ->  disable
+
+           matting   ->  mat
+           mating    ->  mate
+           meeting   ->  meet
+           milling   ->  mill
+           messing   ->  mess
+
+           meetings  ->  meet
+        """
+        if self.b[self.k] == 's':
+            if self.ends("sses"):
+                self.k = self.k - 2
+            elif self.ends("ies"):
+                self.setto("i")
+            elif self.b[self.k - 1] != 's':
+                self.k = self.k - 1
+        if self.ends("eed"):
+            if self.m() > 0:
+                self.k = self.k - 1
+        elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+            self.k = self.j
+            if self.ends("at"):   self.setto("ate")
+            elif self.ends("bl"): self.setto("ble")
+            elif self.ends("iz"): self.setto("ize")
+            elif self.doublec(self.k):
+                self.k = self.k - 1
+                ch = self.b[self.k]
+                if ch == 'l' or ch == 's' or ch == 'z':
+                    self.k = self.k + 1
+            elif (self.m() == 1 and self.cvc(self.k)):
+                self.setto("e")
+
+    def step1c(self):
+        """step1c() turns terminal y to i when there is another vowel in the stem."""
+        if (self.ends("y") and self.vowelinstem()):
+            self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+    def step2(self):
+        """step2() maps double suffices to single ones.
+        so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+        string before the suffix must give m() > 0.
+        """
+        if self.b[self.k - 1] == 'a':
+            if self.ends("ational"):   self.r("ate")
+            elif self.ends("tional"):  self.r("tion")
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("enci"):      self.r("ence")
+            elif self.ends("anci"):    self.r("ance")
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("izer"):      self.r("ize")
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("bli"):       self.r("ble") # --DEPARTURE--
+            # To match the published algorithm, replace this phrase with
+            #   if self.ends("abli"):      self.r("able")
+            elif self.ends("alli"):    self.r("al")
+            elif self.ends("entli"):   self.r("ent")
+            elif self.ends("eli"):     self.r("e")
+            elif self.ends("ousli"):   self.r("ous")
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ization"):   self.r("ize")
+            elif self.ends("ation"):   self.r("ate")
+            elif self.ends("ator"):    self.r("ate")
+        elif self.b[self.k - 1] == 's':
+            if self.ends("alism"):     self.r("al")
+            elif self.ends("iveness"): self.r("ive")
+            elif self.ends("fulness"): self.r("ful")
+            elif self.ends("ousness"): self.r("ous")
+        elif self.b[self.k - 1] == 't':
+            if self.ends("aliti"):     self.r("al")
+            elif self.ends("iviti"):   self.r("ive")
+            elif self.ends("biliti"):  self.r("ble")
+        elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+            if self.ends("logi"):      self.r("log")
+        # To match the published algorithm, delete this phrase
+
+    def step3(self):
+        """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+        if self.b[self.k] == 'e':
+            if self.ends("icate"):     self.r("ic")
+            elif self.ends("ative"):   self.r("")
+            elif self.ends("alize"):   self.r("al")
+        elif self.b[self.k] == 'i':
+            if self.ends("iciti"):     self.r("ic")
+        elif self.b[self.k] == 'l':
+            if self.ends("ical"):      self.r("ic")
+            elif self.ends("ful"):     self.r("")
+        elif self.b[self.k] == 's':
+            if self.ends("ness"):      self.r("")
+
+    def step4(self):
+        """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+        if self.b[self.k - 1] == 'a':
+            if self.ends("al"): pass
+            else: return
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("ance"): pass
+            elif self.ends("ence"): pass
+            else: return
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("er"): pass
+            else: return
+        elif self.b[self.k - 1] == 'i':
+            if self.ends("ic"): pass
+            else: return
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("able"): pass
+            elif self.ends("ible"): pass
+            else: return
+        elif self.b[self.k - 1] == 'n':
+            if self.ends("ant"): pass
+            elif self.ends("ement"): pass
+            elif self.ends("ment"): pass
+            elif self.ends("ent"): pass
+            else: return
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
+            elif self.ends("ou"): pass
+            # takes care of -ous
+            else: return
+        elif self.b[self.k - 1] == 's':
+            if self.ends("ism"): pass
+            else: return
+        elif self.b[self.k - 1] == 't':
+            if self.ends("ate"): pass
+            elif self.ends("iti"): pass
+            else: return
+        elif self.b[self.k - 1] == 'u':
+            if self.ends("ous"): pass
+            else: return
+        elif self.b[self.k - 1] == 'v':
+            if self.ends("ive"): pass
+            else: return
+        elif self.b[self.k - 1] == 'z':
+            if self.ends("ize"): pass
+            else: return
+        else:
+            return
+        if self.m() > 1:
+            self.k = self.j
+
+    def step5(self):
+        """step5() removes a final -e if m() > 1, and changes -ll to -l if
+        m() > 1.
+        """
+        self.j = self.k
+        if self.b[self.k] == 'e':
+            a = self.m()
+            if a > 1 or (a == 1 and not self.cvc(self.k-1)):
+                self.k = self.k - 1
+        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+            self.k = self.k -1
+
+    def stem(self, p, i, j):
+        """In stem(p,i,j), p is a char pointer, and the string to be stemmed
+        is from p[i] to p[j] inclusive. Typically i is zero and j is the
+        offset to the last character of a string, (p[j+1] == '\0'). The
+        stemmer adjusts the characters p[i] ... p[j] and returns the new
+        end-point of the string, k. Stemming never increases word length, so
+        i <= k <= j. To turn the stemmer into a module, declare 'stem' as
+        extern, and delete the remainder of this file.
+        """
+        # copy the parameters into statics
+        self.b = p
+        self.k = j
+        self.k0 = i
+        if self.k <= self.k0 + 1:
+            return self.b # --DEPARTURE--
+
+        # With this line, strings of length 1 or 2 don't go through the
+        # stemming process, although no mention is made of this in the
+        # published algorithm. Remove the line to match the published
+        # algorithm.
+
+        self.step1ab()
+        self.step1c()
+        self.step2()
+        self.step3()
+        self.step4()
+        self.step5()
+        return self.b[self.k0:self.k+1]
+
+
+if __name__ == '__main__':
+    p = PorterStemmer()
+    if len(sys.argv) > 1:
+        for f in sys.argv[1:]:
+            infile = open(f, 'r')
+            while 1:
+                output = ''
+                word = ''
+                line = infile.readline()
+                if line == '':
+                    break
+                for c in line:
+                    if c.isalpha():
+                        word += c.lower()
+                    else:
+                        if word:
+                            output += p.stem(word, 0,len(word)-1)
+                            word = ''
+                        output += c.lower()
+                # print output,
+            infile.close()
diff --git a/p3/bingclient_p3.py b/p3/bingclient_p3.py
new file mode 100644
index 0000000..f02f26d
--- /dev/null
+++ b/p3/bingclient_p3.py
@@ -0,0 +1,44 @@
+'''
+Created on Sep 21, 2012
+
+@author: johnterzis
+
+BingClient takes in an Account Key to its ctor and exposes web search query
+method to client that is a wrapper of Bing Search API 1.0
+
+Parameters are standardized based on assignment requirements and query returns
+top 10 results only, in JSON format
+'''
+
+import logging
+from py_bing_search import PyBingWebSearch
+
+
+class BingClient:
+    '''
+    classdocs
+    '''
+    def __init__(self, AccountKey=None):
+        '''
+        Constructor
+        '''
+
+        # enfore pseudo privacy of account key member with __ prefix
+        self.__i_accountKey = AccountKey
+
+        if self.__i_accountKey is None:
+            logging.error('Account Key is NULL!!!')
+
+    # send a web query to Bing Search API returning top 10 results as json
+    def webQuery(self, query, result_num=10):
+        # format query based on OData protocol and desired JSON format of results
+
+        full_query = query.replace(' ', '+')
+        logging.debug('Sending following URL query: ' + full_query)
+
+        print('%-20s= %s' % ("URL", full_query))
+
+        bing_web = PyBingWebSearch(self.__i_accountKey, full_query, web_only=False)
+        first_n_result = bing_web.search(limit=result_num, format='json')
+
+        return first_n_result
diff --git a/p3/common_p3.py b/p3/common_p3.py
new file mode 100644
index 0000000..bd46b6e
--- /dev/null
+++ b/p3/common_p3.py
@@ -0,0 +1,112 @@
+'''
+
+@author: aiman.najjar
+
+Functions that are commonly used across the project
+
+'''
+
+import constants_p3 as constants
+import re
+from html.parser import HTMLParser
+from PorterStemmer_p3 import PorterStemmer
+
+
+'''
+MLStripper:
+ An implementation of the HTMLParser class that returns only useful terms and discard other markup
+ Initial skeleton of this implementation was obtained from the following StackOverflow page but was
+ modified as per our needs:
+ http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
+'''
+
+
+class MLStripper(HTMLParser):
+    def __init__(self):
+        self.reset()
+        self.fed = []
+        self.currentTag = ""
+        self.currentAttrs = []
+
+    def handle_starttag(self, tag, attrs):
+        self.currentTag = tag
+        self.currentAttrs = attrs
+
+    def handle_endtag(self, tag):
+        self.currentTag = ""
+        self.currentAttrs = []
+
+    def handle_data(self, d):
+        if self.currentTag not in constants.IGNORE_TAGS:
+            res = re.match(r"(.*http.*)", d.lower())
+            if not res:
+                self.fed.append(d)
+
+    def get_data(self):
+        return ''.join(self.fed)
+
+
+# Convinent function to quickly invoke our special HTML parser
+def strip_tags(html):
+    s = MLStripper()
+    try:
+        html = html.decode('UTF-8')
+    except UnicodeDecodeError:
+        html = html
+
+    s.feed(html)
+    return s.get_data()
+
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+'''
+getTopTerms:
+    Given the current query and the new query vector, return the highest scoring terms (default 2 terms)
+    The current query is used to ensure that returned terms are actually new
+'''
+
+
+def getTopTerms(currentQuery, weightsMap, topX=2):
+
+    p = PorterStemmer()
+    current_terms = []
+    # for term in currentQuery.split():
+    #     term = p.stem(term.lower(), 0,len(term)-1)
+    #     current_terms.append(term)
+
+    i = 0
+    terms = []
+    for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
+        if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0, len(term) - 1) in current_terms:
+            continue
+        terms.append(term)
+        current_terms.append(p.stem(term.lower(), 0, len(term) - 1))
+        i = i + 1
+        if (topX != 'ALL' and i >= topX):
+            break
+
+    return terms
+
+
+'''
+printWeights:
+    Given the new query vector, print out the highest scoring terms (default 10 terms)
+    Used for debugging purposes only
+'''
+
+
+def printWeights(weightsMap, topX=10):
+    i = 0
+    for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
+        if term in constants.STOP_WORDS_LIST:
+            continue
+        print("%-10s: %10f" % (term, weightsMap[term]))
+        i = i + 1
+        if (topX != 'ALL' and i >= topX):
+            break
diff --git a/p3/constants_p3.py b/p3/constants_p3.py
new file mode 100644
index 0000000..e4dfe6d
--- /dev/null
+++ b/p3/constants_p3.py
@@ -0,0 +1,171 @@
+'''
+
+@author: aiman.najjar
+
+Constants and Setting Variables.
+
+NUM_INDEXER_THREADS		: How many indexer worker threads should work concurrently
+DELIMITERS 				: RegEx experession to separate (tokenize) words based on
+BING_ACCT_KEY			: Bing Account Key required to invoke Bing API
+BING_URL				: Prefix to Bing Query API URL
+STEM_TOKEN				: Flag indicates whether tokens should be stemmed in the invertedFile or not (useful to experiment around)
+ALPHA 					: Weight for previous query vector terms used while computing the expanded query vector (first term in Rocchio Alg.)
+BETA 					: Weight for relevant document vector terms used while computing the expanded query vector (second term in Rocchio Alg.)
+GAMMA					: Weight for non-relevant document vector terms used while computing the expanded query vector (second term in Rocchio Alg.)
+STEM_IN_ROCCHIO			: Flag indicates whether terms should be stemmed before computing summation of their weights for Rocchio formula
+IGNORE_TAGS				: A list of HTML tags in which its content must be ignored (e.g. tags that contain only css or javascript code)
+QUERY_SKIP_TERMS		: A list of terms that should not be considered in the expanded query even with high scores (e.g. stop words)
+'''
+
+
+NUM_INDEXER_THREADS	=	2
+DELIMITERS 			= '[\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]+' # DELIMITERS 			= '\W+'
+BING_ACCT_KEY		= 'EEss/QY1BWmE0o0fSsqvzmcsZ+2S/lTTT0xgvAy4Z8s'
+BING_URL			= 'https://api.datamarket.azure.com/Data.ashx/Bing/SearchWeb/v1/Web?'
+STEM_TOKEN			= False
+ALPHA 				= 0
+BETA 				= 1.0
+GAMMA				= 1.0
+STEM_IN_ROCCHIO		= False
+IGNORE_TAGS			= ["style", "script"]
+QUERY_SKIP_TERMS	= { "about" : True,
+"above" : True,
+"after" : True,
+"again" : True,
+"against" : True,
+"all" : True,
+"am" : True,
+"an" : True,
+"and" : True,
+"any" : True,
+"are" : True,
+"aren" : True,
+"as" : True,
+"at" : True,
+"be" : True,
+"because" : True,
+"been" : True,
+"before" : True,
+"being" : True,
+"below" : True,
+"between" : True,
+"both" : True,
+"but" : True,
+"by" : True,
+"can" : True,
+"cannot" : True,
+"could" : True,
+"couldn" : True,
+"did" : True,
+"didn" : True,
+"do" : True,
+"does" : True,
+"doesn" : True,
+"doing" : True,
+"don" : True,
+"down" : True,
+"during" : True,
+"each" : True,
+"few" : True,
+"for" : True,
+"from" : True,
+"further" : True,
+"had" : True,
+"hadn" : True,
+"has" : True,
+"hasn" : True,
+"have" : True,
+"haven" : True,
+"having" : True,
+"he" : True,
+"her" : True,
+"here" : True,
+"here" : True,
+"hers" : True,
+"herself" : True,
+"him" : True,
+"himself" : True,
+"his" : True,
+"how" : True,
+"how" : True,
+"if" : True,
+"in" : True,
+"into" : True,
+"is" : True,
+"isn" : True,
+"it" : True,
+"its" : True,
+"itself" : True,
+"let" : True,
+"me" : True,
+"more" : True,
+"most" : True,
+"mustn" : True,
+"my" : True,
+"myself" : True,
+"no" : True,
+"nor" : True,
+"not" : True,
+"of" : True,
+"off" : True,
+"on" : True,
+"once" : True,
+"only" : True,
+"or" : True,
+"other" : True,
+"ought" : True,
+"our" : True,
+"ours" : True,
+"ourselves" : True,
+"out" : True,
+"over" : True,
+"own" : True,
+"same" : True,
+"shan" : True,
+"she" : True,
+"should" : True,
+"shouldn" : True,
+"so" : True,
+"some" : True,
+"such" : True,
+"than" : True,
+"that" : True,
+"the" : True,
+"their" : True,
+"theirs" : True,
+"them" : True,
+"themselves" : True,
+"then" : True,
+"there" : True,
+"these" : True,
+"they" : True,
+"this" : True,
+"those" : True,
+"through" : True,
+"to" : True,
+"too" : True,
+"under" : True,
+"until" : True,
+"up" : True,
+"very" : True,
+"was" : True,
+"wasn" : True,
+"we" : True,
+"were" : True,
+"weren" : True,
+"what" : True,
+"when" : True,
+"where" : True,
+"which" : True,
+"while" : True,
+"who" : True,
+"whom" : True,
+"why" : True,
+"with" : True,
+"would" : True,
+"wouldn" : True,
+"you" : True,
+"your" : True,
+"yours" : True,
+"yourself" : True,
+"yourselves" : True }
diff --git a/p3/indexer_p3.py b/p3/indexer_p3.py
new file mode 100644
index 0000000..fac6a00
--- /dev/null
+++ b/p3/indexer_p3.py
@@ -0,0 +1,173 @@
+'''
+Created on Sep 21, 2012
+
+@author: aiman.najjar
+
+This class is resposible for indexing the documents and it performs the following steps:
+    1. Retrieves the body content of the document, if the HTTP request fails, the body
+        summary returned in Bing API is used
+    2. Tokenize the document text based on constants.DELIMITERS regular expression
+    3. OPTIONAL: Stem token (default is False, setting can be changed in constants.py)
+    4. Throw away out terms that are likely to be useless (e.g. length is 1 or numerical only)
+    5. Insert into invertedFile
+    6. In the same pass, we compute term frequencies for each term in d and store the weight
+        in document["tfVector"][term], this is useful later for Rocchio
+
+Note that this indexer is setup to work concurrently and dynamically build the index as opposed to index
+the document collections at once.
+To index a document, the document object should be enqueued in documents_queue and one of the worker
+threads will be pick it up to process it
+Therefore, a mutex lock was necessary while accessing invertedFile to ensure dictionary consistency
+
+Here is the invertedFile structure:
+
+    invertedFile =
+    {
+
+        "Term 1" : {
+            "DocID 1" :
+            {
+                "body": [0,3,4,2,1] # List of positions
+                .
+                .
+                other zones (currently only indexing body)
+            }
+
+            .
+            .
+            .
+            other documents
+
+        }
+
+        .
+        .
+        .
+        .
+        other terms
+
+    }
+
+
+You will notice our liberal usage of hash maps which are convenient for quick access but consume larger
+memory, we explain our design choice in the README file
+
+
+'''
+
+import threading
+import re
+import requests
+import logging
+import constants_p3 as constants
+from PorterStemmer_p3 import PorterStemmer
+from common_p3 import *
+from queue import Queue
+from threading import Thread
+
+
+class Indexer():
+
+    def __init__(self):
+        logging.info("Initializing indexer")
+        self.ifile_lock = threading.Lock()
+        self.documents_queue = Queue()
+        self.invertedFile = dict()
+        self.termsFrequencies = dict()
+
+        for i in range(constants.NUM_INDEXER_THREADS):
+            worker = Thread(target=self.index, args=(i, self.documents_queue,))
+            worker.setDaemon(True)
+            worker.start()
+
+    # Enqueues a task in the indexer queue
+    def indexDocument(self, document):
+        self.documents_queue.put(document)
+
+    def waitForIndexer(self):
+        self.documents_queue.join()
+
+    def clearIndex(self):
+        with self.ifile_lock:
+            self.invertedFile = dict()
+            self.termsFrequencies = dict()
+
+    def index(self, i, q):
+        while True:
+            logging.info('Indexer-%s: Waiting for next document' % i)
+            document = q.get()
+
+            logging.info('Indexer-%s: Indexing document #%s (%s)' % (i, document["ID"], document["Url"]))
+
+            # Create key to hold tf weights
+            document["tfVector"] = {}
+
+            # Retrive Entire document
+            url = document["Url"]
+
+            try:
+                response = requests.get(url)
+                body = response.text  # response.read()
+                # Strip out HTML
+                document["Body"] = strip_tags(body)
+            except Exception:
+                document["Body"] = document["Description"]
+
+            # Terms List
+            terms = []
+
+            # Tokenizer
+            logging.debug('Indexer-%s: Tokenizing document #%s' % (i, document["ID"]))
+            tokens = re.compile(constants.DELIMITERS).split(document["Body"])
+            logging.debug('Indexer-%s: Found %d tokens' % (i, len(tokens)))
+            j = 0
+
+            # Process Tokens
+            p = PorterStemmer()
+            for token in tokens:
+
+                # Stem Token
+                if (constants.STEM_TOKEN):
+                    logging.debug('Indexer-%s: Stemming token: \'%s\'' % (i, token))
+                    token = p.stem(token.lower(), 0, len(token) - 1)
+                else:
+                    token = token.lower()
+
+                # Is token eligible to indexed?
+                if (token == '' or len(token) <= 1 or len(token) >= 10 or is_number(token)):
+                    logging.debug('Indexer-%s: Discarding short or empty token \'%s\'' % (i, token))
+                    continue
+
+                terms.append(token)
+
+                # Insert into invertedFile
+                with self.ifile_lock:
+                    logging.debug('Indexer-%s: Updating postings for token: %s' % (i, token))
+
+                    if token not in self.termsFrequencies:
+                        self.termsFrequencies[token] = 1
+                    else:
+                        self.termsFrequencies[token] = self.termsFrequencies[token] + 1
+
+                    if token not in self.invertedFile:
+                        self.invertedFile[token] = {}
+
+                    if document["ID"] not in self.invertedFile[token]:
+                        self.invertedFile[token][document["ID"]] = {}
+
+                    body_postings = []
+                    if "body" in self.invertedFile[token][document["ID"]]:
+                        body_postings = self.invertedFile[token][document["ID"]]["body"]
+                        body_postings.append(j)
+                    else:
+                        self.invertedFile[token][document["ID"]]["body"] = [j]
+
+                    if (token in document["tfVector"]):
+                        document["tfVector"][token] = document["tfVector"][token] + 1
+                    else:
+                        document["tfVector"][token] = 1
+
+                j = j + 1
+
+            logging.info('Indexer-%s: Finished indexing document %s' % (i, document["ID"]))
+            q.task_done()
diff --git a/p3/main_p3.py b/p3/main_p3.py
new file mode 100644
index 0000000..8efea39
--- /dev/null
+++ b/p3/main_p3.py
@@ -0,0 +1,157 @@
+'''
+Created on Sep 21, 2012
+
+@author: johnterzis
+
+arguments: <precision> <query>
+
+Contains the main loop of the application
+
+'''
+
+import sys
+import bingclient_p3 as bingclient
+import constants_p3 as constants
+import logging
+import indexer_p3 as indexer
+import rocchio_p3 as rocchio
+import common_p3 as common
+import math
+
+# only if run as standalone script (not imported module) does, __name__  attribute defaults to __main__
+# assume first arg is <precision> second is <query>
+if __name__ == '__main__':
+
+    logging.basicConfig(level=logging.ERROR)
+
+# create all singleton objects
+    arglist = sys.argv
+    if len(arglist) < 3:
+        print("Usage: <precision> <query>")
+        sys.exit(1)  # exit interpreter
+
+    print('Desired precision@10: {}'.format(arglist[1]))
+
+    precisionTenTarg = float(arglist[1])   # must convert string to float
+    # 'eECeOiLBFOie0G3C03YjoHSqb1aMhEfqk8qe7Xi2YMs='
+    # connect to client with key arg[1] and post a query with arg[3], query
+
+    bingClient = bingclient.BingClient(constants.BING_ACCT_KEY)
+    indexer = indexer.Indexer()
+    queryOptimizer = rocchio.RocchioOptimizeQuery(arglist[2])
+
+    firstPass = 1
+    precisionAtK = 0.00
+    expandedQuery = arglist[2]
+    queryWeights = {}
+
+    # while precision at 10 is less than desired amt issue a query, obtain new precision metric, expand query, repeat
+    while (precisionAtK < precisionTenTarg):
+        precisionAtK = 0.00  # reset precision each round
+        # PROCESS A QUERY
+
+        print('Parameters')
+        print('%-20s= %s' % ("Query", expandedQuery))
+        print('%-20s= %s' % ("Target Precision", precisionTenTarg))
+
+        indexer.clearIndex()
+
+        if firstPass == 1:
+            results = bingClient.webQuery(arglist[2])
+        else:
+            results = bingClient.webQuery(expandedQuery)
+
+        DocumentList = []
+
+        for wq in results:
+            d = {}
+            d['Url'] = wq.url
+            d['Description'] = wq.description
+            d['Title'] = wq.title
+            DocumentList.append(d)
+
+        print('Total number of results: %d' % len(DocumentList))
+
+        # to calc precision@10 display documents to user and ask them to categorize as Relevant or Non-Relevant
+        print('======================')
+
+        # Reset collections for relevant ad nonrelevant documents
+        relevantDocuments = []
+        nonrelevantDocuments = []
+
+        for i in range(len(DocumentList)):
+
+            DocumentList[i]["ID"] = i
+            indexer.indexDocument(DocumentList[i])
+
+            print('Result %d' % (i + 1))
+            print('[')
+            print('  %-9s: %10s' % ("URL", DocumentList[i]["Url"]))
+            print('  %-9s: %10s' % ("Title", DocumentList[i]["Title"]))
+            print('  %-9s: %10s' % ("Summary", DocumentList[i]["Description"]))
+            print(']')
+
+            print('')
+            sys.stdout.write('Relevant (Y/N)? ')
+            value = input()
+            if value.upper() == 'Y':
+                DocumentList[i]['IsRelevant'] = 1   # 1 is true , 0 is false
+                precisionAtK = precisionAtK + 1
+                relevantDocuments.append(i)
+
+            elif value.upper() == 'N':
+                DocumentList[i]['IsRelevant'] = 0   # 1 is true , 0 is false
+                nonrelevantDocuments.append(i)
+            else:
+                print('Invalid value entered!')
+
+        precisionAtK = float(precisionAtK) / 10  # final precision@10 per round
+
+        print('')
+        print('Precision@10 is: {}'.format(float(precisionAtK)))
+        print('')
+
+        # expand query here by indexing and weighting current document list
+        if (precisionAtK == 0):
+            print('Below desired precision, but can no longer augment the query')
+            sys.exit()
+
+        print('Indexing results...')
+        indexer.waitForIndexer()  # Will block until indexer is done indexing all documents
+
+        # print(inveretd file
+
+        for term in sorted(indexer.invertedFile, key=lambda posting: len(indexer.invertedFile[posting].keys())):
+            logging.info(
+                "%-30s %-2s:%-3d %-2s:%-3d %-3s:%-10f" % (
+                    term,
+                    "TF",
+                    indexer.termsFrequencies[term],
+                    "DF",
+                    len(indexer.invertedFile[term]),
+                    "IDF",
+                    math.log(float(len(DocumentList)) / len(indexer.invertedFile[term].keys()), 10)
+                )
+            )
+
+        print('======================')
+        print('FEEDBACK SUMMARY')
+
+        if (precisionAtK < precisionTenTarg):
+            print('')
+            print('Still below desired precision of %f' % precisionTenTarg)
+            queryWeights = queryOptimizer.Rocchio(
+                indexer.invertedFile,
+                DocumentList,
+                relevantDocuments,
+                nonrelevantDocuments
+            )   # optimize new query here
+
+            newTerms = common.getTopTerms(expandedQuery, queryWeights, 2)
+            expandedQuery = expandedQuery + " " + newTerms[0] + " " + newTerms[1]
+            firstPass = 0
+
+            print('Augmenting by %s %s' % (newTerms[0], newTerms[1]))
+
+    # precision@10 is > desired , return query and results to user
+    print('Desired precision reached, done')
diff --git a/p3/parser_p3.py b/p3/parser_p3.py
new file mode 100644
index 0000000..7481de1
--- /dev/null
+++ b/p3/parser_p3.py
@@ -0,0 +1,51 @@
+'''
+Created on Sep 25, 2012
+
+@author: johnterzis
+
+Parser takes raw json output from BingClient and parses the result list of dictionaries, placing
+significant components into a Document List
+
+e.g.
+if json document, exampleResults, is passed into contructor
+
+exampleResults['d]['results'] is list of 10 dictionaries, each a result
+
+'''
+
+
+class Parser:
+    '''
+    classdocs
+    '''
+
+    def __init__(self, rawJSON):
+
+        self.rawJSON = rawJSON
+        self.DocumentsList = []
+
+    def parser(self):
+
+        results = self.rawJSON
+
+        resultLength = len(results)
+
+        # generate list of dictionaries one for each doc
+        self.DocumentsList = [
+            {
+                'Description': results[k]['Description'],
+                'Title': results[k]['Title'],
+                'Url': results[k]['Url'],
+                'IsRelevant': None,
+                'Body': None,
+                'URLBody': None
+            } for k in range(resultLength)
+        ]
+
+    def getDocList(self):
+
+        if self.DocumentsList is None:
+            print('Document List Empty!')
+            return
+
+        return self.DocumentsList
diff --git a/p3/rocchio_p3.py b/p3/rocchio_p3.py
new file mode 100644
index 0000000..80091e8
--- /dev/null
+++ b/p3/rocchio_p3.py
@@ -0,0 +1,100 @@
+'''
+Implement Rocchio algo on a corpus of relevant documents
+by weighting based on td-idf to iteratively form a new query vector of weightings
+for each unique term across all dictionaries (from invertedFiles) passed into Rocchio
+'''
+import constants_p3 as constants
+import math
+import PorterStemmer_p3 as PorterStemmer
+
+
+class RocchioOptimizeQuery:
+
+    def __init__(self, firstQueryTerm):
+        '''
+        Constructor
+        '''
+        self.query = {}
+        self.query[firstQueryTerm] = 1
+
+    def Rocchio(self, invertedFile, documentsList, relevantDocs, nonrelevantDocs):
+        '''
+        output new query vector'
+
+        calculate summation of relevant documents weights
+        'calculate IDF per inverted file'
+
+
+        '''
+        p = PorterStemmer.PorterStemmer()
+
+        weights = {}
+        for term in invertedFile:
+            sterm = term
+            if constants.STEM_IN_ROCCHIO:
+                sterm = p.stem(term.lower(), 0, len(term) - 1)
+            weights[sterm] = 0.0  # initialize weight vector for each key in inverted file
+        print('')
+
+        relevantDocsTFWeights = {}
+        nonrelevantDocsTFWeights = {}
+
+        # ------------------------------------- #
+        # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors
+        for docId in relevantDocs:
+            doc = documentsList[docId]
+            for term in doc["tfVector"]:
+                sterm = term
+                if constants.STEM_IN_ROCCHIO:
+                    sterm = p.stem(term.lower(), 0, len(term) - 1)
+
+                if sterm in relevantDocsTFWeights:
+                    relevantDocsTFWeights[sterm] = relevantDocsTFWeights[sterm] + doc["tfVector"][term]
+                else:
+                    relevantDocsTFWeights[sterm] = doc["tfVector"][term]
+
+        for docId in nonrelevantDocs:
+            doc = documentsList[docId]
+            for term in doc["tfVector"]:
+                sterm = term
+                if constants.STEM_IN_ROCCHIO:
+                    sterm = p.stem(term.lower(), 0, len(term) - 1)
+
+                if sterm in nonrelevantDocsTFWeights:
+                    nonrelevantDocsTFWeights[sterm] = nonrelevantDocsTFWeights[sterm] + doc["tfVector"][term]
+                else:
+                    nonrelevantDocsTFWeights[sterm] = doc["tfVector"][term]
+
+        # ------------------------------------- #
+        # Compute Rocchio vector
+        for term in invertedFile:
+            idf = math.log(float(len(documentsList)) / float(len(invertedFile[term].keys())), 10)
+
+            sterm = term
+            if constants.STEM_IN_ROCCHIO:
+                sterm = p.stem(term.lower(), 0, len(term) - 1)
+
+            # Terms 2 and 3 of Rocchio algorithm
+            for docId in invertedFile[term]:
+                if documentsList[docId]['IsRelevant'] == 1:
+                    # Term 2: Relevant documents weights normalized and given BETA weight
+                    weights[sterm] = (
+                        weights[sterm] + constants.BETA * idf * (relevantDocsTFWeights[sterm] / len(relevantDocs))
+                    )
+                else:
+                    # Term 3: NonRelevant documents weights normalized and given BETA weight
+                    weights[sterm] = (
+                        weights[sterm] - constants.GAMMA * idf * (
+                            nonrelevantDocsTFWeights[sterm] / len(nonrelevantDocs)
+                        )
+                    )
+
+            # Term 1 of Rocchio, query terms
+            if term in self.query:
+                self.query[term] = (
+                    constants.ALPHA * self.query[term] + weights[sterm]  # build new query vector of weights
+                )
+            elif weights[sterm] > 0:
+                self.query[term] = weights[sterm]
+
+        return self.query

From 1c21b0afef46fb8b745faee61fc5633a70f2c4a5 Mon Sep 17 00:00:00 2001
From: Aivin Solatorio <aivin@kalibrr.com>
Date: Mon, 12 Sep 2016 09:53:44 +0800
Subject: [PATCH 2/2] Add bing search api python wrapper.

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..254ba5d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+py-bing-search==0.2.6