-
Notifications
You must be signed in to change notification settings - Fork 27
/
doc_questions.py
157 lines (128 loc) · 6.35 KB
/
doc_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import sys
import json
import os
import config
from lib.database import featurebase_query, create_database
from lib.util import embeddings
from lib.ai import ai
# create the databases
databases = []
databases.append({"name": "doc_answers", "schema": "(_id string, filename string, title string, answer string, keyterms stringset, page_id string, answer_location int, answer_embedding vector(768));"})
for database in databases:
create_database(database.get('name'), database.get('schema'))
# select the file
from lib.util import get_pdf_filename
filename = get_pdf_filename()
if filename:
print("Selected PDF:", filename)
# select current questions
sql = f"SELECT * FROM doc_questions WHERE filename = '{filename}';"
fb_questions = featurebase_query({"sql": sql}).get('results')
if not fb_questions:
print("Please run `python3 index_tandqs.py` first to extract for `%s`." % filename)
sys.exit()
# itterate over questions
for question in fb_questions:
# add cleanup for bad data
"""
if question.get('question') == "None" or question.get('question') == "null":
print("Removing empty question.")
remove_uuid = question.get('_id')
sql = "DELETE FROM doc_questions WHERE _id = '%s'" % remove_uuid
featurebase_query({"sql": sql})
"""
# check if we have the answer already
sql = f"SELECT * FROM doc_answers WHERE _id = '{question.get('_id')}';"
results = featurebase_query({"sql": sql}).get('results')
if results:
if results[0].get('answer_location') > 0 and len(results) > 0:
print("system> Skipping question because it has an answer already.")
continue
if question.get('answer', "null") == "null" or question.get('answer', "None") == "None" or question.get('answer') == None or question.get('answer') == '':
print("system>", question.get('question'))
# get question's original text fragment
uuid = question.get('_id')
keyterms = question.get('keyterms')
# get the middle fragment
sql = "SELECT * FROM doc_fragments WHERE _id = '%s'" % uuid
middle_fragment = featurebase_query({"sql": sql}).get('results')[0] # get just one entry
# get next fragment having prev_uuid the same as the middle fragment's uuid
sql = "SELECT * FROM doc_fragments WHERE prev_id = '%s'" % uuid
try:
next_fragment = featurebase_query({"sql": sql}).get('results')[0] # get just one entry
except:
# probably the last fragment
next_fragment = {"fragment": ""}
# get prev fragment having its _id equal to the middle fragment's previous uuid
try:
sql = "SELECT * FROM doc_fragments WHERE _id = '%s'" % middle_fragment.get('prev_id')
prev_fragment = featurebase_query({"sql": sql}).get('results')[0] # get just one entry
except:
# probably the first fragment
prev_fragment = {"fragment": ""}
# add half the previous fragment, the middle fragment and half the next fragment to concepts
fragment_string = prev_fragment.get('fragment')[:int(len(prev_fragment.get('fragment'))/2)] + " " + middle_fragment.get('fragment') + " " + next_fragment.get('fragment')[:int(len(next_fragment.get('fragment'))/2)]
# get related fragments
related_uuids = []
# tanmoto query on keyterms in doc_questions
sql = f"SELECT *, tanimoto_coefficient(keyterms, (SELECT keyterms FROM doc_questions WHERE _id = '{uuid}')) AS distance FROM doc_questions ORDER BY distance DESC;))"
results = featurebase_query({"sql": sql}).get('results')
for i, result in enumerate(results):
if result.get('_id') not in related_uuids and result.get('_id') != uuid:
related_uuids.append(result.get('_id'))
if i > 4: # just grab 5
break
# vector embedding for the question's fragment
qf_embedding = embeddings([middle_fragment.get('fragment')])[0] # send one fragment (the middle one) and return one embedding
# query using the question's fragment embeddings, to get other related questions
sql = f"SELECT _id, question, cosine_distance({qf_embedding.get('embedding')}, question_embedding) AS distance FROM doc_questions ORDER BY distance ASC;"
results = featurebase_query({"sql": sql}).get('results')
for i, result in enumerate(results):
if result.get('_id') not in related_uuids and result.get('_id') != uuid:
related_uuids.append(result.get('_id'))
if i > 4: # just grab 5
break
# select fragments
for _uuid in related_uuids:
if len(fragment_string) < 2048:
sql = f"SELECT fragment FROM doc_fragments WHERE _id = '{_uuid}';"
results = featurebase_query({"sql": sql}).get('results')
fragment_string = fragment_string + results[0].get('fragment')
# build a document for sending to the ai
document = {"origin_id": uuid, "question": question.get('question'), "text": fragment_string.strip(), "title": question.get('title'), "filename": question.get('filename'), "page_id": question.get('page_id')}
document = ai("answer_question", document)
# use word lock to find string
try:
location = fragment_string.replace("'","").replace('"','').find(document.get('word_lock_on').replace("'", "").replace('"',''))
if location < 0:
word_lock_split = document.get('word_lock_on').replace("'", "").replace('"','').split(' ')
for _word in word_lock_split:
try:
location = fragment_string.find(_word)
if location < 0:
print(f"system> Failed to find {_word} in text. Scanning....")
except Exception as ex:
location = -1
print(f"system> Parser threw exception: {ex}.")
if location < 0:
print(f"system> Failed to find an answer lock on word. This entry should be discarded for training and it's location is set to -1.")
except Exception as ex:
print("system> Location not found: ", ex)
location = -1
# update if we have a good answer
if document.get('error') == None:
# print our results
print("bot>", document.get('answer'))
# embed the answer
answer_embedding = embeddings([document.get('answer')])[0]
# insert if we have a good answer and vector
if len(answer_embedding.get('embedding')) == 768:
# write to doc_answers
answer = document.get('answer').replace("'", "''")
sql = f"INSERT INTO doc_answers VALUES('{uuid}', '{question.get('filename')}', '{question.get('title')}', '{answer}', {keyterms}, '{question.get('page_id')}', {location}, {answer_embedding.get('embedding')});"
featurebase_query({"sql": sql}).get('explain')
else:
print("System> Got a bad vector size for the embedding.")
else:
print("bot> ", document.get('error'), document.get('answer'))
print("bot> ")