-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
109 lines (87 loc) · 2.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import PyPDF2
import os
import sys
""" TODO :
- work on argv system to allow user to :
\select the folder
\select a specific file, several files
\select a query, several queries
\select the output location
\select a no output possibility
- maybe work on a Bold system but seems a bit hard...
"""
def askForOutput(question):
yes = ["y", "yes"]
no = ["n", "no"]
isGoodAnswer = False
while isGoodAnswer == False:
answer = input(question)
if (answer in yes):
return True
if (answer in no):
return False
def leave():
print("~ bye")
exit()
path = os.getcwd()
argLen = len(sys.argv)
# if no argv, merges all the file in the current directory
if argLen == 1:
where = './'
query = ''
#if path but no query, merges all the file in the specified directory
elif argLen == 2:
if(askForOutput("No query specified. Query all files? ")):
where = os.path.join(path, sys.argv[1])
query = ''
else:
leave()
#standard behavior: looks for the query in every pdf file and merge the results
else:
where = os.path.join(path, sys.argv[1])
query = sys.argv[2]
directory = os.fsencode(where)
if not os.path.exists(os.path.join(where, 'pdfquery_results')):
os.makedirs(os.path.join(where,'pdfquery_results'))
def queryPDFs(query):
pages = []
print()
print("> Looking for " + '"' + query + '"')
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".pdf"):
print()
print(" || " + filename + " || ")
print()
pdfFile = open(os.path.join(where, filename), 'rb')
try:
reader = PyPDF2.PdfFileReader(pdfFile)
for pageNum in range(reader.numPages):
page = reader.getPage(pageNum)
contentPage = page.extractText()
find = contentPage.find(query)
if find != -1:
print("\t -> match on page " + str(pageNum))
pages.append(page)
except:
print(filename + "can't be added, sorry :(")
return pages
writer = PyPDF2.PdfFileWriter()
pages = queryPDFs(query)
if len(pages) == 0:
print("no match, sorry :(")
exit()
print()
if askForOutput("Do you want to create a file? "):
if len(pages) != 0:
for page in pages:
writer.addPage(page)
outputfile = open(os.path.join(where, 'pdfquery_results/' + query + ".pdf"), 'wb')
writer.write(outputfile)
print()
print('\t––––––––––––––––––––––––––––––––')
print("\t" + query + ".pdf" + " created!")
print('\t––––––––––––––––––––––––––––––––')
print()
outputfile.close()
leave()