forked from lesquoyb/pdf_converter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
110 lines (84 loc) · 3.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# coding: utf8
import os
import subprocess
import pdfkit
PYTHON_INTERP = "python"
PATH_CONV = r'/home/lt/Documents/pdf_converter'
error_list = []
verbose = False
options = {'encoding': "UTF-8", '--load-error-handling': 'ignore'}
def convert_ps_to_pdf(in_file):
subprocess.call(["ps2pdf", "-dEPSCrop", in_file], stdout=subprocess.PIPE)
def convert_doc_to_pdf(in_file):
subprocess.call([PYTHON_INTERP, './unoconv.py', "-f", "pdf", in_file], stdout=subprocess.PIPE)#I am not even going to read unoconv.py to make this more efficient :)
def convert_chm(in_file):
directory = os.path.splitext(in_file)[0]
subprocess.call(["7z -y", "e", directory, in_file], stdout=subprocess.PIPE) # in fact 7zip is doing a far better job at decompressing chm files
def change_ext_to_pdf(in_file):
return os.path.splitext(in_file)[0]+".pdf"
def convert_web_to_pdf(in_file):
pdfkit.from_file(in_file, change_ext_to_pdf(in_file), options=options)
def get_clean_ext(filename):
return os.path.splitext(filename)[1].lower()
def main(convert):
if verbose:
count = 0
print("initializing")
for dirname, dirnames, filenames in os.walk(PATH_CONV):
for filename in filenames:
if get_clean_ext(filename) in convert and not os.path.isfile(change_ext_to_pdf(dirname + os.sep + filename)):
count += 1
print(str(count) + " files to process")
current = 0
for dirname, dirnames, filenames in os.walk(PATH_CONV):
if verbose:
print(str(current) + "/" + str(count) + " (" + str(100.0 * current / count) + "%)")
for filename in filenames:
if filename == "index.html":
options["--cache-dir"] = os.path.abspath(dirname)
key = get_clean_ext(filename)
abs = os.path.abspath(dirname + os.sep + filename)
try:
if key in convert:
if not os.path.isfile(change_ext_to_pdf(abs)):
if verbose:
print(abs)
current += 1
convert.get(key)(abs)
except Exception as e:
error_list.append(abs)
if verbose:
print("[ERROR] " + abs + " :" + e.message)
if __name__ == '__main__':
if not verbose:
options['quiet'] = ''
first = {
".doc": convert_doc_to_pdf,
".wri": convert_doc_to_pdf,
".wps": convert_doc_to_pdf,
".odt": convert_doc_to_pdf,
".pps": convert_doc_to_pdf,
".wpd": convert_doc_to_pdf,
".ppt": convert_doc_to_pdf,
".rtf": convert_doc_to_pdf,
".xls": convert_doc_to_pdf,
".eps": convert_doc_to_pdf,
".psd": convert_doc_to_pdf,
".pcx": convert_doc_to_pdf,
".xlsx": convert_doc_to_pdf,
".docx": convert_doc_to_pdf,
".ps": convert_ps_to_pdf,
".chm": convert_chm,
}
second = {
".htm": convert_web_to_pdf,
".html": convert_web_to_pdf,
}
#we are doing two passes because "chm" files are adding new directories containing html files, and i'm lazy
main(first)
main(second)
print ("all work is done")
if len(error_list) > 0:
print("errors: ")
for error in error_list:
print(error)