-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpdf_to_txt.py
122 lines (99 loc) · 4.61 KB
/
pdf_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
####avec pytesseract et PIL.image
# Great !!! Il transforme en texte corectement les plus compliqués
import cv2 ##pip install opencv-python
import numpy as np
import pytesseract #pip install pytesseract #installer tesseract et mettre le lien vers le exe dans le code
###prerequistes
# install tesseract software:
# put tesseract in the path (C:\Program Files\Tesseract-OCR)
# install ghostscript software:
# put ghostscript in the path (C:\Program Files\gs\gs9.54.0)
# pip install pytesseract, pillow
# Set the tesseract path in the script before calling image_to_string
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def get_string_way1(list_img_path):
from PIL import Image as PILImage
print('--getting string from imagePages--',end='\n ')
Liste_string=[]
for i,img_path in enumerate(list_img_path):
print(f'page{i}',end = ' ')
# Recognize text with tesseract for python
result = pytesseract.image_to_string(PILImage.open(img_path))
Liste_string.append(result)
print('\n')
return Liste_string
#Il veut enlever du bruit: Il fait juste plus bizarre non ??
# Set the tesseract path in the script before calling image_to_string
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def get_string_way2(list_img_path):
from PIL import Image as PILImage
print('--getting string from imagePages--',end='\n ')
Liste_string=[]
for i,img_path in enumerate(list_img_path):
print(f'page{i}',end = ' ')
# Read image with opencv
img = cv2.imread(img_path)
# Convert to gray
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Write image after removed noise
cv2.imwrite("removed_noise.png", img)
# Apply threshold to get image with only black and white
#img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
# Write the image after apply opencv to do some ...
cv2.imwrite(img_path, img)
# Recognize text with tesseract for python
result = pytesseract.image_to_string(PILImage.open(img_path))
Liste_string.append(result)
print('\n')
return Liste_string
##########avec pytesseract et pyocr #il fait bizarre sur le pdf1
#https://xiaofeima1990.github.io/2016/12/19/extract-text-from-sanned-pdf/
from wand.image import Image #import wand #installer ImageMagicket ghostscript, put them in the path
from PIL import Image as PI #pip install pillow
import pyocr #import pyocr mais il marche pas (pyocr.get_available_tools() is empty list) sans des way liées à tesseract dans le path (et là encore). ç amarche avec le override d'en bas
import pyocr.builders
import io
#override le chemin path
pyocr.tesseract.TESSERACT_CMD = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#https://stackoverflow.com/questions/49162994/pyocr-get-availables-tools-returns-an-empty-list-can-access-tesseract-from
def get_string_way3(list_img_path):
print('--getting tools--')
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0] # 0 is eng
req_image = []
final_list_of_text = []
print('--getting images from path--',end='\n ')
for i,img_path in enumerate(list_img_path):
print(f'page{i}',end = ' ')
img_page = Image(filename=img_path)
req_image.append(img_page.make_blob('jpeg'))
print('\n')
print('--getting text--',end='\n ')
for i,img in enumerate(req_image):
print(f'page{i}',end = ' ')
txt = tool.image_to_string(
PI.open(io.BytesIO(img)),
lang=lang,
builder=pyocr.builders.TextBuilder()
)
final_list_of_text.append(txt)
print('\n')
return final_list_of_text
def pdf_to_txt(pdf_abspath):
from pdf_to_img import pdf_to_img #import a function to convert into image
list_img_path = pdf_to_img(pdf_abspath) #get first image (page1)
liste_txt = get_string_way1(list_img_path)
return liste_txt
if __name__ == '__main__':
pdf = "data/pdf/pdf1.pdf"
pdf = "data/pdf/FactureSNM.pdf"
from pdf_to_img import pdf_to_img #import a function to convert into image
pdf = os.path.abspath(pdf) #get abs path
list_img_path = pdf_to_img(pdf) #get first image (page1)
sep = '\n\n'.join(5*[100*'-'])
with open(pdf+'.txt','w') as f: f.write(sep.join(get_string_way1(list_img_path)))