forked from louisrli/anki-faws-image-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
246 lines (213 loc) · 8.82 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
Helper functions related to scraping images.
"""
from aqt.qt import QApplication
from typing import NamedTuple, List, Tuple
import io
import re
import requests
from bs4 import BeautifulSoup
import concurrent.futures
from anki.utils import checksum
from io import BytesIO
from typing import Tuple
from math import ceil
from .logging import logger
class QueryResult(NamedTuple):
"""
Encapsulates all of the information and configs needed to process a query result and apply
the changes back into the Anki database.
"""
note_id: str
query: str
target_field: str
overwrite: str
max_results: int
width: int
height: int
# (filename, image data)
images: List[Tuple[str, bytes]]
def sleep(seconds):
"""
Sleep for a certain amount of time to throttle request rates.
"""
start = time.time()
while time.time() - start < seconds:
time.sleep(0.01)
QApplication.instance().processEvents()
def strip_html_clozes(w: str) -> str:
"""
Strips a string of any HTML and clozes.
In particular, this is used as note fields can have a lot of random stuff on
them that we don't want to enter into the search query.
"""
# This code is copy-pasted straight from
# batch-download-pictures-from-google-images
# Unfortunately it's more or less unreadable/unmaintainable and I'm just going
# to trust that it works.
w = re.sub(r'</?(b|i|u|strong|span)(?: [^>]+)>', '', w)
w = re.sub(r'\[sound:.*?\]', '', w)
if '<' in w:
soup = BeautifulSoup(w, "html.parser")
for s in soup.stripped_strings:
w = s
break
else:
w = re.sub(r'<br ?/?>[\s\S]+$', ' ', w)
w = re.sub(r'<[^>]+>', '', w)
clozes = re.findall(r'{{c\d+::(.*?)(?::.*?)?}}', w)
if clozes:
w = ' '.join(clozes)
return w
class Scraper:
# Taken from the source code of bing-image-downloader (Python)
# Note: It's actually really important that these headers have certain
# values, though I can't say what they are.
# https://github.com/gurugaurav/bing_image_downloader/commit/062e0f4e6ec31b0ab66ea10e0a187d1d2d80de63
# https://github.com/gurugaurav/bing_image_downloader/issues/19
# But as it turns out, using the headers from the first commit there solves
# the problem.
SPOOFED_HEADER = {
'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
def __init__(self, executor: concurrent.futures.ThreadPoolExecutor, mw):
self._executor = executor
self._mw = mw
def push_scrape_job(self, result: QueryResult):
"""
Pushes a new job (future) into the executor using the query result.
Returns a Future.
"""
raise Exception("Unimplemented abstract method.")
class BingScraper(Scraper):
"""
A scraper that targets Bing Images.
This can be refactored if we ever choose to add another source. Things such as
retry logic can be extracted into a common class.
"""
SEARCH_FORMAT_URL = "https://www.bing.com/images/async?q={}"
TIMEOUT_SEC = 15
MAX_RETRIES = 3
# Number of seconds to sleep per retry on rate limit error.
THROTTLE_SLEEP_SEC = 30
# Number of seconds to sleep per retry on timeout error.
TIMEOUT_SLEEP_SEC = 5
# Taken from bing-image-downloader
IMAGE_URL_REGEX = 'murl":"(.*?)"'
def __init__(self, executor: concurrent.futures.ThreadPoolExecutor, mw):
super().__init__(executor, mw)
def push_scrape_job(self, result: QueryResult):
"""
Fire off a request to the image search page, then queue up a job to scrape
the images from the resulting text and resize them.
"""
# Note that the REQUEST is not
# multithreaded, but parsing/extracting images is (disputable whether this
# is the correct architecture, but I'm just going to copy this guy's code).
# In case of a status exception, retry
search_url = BingScraper.SEARCH_FORMAT_URL.format(result.query)
retry_count = 0
while retry_count < BingScraper.MAX_RETRIES:
try:
req = requests.get(search_url, headers=Scraper.SPOOFED_HEADER,
timeout=BingScraper.TIMEOUT_SEC)
req.raise_for_status()
future = self._executor.submit(
self._parse_and_download_images, req.text, result)
return future
except requests.exceptions.RequestException as e:
logger.exception(e)
if retry_count == BingScraper.MAX_RETRIES:
raise Exception(
"Exceeded max retries. Unable to scrape for query: %s" %
result.query)
retry_count += 1
if isinstance(
e, requests.exceptions.HTTPError) and e.response.status_code == 429:
# Retry on 429: we were rate limited
self._mw.progress.update(
f"Sleeping for {retry_cnt * 30} seconds...")
QApplication.instance().processEvents()
sleep(retry_cnt * BingScraper.THROTTLE_SLEEP_SEC)
elif isinstance(e, (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError)):
# Connection error
self._mw.progress.update(
f"Sleeping for {retry_cnt * 5} seconds...")
QApplication.instance().processEvents()
sleep(retry_cnt * BingScraper.TIMEOUT_SLEEP_SEC)
else:
raise e
def _parse_and_download_images(
self,
html: str,
result: QueryResult) -> QueryResult:
"""
Parses the image URLs out of the HTML. Processes and resizes them.
This function **mutates** `result` and also returns it.
"""
image_urls = re.findall(BingScraper.IMAGE_URL_REGEX, html)
num_processed = 0
if len(image_urls) == 0:
logger.debug("Found 0 image URLs for query: %s" % result.query)
for url in image_urls:
if num_processed == result.max_results:
break
try:
req = requests.get(url,
headers=Scraper.SPOOFED_HEADER,
timeout=BingScraper.TIMEOUT_SEC)
req.raise_for_status()
except requests.packages.urllib3.exceptions.LocationParseError:
continue
except requests.exceptions.RequestException:
continue
# Ignore SVGs. Dunno, the last guy did it too, maybe they won't work
# with Anki.
if 'image/svg+xml' in req.headers.get('content-type', ''):
continue
try:
buf = _maybe_resize_image(io.BytesIO(req.content), result.width,
result.height)
except UnicodeError as e:
# UnicodeError: encoding with 'idna' codec failed (UnicodeError: label empty or too long)
# https://bugs.python.org/issue32958
continue
filename = checksum(url + result.query)
result.images.append((filename, buf.getvalue()))
num_processed += 1
return result
def _maybe_resize_image(image_data: BytesIO, user_width: int, user_height: int) -> BytesIO:
should_resize = user_width > 0 or user_height > 0
if should_resize:
img_data = BytesIO(image_data.read())
img_data.seek(0)
img = Image.open(img_data)
width, height = img.size
if user_width > 0 and user_height > 0:
img = resize_proportional(img, user_width, user_height)
elif user_width > 0:
img = resize_proportional(img, user_width, height)
elif user_height > 0:
img = resize_proportional(img, width, user_height)
img_data_resized = BytesIO()
img.save(img_data_resized, format="JPEG")
img_data_resized.seek(0)
# Now, you can use img_data_resized for whatever you want
print("Image resized")
return img_data_resized
else:
# No need to resize
print("No need to resize")
return image_data
def resize_proportional(img, new_width, new_height):
width, height = img.size
ratio = min(new_width / width, new_height / height)
return img.resize((ceil(width * ratio), ceil(height * ratio)), Image.ANTIALIAS)
def image_resize(image_data: BytesIO, width: int, height: int) -> BytesIO:
img_data = BytesIO(image_data.read())
img_data.seek(0)
img = Image.open(img_data)
img.thumbnail((width, height))
img_resized = BytesIO()
img.save(img_resized, format="JPEG")
img_resized.seek(0)
return img_resized