-
Notifications
You must be signed in to change notification settings - Fork 601
/
Copy pathscraper.py
71 lines (55 loc) · 2.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import pypandoc
import pathlib
from requests import get
from bs4 import BeautifulSoup
from mercury_parser import ParserAPI
# get the API key at https://mercury.postlight.com/web-parser/
mercury = ParserAPI(api_key='<your API key>')
index_page = 'https://bartoszmilewski.com/2014/10/28/category-theory-for-programmers-the-preface/'
def get_toc():
result = []
part = 0
chapter = 0
lists = get_content(get(index_page).content).find_all('ol')
for part_list in lists:
part += 1
for li in part_list.find_all('li'):
chapter += 1
result.append({'chapter': '{}.{}'.format(part, chapter),
'title': li.a.text,
'url': li.a['href']})
chapter = 0
return result
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
return soup.find(class_='post-content')
def write_content(file, content):
with file.open('w') as f: f.write(content)
def save_images(markup, path):
import urllib.request
html = get_content(markup)
images = html.find_all('img')
for img in images:
image_url = img.parent['href']
image_file = image_url.split('/')[-1]
target_file = path.joinpath(image_file)
urllib.request.urlretrieve(image_url, target_file)
img['src'] = 'images/' + target_file.name
img.parent.replaceWithChildren() # removes the parent <a href...> and replaces it with the image
return html
def save_url(chapter, title, url):
file_name = '{}.tex'.format(title.replace('/', '\\').replace(':', ' -'))
path = pathlib.Path(os.path.join('content', chapter, 'images'))
path.mkdir(parents=True, exist_ok=True)
p = mercury.parse(url)
html = save_images(p.content, path)
content = pypandoc.convert_text(html, 'tex', format='html')
write_content(path.parent.joinpath(file_name), content)
for toc in get_toc():
print(toc)
title = toc['title']
chapter = toc['chapter']
url = toc['url']
save_url(chapter, title, url)
save_url('0.0', 'Preface', index_page)