-
Notifications
You must be signed in to change notification settings - Fork 9
/
medium_to_jekyll.py
123 lines (108 loc) · 4.1 KB
/
medium_to_jekyll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env
from __future__ import print_function
from lxml import etree
import lxml.html
import lxml.html.soupparser
from markdownify import markdownify
import os
import requests
import shutil
import sys
import re
def usage():
print('Usage: %s <path-to-medium-articles> <path-to-jekyll-root-directory>' % sys.argv[0])
def get_unique_file_name(full_path_file_name):
increment_number = 0
file_name, file_extension = os.path.splitext(full_path_file_name)
while os.path.exists(full_path_file_name):
increment_number += 1
full_path_file_name = "%s_%d%s" % (file_name, increment_number, file_extension)
return full_path_file_name
def sanatize_file_name(file_name):
return re.sub(r'[\\/*?:"<>|]', "", file_name)
def save_images(doc, image_directory):
for img in doc.xpath('//img'):
if not 'src' in img.attrib:
continue
url = img.attrib['src']
r = requests.get(url, stream=True)
if r.status_code == 200:
filename = url.split('/')[-1]
filename = sanatize_file_name(filename)
filepath = os.path.join(image_directory, filename)
filepath = get_unique_file_name(filepath)
filename = os.path.basename(filepath)
with open(filepath, 'wb') as w:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, w)
path_slash = '\\' if os.name == 'nt' else '/'
img.attrib['src'] = '/%s/%s' % (image_directory.split(path_slash)[-1], filename)
else:
print('Error processing image (%s): %d' % (url, r.status_code))
def extract_metadata(doc):
title = etree.tostring(doc.xpath('//title')[0], method='text', encoding='unicode')
date = doc.xpath('//time/@datetime')[0][:10]
return title, date
def convert_post(doc):
drop_xpaths = [
'//head',
'//header',
'//*[contains(@class, "graf--title")]',
'//section[@data-field="subtitle"]',
'//footer'
]
for xpath in drop_xpaths:
elem = doc.xpath(xpath)
if elem:
elem[0].drop_tree()
html = etree.tostring(doc, encoding='unicode')
return markdownify(html)
def format_frontmatter(markdown, title, date):
post = '---\n'
post += 'layout:\tpost\n'
post += 'title:\t"%s"\n' % title
post += 'date:\t%s\n' % date
post += '---\n\n%s'% markdown
return post
def format_output_filename(filename):
# Jekyll expects all seperators to be hyphens
filename = filename.lower().replace('_', '-')
# Strip the extra characters Medium has at the end of its URLs
return re.sub(r'-*?\w*?\.html$', '', filename) + '.markdown'
def main():
if len(sys.argv) != 3:
usage()
sys.exit(-1)
medium_directory = sys.argv[1]
if not os.path.isdir(medium_directory):
usage()
print('Invalid Medium directory')
sys.exit(-1)
jekyll_directory = sys.argv[2]
if not os.path.isdir(jekyll_directory):
usage()
print('Invalid Jekyll directory')
sys.exit(-1)
img_directory = os.path.join(jekyll_directory, 'img')
if not os.path.isdir(img_directory):
os.mkdir(img_directory)
elif os.path.isfile(img_directory):
usage()
print('Jekyll directory contains `img` file instead of directory')
sys.exit(-1)
for filename in os.listdir(sys.argv[1]):
if filename.startswith('draft') or not filename.endswith('.html'):
continue
with open(os.path.join(medium_directory, filename), encoding="utf8") as f:
html = f.read()
doc= lxml.html.soupparser.fromstring(html)
title, date = extract_metadata(doc)
save_images(doc, img_directory)
markdown = convert_post(doc)
post = format_frontmatter(markdown, title, date)
output_filename = format_output_filename(filename)
with open(os.path.join(jekyll_directory, '_posts', output_filename), 'wb') as out:
out.write(post.encode('utf-8'))
print('Converted %s (Published %s)' % (title, date))
if __name__ == "__main__":
main()