-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmldump2files.py
executable file
·169 lines (141 loc) · 4.86 KB
/
xmldump2files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/python
# Split a Wikipedia XML dump into individual files. The files are stored in a
# directory tree based on hashing the title of the article.
#
# Evan Jones <[email protected]>
# April, 2008
# Released under a BSD licence.
# http://evanjones.ca/software/wikipedia2text.html
import md5
import os
import sys
import urllib
import xml.sax
deletedTotal = 0
redirectsTotal = 0
bytesTotal = 0
bytesOut = 0
articleSkip = 0
articleWrite = 0
log = open("xmldump2files.log", "a")
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.2f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.2f%s%s" % (num, 'Yi', suffix)
def writeArticle(root, title, text):
global articleSkip
global articleWrite
global deletedTotal
global bytesOut
global bytesTotal
global log
global redirectsTotal
# ~5.5 million articles at the moment
# assuming an even distribution, we want 2 levels of 2 character directories:
# 5.5 million / 256 / 256 = 83
# Thus we won't have too many items in any directory
title = title.encode("UTF-8")
hash = md5.new(title).hexdigest()
level1 = os.path.join(root, hash[0:2])
level2 = os.path.join(level1, hash[2:4])
# Wikipedia-ize the title for the file name
title = title.replace(" ", "_")
title = urllib.quote(title)
# Special case for /: "%x" % ord("/") == 2f
title = title.replace("/", "%2F")
# remove file using old filename
oldTitle = title
if len(oldTitle) < 256 and len(oldTitle) > 123:
oldTitle += ".txt"
oldFilename = os.path.join(level2, oldTitle)
if os.path.exists(oldFilename):
deletedTotal = deletedTotal + 1
os.remove(oldFilename)
#log.write("Deleting oldFilename")
#log.write(oldFilename)
#log.write("\n")
return
if len(title) > 123:
title = hash
title += ".txt"
# print title
filename = os.path.join(level2, title)
if not os.path.exists(level1):
os.mkdir(level1)
if not os.path.exists(level2):
os.mkdir(level2)
if text.startswith("#REDIRECT [[") or text.startswith("#REDIRECT[["):
redirectsTotal = redirectsTotal + 1
#if os.path.exists(filename):
# deletedTotal = deletedTotal + 1
# os.remove(filename)
# log.write("Deleting redirect")
# log.write(filename)
# log.write("\n")
return
encoded = text.encode("UTF-8")
bytesTotal = bytesTotal + len(encoded)
if not os.path.exists(filename):
out = open(filename, "w")
out.write(encoded)
out.close()
bytesOut = bytesOut + len(encoded)
articleWrite = articleWrite + 1
#log.write("Writing missing file")
#log.write(filename)
#log.write("\n")
else:
articleSkip = articleSkip + 1
if (articleSkip + articleWrite) % 1000 == 0:
percentComplete = (articleSkip + articleWrite) * 100 / 5500000
string = "Redirects %d Deleted %d Skipped %d Wrote %d %s Total %d %s (%d%%)\n" % (redirectsTotal, deletedTotal, articleSkip, articleWrite, sizeof_fmt(bytesOut), articleWrite + articleSkip, sizeof_fmt(bytesTotal), percentComplete)
# log = open("xmldump2files.log", "a")
log.write(string)
log.flush()
# log.close()
# print string
class WikiPageSplitter(xml.sax.ContentHandler):
def __init__(self, root):
self.root = root
self.stack = []
self.text = None
self.title = None
def startElement(self, name, attributes):
#~ print "start", name
if name == "page":
assert self.stack == []
self.text = None
self.title = None
elif name == "title":
assert self.stack == ["page"]
assert self.title is None
self.title = ""
elif name == "text":
assert self.stack == ["page"]
assert self.text is None
self.text = ""
else:
assert len(self.stack) == 0 or self.stack[-1] == "page"
return
self.stack.append(name)
def endElement(self, name):
#~ print "end", name
if len(self.stack) > 0 and name == self.stack[-1]:
del self.stack[-1]
if name == "text":
# We have the complete article: write it out
writeArticle(self.root, self.title, self.text)
def characters(self, content):
assert content is not None and len(content) > 0
if len(self.stack) == 0:
return
if self.stack[-1] == "title":
self.title += content
elif self.stack[-1] == "text":
assert self.title is not None
self.text += content
xml.sax.parse(sys.argv[1], WikiPageSplitter(sys.argv[2]))
log.write("done\n")
log.close()