Skip to content

Commit

Permalink
Update WikiExtractor.py
Browse files Browse the repository at this point in the history
Fix for TypeError: cannot pickle '_io.TextIOWrapper on MacOS.
Allow -b0 for saving a single article per file.
  • Loading branch information
attardi authored Oct 14, 2021
1 parent 0242d58 commit 1053fe2
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions wikiextractor/WikiExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
import re # TODO use regex when it will be standard
import sys
from io import StringIO
from multiprocessing import Queue, Process, cpu_count
from multiprocessing import Queue, get_context, cpu_count
from timeit import default_timer

from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
Expand Down Expand Up @@ -103,7 +103,7 @@

# def expandTemplates(text):
# """Expand templates invoking MediaWiki API"""
# text = urlib.urlencodew(text.encode('utf-8'))
# text = urlib.urlencodew(text)
# base = urlbase[:urlbase.rfind('/')]
# url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
# exp = json.loads(urllib.urlopen(url))
Expand Down Expand Up @@ -170,7 +170,7 @@ def reserve(self, size):
def write(self, data):
self.reserve(len(data))
if self.compress:
self.file.write(data.encode('utf-8'))
self.file.write(data)
else:
self.file.write(data)

Expand Down Expand Up @@ -247,11 +247,11 @@ def load_templates(file, output_file=None):
if output_file and (title.startswith(templatePrefix) or
title.startswith(modulePrefix)):
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title.encode('utf-8'))
output.write(' <title>%s</title>\n' % title)
output.write(' <ns>10</ns>\n')
output.write(' <text>')
for line in page:
output.write(line.encode('utf-8'))
output.write(line)
output.write(' </text>\n')
output.write('</page>\n')
page = []
Expand Down Expand Up @@ -355,6 +355,9 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# - pages to be processed are dispatched to workers
# - a reduce process collects the results, sort them and print them.

# fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object
Process = get_context("fork").Process

maxsize = 10 * process_count
# output queue
output_queue = Queue(maxsize=maxsize)
Expand Down Expand Up @@ -524,7 +527,7 @@ def main():
groupO.add_argument("-o", "--output", default="text",
help="directory for extracted files (or '-' for dumping to stdout)")
groupO.add_argument("-b", "--bytes", default="1M",
help="maximum bytes per output file (default %(default)s)",
help="maximum bytes per output file (default %(default)s); 0 means to put a single article per file",
metavar="n[KMG]")
groupO.add_argument("-c", "--compress", action="store_true",
help="compress output files using bzip")
Expand Down Expand Up @@ -571,8 +574,9 @@ def main():

try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
file_size = int(args.bytes[:-1]) * 1024 ** power
if file_size < minFileSize:
# 0 bytes means put a single article per file.
file_size = 0 if args.bytes == '0' else int(args.bytes[:-1]) * 1024 ** power
if file_size and file_size < minFileSize:
raise ValueError()
except ValueError:
logging.error('Insufficient or invalid size: %s', args.bytes)
Expand Down

0 comments on commit 1053fe2

Please sign in to comment.