diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 830235d..9c5f945 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -56,11 +56,12 @@ import argparse import bz2 import logging -import os.path +import os import re # TODO use regex when it will be standard import sys from io import StringIO from multiprocessing import Queue, get_context, cpu_count +from multiprocessing.dummy import Process as WinProcess from timeit import default_timer from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces @@ -180,7 +181,8 @@ def open(self, filename): if self.compress: return bz2.BZ2File(filename + '.bz2', 'w') else: - return open(filename, 'w') + # Fixes default encoding on Windows + return open(filename, 'w', encoding="utf-8") # ---------------------------------------------------------------------- @@ -414,7 +416,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # - a reduce process collects the results, sort them and print them. # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object - Process = get_context("fork").Process + # fixes Windows errors: + # 1. (when using "fork"):ValueError: cannot find context for 'fork' + # 2. (when using "spawn"):TypeError: cannot pickle '_io.TextIOWrapper' object + if os.name == 'nt': + Process = WinProcess + else: + Process = get_context("fork").Process maxsize = 10 * process_count # output queue diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..a5a0af4 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -380,11 +380,11 @@ def dropSpans(spans, text): # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' ExtLinkBracketedRegex = re.compile( - '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', + '(?i)\[((' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', re.S | re.U) EXT_IMAGE_REGEX = re.compile( - r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) - /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", + r"""(?i)^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) + /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.(gif|png|jpg|jpeg)$""", re.X | re.S | re.U) @@ -1220,7 +1220,12 @@ def expandTemplate(self, body): template = Template.parse(templates[title]) # add it to cache templateCache[title] = template - del templates[title] + try: + del templates[title] + except KeyError: + # Duplicated articles + # Noticed with article 'Ашаблон:Акарточка аполитик' of from ab-wiki dump + pass else: # The page being included could not be identified return ''