From a26546e10db2af9a2afc74ed084186d16a806248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sat, 3 Jun 2023 18:16:09 +0200 Subject: [PATCH 1/3] Add support for Windows --- wikiextractor/WikiExtractor.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 830235d..9c5f945 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -56,11 +56,12 @@ import argparse import bz2 import logging -import os.path +import os import re # TODO use regex when it will be standard import sys from io import StringIO from multiprocessing import Queue, get_context, cpu_count +from multiprocessing.dummy import Process as WinProcess from timeit import default_timer from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces @@ -180,7 +181,8 @@ def open(self, filename): if self.compress: return bz2.BZ2File(filename + '.bz2', 'w') else: - return open(filename, 'w') + # Fixes default encoding on Windows + return open(filename, 'w', encoding="utf-8") # ---------------------------------------------------------------------- @@ -414,7 +416,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # - a reduce process collects the results, sort them and print them. # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object - Process = get_context("fork").Process + # fixes Windows errors: + # 1. (when using "fork"):ValueError: cannot find context for 'fork' + # 2. (when using "spawn"):TypeError: cannot pickle '_io.TextIOWrapper' object + if os.name == 'nt': + Process = WinProcess + else: + Process = get_context("fork").Process maxsize = 10 * process_count # output queue From 1e54c452a9578a51a700a25716bd6f47958dbb4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sat, 3 Jun 2023 18:18:13 +0200 Subject: [PATCH 2/3] Fix regex error for python>=3.10 --- wikiextractor/extract.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..ff7b982 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -380,11 +380,11 @@ def dropSpans(spans, text): # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' ExtLinkBracketedRegex = re.compile( - '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', + '(?i)\[((' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', re.S | re.U) EXT_IMAGE_REGEX = re.compile( - r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) - /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", + r"""(?i)^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) + /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.(gif|png|jpg|jpeg)$""", re.X | re.S | re.U) From e538ad422822830d5042eec37932791d5f327c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sat, 3 Jun 2023 18:36:27 +0200 Subject: [PATCH 3/3] Fix for duplicated article titles --- wikiextractor/extract.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index ff7b982..a5a0af4 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -1220,7 +1220,12 @@ def expandTemplate(self, body): template = Template.parse(templates[title]) # add it to cache templateCache[title] = template - del templates[title] + try: + del templates[title] + except KeyError: + # Duplicated articles + # Noticed with article 'Ашаблон:Акарточка аполитик' of from ab-wiki dump + pass else: # The page being included could not be identified return ''