From a26546e10db2af9a2afc74ed084186d16a806248 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= <radek.gryta@gmail.com>
Date: Sat, 3 Jun 2023 18:16:09 +0200
Subject: [PATCH 1/3] Add support for Windows

---
 wikiextractor/WikiExtractor.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index 830235d..9c5f945 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -56,11 +56,12 @@
 import argparse
 import bz2
 import logging
-import os.path
+import os
 import re  # TODO use regex when it will be standard
 import sys
 from io import StringIO
 from multiprocessing import Queue, get_context, cpu_count
+from multiprocessing.dummy import Process as WinProcess
 from timeit import default_timer
 
 from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
@@ -180,7 +181,8 @@ def open(self, filename):
         if self.compress:
             return bz2.BZ2File(filename + '.bz2', 'w')
         else:
-            return open(filename, 'w')
+            # Fixes default encoding on Windows
+            return open(filename, 'w', encoding="utf-8")
 
 
 # ----------------------------------------------------------------------
@@ -414,7 +416,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     # - a reduce process collects the results, sort them and print them.
 
     # fixes MacOS error: TypeError: cannot pickle '_io.TextIOWrapper' object
-    Process = get_context("fork").Process
+    # fixes Windows errors:
+    # 1. (when using "fork"):ValueError: cannot find context for 'fork'
+    # 2. (when using "spawn"):TypeError: cannot pickle '_io.TextIOWrapper' object
+    if os.name == 'nt':
+        Process = WinProcess
+    else:
+        Process = get_context("fork").Process
 
     maxsize = 10 * process_count
     # output queue

From 1e54c452a9578a51a700a25716bd6f47958dbb4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= <radek.gryta@gmail.com>
Date: Sat, 3 Jun 2023 18:18:13 +0200
Subject: [PATCH 2/3] Fix regex error for python>=3.10

---
 wikiextractor/extract.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index a00e23d..ff7b982 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -380,11 +380,11 @@ def dropSpans(spans, text):
 # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
 ExtLinkBracketedRegex = re.compile(
-    '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
+    '(?i)\[((' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
     re.S | re.U)
 EXT_IMAGE_REGEX = re.compile(
-    r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
-    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
+    r"""(?i)^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
+    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.(gif|png|jpg|jpeg)$""",
     re.X | re.S | re.U)
 
 

From e538ad422822830d5042eec37932791d5f327c35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= <radek.gryta@gmail.com>
Date: Sat, 3 Jun 2023 18:36:27 +0200
Subject: [PATCH 3/3] Fix for duplicated article titles

---
 wikiextractor/extract.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index ff7b982..a5a0af4 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -1220,7 +1220,12 @@ def expandTemplate(self, body):
             template = Template.parse(templates[title])
             # add it to cache
             templateCache[title] = template
-            del templates[title]
+            try:
+                del templates[title]
+            except KeyError:
+                # Duplicated articles
+                # Noticed with article 'Ашаблон:Акарточка аполитик' of from ab-wiki dump
+                pass
         else:
             # The page being included could not be identified
             return ''