attardi · rogerioacp · Aug 12, 2020 · Aug 13, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/.gitignore b/.gitignore
@@ -90,4 +90,9 @@ ENV/
 .spyderproject
 
 # Editor files
-*.idea
+*.idea
+
+#Wikipedia Test Files
+
+*.xml
+*.xml.bz2
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # WikiExtractor
 [WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
 
-The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
+The tool is written in Python and requires Python 3.3+ but no additional library.
 
 For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
 
@@ -33,7 +33,7 @@ The script is invoked with a Wikipedia dump file as an argument.
 The output is stored in several files of similar size in a given directory.
 Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
 
-    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
+    usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
                             [-l] [-s] [--lists] [-ns ns1,ns2]
                             [--templates TEMPLATES] [--no-templates] [-r]
                             [--min_text_length MIN_TEXT_LENGTH]

diff --git a/extract.sh b/extract.sh
@@ -14,7 +14,17 @@ PROCESSES=$2
 TEMPLATES=$3
 OUTPUT=$4
 
-python WikiExtractor.py $INPUT \
+
+#tests the return code of  wikiextractor to valid if cmd is installed
+if ! command -v wikiextractor &> /dev/null
+then
+
+    echo "WikiExtractor is not installed. Please install it to use the script."
+    echo "More details on the installation process can be found in README."
+    exit 1
+fi
+
+wikiextractor $INPUT \
        --json \
        --processes $PROCESSES \
        --templates $TEMPLATES \

diff --git a/setup.py b/setup.py
@@ -0,0 +1,40 @@
+from setuptools import setup, find_packages
+import re
+
+from wikiextractor.WikiExtractor import version
+
+
+def get_version(version):
+    if re.match(r'^\d+\.\d+$', version):
+        return version + '.0'
+    return version
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setup(
+    name='wikiextractor',
+    version=get_version(version),
+    author='Giuseppe Attardi',
+    author_email='[email protected]',
+    description='A tool for extracting plain text from Wikipedia dumps',
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    license='GNU Affero General Public License',
+    install_requires=[],
+    url="https://github.com/attardi/wikiextractor",
+    packages=find_packages(include=["wikiextractor"]),
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Topic :: Text Processing :: Linguistic',
+        'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
+        'Programming Language :: Python :: 3'
+     ],
+    entry_points={
+        "console_scripts": [
+            "wikiextractor = wikiextractor.WikiExtractor:main"
+            ]
+        },
+    python_requires='>=3.6',
+)
diff --git a/WikiExtractor.py → wikiextractor/WikiExtractor.py b/WikiExtractor.py → wikiextractor/WikiExtractor.py
@@ -2873,6 +2873,14 @@ def hook(filename, mode):
                 return gzip.open(filename, mode, encoding=encoding)
             elif ext == '.bz2':
                 import bz2
+                # FileInput by default assumes a read mode ('r') and bz2.open assumes a read mode for binary files ('rb'). However, the wiki files are textual and the correct read mode is 'rt'.
+                try:
+                    if mode != 'r':
+                        raise ValueError()
+                    mode = mode +'t'
+                except ValueError:
+                    logging.error('File read mode is invalid %s', mode)
+                    return
                 return bz2.open(filename, mode, encoding=encoding)
             else:
                 return open(filename, mode, encoding=encoding)

diff --git a/wikiextractor/__init__.py b/wikiextractor/__init__.py