Skip to content

Commit

Permalink
Merge pull request #183 from albertvillanova/fix-encoding
Browse files Browse the repository at this point in the history
Force 'utf-8' encoding without relying on platform-dependent default
  • Loading branch information
attardi authored Jul 22, 2020
2 parents 08985ca + ff9a70c commit 6408a43
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion WikiExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param process_count: number of extraction processes to spawn.
"""

def hook_compressed_encoded(encoding):
def hook(filename, mode):
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode, encoding=encoding)
elif ext == '.bz2':
import bz2
return bz2.open(filename, mode, encoding=encoding)
else:
return open(filename, mode, encoding=encoding)
return hook

if input_file == '-':
input = sys.stdin
else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8'))

# collect siteinfo
for line in input:
Expand Down

0 comments on commit 6408a43

Please sign in to comment.