-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathunicode2ascii.py
209 lines (168 loc) · 6.54 KB
/
unicode2ascii.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
# Replaces Unicode characters in input text with ASCII
# approximations based on file with mappings between the two.
import sys
import os
import codecs
import re
from io import StringIO
from logging import warn
# The name of the file from which to read the replacement. Each line
# should contain the hex code for the unicode character, TAB, and
# the replacement string.
MAPPING_FILE_NAME = "entities.dat"
u2a_mapping = None
# For statistics and summary of missing mappings in verbose mode
map_count = {}
missing_mapping = {}
# Support wide unichr on narrow python builds. From @marcovzla, see
# https://github.com/spyysalo/nxml2txt/pull/4.
def wide_unichr(i):
try:
return chr(i)
except ValueError:
return (r'\U' + hex(i)[2:].zfill(8)).decode('unicode-escape')
def read_mapping(f, fn="mapping data"):
"""
Reads in mapping from Unicode to ASCII from the given input stream
and returns a dictionary keyed by Unicode characters with the
corresponding ASCII characters as values. The expected mapping
format defines a single mapping per line, each with the format
CODE\tASC where CODE is the Unicode code point as a hex number and
ASC is the replacement ASCII string ("\t" is the literal tab
character). Any lines beginning with "#" are skipped as comments.
"""
# read in the replacement data
linere = re.compile(r'^([0-9A-Za-z]{4,})\t(.*)$')
mapping = {}
for i, l in enumerate(f):
# ignore lines starting with "#" as comments
if len(l) != 0 and l[0] == "#":
continue
m = linere.match(l)
assert m, "Format error in %s line %s: '%s'" % (fn, i+1, l.replace("\n","").encode("utf-8"))
c, r = m.groups()
c = wide_unichr(int(c, 16))
assert c not in mapping or mapping[c] == r, "ERROR: conflicting mappings for %.4X: '%s' and '%s'" % (ord(c), mapping[c], r)
# exception: literal '\n' maps to newline
if r == '\\n':
r = '\n'
mapping[c] = r
return mapping
def convert_u2a(f, out=None, mapping=None):
"""
Applies the given mapping to replace characters other than 7-bit
ASCII from the given input stream f, writing the mapped text to
the given output stream out.
"""
global map_count, missing_mapping, u2a_mapping
if mapping is None:
mapping = u2a_mapping
if isinstance(f, str):
f = StringIO(f)
is_strio = False
if out is None:
out = StringIO()
is_strio = True
for c in f.read():
if ord(c) >= 128:
# higher than 7-bit ASCII, might wish to map
if c in mapping:
map_count[c] = map_count.get(c,0)+1
c = mapping[c]
else:
missing_mapping[c] = missing_mapping.get(c,0)+1
# escape into numeric Unicode codepoint
c = "<%.4X>" % ord(c)
out.write(c.encode("utf-8"))
if is_strio:
return out.getvalue()
else:
return out
def print_summary(out, mapping):
"""
Prints human-readable summary of statistics and missing mappings
for the input into the given output stream.
"""
global map_count, missing_mapping, u2a_mapping
if mapping is None:
mapping = u2a_mapping
print("Characters replaced \t%d" % sum(map_count.values()), file=out)
sk = list(map_count.keys())
sk.sort(lambda a,b : cmp(map_count[b],map_count[a]))
for c in sk:
try:
print("\t%.4X\t%s\t'%s'\t%d" % (ord(c), c.encode("utf-8"), mapping[c], map_count[c]), file=out)
except:
print("\t%.4X\t'%s'\t%d" % (ord(c), mapping[c], map_count[c]), file=out)
print("Characters without mapping\t%d" % sum(missing_mapping.values()), file=out)
sk = list(missing_mapping.keys())
sk.sort(lambda a,b : cmp(missing_mapping[b],missing_mapping[a]))
for c in sk:
try:
print("\t%.4X\t%s\t%d" % (ord(c), c.encode("utf-8"), missing_mapping[c]), file=out)
except:
print("\t%.4X\t?\t%d" % (ord(c), missing_mapping[c]), file=out)
def argparser():
"""
Returns an argument parser for the script.
"""
import argparse
ap=argparse.ArgumentParser(description="Replaces Unicode characters in input text with ASCII approximations.")
ap.add_argument('-d', '--directory', default=None, help="Directory for output (stdout by default)")
ap.add_argument('-v', '--verbose', default=False, action='store_true', help="Verbose output")
ap.add_argument('file', nargs='+', help='Input text file')
return ap
def read_u2a_data():
global u2a_mapping
# don't read twice
if u2a_mapping is not None:
return u2a_mapping
mapfn = MAPPING_FILE_NAME
if not os.path.exists(mapfn):
# fall back to trying in script dir
mapfn = os.path.join(os.path.dirname(__file__),
os.path.basename(MAPPING_FILE_NAME))
with codecs.open(mapfn, encoding="utf-8") as f:
u2a_mapping = read_mapping(f, mapfn)
return u2a_mapping
def log_missing_ascii_mappings(write=warn):
if len(missing_mapping) == 0:
return
write("Characters without ASCII mapping: %d" %
sum(missing_mapping.values()))
sk = list(missing_mapping.keys())
sk.sort(lambda a,b : cmp(missing_mapping[b],missing_mapping[a]))
for c in sk:
try:
write("\t%.4X\t%s\t%d" % (ord(c), c.encode("utf-8"),
missing_mapping[c]))
except:
write("\t%.4X\t?\t%d" % (ord(c), missing_mapping[c]))
def main(argv):
options = argparser().parse_args(argv[1:])
# read in mapping
try:
mapping = read_u2a_data()
except IOError as e:
print("Error reading mapping from %s: %s" % (MAPPING_FILE_NAME, e), file=sys.stderr)
return 1
# primary processing
for fn in options.file:
try:
with codecs.open(fn, encoding="utf-8") as f:
if options.directory is None:
convert_u2a(f, sys.stdout, mapping)
else:
bfn = os.path.basename(fn)
ofn = os.path.join(options.directory, bfn)
with codecs.open(ofn, 'wt', encoding="utf-8") as out:
convert_u2a(f, out, mapping)
except IOError as e:
print("Error processing %s: %s" % (fn, e), file=sys.stderr)
# optionally print summary of mappings
if options.verbose:
print_summary(sys.stderr, mapping)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))