-
Notifications
You must be signed in to change notification settings - Fork 1
/
encodings.py
executable file
·289 lines (241 loc) · 9.52 KB
/
encodings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import encodings
from pkgutil import iter_modules
from collections import defaultdict
from textwrap import TextWrapper
from platform import python_version, uname
from time import strftime
import re
# Force undefined to sort last
UNDEFINED = '\U0001fffffundefined'
class Formatter:
def __init__(self, wrapper=None):
self.wrapper = wrapper
def emit(self, message):
if message is not None:
print(message)
def header(self, codeclist):
encs = ', '.join(codeclist)
if self.wrapper is not None:
return self.wrapper(['Supported encodings:', encs])
else:
return 'Supported encodings: %s' % encs
def item(self, char):
return '0x%s' % char
def row(self, char, code, encs):
if char == UNDEFINED:
header = '(undefined)'
else:
header = ' %s (U+%04x): ' % (char, code)
if self.wrapper is not None:
return self.wrapper([header, encs])
else:
return '%s%s' % (header, encs)
def enditem(self):
return None
def endsection(self):
return '-' * 72
def footer(self):
return None
class HtmlFormatter(Formatter):
def header(self, codeclist):
encs = self.encodingtable(codeclist)
# Simulate uname(1) -a output
sysinfo = ' '.join(
[getattr(u, attr) for u in (uname(),)
for attr in ['system', 'node', 'release', 'version', 'machine']])
return('''<!DOCTYPE html>
<html lang="en" class="">
<head>
<meta charset='utf-8'>
<meta http-equiv="Content-Encoding" content="utf-8">
<meta http-equiv="Content-Language" content="en">
<title>Table of Legacy 8-bit Encodings</title>
<style>
th { white-space: nowrap; text-align: left; vertical-align: top; }
td { vertical-align: text-top; }
</style>
</head>
<body>
<h1>Table of Legacy 8-bit Encodings</h1>
<p>This table was generated from
<a href="https://github.com/tripleee/8bit/">
https://github.com/tripleee/8bit/</a>
and contains a map of the character codes 0x00-0x31 and 0x80-0xFF
in the various 8-bit encodings known by the Python version
which generated this page.</p>
<p>Section headlines like <a href="#0x80">0x80</a>
are clickable links so you can link to or bookmark
an individual character code.</p>
<p>This page was generated on %s by Python %s<br/>
<tt>%s</tt>.</p>
<p><table><tr><th>Supported encodings:</th><td>\n%s</td></tr></table></p>
<hr>
''' % (strftime('%c'), python_version(), sysinfo, encs))
def encodingtable(self, encs):
# map regular expression to Wikipedia link
template = {
r'^cp037$': 'Code_page_37',
r'^cp(273|500|1140)$': r'Code_page_37#\1',
r'^cp(437|7\d{2}|8(?!7[45])\d{2}|1006)$': r'Code_page_\1',
r'^cp(125\d)$': r'Windows-\1',
r'^iso8859_(\d{1,2})$': r'ISO/IEC_8859-\1',
r'^hp_roman8': r'HP_Roman#Roman-8',
r'^koi8_([rtu])$': r'KOI8->>\1',
r'^kz1048$': 'Windows-1251#Kazakh_variant',
r'^latin_1': 'ISO/IEC_8859-1',
r'^mac_(armenian|roman)$': r'Mac_OS_>>\1',
r'^mac_(arabic|farsi|greek)$': r'Mac>>\1_encoding',
r'^mac_latin2$': 'Mac_OS_Central_European_encoding',
r'^mac_(croatian|cyrillic|romanian|turkish)$':
r'Mac_OS_>>\1_encoding',
r'^mac_iceland$': 'Mac_OS_Icelandic_encoding',
# r'^mac_latin2$': 'Macintosh_Latin_encoding',
r'^palmos$': 'Windows-1252#Palm_OS_variant',
r'tis_620$': 'Thai_Industrial_Standard_620-2533'
}
result = []
for enc in encs:
for pat, sub in template.items():
if re.match(pat, enc):
replacement = re.sub(pat, sub, enc)
replacement = re.sub(
r'>>(.)', lambda x: x.group(1).upper(), replacement)
result.append(
'<a href="https://en.wikipedia.org/wiki/%s">%s</a>' % (
replacement, enc))
break
else:
result.append(enc)
return ',\n'.join(result)
def item(self, char):
# Keep <a name="0xFF"> as a synonym for legacy links in this syntax
return '<h3><a name="%s">•</a>' \
'<a name="0x%s"> </a>' \
'<a href="#%s">0x%s</a>' \
'</h3>\n<p><table>' % (char, char, char, char)
def row(self, char, code, encs):
if char == UNDEFINED:
header = '</th><th>(undefined)'
else:
header = '&#%i;</th><th>(%s)' % (
0x2420 if code == 0x20 else
0x2421 if code == 0x7f else
code + 0x2400 if code <= 32 else code, self.rep(code))
return '<tr><th>‌</th><th>%s</th><td>%s</td>' % (
header, encs)
def enditem(self):
return '</table></p>'
def rep(self, code):
return '<a href="http://www.fileformat.info/' \
'info/unicode/char/%04X/">U+%04X</a>' % (code, code)
def endsection(self):
return '\n<hr/>\n'
def footer(self):
return '</body></html>'
def get_encodings():
'''http://stackoverflow.com/a/1728414/874188'''
exclude=set(['aliases',
# Exclude binary encodings, ascii encodings, reserved encodings, etc
'bz2_codec', 'punycode', 'hex_codec', 'uu_codec', 'unicode_internal',
'quopri_codec', 'raw_unicode_escape', 'unicode_escape', 'base64_codec',
'zlib_codec', 'charmap', 'ascii', 'string_escape', 'rot_13',
'undefined', 'idna', 'oem',
# Exclude multi-byte encodings: UTF-xx
'mbcs', 'utf_7', 'utf_8', 'utf_8_sig',
'utf_16', 'utf_16_be', 'utf_16_le',
'utf_32', 'utf_32_be', 'utf_32_le',
'cp65001',
# Chinese
'big5hkscs', 'gbk', 'gb2312', 'hz', 'big5', 'gb18030', 'cp950',
# Japanese
'euc_jp', 'euc_jisx0213', 'euc_jis_2004',
'iso2022_jp', 'iso2022_jp_ext', 'iso2022_jp_1',
'iso2022_jp_2', 'iso2022_jp_3', 'iso2022_jp_2004',
'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'cp932',
# Korean
'euc_kr', 'iso2022_kr', 'johab', 'cp949'])
found=set(name for im, name, ispkg in iter_modules(encodings.__path__))
exclude = exclude.union(set(encodings.aliases.aliases.keys()))
found.difference_update(exclude)
# Sort by padded numeric suffix, so that 13 comes before 110 and after 2 etc
return sorted(
found, key=lambda x: re.sub(
r'(?<=\D)(\d+)$', lambda y: "%04i" % int(y.group(1)), x))
def wraplines (lines):
t = TextWrapper(initial_indent=lines[0],
subsequent_indent=' ' * len(lines[0]))
return '\n'.join(t.wrap(*lines[1:]))
def get_mappings(ch, codecs):
result = dict()
char = bytes([ch])
result[ch] = defaultdict(list)
for enc in codecs:
try:
code = char.decode(enc)
result[ch][code].append(enc)
except UnicodeDecodeError as err:
if 'character maps to <undefined>' in str(err):
result[ch][UNDEFINED].append(enc)
else:
raise
for glyph in sorted(result[ch].keys()):
yield glyph, result[ch][glyph]
def printrange(start, end, codecs):
for ch in range(start, end):
formatter.emit(formatter.item('%02x' % ch))
for glyph, encs in get_mappings(ch, codecs):
formatter.emit(formatter.row(
glyph, None if glyph == UNDEFINED else ord(glyph),
', '.join(encs)))
formatter.emit(formatter.enditem())
def table(formatter):
"""
Render a table of all the character codes we support using the
provided formatter.
"""
formatter.emit(formatter.header(codecs))
printrange(0, 32, codecs)
formatter.emit(formatter.endsection())
printrange(128, 256, codecs)
formatter.emit(formatter.footer())
def renderings(codecs, string):
"""
Print a string in all the encodings which can interpret it.
"""
seen = defaultdict(list)
for codec in codecs:
try:
seen[string.encode(codec)].append(codec)
except UnicodeEncodeError:
pass
for result, codecs in seen.items():
print(f"{repr(result)[1:]}: {codecs}")
try:
u = string.encode('latin-1').decode('utf-8')
if u not in seen:
print(f"{repr(u)}: ['utf-8']")
except UnicodeEncodeError:
pass
if __name__ == "__main__":
from argparse import ArgumentParser, REMAINDER
parser = ArgumentParser(
prog='8bit',
description='8-bit character encoding mapping and information')
parser.add_argument('-t', '--table', dest='table', choices=['html', 'text'],
help='Generate tabular output (specify "html" or text)')
parser.add_argument('-w', '--wrap', dest='wrap', action='store_true',
help='Wrap text table output for limited column width')
parser.add_argument('strings', metavar='s', nargs=REMAINDER,
help='Strings to map to various encodings')
args = parser.parse_args()
codecs = get_encodings()
if args.table:
if args.wrap and args.table == 'html':
raise ValueError('Cannot specify --wrap with --table html')
formatter = HtmlFormatter() if args.table == 'html' \
else Formatter(wrapper=wraplines if args.wrap else None)
table(formatter)
else:
renderings(codecs, ' '.join(args.strings))