diff --git a/Tools/unicode/gen_expat_table.py b/Tools/unicode/gen_expat_table.py new file mode 100644 index 000000000000000..ff08fb4c87dab39 --- /dev/null +++ b/Tools/unicode/gen_expat_table.py @@ -0,0 +1,107 @@ +import sys + + +def info(msg, *args): + print(msg % args, file=sys.stderr) + +def fail(msg, *args): + info(msg, *args) + sys.exit(2) + +# ASCII characters that can appear in a well-formed XML document +# except the characters "$@\^`{}~". +compulsory_chars = (set(b'\t\n\r') | set(range(32, 127))) - set(br'$@\^`{}~') + +def check_compatibility(encoding): + for i in compulsory_chars: + c = chr(i) + b = bytes([i]) + try: + decoded = b.decode(encoding) + except UnicodeDecodeError: + info('Cannot decode byte %#04x (%a)', i, c) + return False + if decoded != c: + info('Incompatible encoding: %#04x (%a) -> %a', i, c, decoded) + return False + return True + +def create_table(encoding): + mapping = [-1] * 256 + + non_bmp = None + for i in range(0x110000): + char = chr(i) + try: + encoded = char.encode(encoding) + except UnicodeEncodeError: + continue + if i >= 0x10000 and non_bmp is None: + non_bmp = i + info('Non-BMP character: %r (U+%04X)', char, i) + length = len(encoded) + k = encoded[0] + v = mapping[k] + if length == 1: + if v == -1: + mapping[k] = i + elif v < 0: + fail('Ambiguous mapping for %#04x: ' + '%r (U+%04X) and %d-byte sequence', + k, c, i, -v) + else: + info('Ambiguous mapping for %#04x: ' + '%r (U+%04X) and %r (U+%04X)', + k, chr(v), v, char, i) + else: + if length > 4: + fail('Too long encoding for %r (U+%04X): %r', + char, i, encoded) + if v == -1: + mapping[k] = -length + elif v != -length: + if v < 0: + fail('Ambiguous mapping for %#04x: ' + '%d-byte sequence and %d-byte sequence %r', + k, -v, length, encoded) + else: + fail('Ambiguous mapping for %#04x: ' + '%r (U+%04X) and %d-byte sequence %r', + k, chr(v), v, length, encoded) + return mapping + +def print_table(mapping): + print(' '*8 + '_expat_decoding_table=(', end='') + if mapping[:128] == list(range(128)): + print('*range(128),') + start = 128 + else: + print() + start = 0 + line = '' + pos = 0 + i = start + while True: + if i % 8 == 0: + if len(line) > 68: + print(' '*12 + line[:pos].rstrip()) + line = line[pos:] + pos = len(line) + if i == 256: + break + v = mapping[i] + if v >= 0: + v = hex(v) + line += f'{v}, ' + i += 1 + print(' '*12 + line.rstrip().rstrip(',') + '),') + + +encoding = sys.argv[1] + +if not check_compatibility(encoding): + print(' '*8 + '_expat_decoding_table=False,') + sys.exit(1) + +mapping = create_table(encoding) +print_table(mapping)