diff --git a/src/etc/char_private.py b/src/etc/char_private.py index 3566d143529..9d15f98e067 100644 --- a/src/etc/char_private.py +++ b/src/etc/char_private.py @@ -11,11 +11,16 @@ # except according to those terms. # This script uses the following Unicode tables: -# - Categories.txt +# - UnicodeData.txt + +from collections import namedtuple +import csv import os import subprocess +NUM_CODEPOINTS=0x110000 + def to_ranges(iter): current = None for i in iter: @@ -28,10 +33,10 @@ def to_ranges(iter): if current is not None: yield tuple(current) -def get_escaped(dictionary): - for i in range(0x110000): - if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '): - yield i +def get_escaped(codepoints): + for c in codepoints: + if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): + yield c.value def get_file(f): try: @@ -40,10 +45,41 @@ def get_file(f): subprocess.run(["curl", "-O", f], check=True) return open(os.path.basename(f)) -def main(): - file = get_file("http://www.unicode.org/notes/tn36/Categories.txt") +Codepoint = namedtuple('Codepoint', 'value class_') - dictionary = {int(line.split()[0], 16): line.split()[1] for line in file} +def get_codepoints(f): + r = csv.reader(f, delimiter=";") + prev_codepoint = 0 + class_first = None + for row in r: + codepoint = int(row[0], 16) + name = row[1] + class_ = row[2] + + if class_first is not None: + if not name.endswith("Last>"): + raise ValueError("Missing Last after First") + + for c in range(prev_codepoint + 1, codepoint): + yield Codepoint(c, class_first) + + class_first = None + if name.endswith("First>"): + class_first = class_ + + yield Codepoint(codepoint, class_) + prev_codepoint = codepoint + + if class_first != None: + raise ValueError("Missing Last after First") + + for c in range(prev_codepoint + 1, NUM_CODEPOINTS): + yield Codepoint(c, None) + +def main(): + file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt") + + codepoints = get_codepoints(file) CUTOFF=0x10000 singletons0 = [] @@ -52,7 +88,7 @@ def main(): normal1 = [] extra = [] - for a, b in to_ranges(get_escaped(dictionary)): + for a, b in to_ranges(get_escaped(codepoints)): if a > 2 * CUTOFF: extra.append((a, b - a)) elif a == b - 1: diff --git a/src/libcore/char_private.rs b/src/libcore/char_private.rs index 708e7cc15e7..ddc473592a2 100644 --- a/src/libcore/char_private.rs +++ b/src/libcore/char_private.rs @@ -11,6 +11,8 @@ // NOTE: The following code was generated by "src/etc/char_private.py", // do not edit directly! +use slice::SliceExt; + fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool { for &s in singletons { if x == s { @@ -42,7 +44,16 @@ pub fn is_printable(x: char) -> bool { } else if x < 0x20000 { check(lower, SINGLETONS1, NORMAL1) } else { - if 0x20000 <= x && x < 0x2f800 { + if 0x2a6d7 <= x && x < 0x2a700 { + return false; + } + if 0x2b735 <= x && x < 0x2b740 { + return false; + } + if 0x2b81e <= x && x < 0x2b820 { + return false; + } + if 0x2cea2 <= x && x < 0x2f800 { return false; } if 0x2fa1e <= x && x < 0xe0100 { @@ -62,10 +73,13 @@ const SINGLETONS0: &'static [u16] = &[ 0x38b, 0x38d, 0x3a2, + 0x530, 0x557, 0x558, 0x560, 0x588, + 0x58b, + 0x58c, 0x590, 0x61c, 0x61d, @@ -79,10 +93,8 @@ const SINGLETONS0: &'static [u16] = &[ 0x83f, 0x85c, 0x85d, - 0x8a1, - 0x8ff, - 0x978, - 0x980, + 0x8b5, + 0x8e2, 0x984, 0x98d, 0x98e, @@ -154,14 +166,11 @@ const SINGLETONS0: &'static [u16] = &[ 0xc0d, 0xc11, 0xc29, - 0xc34, 0xc45, 0xc49, 0xc57, 0xc64, 0xc65, - 0xc80, - 0xc81, 0xc84, 0xc8d, 0xc91, @@ -193,6 +202,8 @@ const SINGLETONS0: &'static [u16] = &[ 0xdbf, 0xdd5, 0xdd7, + 0xdf0, + 0xdf1, 0xe83, 0xe85, 0xe86, @@ -245,6 +256,10 @@ const SINGLETONS0: &'static [u16] = &[ 0x1317, 0x135b, 0x135c, + 0x13f6, + 0x13f7, + 0x13fe, + 0x13ff, 0x1680, 0x170d, 0x176d, @@ -253,6 +268,7 @@ const SINGLETONS0: &'static [u16] = &[ 0x17df, 0x180e, 0x180f, + 0x191f, 0x196e, 0x196f, 0x1a1c, @@ -260,6 +276,9 @@ const SINGLETONS0: &'static [u16] = &[ 0x1a5f, 0x1a7d, 0x1a7e, + 0x1aae, + 0x1aaf, + 0x1cf7, 0x1f16, 0x1f17, 0x1f1e, @@ -285,7 +304,12 @@ const SINGLETONS0: &'static [u16] = &[ 0x2072, 0x2073, 0x208f, - 0x2700, + 0x23ff, + 0x2b74, + 0x2b75, + 0x2b96, + 0x2b97, + 0x2bc9, 0x2c2f, 0x2c5f, 0x2d26, @@ -306,8 +330,11 @@ const SINGLETONS0: &'static [u16] = &[ 0x318f, 0x321f, 0x32ff, - 0xa78f, + 0xa7af, + 0xa8fe, + 0xa8ff, 0xa9ce, + 0xa9ff, 0xaa4e, 0xaa4f, 0xaa5a, @@ -317,6 +344,7 @@ const SINGLETONS0: &'static [u16] = &[ 0xab0f, 0xab10, 0xab27, + 0xab2f, 0xabee, 0xabef, 0xfa6e, @@ -350,7 +378,7 @@ const SINGLETONS1: &'static [u16] = &[ 0x3e, 0x4e, 0x4f, - 0x31f, + 0x18f, 0x39e, 0x49e, 0x49f, @@ -361,6 +389,9 @@ const SINGLETONS1: &'static [u16] = &[ 0x83d, 0x83e, 0x856, + 0x8f3, + 0x9d0, + 0x9d1, 0xa04, 0xa14, 0xa18, @@ -368,6 +399,49 @@ const SINGLETONS1: &'static [u16] = &[ 0xb57, 0x10bd, 0x1135, + 0x11ce, + 0x11cf, + 0x11e0, + 0x1212, + 0x1287, + 0x1289, + 0x128e, + 0x129e, + 0x1304, + 0x130d, + 0x130e, + 0x1311, + 0x1312, + 0x1329, + 0x1331, + 0x1334, + 0x133a, + 0x133b, + 0x1345, + 0x1346, + 0x1349, + 0x134a, + 0x134e, + 0x134f, + 0x1364, + 0x1365, + 0x145a, + 0x145c, + 0x15b6, + 0x15b7, + 0x1c09, + 0x1c37, + 0x1c90, + 0x1c91, + 0x1ca8, + 0x246f, + 0x6a5f, + 0x6aee, + 0x6aef, + 0x6b5a, + 0x6b62, + 0xbc9a, + 0xbc9b, 0xd127, 0xd128, 0xd455, @@ -395,6 +469,14 @@ const SINGLETONS1: &'static [u16] = &[ 0xd6a7, 0xd7cc, 0xd7cd, + 0xdaa0, + 0xe007, + 0xe019, + 0xe01a, + 0xe022, + 0xe025, + 0xe8c5, + 0xe8c6, 0xee04, 0xee20, 0xee23, @@ -429,31 +511,25 @@ const SINGLETONS1: &'static [u16] = &[ 0xeeaa, 0xf0af, 0xf0b0, - 0xf0bf, 0xf0c0, 0xf0d0, 0xf12f, - 0xf336, - 0xf3c5, - 0xf43f, - 0xf441, - 0xf4f8, - 0xf53e, - 0xf53f, + 0xf91f, + 0xf931, + 0xf932, + 0xf93f, ]; const NORMAL0: &'static [u16] = &[ 0x0, 0x20, 0x7f, 0x22, - 0x37f, 0x5, - 0x528, 0x9, - 0x58b, 0x4, + 0x380, 0x4, 0x5c8, 0x8, 0x5eb, 0x5, 0x5f5, 0x11, 0x7b2, 0xe, 0x7fb, 0x5, 0x85f, 0x41, - 0x8ad, 0x37, + 0x8be, 0x16, 0x9b3, 0x3, 0x9cf, 0x8, 0x9d8, 0x4, @@ -465,7 +541,8 @@ const NORMAL0: &'static [u16] = &[ 0xa5f, 0x7, 0xa76, 0xb, 0xad1, 0xf, - 0xaf2, 0xf, + 0xaf2, 0x7, + 0xafa, 0x7, 0xb4e, 0x8, 0xb58, 0x4, 0xb78, 0xa, @@ -478,21 +555,19 @@ const NORMAL0: &'static [u16] = &[ 0xbc3, 0x3, 0xbd1, 0x6, 0xbd8, 0xe, - 0xbfb, 0x6, + 0xbfb, 0x5, 0xc3a, 0x3, 0xc4e, 0x7, - 0xc5a, 0x6, + 0xc5b, 0x5, 0xc70, 0x8, 0xcce, 0x7, 0xcd7, 0x7, - 0xcf3, 0xf, - 0xd4f, 0x8, - 0xd58, 0x8, - 0xd76, 0x3, + 0xcf3, 0xe, + 0xd50, 0x4, 0xd97, 0x3, 0xdc7, 0x3, 0xdcb, 0x4, - 0xde0, 0x12, + 0xde0, 0x6, 0xdf5, 0xc, 0xe3b, 0x4, 0xe5c, 0x25, @@ -503,9 +578,8 @@ const NORMAL0: &'static [u16] = &[ 0x10c8, 0x5, 0x137d, 0x3, 0x139a, 0x6, - 0x13f5, 0xb, 0x169d, 0x3, - 0x16f1, 0xf, + 0x16f9, 0x7, 0x1715, 0xb, 0x1737, 0x9, 0x1754, 0xc, @@ -516,7 +590,6 @@ const NORMAL0: &'static [u16] = &[ 0x1878, 0x8, 0x18ab, 0x5, 0x18f6, 0xa, - 0x191d, 0x3, 0x192c, 0x4, 0x193c, 0x4, 0x1941, 0x3, @@ -526,34 +599,34 @@ const NORMAL0: &'static [u16] = &[ 0x19db, 0x3, 0x1a8a, 0x6, 0x1a9a, 0x6, - 0x1aae, 0x52, + 0x1abf, 0x41, 0x1b4c, 0x4, 0x1b7d, 0x3, 0x1bf4, 0x8, 0x1c38, 0x3, 0x1c4a, 0x3, - 0x1c80, 0x40, + 0x1c89, 0x37, 0x1cc8, 0x8, - 0x1cf7, 0x9, - 0x1de7, 0x15, + 0x1cfa, 0x6, + 0x1df6, 0x5, 0x1fff, 0x11, 0x2028, 0x8, 0x205f, 0x11, 0x209d, 0x3, - 0x20ba, 0x16, + 0x20bf, 0x11, 0x20f1, 0xf, - 0x218a, 0x6, - 0x23f4, 0xc, + 0x218c, 0x4, 0x2427, 0x19, 0x244b, 0x15, - 0x2b4d, 0x3, - 0x2b5a, 0xa6, + 0x2bba, 0x3, + 0x2bd2, 0x1a, + 0x2bf0, 0x10, 0x2cf4, 0x5, 0x2d28, 0x5, 0x2d68, 0x7, 0x2d71, 0xe, 0x2d97, 0x9, - 0x2e3c, 0x44, + 0x2e45, 0x3b, 0x2ef4, 0xc, 0x2fd6, 0x1a, 0x2ffc, 0x5, @@ -561,32 +634,28 @@ const NORMAL0: &'static [u16] = &[ 0x312e, 0x3, 0x31bb, 0x5, 0x31e4, 0xc, - 0x3400, 0x19c0, - 0x4e00, 0x5200, + 0x4db6, 0xa, + 0x9fd6, 0x2a, 0xa48d, 0x3, 0xa4c7, 0x9, 0xa62c, 0x14, - 0xa698, 0x7, 0xa6f8, 0x8, - 0xa794, 0xc, - 0xa7ab, 0x4d, + 0xa7b8, 0x3f, 0xa82c, 0x4, 0xa83a, 0x6, 0xa878, 0x8, - 0xa8c5, 0x9, + 0xa8c6, 0x8, 0xa8da, 0x6, - 0xa8fc, 0x4, 0xa954, 0xb, 0xa97d, 0x3, 0xa9da, 0x4, - 0xa9e0, 0x20, 0xaa37, 0x9, - 0xaa7c, 0x4, 0xaac3, 0x18, 0xaaf7, 0xa, 0xab17, 0x9, - 0xab2f, 0x91, - 0xabfa, 0x2bb6, + 0xab66, 0xa, + 0xabfa, 0x6, + 0xd7a4, 0xc, 0xd7c7, 0x4, 0xd7fc, 0x2104, 0xfada, 0x26, @@ -596,7 +665,6 @@ const NORMAL0: &'static [u16] = &[ 0xfd40, 0x10, 0xfdc8, 0x28, 0xfe1a, 0x6, - 0xfe27, 0x9, 0xfe6c, 0x4, 0xfefd, 0x4, 0xffbf, 0x3, @@ -608,61 +676,123 @@ const NORMAL1: &'static [u16] = &[ 0xfb, 0x5, 0x103, 0x4, 0x134, 0x3, - 0x18b, 0x5, - 0x19c, 0x34, + 0x19c, 0x4, + 0x1a1, 0x2f, 0x1fe, 0x82, 0x29d, 0x3, - 0x2d1, 0x2f, + 0x2d1, 0xf, + 0x2fc, 0x4, 0x324, 0xc, - 0x34b, 0x35, + 0x34b, 0x5, + 0x37b, 0x5, 0x3c4, 0x4, 0x3d6, 0x2a, - 0x4aa, 0x356, + 0x4aa, 0x6, + 0x4d4, 0x4, + 0x4fc, 0x4, + 0x528, 0x8, + 0x564, 0xb, + 0x570, 0x90, + 0x737, 0x9, + 0x756, 0xa, + 0x768, 0x98, 0x839, 0x3, - 0x860, 0xa0, + 0x89f, 0x8, + 0x8b0, 0x30, + 0x8f6, 0x5, 0x91c, 0x3, 0x93a, 0x5, 0x940, 0x40, - 0x9b8, 0x6, - 0x9c0, 0x40, + 0x9b8, 0x4, 0xa07, 0x5, 0xa34, 0x4, 0xa3b, 0x4, 0xa48, 0x8, 0xa59, 0x7, - 0xa80, 0x80, + 0xaa0, 0x20, + 0xae7, 0x4, + 0xaf7, 0x9, 0xb36, 0x3, 0xb73, 0x5, - 0xb80, 0x80, - 0xc49, 0x217, + 0xb92, 0x7, + 0xb9d, 0xc, + 0xbb0, 0x50, + 0xc49, 0x37, + 0xcb3, 0xd, + 0xcf3, 0x7, + 0xd00, 0x160, 0xe7f, 0x181, 0x104e, 0x4, - 0x1070, 0x10, + 0x1070, 0xf, 0x10c2, 0xe, 0x10e9, 0x7, 0x10fa, 0x6, - 0x1144, 0x3c, - 0x11c9, 0x7, - 0x11da, 0x4a6, + 0x1144, 0xc, + 0x1177, 0x9, + 0x11f5, 0xb, + 0x123f, 0x41, + 0x12aa, 0x6, + 0x12eb, 0x5, + 0x12fa, 0x6, + 0x1351, 0x6, + 0x1358, 0x5, + 0x136d, 0x3, + 0x1375, 0x8b, + 0x145e, 0x22, + 0x14c8, 0x8, + 0x14da, 0xa6, + 0x15de, 0x22, + 0x1645, 0xb, + 0x165a, 0x6, + 0x166d, 0x13, 0x16b8, 0x8, - 0x16ca, 0x936, - 0x236f, 0x91, - 0x2463, 0xd, - 0x2474, 0xb8c, - 0x342f, 0x33d1, - 0x6a39, 0x4c7, + 0x16ca, 0x36, + 0x171a, 0x3, + 0x172c, 0x4, + 0x1740, 0x160, + 0x18f3, 0xc, + 0x1900, 0x1c0, + 0x1af9, 0x107, + 0x1c46, 0xa, + 0x1c6d, 0x3, + 0x1cb7, 0x349, + 0x239a, 0x66, + 0x2475, 0xb, + 0x2544, 0xabc, + 0x342f, 0xfd1, + 0x4647, 0x21b9, + 0x6a39, 0x7, + 0x6a6a, 0x4, + 0x6a70, 0x60, + 0x6af6, 0xa, + 0x6b46, 0xa, + 0x6b78, 0x5, + 0x6b90, 0x370, 0x6f45, 0xb, 0x6f7f, 0x10, - 0x6fa0, 0x4060, - 0xb002, 0x1ffe, + 0x6fa0, 0x40, + 0x6fe1, 0x1f, + 0x87ed, 0x13, + 0x8af3, 0x250d, + 0xb002, 0xbfe, + 0xbc6b, 0x5, + 0xbc7d, 0x3, + 0xbc89, 0x7, + 0xbca0, 0x1360, 0xd0f6, 0xa, 0xd173, 0x8, - 0xd1de, 0x22, + 0xd1e9, 0x17, 0xd246, 0xba, 0xd357, 0x9, 0xd372, 0x8e, 0xd547, 0x3, - 0xd800, 0x1600, + 0xda8c, 0xf, + 0xdab0, 0x550, + 0xe02b, 0x7d5, + 0xe8d7, 0x29, + 0xe94b, 0x5, + 0xe95a, 0x4, + 0xe960, 0x4a0, 0xee3c, 0x6, 0xee43, 0x4, 0xee9c, 0x5, @@ -670,24 +800,27 @@ const NORMAL1: &'static [u16] = &[ 0xeef2, 0x10e, 0xf02c, 0x4, 0xf094, 0xc, - 0xf0e0, 0x20, - 0xf10b, 0x5, + 0xf0f6, 0xa, + 0xf10d, 0x3, 0xf16c, 0x4, - 0xf19b, 0x4b, + 0xf1ad, 0x39, 0xf203, 0xd, - 0xf23b, 0x5, + 0xf23c, 0x4, 0xf249, 0x7, 0xf252, 0xae, - 0xf321, 0xf, - 0xf37d, 0x3, - 0xf394, 0xc, - 0xf3cb, 0x15, - 0xf3f1, 0xf, - 0xf4fd, 0x3, - 0xf544, 0xc, - 0xf568, 0x93, - 0xf641, 0x4, - 0xf650, 0x30, - 0xf6c6, 0x3a, - 0xf774, 0x88c, + 0xf6d3, 0xd, + 0xf6ed, 0x3, + 0xf6f7, 0x9, + 0xf774, 0xc, + 0xf7d5, 0x2b, + 0xf80c, 0x4, + 0xf848, 0x8, + 0xf85a, 0x6, + 0xf888, 0x8, + 0xf8ae, 0x62, + 0xf928, 0x8, + 0xf94c, 0x4, + 0xf95f, 0x21, + 0xf992, 0x2e, + 0xf9c1, 0x63f, ]; diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index 7da0b6902f2..b4088ffbf89 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -162,6 +162,8 @@ fn test_escape_debug() { assert_eq!(s, "~"); let s = string('é'); assert_eq!(s, "é"); + let s = string('文'); + assert_eq!(s, "文"); let s = string('\x00'); assert_eq!(s, "\\u{0}"); let s = string('\x1f');