Support unicode in string literals.

2015-09-03 23:38:12 -04:00 · 2015-09-03 23:38:12 -04:00 · 61f642f6f8
commit 61f642f6f8
parent b0baa3d06e
8 changed files with 52 additions and 51 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -8,6 +8,7 @@ dependencies = [
 "strings 0.0.1 (git+https://github.com/nrc/strings.rs.git)",
 "term 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
 "toml 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -87,6 +88,11 @@ dependencies = [
 "rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "unicode-segmentation"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "winapi"
 version = "0.2.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -15,6 +15,8 @@ git = "https://github.com/nrc/strings.rs.git"
 [dependencies]
 toml = "0.1.20"
 rustc-serialize = "0.3.14"
+unicode-segmentation = "0.1.2"
+regex = "0.1.41"

 [dev-dependencies]
 diff = "0.1.0"
--- a/src/expr.rs
+++ b/src/expr.rs
@ -28,8 +28,8 @@ impl Rewrite for ast::Expr {
        match self.node {
            ast::Expr_::ExprLit(ref l) => {
                match l.node {
-                    ast::Lit_::LitStr(ref is, ast::StrStyle::CookedStr) => {
-                        rewrite_string_lit(context, &is, l.span, width, offset)
+                    ast::Lit_::LitStr(_, ast::StrStyle::CookedStr) => {
+                        rewrite_string_lit(context, l.span, width, offset)
                    }
                    _ => Some(context.snippet(self.span)),
                }
@ -823,7 +823,6 @@ fn rewrite_pat_expr(context: &RewriteContext,
 }

 fn rewrite_string_lit(context: &RewriteContext,
-                      s: &str,
                      span: Span,
                      width: usize,
                      offset: usize)
@ -842,7 +841,10 @@ fn rewrite_string_lit(context: &RewriteContext,
        trim_end: false,
    };

-    Some(rewrite_string(&s.escape_default(), &fmt))
+    let string_lit = context.snippet(span);
+    let str_lit = &string_lit[1..string_lit.len() - 1]; // Remove the quote characters.
+
+    Some(rewrite_string(str_lit, &fmt))
 }

 fn rewrite_call(context: &RewriteContext,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -9,8 +9,6 @@
 // except according to those terms.

 #![feature(rustc_private)]
-#![feature(str_escape)]
-#![feature(str_char)]
 #![feature(custom_attribute)]
 #![allow(unused_attributes)]

@ -30,6 +28,9 @@ extern crate rustc_serialize;

 extern crate strings;

+extern crate unicode_segmentation;
+extern crate regex;
+
 use rustc::session::Session;
 use rustc::session::config as rustc_config;
 use rustc::session::config::Input;
--- a/src/string.rs
+++ b/src/string.rs
@ -10,7 +10,12 @@

 // Format string literals.

-use utils::{make_indent, next_char, prev_char, round_up_to_power_of_two};
+
+
+use unicode_segmentation::UnicodeSegmentation;
+use regex::Regex;
+
+use utils::{make_indent, round_up_to_power_of_two};

 use MIN_STRING;

@ -26,8 +31,12 @@ pub struct StringFormat<'a> {

 // TODO: simplify this!
 pub fn rewrite_string<'a>(s: &str, fmt: &StringFormat<'a>) -> String {
-    // FIXME I bet this stomps unicode escapes in the source string
    // TODO if lo.col > IDEAL - 10, start a new line (need cur indent for that)
+    // Strip line breaks.
+    let re = Regex::new(r"(\\[:space:]+)").unwrap();
+    let stripped_str = re.replace_all(s, "");
+
+    let graphemes = UnicodeSegmentation::graphemes(&*stripped_str, false).collect::<Vec<&str>>();

    let indent = make_indent(fmt.offset);
    let indent = &indent;
@ -39,41 +48,36 @@ pub fn rewrite_string<'a>(s: &str, fmt: &StringFormat<'a>) -> String {
    let ender_length = fmt.line_end.len();
    let max_chars = fmt.width.checked_sub(fmt.opener.len()).unwrap_or(0)
                             .checked_sub(ender_length).unwrap_or(1);
-
    loop {
        let mut cur_end = cur_start + max_chars;

-        if cur_end >= s.len() {
-            result.push_str(&s[cur_start..]);
+        if cur_end >= graphemes.len() {
+            let line = &graphemes[cur_start..].join("");
+            result.push_str(line);
            break;
        }
-
-        // Make sure we're on a char boundary.
-        cur_end = next_char(&s, cur_end);
-
        // Push cur_end left until we reach whitespace.
-        while !s.char_at(cur_end - 1).is_whitespace() {
-            cur_end = prev_char(&s, cur_end);
-
+        while !(graphemes[cur_end - 1].trim().len() == 0) {
+            cur_end -= 1;
            if cur_end - cur_start < MIN_STRING {
                // We can't break at whitespace, fall back to splitting
                // anywhere that doesn't break an escape sequence.
-                cur_end = next_char(&s, cur_start + max_chars);
-                while s.char_at(prev_char(&s, cur_end)) == '\\' {
-                    cur_end = prev_char(&s, cur_end);
+                cur_end = cur_start + max_chars;
+                while graphemes[cur_end - 1] == "\\" {
+                    cur_end -= 1;
                }
                break;
            }
        }
        // Make sure there is no whitespace to the right of the break.
-        while cur_end < s.len() && s.char_at(cur_end).is_whitespace() {
-            cur_end = next_char(&s, cur_end + 1);
+        while cur_end < s.len() && graphemes[cur_end].trim().len() == 0 {
+            cur_end += 1;
        }
-
+        let raw_line = graphemes[cur_start..cur_end].join("");
        let line: &str = if fmt.trim_end {
-            &s[cur_start..cur_end].trim_right_matches(char::is_whitespace)
+            &(raw_line.trim())
        } else {
-            &s[cur_start..cur_end]
+            &raw_line
        };

        result.push_str(line);
--- a/src/utils.rs
+++ b/src/utils.rs
@ -32,31 +32,6 @@ pub fn span_after(original: Span, needle: &str, codemap: &CodeMap) -> BytePos {
    original.lo + BytePos(snippet.find_uncommented(needle).unwrap() as u32 + 1)
 }

-#[inline]
-pub fn prev_char(s: &str, mut i: usize) -> usize {
-    if i == 0 {
-        return 0;
-    }
-
-    i -= 1;
-    while !s.is_char_boundary(i) {
-        i -= 1;
-    }
-    i
-}
-
-#[inline]
-pub fn next_char(s: &str, mut i: usize) -> usize {
-    if i >= s.len() {
-        return s.len();
-    }
-
-    while !s.is_char_boundary(i) {
-        i += 1;
-    }
-    i
-}
-
 #[inline]
 pub fn make_indent(width: usize) -> String {
    let mut indent = String::with_capacity(width);
--- a/tests/source/string-lit.rs
+++ b/tests/source/string-lit.rs
@ -25,5 +25,10 @@ formatting"#;
    let xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx =
        funktion("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
        
+    let unicode = "a̐éö̲\r\n";
+    let unicode2 = "Löwe 老虎 Léopard";
+    let unicode3 = "中华Việt Nam";
+    let unicode4 = "☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃";
+
    "stuff"
 }
--- a/tests/target/string-lit.rs
+++ b/tests/target/string-lit.rs
@ -30,5 +30,11 @@ formatting"#;
                                                                            yyyyyyyyyyyyyyyyyyyyy\
                                                                            yyyyyyyyyy");

+    let unicode = "a̐éö̲\r\n";
+    let unicode2 = "Löwe 老虎 Léopard";
+    let unicode3 = "中华Việt Nam";
+    let unicode4 = "☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃\
+                    ☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃";
+
    "stuff"
 }