Support unicode in string literals.

This commit is contained in:
Sinh Pham 2015-09-03 23:38:12 -04:00
parent b0baa3d06e
commit 61f642f6f8
8 changed files with 52 additions and 51 deletions

6
Cargo.lock generated
View file

@ -8,6 +8,7 @@ dependencies = [
"strings 0.0.1 (git+https://github.com/nrc/strings.rs.git)",
"term 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"toml 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -87,6 +88,11 @@ dependencies = [
"rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-segmentation"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.2.2"

View file

@ -15,6 +15,8 @@ git = "https://github.com/nrc/strings.rs.git"
[dependencies]
toml = "0.1.20"
rustc-serialize = "0.3.14"
unicode-segmentation = "0.1.2"
regex = "0.1.41"
[dev-dependencies]
diff = "0.1.0"

View file

@ -28,8 +28,8 @@ impl Rewrite for ast::Expr {
match self.node {
ast::Expr_::ExprLit(ref l) => {
match l.node {
ast::Lit_::LitStr(ref is, ast::StrStyle::CookedStr) => {
rewrite_string_lit(context, &is, l.span, width, offset)
ast::Lit_::LitStr(_, ast::StrStyle::CookedStr) => {
rewrite_string_lit(context, l.span, width, offset)
}
_ => Some(context.snippet(self.span)),
}
@ -823,7 +823,6 @@ fn rewrite_pat_expr(context: &RewriteContext,
}
fn rewrite_string_lit(context: &RewriteContext,
s: &str,
span: Span,
width: usize,
offset: usize)
@ -842,7 +841,10 @@ fn rewrite_string_lit(context: &RewriteContext,
trim_end: false,
};
Some(rewrite_string(&s.escape_default(), &fmt))
let string_lit = context.snippet(span);
let str_lit = &string_lit[1..string_lit.len() - 1]; // Remove the quote characters.
Some(rewrite_string(str_lit, &fmt))
}
fn rewrite_call(context: &RewriteContext,

View file

@ -9,8 +9,6 @@
// except according to those terms.
#![feature(rustc_private)]
#![feature(str_escape)]
#![feature(str_char)]
#![feature(custom_attribute)]
#![allow(unused_attributes)]
@ -30,6 +28,9 @@ extern crate rustc_serialize;
extern crate strings;
extern crate unicode_segmentation;
extern crate regex;
use rustc::session::Session;
use rustc::session::config as rustc_config;
use rustc::session::config::Input;

View file

@ -10,7 +10,12 @@
// Format string literals.
use utils::{make_indent, next_char, prev_char, round_up_to_power_of_two};
use unicode_segmentation::UnicodeSegmentation;
use regex::Regex;
use utils::{make_indent, round_up_to_power_of_two};
use MIN_STRING;
@ -26,8 +31,12 @@ pub struct StringFormat<'a> {
// TODO: simplify this!
pub fn rewrite_string<'a>(s: &str, fmt: &StringFormat<'a>) -> String {
// FIXME I bet this stomps unicode escapes in the source string
// TODO if lo.col > IDEAL - 10, start a new line (need cur indent for that)
// Strip line breaks.
let re = Regex::new(r"(\\[:space:]+)").unwrap();
let stripped_str = re.replace_all(s, "");
let graphemes = UnicodeSegmentation::graphemes(&*stripped_str, false).collect::<Vec<&str>>();
let indent = make_indent(fmt.offset);
let indent = &indent;
@ -39,41 +48,36 @@ pub fn rewrite_string<'a>(s: &str, fmt: &StringFormat<'a>) -> String {
let ender_length = fmt.line_end.len();
let max_chars = fmt.width.checked_sub(fmt.opener.len()).unwrap_or(0)
.checked_sub(ender_length).unwrap_or(1);
loop {
let mut cur_end = cur_start + max_chars;
if cur_end >= s.len() {
result.push_str(&s[cur_start..]);
if cur_end >= graphemes.len() {
let line = &graphemes[cur_start..].join("");
result.push_str(line);
break;
}
// Make sure we're on a char boundary.
cur_end = next_char(&s, cur_end);
// Push cur_end left until we reach whitespace.
while !s.char_at(cur_end - 1).is_whitespace() {
cur_end = prev_char(&s, cur_end);
while !(graphemes[cur_end - 1].trim().len() == 0) {
cur_end -= 1;
if cur_end - cur_start < MIN_STRING {
// We can't break at whitespace, fall back to splitting
// anywhere that doesn't break an escape sequence.
cur_end = next_char(&s, cur_start + max_chars);
while s.char_at(prev_char(&s, cur_end)) == '\\' {
cur_end = prev_char(&s, cur_end);
cur_end = cur_start + max_chars;
while graphemes[cur_end - 1] == "\\" {
cur_end -= 1;
}
break;
}
}
// Make sure there is no whitespace to the right of the break.
while cur_end < s.len() && s.char_at(cur_end).is_whitespace() {
cur_end = next_char(&s, cur_end + 1);
while cur_end < s.len() && graphemes[cur_end].trim().len() == 0 {
cur_end += 1;
}
let raw_line = graphemes[cur_start..cur_end].join("");
let line: &str = if fmt.trim_end {
&s[cur_start..cur_end].trim_right_matches(char::is_whitespace)
&(raw_line.trim())
} else {
&s[cur_start..cur_end]
&raw_line
};
result.push_str(line);

View file

@ -32,31 +32,6 @@ pub fn span_after(original: Span, needle: &str, codemap: &CodeMap) -> BytePos {
original.lo + BytePos(snippet.find_uncommented(needle).unwrap() as u32 + 1)
}
#[inline]
pub fn prev_char(s: &str, mut i: usize) -> usize {
if i == 0 {
return 0;
}
i -= 1;
while !s.is_char_boundary(i) {
i -= 1;
}
i
}
#[inline]
pub fn next_char(s: &str, mut i: usize) -> usize {
if i >= s.len() {
return s.len();
}
while !s.is_char_boundary(i) {
i += 1;
}
i
}
#[inline]
pub fn make_indent(width: usize) -> String {
let mut indent = String::with_capacity(width);

View file

@ -24,6 +24,11 @@ formatting"#;
let xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx =
funktion("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
let unicode = "a̐éö̲\r\n";
let unicode2 = "Löwe 老虎 Léopard";
let unicode3 = "中华Việt Nam";
let unicode4 = "☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃";
"stuff"
}

View file

@ -30,5 +30,11 @@ formatting"#;
yyyyyyyyyyyyyyyyyyyyy\
yyyyyyyyyy");
let unicode = "a̐éö̲\r\n";
let unicode2 = "Löwe 老虎 Léopard";
let unicode3 = "中华Việt Nam";
let unicode4 = "☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃\
";
"stuff"
}