initial step towards implementing C string literals

This commit is contained in:
Deadbeef 2023-03-05 15:03:22 +00:00
parent 7b99493492
commit 8ff3903643
17 changed files with 312 additions and 82 deletions

View file

@ -1814,6 +1814,8 @@ pub enum LitKind {
/// A byte string (`b"foo"`). Not stored as a symbol because it might be
/// non-utf8, and symbols only allow utf8 strings.
ByteStr(Lrc<[u8]>, StrStyle),
/// A C String (`c"foo"`).
CStr(Lrc<[u8]>, StrStyle),
/// A byte char (`b'f'`).
Byte(u8),
/// A character literal (`'a'`).
@ -1868,6 +1870,7 @@ impl LitKind {
// unsuffixed variants
LitKind::Str(..)
| LitKind::ByteStr(..)
| LitKind::CStr(..)
| LitKind::Byte(..)
| LitKind::Char(..)
| LitKind::Int(_, LitIntType::Unsuffixed)

View file

@ -74,6 +74,8 @@ pub enum LitKind {
StrRaw(u8), // raw string delimited by `n` hash symbols
ByteStr,
ByteStrRaw(u8), // raw byte string delimited by `n` hash symbols
CStr,
CStrRaw(u8),
Err,
}
@ -141,6 +143,10 @@ impl fmt::Display for Lit {
delim = "#".repeat(n as usize),
string = symbol
)?,
CStr => write!(f, "c\"{symbol}\"")?,
CStrRaw(n) => {
write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize))?
}
Integer | Float | Bool | Err => write!(f, "{symbol}")?,
}
@ -170,6 +176,7 @@ impl LitKind {
Float => "float",
Str | StrRaw(..) => "string",
ByteStr | ByteStrRaw(..) => "byte string",
CStr | CStrRaw(..) => "C string",
Err => "error",
}
}

View file

@ -2,7 +2,10 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token};
use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode};
use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
Mode,
};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;
use std::{ascii, fmt, str};
@ -158,6 +161,52 @@ impl LitKind {
LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
}
token::CStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::CStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
buf.push(b'\0');
LitKind::CStr(buf.into(), StrStyle::Cooked)
}
token::CStrRaw(n) => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
buf.push(b'\0');
LitKind::CStr(buf.into(), StrStyle::Raw(n))
}
token::Err => LitKind::Err,
})
}
@ -191,6 +240,8 @@ impl fmt::Display for LitKind {
string = symbol
)?;
}
// TODO need to reescape
LitKind::CStr(..) => todo!(),
LitKind::Int(n, ty) => {
write!(f, "{n}")?;
match ty {
@ -237,6 +288,8 @@ impl MetaItemLit {
LitKind::Str(_, ast::StrStyle::Raw(n)) => token::StrRaw(n),
LitKind::ByteStr(_, ast::StrStyle::Cooked) => token::ByteStr,
LitKind::ByteStr(_, ast::StrStyle::Raw(n)) => token::ByteStrRaw(n),
LitKind::CStr(_, ast::StrStyle::Cooked) => token::CStr,
LitKind::CStr(_, ast::StrStyle::Raw(n)) => token::CStrRaw(n),
LitKind::Byte(_) => token::Byte,
LitKind::Char(_) => token::Char,
LitKind::Int(..) => token::Integer,

View file

@ -210,6 +210,8 @@ pub fn literal_to_string(lit: token::Lit) -> String {
token::ByteStrRaw(n) => {
format!("br{delim}\"{string}\"{delim}", delim = "#".repeat(n as usize), string = symbol)
}
// TODO
token::CStr | token::CStrRaw(_) => todo!(),
token::Integer | token::Float | token::Bool | token::Err => symbol.to_string(),
};

View file

@ -32,6 +32,10 @@ pub fn expand_concat(
Ok(ast::LitKind::Bool(b)) => {
accumulator.push_str(&b.to_string());
}
Ok(ast::LitKind::CStr(..)) => {
cx.span_err(e.span, "cannot concatenate a C string literal");
has_errors = true;
}
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
cx.emit_err(errors::ConcatBytestr { span: e.span });
has_errors = true;

View file

@ -18,6 +18,10 @@ fn invalid_type_err(
};
let snippet = cx.sess.source_map().span_to_snippet(span).ok();
match ast::LitKind::from_token_lit(token_lit) {
Ok(ast::LitKind::CStr(_, _)) => {
// TODO
cx.span_err(span, "cannot concatenate C string litearls");
}
Ok(ast::LitKind::Char(_)) => {
let sugg =
snippet.map(|snippet| ConcatBytesInvalidSuggestion::CharLit { span, snippet });

View file

@ -61,6 +61,8 @@ impl FromInternal<token::LitKind> for LitKind {
token::StrRaw(n) => LitKind::StrRaw(n),
token::ByteStr => LitKind::ByteStr,
token::ByteStrRaw(n) => LitKind::ByteStrRaw(n),
// TODO
token::CStr | token::CStrRaw(_) => todo!(),
token::Err => LitKind::Err,
token::Bool => unreachable!(),
}
@ -436,6 +438,8 @@ impl server::FreeFunctions for Rustc<'_, '_> {
| token::LitKind::StrRaw(_)
| token::LitKind::ByteStr
| token::LitKind::ByteStrRaw(_)
| token::LitKind::CStr
| token::LitKind::CStrRaw(_)
| token::LitKind::Err => return Err(()),
token::LitKind::Integer | token::LitKind::Float => {}
}

View file

@ -332,6 +332,7 @@ language_item_table! {
RangeTo, sym::RangeTo, range_to_struct, Target::Struct, GenericRequirement::None;
String, sym::String, string, Target::Struct, GenericRequirement::None;
CStr, sym::CStr, c_str, Target::Struct, GenericRequirement::None;
}
pub enum GenericRequirement {

View file

@ -1300,6 +1300,11 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> {
opt_ty.unwrap_or_else(|| self.next_float_var())
}
ast::LitKind::Bool(_) => tcx.types.bool,
ast::LitKind::CStr(_, _) => tcx.mk_imm_ref(
tcx.lifetimes.re_static,
tcx.type_of(tcx.require_lang_item(hir::LangItem::CStr, Some(lit.span)))
.skip_binder(),
),
ast::LitKind::Err => tcx.ty_error_misc(),
}
}

View file

@ -186,12 +186,16 @@ pub enum LiteralKind {
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` is invalid.
RawCStr { n_hashes: Option<u8> },
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@ -391,6 +395,32 @@ impl Cursor<'_> {
_ => self.ident_or_unknown_prefix(),
},
// TODO deduplicate this code
// c-string literal, raw c-string literal or identifier.
'c' => match (self.first(), self.second()) {
('"', _) => {
self.bump();
let terminated = self.double_quoted_string();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
let kind = CStr { terminated };
Literal { kind, suffix_start }
}
('r', '"') | ('r', '#') => {
self.bump();
let res = self.raw_double_quoted_string(2);
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawCStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
},
// Identifier (this should be checked after other variant that can
// start as identifier).
c if is_id_start(c) => self.ident_or_unknown_prefix(),

View file

@ -90,6 +90,39 @@ where
Mode::RawStr | Mode::RawByteStr => {
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
}
Mode::CStr | Mode::RawCStr => unreachable!(),
}
}
pub enum CStrUnit {
Byte(u8),
Char(char),
}
impl From<u8> for CStrUnit {
fn from(value: u8) -> Self {
CStrUnit::Byte(value)
}
}
impl From<char> for CStrUnit {
fn from(value: char) -> Self {
CStrUnit::Char(value)
}
}
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
{
if mode == Mode::RawCStr {
unescape_raw_str_or_raw_byte_str(
src,
mode.characters_should_be_ascii(),
&mut |r, result| callback(r, result.map(CStrUnit::Char)),
);
} else {
unescape_str_common(src, mode, callback);
}
}
@ -114,19 +147,26 @@ pub enum Mode {
ByteStr,
RawStr,
RawByteStr,
CStr,
RawCStr,
}
impl Mode {
pub fn in_double_quotes(self) -> bool {
match self {
Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
Mode::Str
| Mode::ByteStr
| Mode::RawStr
| Mode::RawByteStr
| Mode::CStr
| Mode::RawCStr => true,
Mode::Char | Mode::Byte => false,
}
}
pub fn is_byte(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => true,
Mode::Char | Mode::Str | Mode::RawStr => false,
}
}
@ -163,64 +203,65 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError
value as char
}
'u' => {
// We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
// First character must be a hexadecimal digit.
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
// First character is valid, now parse the rest of the number
// and closing brace.
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
if is_byte {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit: u32 =
c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
// Stop updating value since we're sure that it's incorrect already.
continue;
}
value = value * 16 + digit;
}
};
}
}
'u' => scan_unicode(chars, is_byte)?,
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
// We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
// First character must be a hexadecimal digit.
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
// First character is valid, now parse the rest of the number
// and closing brace.
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
if is_byte {
return Err(EscapeError::UnicodeEscapeInByte);
}
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
});
}
Some(c) => {
let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
// Stop updating value since we're sure that it's incorrect already.
continue;
}
value = value * 16 + digit;
}
};
}
}
#[inline]
fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
if is_byte && !c.is_ascii() {
@ -266,7 +307,9 @@ where
// if unescaped '\' character is followed by '\n'.
// For details see [Rust language reference]
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
skip_ascii_whitespace(&mut chars, start, callback);
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
callback(range, Err(err))
});
continue;
}
_ => scan_escape(&mut chars, is_byte),
@ -281,32 +324,32 @@ where
let end = src.len() - chars.as_str().len();
callback(start..end, res);
}
}
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let tail = chars.as_str();
let first_non_space = tail
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(tail.len());
if tail[1..first_non_space].contains('\n') {
// The +1 accounts for the escaping slash.
let end = start + first_non_space + 1;
callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
}
let tail = &tail[first_non_space..];
if let Some(c) = tail.chars().nth(0) {
if c.is_whitespace() {
// For error reporting, we would like the span to contain the character that was not
// skipped. The +1 is necessary to account for the leading \ that started the escape.
let end = start + first_non_space + c.len_utf8() + 1;
callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
}
}
*chars = tail.chars();
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
where
F: FnMut(Range<usize>, EscapeError),
{
let tail = chars.as_str();
let first_non_space = tail
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(tail.len());
if tail[1..first_non_space].contains('\n') {
// The +1 accounts for the escaping slash.
let end = start + first_non_space + 1;
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
}
let tail = &tail[first_non_space..];
if let Some(c) = tail.chars().nth(0) {
if c.is_whitespace() {
// For error reporting, we would like the span to contain the character that was not
// skipped. The +1 is necessary to account for the leading \ that started the escape.
let end = start + first_non_space + c.len_utf8() + 1;
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
}
}
*chars = tail.chars();
}
/// Takes a contents of a string literal (without quotes) and produces a

View file

@ -415,6 +415,16 @@ impl<'a> StringReader<'a> {
}
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
}
rustc_lexer::LiteralKind::CStr { terminated } => {
if !terminated {
self.sess.span_diagnostic.span_fatal_with_code(
self.mk_sp(start + BytePos(1), end),
"unterminated C string",
error_code!(E0767),
)
}
self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
}
rustc_lexer::LiteralKind::RawStr { n_hashes } => {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
@ -433,6 +443,15 @@ impl<'a> StringReader<'a> {
self.report_raw_str_error(start, 2);
}
}
rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::CStrRaw(n_hashes);
self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
} else {
self.report_raw_str_error(start, 2);
}
}
rustc_lexer::LiteralKind::Int { base, empty_int } => {
if empty_int {
let span = self.mk_sp(start, end);
@ -692,6 +711,51 @@ impl<'a> StringReader<'a> {
(token::Err, self.symbol_from_to(start, end))
}
}
fn cook_c_string(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
) -> (token::LitKind, Symbol) {
let mut has_fatal_err = false;
let content_start = start + BytePos(prefix_len);
let content_end = end - BytePos(postfix_len);
let lit_content = self.str_from_to(content_start, content_end);
unescape::unescape_c_string(lit_content, mode, &mut |range, result| {
// Here we only check for errors. The actual unescaping is done later.
if let Err(err) = result {
let span_with_quotes = self.mk_sp(start, end);
let (start, end) = (range.start as u32, range.end as u32);
let lo = content_start + BytePos(start);
let hi = lo + BytePos(end - start);
let span = self.mk_sp(lo, hi);
if err.is_fatal() {
has_fatal_err = true;
}
emit_unescape_error(
&self.sess.span_diagnostic,
lit_content,
span_with_quotes,
span,
mode,
range,
err,
);
}
});
// We normally exclude the quotes for the symbol, but for errors we
// include it because it results in clearer error messages.
if !has_fatal_err {
(kind, Symbol::intern(lit_content))
} else {
(token::Err, self.symbol_from_to(start, end))
}
}
}
pub fn nfc_normalize(string: &str) -> Symbol {

View file

@ -82,6 +82,7 @@ use crate::str;
#[cfg_attr(not(test), rustc_diagnostic_item = "CStr")]
#[stable(feature = "core_c_str", since = "1.64.0")]
#[rustc_has_incoherent_inherent_impls]
#[cfg_attr(not(bootstrap), lang = "CStr")]
// FIXME:
// `fn from` in `impl From<&CStr> for Box<CStr>` current implementation relies
// on `CStr` being layout-compatible with `[u8]`.

View file

@ -811,7 +811,9 @@ impl<'src> Classifier<'src> {
| LiteralKind::Str { .. }
| LiteralKind::ByteStr { .. }
| LiteralKind::RawStr { .. }
| LiteralKind::RawByteStr { .. } => Class::String,
| LiteralKind::RawByteStr { .. }
| LiteralKind::CStr { .. }
| LiteralKind::RawCStr { .. } => Class::String,
// Number literals.
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
},

View file

@ -284,6 +284,7 @@ impl<'a> NormalizedPat<'a> {
LitKind::Str(sym, _) => Self::LitStr(sym),
LitKind::ByteStr(ref bytes, _) => Self::LitBytes(bytes),
LitKind::Byte(val) => Self::LitInt(val.into()),
LitKind::CStr(ref bytes, _) => Self::LitBytes(bytes),
LitKind::Char(val) => Self::LitInt(val.into()),
LitKind::Int(val, _) => Self::LitInt(val),
LitKind::Bool(val) => Self::LitBool(val),

View file

@ -304,6 +304,11 @@ impl<'a, 'tcx> PrintVisitor<'a, 'tcx> {
kind!("ByteStr(ref {vec})");
chain!(self, "let [{:?}] = **{vec}", vec.value);
},
LitKind::CStr(ref vec, _) => {
bind!(self, vec);
kind!("CStr(ref {vec})");
chain!(self, "let [{:?}] = **{vec}", vec.value);
}
LitKind::Str(s, _) => {
bind!(self, s);
kind!("Str({s}, _)");

View file

@ -211,6 +211,7 @@ pub fn lit_to_mir_constant(lit: &LitKind, ty: Option<Ty<'_>>) -> Constant {
LitKind::Str(ref is, _) => Constant::Str(is.to_string()),
LitKind::Byte(b) => Constant::Int(u128::from(b)),
LitKind::ByteStr(ref s, _) => Constant::Binary(Lrc::clone(s)),
LitKind::CStr(ref s, _) => Constant::Binary(Lrc::clone(s)),
LitKind::Char(c) => Constant::Char(c),
LitKind::Int(n, _) => Constant::Int(n),
LitKind::Float(ref is, LitFloatType::Suffixed(fty)) => match fty {