scalar/escape: various touchups

- Renamed fn flow_double_unescape -> flow_unescape
- Added unit tests for flow_unescape return
- Improved existing test coverage for hex escapes
- Documented module + flow_unescape
- Slight refactoring of module layout for readability
This commit is contained in:
Paul Stemmet 2021-06-18 19:30:21 +00:00 committed by Paul Stemmet
parent a750275b47
commit 4dc5efae33
1 changed files with 127 additions and 22 deletions

View File

@ -1,11 +1,21 @@
//! This module exports function(s) for handling scalar
//! escapes in YAML documents.
use crate::scanner::error::{ScanError, ScanResult as Result};
const NEL: [u8; 2] = [b'\xC2', b'\x85'];
const NBS: [u8; 2] = [b'\xC2', b'\xA0'];
const LS: [u8; 3] = [b'\xE2', b'\x80', b'\xA8'];
const PS: [u8; 3] = [b'\xE2', b'\x80', b'\xA9'];
fn flow_double_unescape<'b, 'c>(base: &'b str, scratch: &'c mut Vec<u8>) -> Result<usize>
/// Unescape a given YAML escape sequence as defined in
/// [Section 5.7][Link]. Specifically, YAML defines 18
/// 'special' escapes, and 3 methods of encoding 8, 16 and
/// 32 bit unicode points.
///
/// It writes the unescaped character to .scratch, returning
/// the length of .buffer advanced, or an error if the
/// escape sequence is invalid. It expects .buffer->0 is a
/// backslash (\\), as this is the only valid start of an
/// escape sequence.
///
/// [Link]: https://yaml.org/spec/1.2/spec.html#c-escape
pub(super) fn flow_unescape(base: &str, scratch: &mut Vec<u8>) -> Result<usize>
{
let mut buffer = base;
let mut escape_len: Option<u8> = None;
@ -77,13 +87,16 @@ fn write_unicode_point<'b, 'c>(
{
match buffer.as_bytes().first()
{
Some(c) if !c.is_ascii_hexdigit() => return Err(ScanError::UnknownEscape),
None => return Err(ScanError::UnexpectedEOF),
Some(c) if !c.is_ascii_hexdigit() => return Err(ScanError::UnknownEscape),
Some(b) => value = (value << 4) + as_hex(*b),
}
advance!(buffer, 1, i);
}
// Bit shift the value into the correct byte configuration
// for UTF8
match value
{
// v <= 127 (ASCII)
@ -91,20 +104,20 @@ fn write_unicode_point<'b, 'c>(
// v <= 2047
v if v <= 0x7FF =>
{
scratch.extend_from_slice(&[0xC0 + (v >> 6) as u8, 0x80 + (value & 0x3F) as u8])
scratch.extend_from_slice(&[0xC0 | (v >> 6) as u8, 0x80 | (v & 0x3F) as u8])
},
// v <= 65535
v if v <= 0xFFFF => scratch.extend_from_slice(&[
0xE0 + (v >> 12) as u8,
0x80 + ((v >> 6) & 0x3F) as u8,
0x80 + (v & 0x3F) as u8,
0xE0 | (v >> 12) as u8,
0x80 | ((v >> 6) & 0x3F) as u8,
0x80 | (v & 0x3F) as u8,
]),
// Otherwise it must be a full 4 byte code point
v => scratch.extend_from_slice(&[
0xF0 + (v >> 18) as u8,
0x80 + ((v >> 12) & 0x3F) as u8,
0x80 + ((v >> 6) & 0x3F) as u8,
0x80 + (v & 0x3F) as u8,
0xF0 | (v >> 18) as u8,
0x80 | ((v >> 12) & 0x3F) as u8,
0x80 | ((v >> 6) & 0x3F) as u8,
0x80 | (v & 0x3F) as u8,
]),
}
@ -130,6 +143,15 @@ fn as_hex(b: u8) -> u32
ret as u32
}
/// <Next Line> (U+0085)
const NEL: [u8; 2] = [b'\xC2', b'\x85'];
/// <No-Break Space> (U+00A0)
const NBS: [u8; 2] = [b'\xC2', b'\xA0'];
/// <Line Separator> (U+2028)
const LS: [u8; 3] = [b'\xE2', b'\x80', b'\xA8'];
/// <Paragraph Separator> (U+2029)
const PS: [u8; 3] = [b'\xE2', b'\x80', b'\xA9'];
#[cfg(test)]
mod tests
{
@ -141,7 +163,7 @@ mod tests
type TestResult = anyhow::Result<()>;
#[test]
fn flow_d_escape_special() -> TestResult
fn flow_escape_special() -> TestResult
{
let mut s = Vec::new();
let scratch = &mut s;
@ -181,7 +203,7 @@ mod tests
for (i, (&t, &ex)) in data.into_iter().zip(expected).enumerate()
{
scratch.clear();
flow_double_unescape(t, scratch)
flow_unescape(t, scratch)
.map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?;
assert_eq!(scratch, ex, "on iteration {}", i)
@ -191,12 +213,45 @@ mod tests
}
#[test]
fn flow_d_escape_hex() -> TestResult
fn flow_escape_hex() -> TestResult
{
let mut s = Vec::new();
let scratch = &mut s;
let data = &[r#"\x64"#];
let expected: &[&[u8]] = &[&[100]];
#[rustfmt::skip]
let data = &[
// === 1 byte
r#"\x64"#, // 0
r#"\x65"#, // 1
// === 2 bytes
r#"\x7f"#, // 2
r#"\xF7"#, // 3
r#"\xB6"#, // 4
r#"\xFF"#, // 5
r#"\xC6"#, // 6
r#"\u2c61"#, // 7
r#"\u30C4"#, // 8
r#"\ua026"#, // 9
// === 4 bytes
r#"\U000111E1"#, // 10
];
#[rustfmt::skip]
let expected = &[
// === 1 byte
'd', // 0
'e', // 1
// === 2 bytes
'\u{7f}', // 2
'÷', // 3
'¶', // 4
'ÿ', // 5
'Æ', // 6
// === 3 bytes
'ⱡ', // 7
'ツ', // 8
'ꀦ', // 9
// === 4 bytes
'𑇡' // 10
];
assert_eq!(
data.len(),
@ -206,11 +261,61 @@ mod tests
for (i, (&t, &ex)) in data.into_iter().zip(expected).enumerate()
{
let mut c: [u8; 4] = [0; 4];
scratch.clear();
flow_double_unescape(t, scratch)
flow_unescape(t, scratch)
.map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?;
assert_eq!(scratch, ex, "on iteration {}", i)
assert_eq!(
scratch,
ex.encode_utf8(&mut c).as_bytes(),
"on iteration {}, codepoint '{}'",
i,
ex
)
}
Ok(())
}
#[test]
fn flow_escape_consumed() -> TestResult
{
let mut s = Vec::new();
let scratch = &mut s;
let data = &[
// === 1 byte
r#"\x64"#, // 0
r#"\x65"#, // 1
// === 2 bytes
r#"\x7f"#, // 2
r#"\xF7"#, // 3
r#"\xB6"#, // 4
r#"\xFF"#, // 5
r#"\xC6"#, // 6
r#"\u2c61"#, // 7
r#"\u30C4"#, // 8
r#"\ua026"#, // 9
// === 4 bytes
r#"\U000111E1"#, // 10
];
for (i, &t) in data.into_iter().enumerate()
{
scratch.clear();
let consumed = flow_unescape(t, scratch)
.map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?;
assert_eq!(
consumed,
t.len(),
"on iteration {}, expected to consume {}, got {}",
i,
t.len(),
consumed
)
}
Ok(())