From b7575fa0da53da968c5c705b978cb0f7f374916c Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 07:24:57 +0000 Subject: [PATCH 1/8] scanner/macros: fix error path syntax --- src/scanner/macros.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/scanner/macros.rs b/src/scanner/macros.rs index 97c892c..4de3a78 100644 --- a/src/scanner/macros.rs +++ b/src/scanner/macros.rs @@ -70,7 +70,7 @@ macro_rules! check { (@priv $buffer:expr, $offset:expr => $( $match:tt )|+) => { match $buffer.get($offset..) { Some(buffer) => check!(@priv buffer => $( $match )|+), - None => check!(@eofck $( $match )|+ ) + None => check!(@eofck $( $match )|+ ), } }; (@priv $buffer:expr => $( $match:tt )|+) => { @@ -81,16 +81,16 @@ macro_rules! check { }; (@priv $buffer:expr, $offset:expr => $( $match:tt )|+, else $error:expr) => { match $buffer.get($offset..) { - Some(buffer) => check!(@priv buffer => $( $match )|+ else $error), - None if check!(@eofck $( $match )|+ ) => Ok(()) - _ => Err($crate::scanner::error::ScanError::UnexpectedEOF) + Some(buffer) => check!(@priv buffer => $( $match )|+, else $error), + None if check!(@eofck $( $match )|+ ) => Ok(()), + _ => Err($crate::scanner::error::ScanError::UnexpectedEOF), } }; (@priv $buffer:expr => $( $match:tt )|+, else $error:expr) => { match $buffer { $( check!(@ptn $match) )|+ => Ok(()), [] => Err($crate::scanner::error::ScanError::UnexpectedEOF), - _ => Err($error) + _ => Err($error), } }; -- 2.40.1 From 8719af5ecf48ab31f4950b22730176a642ea6a00 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 07:34:37 +0000 Subject: [PATCH 2/8] scanner/macros: add isHex! --- src/scanner/macros.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/scanner/macros.rs b/src/scanner/macros.rs index 4de3a78..7a73ebb 100644 --- a/src/scanner/macros.rs +++ b/src/scanner/macros.rs @@ -186,6 +186,29 @@ macro_rules! isBlankZ { }; } +/// Checks if byte (@ .offset) in .buffer is hexadecimal +/// +/// Modifiers: +/// ~ .buffer := .buffer.as_bytes() +/// +/// Variants: +/// /1 .buffer := /2 .buffer, 0 +/// /2 .buffer, .offset +/// /3 .buffer, else .error +/// := /4 .buffer, 0, else .error +/// /4 .buffer, .offset, else .error +macro_rules! isHex { + (~ $buffer:expr $(, $offset:expr )? $(, else $error:expr )? ) => { + isHex!($buffer.as_bytes() $(, $offset)? $(, else $error)? ) + }; + ($buffer:expr $(, $offset:expr )? $(, else $error:expr)? ) => { + check!($buffer $(, $offset)? => + [b'0'..=b'9', ..] | [b'A'..=b'F', ..] | [b'a'..=b'f', ..] + $(, else $error )? + ) + }; +} + #[cfg(test)] mod tests { -- 2.40.1 From 1802bdbba4595681561885689182cb8ce11c0d80 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 07:36:23 +0000 Subject: [PATCH 3/8] scalar/escape: as_hex returns u8 This allows the caller to decide if a cast is necessary --- src/scanner/scalar/escape.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 7b50be4..4ece140 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -86,7 +86,7 @@ fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> None => return Err(ScanError::UnexpectedEOF), Some(c) if !c.is_ascii_hexdigit() => return Err(ScanError::UnknownEscape), - Some(b) => value = (value << 4) + as_hex(*b), + Some(b) => value = (value << 4) + as_hex(*b) as u32, } advance!(buffer, 1, i); } @@ -128,7 +128,7 @@ fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> */ #[allow(clippy::manual_range_contains)] #[inline] -fn as_hex(b: u8) -> u32 +fn as_hex(b: u8) -> u8 { let ret = if b >= b'A' && b <= b'F' { @@ -143,7 +143,7 @@ fn as_hex(b: u8) -> u32 b - b'0' }; - ret as u32 + ret } /// (U+0085) -- 2.40.1 From c7a2b028f01a685c8aef44c0f3e0e6bb98986383 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 07:37:52 +0000 Subject: [PATCH 4/8] scalar/escape: add tag_uri_unescape This function unescapes percent encoded tag URIs, in accordance with Section 5.6 Miscellaneous Characters #ns-uri-char - Also add unit test for function --- src/scanner/scalar/escape.rs | 108 +++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 4ece140..6662b3c 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -120,6 +120,61 @@ fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> Ok(codepoint_len as usize) } +fn tag_uri_unescape(base: &str, scratch: &mut Vec, _directive: bool) -> Result +{ + let mut buffer = base; + let mut codepoint_len: i8 = 0; + + while { + if !(buffer.len() >= 3) + { + return Err(ScanError::UnexpectedEOF); + } + + if !(check!(~buffer => b'%') && isHex!(~buffer, 1) && isHex!(~buffer, 2)) + { + return Err(ScanError::UnknownEscape); + } + + // Safety: we just checked that there are at least three + // bytes in the buffer + let octet: u8 = (as_hex(buffer.as_bytes()[1]) << 4) + as_hex(buffer.as_bytes()[2]); + + match codepoint_len + { + // First time through, determine how many octets this codepoint has + 0 => + { + codepoint_len = match octet + { + o if (o & 0x80) == 0x00 => 1, + o if (o & 0xE0) == 0xC0 => 2, + o if (o & 0xF0) == 0xE0 => 3, + o if (o & 0xF8) == 0xF0 => 4, + _ => return Err(ScanError::UnknownEscape), + } + }, + // Else ensure that the trailing octet is valid + _ => + { + if (octet & 0xC0) != 0x80 + { + return Err(ScanError::UnknownEscape); + } + }, + } + + scratch.push(octet); + codepoint_len -= 1; + advance!(buffer, 3); + + codepoint_len > 0 + } + {} + + Ok(base.len() - buffer.len()) +} + /* * Inclusive range suggested by clippy here is 5-10% * slower than doing it by hand, see @@ -323,4 +378,57 @@ mod tests Ok(()) } + + #[test] + fn tag_uri_unescape_codepoint() -> TestResult + { + let data = &[ + r#"%C2%85"#, + r#"%c5%b4"#, + r#"%E2%B1%bf"#, + r#"%E2%B8%BF"#, + r#"%f0%90%8f%95"#, + r#"%F0%90%AD%81"#, + ]; + let expected: &[&[u8]] = &[ + &[0xC2, 0x85], + &[0xC5, 0xB4], + &[0xE2, 0xB1, 0xBF], + &[0xE2, 0xB8, 0xBF], + &[0xF0, 0x90, 0x8F, 0x95], + &[0xF0, 0x90, 0xAD, 0x81], + ]; + let scratch = &mut Vec::new(); + + assert_eq!( + data.len(), + expected.len(), + "test data and expected data are not the same length" + ); + + for (i, (&t, &e)) in data.into_iter().zip(expected).enumerate() + { + scratch.clear(); + + let consumed = tag_uri_unescape(t, scratch, true) + .map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?; + + assert_eq!( + &*scratch, e, + "on iteration {}, expected byte sequence {:?}, got {:?}", + i, e, &*scratch + ); + + assert_eq!( + consumed, + t.len(), + "on iteration {}, expected to consume {}, got {}", + i, + t.len(), + consumed + ) + } + + Ok(()) + } } -- 2.40.1 From 20760ba9cd2977afa8cd29ce3fceed209e696d4b Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 07:50:44 +0000 Subject: [PATCH 5/8] scalar/escape: move exported fns to top, document tag_uri_unescape --- src/scanner/scalar/escape.rs | 114 ++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 6662b3c..c131f45 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -67,60 +67,13 @@ pub(super) fn flow_unescape(base: &str, scratch: &mut Vec) -> Result Ok(base.len() - buffer.len()) } -/// Writes a UTF8 codepoint to the scratch space -fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> Result -{ - let mut buffer = base; - let mut i = 0; - let mut value: u32 = 0; - - if codepoint_len < 1 - { - return Ok(0); - } - - while i < codepoint_len - { - match buffer.as_bytes().first() - { - None => return Err(ScanError::UnexpectedEOF), - Some(c) if !c.is_ascii_hexdigit() => return Err(ScanError::UnknownEscape), - - Some(b) => value = (value << 4) + as_hex(*b) as u32, - } - advance!(buffer, 1, i); - } - - // Bit shift the value into the correct byte configuration - // for UTF8 - match value - { - // v <= 127 (ASCII) - v if v <= 0x7F => scratch.push(v as u8), - // v <= 2047 - v if v <= 0x7FF => - { - scratch.extend_from_slice(&[0xC0 | (v >> 6) as u8, 0x80 | (v & 0x3F) as u8]) - }, - // v <= 65535 - v if v <= 0xFFFF => scratch.extend_from_slice(&[ - 0xE0 | (v >> 12) as u8, - 0x80 | ((v >> 6) & 0x3F) as u8, - 0x80 | (v & 0x3F) as u8, - ]), - // Otherwise it must be a full 4 byte code point - v => scratch.extend_from_slice(&[ - 0xF0 | (v >> 18) as u8, - 0x80 | ((v >> 12) & 0x3F) as u8, - 0x80 | ((v >> 6) & 0x3F) as u8, - 0x80 | (v & 0x3F) as u8, - ]), - } - - Ok(codepoint_len as usize) -} - -fn tag_uri_unescape(base: &str, scratch: &mut Vec, _directive: bool) -> Result +/// Unescape a percent encoded UTF8 tag escape sequence as +/// defined in [Section 5.6][Link], writing the code point +/// to the scratch, returning the length of .base consumed. +/// +/// [Link]: https://yaml.org/spec/1.2/spec.html#ns-uri-char +pub(super) fn tag_uri_unescape(base: &str, scratch: &mut Vec, _directive: bool) + -> Result { let mut buffer = base; let mut codepoint_len: i8 = 0; @@ -175,6 +128,59 @@ fn tag_uri_unescape(base: &str, scratch: &mut Vec, _directive: bool) -> Resu Ok(base.len() - buffer.len()) } +/// Writes a UTF8 codepoint to the scratch space +fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> Result +{ + let mut buffer = base; + let mut i = 0; + let mut value: u32 = 0; + + if codepoint_len < 1 + { + return Ok(0); + } + + while i < codepoint_len + { + match buffer.as_bytes().first() + { + None => return Err(ScanError::UnexpectedEOF), + Some(c) if !c.is_ascii_hexdigit() => return Err(ScanError::UnknownEscape), + + Some(b) => value = (value << 4) + as_hex(*b) as u32, + } + advance!(buffer, 1, i); + } + + // Bit shift the value into the correct byte configuration + // for UTF8 + match value + { + // v <= 127 (ASCII) + v if v <= 0x7F => scratch.push(v as u8), + // v <= 2047 + v if v <= 0x7FF => + { + scratch.extend_from_slice(&[0xC0 | (v >> 6) as u8, 0x80 | (v & 0x3F) as u8]) + }, + // v <= 65535 + v if v <= 0xFFFF => scratch.extend_from_slice(&[ + 0xE0 | (v >> 12) as u8, + 0x80 | ((v >> 6) & 0x3F) as u8, + 0x80 | (v & 0x3F) as u8, + ]), + // Otherwise it must be a full 4 byte code point + v => scratch.extend_from_slice(&[ + 0xF0 | (v >> 18) as u8, + 0x80 | ((v >> 12) & 0x3F) as u8, + 0x80 | ((v >> 6) & 0x3F) as u8, + 0x80 | (v & 0x3F) as u8, + ]), + } + + Ok(codepoint_len as usize) +} + /* * Inclusive range suggested by clippy here is 5-10% * slower than doing it by hand, see -- 2.40.1 From ff76d34662f33edac7c4aede6e05e2d019549d3f Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 08:03:25 +0000 Subject: [PATCH 6/8] scalar/escape: fix flow_unescape documentation --- src/scanner/scalar/escape.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index c131f45..5e935e7 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -9,8 +9,8 @@ use crate::scanner::error::{ScanError, ScanResult as Result}; /// 32 bit unicode points. /// /// It writes the unescaped character to .scratch, returning -/// the length of .buffer advanced, or an error if the -/// escape sequence is invalid. It expects .buffer->0 is a +/// the length of .base advanced, or an error if the +/// escape sequence is invalid. It expects .base->0 is a /// backslash (\\), as this is the only valid start of an /// escape sequence. /// -- 2.40.1 From 8f75fa047ee53044f97a47df0d3fd7a027b8404b Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 08:04:03 +0000 Subject: [PATCH 7/8] scalar/escape: add more unit tests for tag_uri_unescape --- src/scanner/scalar/escape.rs | 42 +++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 5e935e7..c202f30 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -219,7 +219,7 @@ const PS: [u8; 3] = [b'\xE2', b'\x80', b'\xA9']; #[cfg(test)] mod tests { - use anyhow::anyhow; + use anyhow::{anyhow, bail}; use pretty_assertions::assert_eq; use super::*; @@ -437,4 +437,44 @@ mod tests Ok(()) } + + #[test] + fn tag_uri_unescape_eof() -> TestResult + { + let data = r#"%C2%8"#; + let scratch = &mut Vec::new(); + let expected = ScanError::UnexpectedEOF; + + match tag_uri_unescape(data, scratch, true) + { + Err(e) if e == expected => Ok(()), + + Err(e) => bail!("expected error: {}, got different error: {}", expected, e), + Ok(amt) => bail!( + "expected error: {}, got unexpected value: {}", + expected, + amt + ), + } + } + + #[test] + fn tag_uri_unescape_invalid() -> TestResult + { + let data = r#"\xC285"#; + let scratch = &mut Vec::new(); + let expected = ScanError::UnknownEscape; + + match tag_uri_unescape(data, scratch, true) + { + Err(e) if e == expected => Ok(()), + + Err(e) => bail!("expected error: {}, got different error: {}", expected, e), + Ok(amt) => bail!( + "expected error: {}, got unexpected value: {}", + expected, + amt + ), + } + } } -- 2.40.1 From 93d45391c1a6268c74c7374df62861c36152b8ed Mon Sep 17 00:00:00 2001 From: Bazaah Date: Sat, 26 Jun 2021 08:21:32 +0000 Subject: [PATCH 8/8] scalar/escape: clippy lints --- src/scanner/scalar/escape.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index c202f30..72f651a 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -79,7 +79,7 @@ pub(super) fn tag_uri_unescape(base: &str, scratch: &mut Vec, _directive: bo let mut codepoint_len: i8 = 0; while { - if !(buffer.len() >= 3) + if buffer.len() < 3 { return Err(ScanError::UnexpectedEOF); } @@ -191,7 +191,7 @@ fn write_unicode_point(base: &str, scratch: &mut Vec, codepoint_len: u8) -> #[inline] fn as_hex(b: u8) -> u8 { - let ret = if b >= b'A' && b <= b'F' + if b >= b'A' && b <= b'F' { b - b'A' + 10 } @@ -202,9 +202,7 @@ fn as_hex(b: u8) -> u8 else { b - b'0' - }; - - ret + } } /// (U+0085) -- 2.40.1