From 6d09555178035015870224d83d3af70279d3be93 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 08:04:10 +0000 Subject: [PATCH 01/19] Cargo: add dependencies.bitflags = 1 --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + 2 files changed, 8 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index f45dc2d..71fbb85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "ctor" version = "0.1.20" @@ -139,5 +145,6 @@ version = "0.1.0" dependencies = [ "anyhow", "atoi", + "bitflags", "pretty_assertions", ] diff --git a/Cargo.toml b/Cargo.toml index 2196b1e..4cc4d1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" [dependencies] atoi = "0.4" +bitflags = "1" [dev-dependencies] anyhow = "1" -- 2.43.5 From a0ad61b7ba18ebefeddb28913a6023b7d5d37c4b Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 08:07:59 +0000 Subject: [PATCH 02/19] scanner/flag: add Flags for Scanner control This struct is a C style bitflag container, which controls various aspects of Scanner functionality. The initial flags available are O_ZEROED, O_EXTENDABLE and O_LAZY. Read each's documentation for an explanation. --- src/scanner/flag.rs | 35 +++++++++++++++++++++++++++++++++++ src/scanner/mod.rs | 1 + 2 files changed, 36 insertions(+) create mode 100644 src/scanner/flag.rs diff --git a/src/scanner/flag.rs b/src/scanner/flag.rs new file mode 100644 index 0000000..4a868da --- /dev/null +++ b/src/scanner/flag.rs @@ -0,0 +1,35 @@ +use bitflags::bitflags; + +/// An empty, zeroed flag set. This is the default set, with +/// all other flags disabled. +pub const O_ZEROED: Flags = Flags::empty(); +/// Hints to the Scanner if the given byte slice can be +/// extended. Typically used when processing data in chunks, +/// or in circumstances when there may be more data in the +/// future. +/// +/// If this flag is set the Scanner will return a +/// ScanError::Extend if the byte stream terminates before a +/// token can be scanned. +pub const O_EXTENDABLE: Flags = Flags::EXTENDABLE; +/// Sets the Scanner to lazily process the underlying byte +/// stream. +/// +/// In particular, the Scanner will not fully process +/// scalars, only locating the start and end markers in the +/// stream. This means that any allocations, escape parsing +/// or line joins will be deferred until the caller +/// explicitly requests the token. This _also applies to +/// errors_ in the scalar itself, which will not be caught +/// until the caller requests the token! +pub const O_LAZY: Flags = Flags::LAZY; + +bitflags! { + /// Directives controlling various behaviors of the Scanner, + /// see each O_ variant for an explanation of how each works + #[derive(Default)] + pub struct Flags: u32 { + const EXTENDABLE = 0b00000001; + const LAZY = 0b00000010; + } +} diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index e31a8d7..3bf158c 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -8,6 +8,7 @@ mod context; mod directive; mod entry; mod error; +mod flag; mod key; mod scalar; mod stats; -- 2.43.5 From e7d68e85144b907198107d7970770eedc4f00a62 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:14:50 +0000 Subject: [PATCH 03/19] scanner/error: add variant Extend This variant suggests to the caller that they should extend the byte stream before calling the Scanner again. --- src/scanner/error.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/scanner/error.rs b/src/scanner/error.rs index 73ceb9c..a89773b 100644 --- a/src/scanner/error.rs +++ b/src/scanner/error.rs @@ -72,6 +72,10 @@ pub enum ScanError /// An integer overflowed IntOverflow, + + /// The underlying buffer should be extended before + /// calling the Scanner again + Extend, } impl fmt::Display for ScanError -- 2.43.5 From c00e78a601f5449e02b188d9dc3c2495d9c13b24 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:06:25 +0000 Subject: [PATCH 04/19] scanner/macros: add cache! cache! allows the Scanner to state that it requires 'N' more codepoints before it can correctly process the byte stream. Its primary purpose is its interaction with O_EXTENDABLE, which allows the caller to hint to the Scanner that the buffer could grow, likewise cache! returns an error that hints to the caller that they should extend the byte stream before calling the Scanner again -- or pass opts without O_EXTENDABLE. --- src/scanner/macros.rs | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/scanner/macros.rs b/src/scanner/macros.rs index a95801b..32fc0ef 100644 --- a/src/scanner/macros.rs +++ b/src/scanner/macros.rs @@ -87,6 +87,60 @@ macro_rules! cow { }; } +/// Check that the underlying .buffer has at least the given +/// number of UTF8 .codepoints available, returning an error +/// if O_EXTENDABLE is set in .opts. Returns the number of +/// _bytes_ read. +/// +/// Modifiers +/// ~ .buffer := .buffer.as_bytes() +/// +/// Variants +/// /1 .buffer, .codepoints +/// := /4 .buffer, @0, .codepoints, O_ZEROED +/// /2 .buffer, @.offset, .codepoints +/// := /4 .buffer, @.offset, .codepoints, O_ZEROED +/// /3 .buffer, .codepoints, .opts +/// := /4 .buffer @0, .codepoints, .opts +/// /4 .buffer, @.offset, .codepoints, .opts +macro_rules! cache { + (~$buffer:expr $(, @$offset:expr )?, $codepoints:expr $(, $opts:expr )?) => { + cache!($buffer.as_bytes(), $( @$offset, )? $codepoints $(, $opts )?) + }; + ($buffer:expr $(, @$offset:expr )?, $codepoints:expr $(, $opts:expr )?) => { + cache!(@inner $buffer, $( @$offset, )? @0, $codepoints $(, $opts )?, $crate::scanner::flag::O_ZEROED) + }; + (@inner $buffer:expr, @$offset:expr, $( @$_:expr, )? $codepoints:expr, $opts:expr $(, $__:expr )?) => { + cache!(@priv $buffer, $offset, $codepoints, $opts.contains($crate::scanner::flag::O_EXTENDABLE)) + }; + (@priv $buffer:expr, $offset:expr, $codepoints:expr, $extend:expr) => {{ + let mut ret = Ok(0); + let mut bytes = $offset; + for _ in 0..$codepoints + { + match widthOf!($buffer, bytes) + { + 0 => + { + if $extend + { + ret = Err($crate::scanner::error::ScanError::Extend); + } + + break; + }, + n => + { + bytes += n; + ret = ret.map(|r| r + n); + }, + } + } + + ret + }}; +} + /// Check the .buffer (@ .offset) matches the given /// .pattern, optionally returning an .error. /// -- 2.43.5 From 71dd1a52d532ffbe7692d4ebfda4b148f16f8e39 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 11:56:32 +0000 Subject: [PATCH 05/19] lib/scanner: add opts to scan_tokens, eat_whitespace cache! This commit adds initial support for cache!-ing characters in the Scanner, starting with eat_whitespace. --- src/scanner/mod.rs | 36 +++++++++++++++++++++++---------- src/scanner/tests/whitespace.rs | 4 ++-- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 3bf158c..e3d9965 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -22,6 +22,7 @@ use crate::{ directive::{scan_directive, DirectiveKind}, entry::TokenEntry, error::{ScanError, ScanResult as Result}, + flag::*, key::{Key, KeyPossible}, scalar::{block::scan_block_scalar, flow::scan_flow_scalar, plain::scan_plain_scalar}, stats::MStats, @@ -67,7 +68,12 @@ impl Scanner /// Scan some tokens from the given .base into .tokens /// returning the number added. - pub fn scan_tokens<'de>(&mut self, base: &'de str, tokens: &mut Tokens<'de>) -> Result + pub fn scan_tokens<'de>( + &mut self, + opts: Flags, + base: &'de str, + tokens: &mut Tokens<'de>, + ) -> Result { let mut num_tokens = 0; let starting_tokens = tokens.len(); @@ -77,7 +83,7 @@ impl Scanner { if let Some(mut buffer) = base.get(self.offset..) { - self.scan_next_token(&mut buffer, tokens)?; + self.scan_next_token(opts, &mut buffer, tokens)?; self.offset = base.len() - buffer.len(); @@ -88,8 +94,12 @@ impl Scanner Ok(num_tokens) } - fn scan_next_token<'de>(&mut self, base: &mut &'de str, tokens: &mut Tokens<'de>) - -> Result<()> + fn scan_next_token<'de>( + &mut self, + opts: Flags, + base: &mut &'de str, + tokens: &mut Tokens<'de>, + ) -> Result<()> { // Is it the beginning of the stream? if self.state == StreamState::Start @@ -99,7 +109,7 @@ impl Scanner } // Eat whitespace to the next delimiter - self.eat_whitespace(base, COMMENTS); + self.eat_whitespace(opts, base, COMMENTS)?; // Remove any saved key positions that cannot contain keys // anymore @@ -870,11 +880,11 @@ impl Scanner /// Chomp whitespace and optionally comments until we /// reach the next token, updating buffer[0] to the /// beginning of the new token - fn eat_whitespace(&mut self, buffer: &mut &str, comments: bool) -> usize + fn eat_whitespace(&mut self, opts: Flags, buffer: &mut &str, comments: bool) -> Result { let mut stats = MStats::new(); - let amt = eat_whitespace(*buffer, &mut stats, comments); + let amt = eat_whitespace(opts, *buffer, &mut stats, comments)?; // A new line may start a key in the block context // @@ -888,7 +898,7 @@ impl Scanner advance!(*buffer, amt); self.stats += stats; - amt + Ok(amt) } } @@ -903,7 +913,7 @@ enum StreamState /// Chomp whitespace and .comments if allowed until a non /// whitespace character is encountered, returning the /// amount chomped -fn eat_whitespace(base: &str, stats: &mut MStats, comments: bool) -> usize +fn eat_whitespace(opts: Flags, base: &str, stats: &mut MStats, comments: bool) -> Result { let mut buffer = base; let mut chomp_line = false; @@ -911,6 +921,8 @@ fn eat_whitespace(base: &str, stats: &mut MStats, comments: bool) -> usize loop { + cache!(~buffer, 1, opts)?; + let (blank, brk) = (isBlank!(~buffer), isBreak!(~buffer)); match (blank, brk) @@ -944,7 +956,7 @@ fn eat_whitespace(base: &str, stats: &mut MStats, comments: bool) -> usize } } - base.len() - buffer.len() + Ok(base.len() - buffer.len()) } /// Roll the indentation level and push a block collection @@ -1106,7 +1118,9 @@ mod tests { if (!self.done) && self.tokens.is_empty() { - if let 0 = self.scan.scan_tokens(self.data, &mut self.tokens)? + if let 0 = self + .scan + .scan_tokens(O_ZEROED, self.data, &mut self.tokens)? { self.done = true } diff --git a/src/scanner/tests/whitespace.rs b/src/scanner/tests/whitespace.rs index 88499a7..8198714 100644 --- a/src/scanner/tests/whitespace.rs +++ b/src/scanner/tests/whitespace.rs @@ -27,7 +27,7 @@ fn eat() let mut buffer = data; let mut s = Scanner::new(); - s.eat_whitespace(&mut buffer, false); + s.eat_whitespace(O_ZEROED, &mut buffer, false).unwrap(); assert_eq!(buffer, "abc"); assert_eq!(s.stats, (3, 0, 3)) @@ -40,7 +40,7 @@ fn eat_none() let mut buffer = data; let mut s = Scanner::new(); - s.eat_whitespace(&mut buffer, false); + s.eat_whitespace(O_ZEROED, &mut buffer, false).unwrap(); assert_eq!(buffer, "abc"); assert_eq!(s.stats, (0, 0, 0)) -- 2.43.5 From 55b8f31b7b3727f5423e3ecbadbab26ae019f4a7 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:18:41 +0000 Subject: [PATCH 06/19] lib/scanner: cache! before fetch in scan_next_token --- src/scanner/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index e3d9965..c53a1c1 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -125,6 +125,12 @@ impl Scanner return self.fetch_stream_end(*base, tokens); } + // 4 characters is the longest token we can encounter, one + // of: + // - '--- ' + // - '... ' + cache!(~base, 4, opts)?; + // Fetch the next token(s) match base.as_bytes() { -- 2.43.5 From dc10680862549185708b63be3a7127613fe76a06 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:22:21 +0000 Subject: [PATCH 07/19] scanner/directive: cache! before fetch --- src/scanner/directive.rs | 34 ++++++++++++++++++++++------------ src/scanner/mod.rs | 12 ++++++++---- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/scanner/directive.rs b/src/scanner/directive.rs index ee3e00d..91da8f8 100644 --- a/src/scanner/directive.rs +++ b/src/scanner/directive.rs @@ -5,13 +5,14 @@ use super::{ stats::MStats, }; use crate::{ - scanner::{eat_whitespace, tag::scan_tag_directive, COMMENTS}, + scanner::{eat_whitespace, flag::Flags, tag::scan_tag_directive, COMMENTS}, token::Token, }; /// Scans a version or tag directive from .buffer, based on /// the .kind of directive, returning the relevant Token. pub(in crate::scanner) fn scan_directive<'de>( + opts: Flags, buffer: &mut &'de str, mut stats: &mut MStats, kind: &DirectiveKind, @@ -22,21 +23,25 @@ pub(in crate::scanner) fn scan_directive<'de>( DirectiveKind::Version => { // Chomp any preceding whitespace - advance!(*buffer, eat_whitespace(buffer, &mut stats, !COMMENTS)); + advance!( + *buffer, + eat_whitespace(opts, buffer, &mut stats, !COMMENTS)? + ); // %YAML 1.1 // ^ - let (major, skip) = scan_directive_version(buffer)?; + let (major, skip) = scan_directive_version(opts, buffer)?; advance!(*buffer, :stats, skip); // %YAML 1.1 // ^ + cache!(~buffer, 1, opts)?; check!(~buffer => b'.', else ScanError::InvalidVersion)?; advance!(*buffer, :stats, 1); // %YAML 1.1 // ^ - let (minor, skip) = scan_directive_version(buffer)?; + let (minor, skip) = scan_directive_version(opts, buffer)?; advance!(*buffer, :stats, skip); Ok(Token::VersionDirective(major, minor)) @@ -44,10 +49,13 @@ pub(in crate::scanner) fn scan_directive<'de>( DirectiveKind::Tag => { // Chomp any spaces up to the handle - advance!(*buffer, eat_whitespace(buffer, &mut stats, !COMMENTS)); + advance!( + *buffer, + eat_whitespace(opts, buffer, &mut stats, !COMMENTS)? + ); // Scan the directive, copying if necessary - let (token, amt) = scan_tag_directive(buffer, &mut stats)?; + let (token, amt) = scan_tag_directive(opts, buffer, &mut stats)?; advance!(*buffer, amt); Ok(token) @@ -97,15 +105,15 @@ impl DirectiveKind } } -fn scan_directive_version(b: &str) -> Result<(u8, usize)> +fn scan_directive_version(opts: Flags, b: &str) -> Result<(u8, usize)> { - let v_slice = take_while(b.as_bytes(), u8::is_ascii_digit); + let v_slice = take_while(opts, b.as_bytes(), u8::is_ascii_digit)?; let v = atoi(v_slice).ok_or(ScanError::InvalidVersion)?; Ok((v, v_slice.len())) } -fn take_while(b: &[u8], f: F) -> &[u8] +fn take_while(opts: Flags, base: &[u8], f: F) -> Result<&[u8]> where F: Fn(&u8) -> bool, { @@ -113,10 +121,12 @@ where loop { - match b.get(index) + let i = cache!(base, @index, 1, opts)?; + + match base.get(index) { - Some(b) if f(b) => index += 1, - _ => return &b[..index], + Some(b) if f(b) => index += i, + _ => return Ok(&base[..index]), } } } diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index c53a1c1..8778164 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -135,7 +135,7 @@ impl Scanner match base.as_bytes() { // Is it a directive? - [DIRECTIVE, ..] if self.stats.column == 0 => self.fetch_directive(base, tokens), + [DIRECTIVE, ..] if self.stats.column == 0 => self.fetch_directive(opts, base, tokens), // Is it a document marker? [b @ b'-', b'-', b'-', ..] | [b @ b'.', b'.', b'.', ..] @@ -270,8 +270,12 @@ impl Scanner Ok(()) } - fn fetch_directive<'de>(&mut self, base: &mut &'de str, tokens: &mut Tokens<'de>) - -> Result<()> + fn fetch_directive<'de>( + &mut self, + opts: Flags, + base: &mut &'de str, + tokens: &mut Tokens<'de>, + ) -> Result<()> { let mut buffer = *base; let mut stats = MStats::new(); @@ -299,7 +303,7 @@ impl Scanner advance!(buffer, :stats, 1 + kind.len()); // Scan the directive token from the .buffer - let token = scan_directive(&mut buffer, &mut stats, &kind)?; + let token = scan_directive(opts, &mut buffer, &mut stats, &kind)?; // A key cannot follow a directive (a newline is required) self.simple_key_allowed = false; -- 2.43.5 From 0191169ad6a92bb9dd9b2d2d68a493b553dcccfb Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:24:53 +0000 Subject: [PATCH 08/19] scanner/tag: cache! before fetch note that this commit also fixes code that fetch_directive uses --- src/scanner/mod.rs | 11 ++++++--- src/scanner/scalar/escape.rs | 8 ++++++- src/scanner/tag.rs | 43 +++++++++++++++++++++++++----------- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 8778164..5086625 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -181,7 +181,7 @@ impl Scanner [ANCHOR, ..] | [ALIAS, ..] => self.fetch_anchor(base, tokens), // Is it a tag? - [TAG, ..] => self.fetch_tag(base, tokens), + [TAG, ..] => self.fetch_tag(opts, base, tokens), // Is it a block scalar? [c @ LITERAL, ..] | [c @ FOLDED, ..] if self.context.is_block() => @@ -319,7 +319,12 @@ impl Scanner Ok(()) } - fn fetch_tag<'de>(&mut self, base: &mut &'de str, tokens: &mut Tokens<'de>) -> Result<()> + fn fetch_tag<'de>( + &mut self, + opts: Flags, + base: &mut &'de str, + tokens: &mut Tokens<'de>, + ) -> Result<()> { let mut buffer = *base; let mut stats = MStats::new(); @@ -331,7 +336,7 @@ impl Scanner self.save_key(!REQUIRED)?; - let (token, amt) = scan_node_tag(buffer, &mut stats)?; + let (token, amt) = scan_node_tag(opts, buffer, &mut stats)?; advance!(buffer, amt); // A key may not start after a tag (only before) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 7b87022..9ec6e60 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -1,7 +1,10 @@ //! This module exports function(s) for handling scalar //! escapes in YAML documents. -use crate::scanner::error::{ScanError, ScanResult as Result}; +use crate::scanner::{ + error::{ScanError, ScanResult as Result}, + flag::Flags, +}; /// Unescape a given YAML escape sequence as defined in /// [Section 5.7][Link]. Specifically, YAML defines 18 @@ -73,6 +76,7 @@ pub(in crate::scanner) fn flow_unescape(base: &str, scratch: &mut Vec) -> Re /// /// [Link]: https://yaml.org/spec/1.2/spec.html#ns-uri-char pub(in crate::scanner) fn tag_uri_unescape( + opts: Flags, base: &str, scratch: &mut Vec, _directive: bool, @@ -82,6 +86,8 @@ pub(in crate::scanner) fn tag_uri_unescape( let mut codepoint_len: i8 = 0; while { + cache!(~buffer, 3, opts)?; + if buffer.len() < 3 { return Err(ScanError::UnexpectedEOF); diff --git a/src/scanner/tag.rs b/src/scanner/tag.rs index dbe1b9c..7d04ec3 100644 --- a/src/scanner/tag.rs +++ b/src/scanner/tag.rs @@ -66,6 +66,7 @@ use crate::{ scanner::{ eat_whitespace, error::{ScanError, ScanResult as Result}, + flag::Flags, scalar::escape::tag_uri_unescape, stats::MStats, }, @@ -79,6 +80,7 @@ use crate::{ /// possible, but may also copy the directive's handle and /// prefix into .scratch if borrowing is not possible. pub(in crate::scanner) fn scan_tag_directive<'de>( + opts: Flags, base: &'de str, stats: &mut MStats, ) -> Result<(Token<'de>, usize)> @@ -88,7 +90,7 @@ pub(in crate::scanner) fn scan_tag_directive<'de>( // %TAG !named! :tag:prefix # a comment\n // ^^^^^^^ - let (handle, amt) = match scan_tag_handle(buffer, stats)? + let (handle, amt) = match scan_tag_handle(opts, buffer, stats)? { Some((handle, amt)) => (handle.into_inner(), amt), None => return Err(ScanError::InvalidTagHandle), @@ -99,14 +101,15 @@ pub(in crate::scanner) fn scan_tag_directive<'de>( // ^ // Check that there is >= 1 whitespace between handle and // prefix + cache!(~buffer, 1, opts)?; isBlank!(~buffer, else ScanError::InvalidTagPrefix)?; // Chomp whitespace to prefix - advance!(buffer, eat_whitespace(buffer, stats, false)); + advance!(buffer, eat_whitespace(opts, buffer, stats, false)?); // %TAG !named! :tag:prefix # a comment\n // ^^^^^^^^^^^ - let (prefix, amt) = scan_tag_uri(buffer, stats, &mut can_borrow, false)?; + let (prefix, amt) = scan_tag_uri(opts, buffer, stats, &mut can_borrow, false)?; // %TAG !named! tag-prefix # a comment\n // ^ @@ -146,6 +149,7 @@ pub(in crate::scanner) fn scan_tag_directive<'de>( /// ("!", "") => A non resolving tag /// (handle, suffix) => A primary, secondary or named tag pub(in crate::scanner) fn scan_node_tag<'de>( + opts: Flags, base: &'de str, stats: &mut MStats, ) -> Result<(Token<'de>, usize)> @@ -163,6 +167,8 @@ pub(in crate::scanner) fn scan_node_tag<'de>( * a zero length sub-slice out. */ + cache!(~buffer, 2, opts)?; + // ! "node" // ^^ // If its a verbatim tag scan it @@ -172,10 +178,11 @@ pub(in crate::scanner) fn scan_node_tag<'de>( // ! "node" // ^^^^^^^^^^^^^^^^^^^^ - let (verbatim, amt) = scan_tag_uri(buffer, stats, &mut can_borrow, true)?; + let (verbatim, amt) = scan_tag_uri(opts, buffer, stats, &mut can_borrow, true)?; // ! "node" // ^ + cache!(~buffer, @amt + 1, 1, opts)?; check!(~buffer, amt + 1 => b'>', else ScanError::InvalidTagSuffix)?; let token = assemble_tag(&buffer[0..0], verbatim, can_borrow); @@ -185,7 +192,7 @@ pub(in crate::scanner) fn scan_node_tag<'de>( // Otherwise scan it as a normal tag else { - match scan_tag_handle(buffer, stats)? + match scan_tag_handle(opts, buffer, stats)? { // ! "node" // ^ @@ -200,7 +207,7 @@ pub(in crate::scanner) fn scan_node_tag<'de>( // !!global "node" OR !named!global "node" // ^^^^^^ ^^^^^^ - let (suffix, amt) = scan_tag_uri(buffer, stats, &mut can_borrow, false)?; + let (suffix, amt) = scan_tag_uri(opts, buffer, stats, &mut can_borrow, false)?; let token = assemble_tag(h, suffix, can_borrow); @@ -210,6 +217,8 @@ pub(in crate::scanner) fn scan_node_tag<'de>( // Handle scan couldn't find a closing !, meaning this is a local tag None => { + cache!(~buffer, 1, opts)?; + // !local "node" // ^ let handle = &buffer[..1]; @@ -217,7 +226,7 @@ pub(in crate::scanner) fn scan_node_tag<'de>( // !local "node" // ^^^^^ - let (suffix, amt) = scan_tag_uri(buffer, stats, &mut can_borrow, false)?; + let (suffix, amt) = scan_tag_uri(opts, buffer, stats, &mut can_borrow, false)?; let token = assemble_tag(handle, suffix, can_borrow); @@ -240,6 +249,7 @@ pub(in crate::scanner) fn scan_node_tag<'de>( /// /// [Link]: https://yaml.org/spec/1.2/spec.html#ns-global-tag-prefix pub(in crate::scanner) fn scan_tag_uri<'de>( + opts: Flags, base: &'de str, stats: &mut MStats, can_borrow: &mut bool, @@ -251,6 +261,8 @@ pub(in crate::scanner) fn scan_tag_uri<'de>( loop { + cache!(~buffer, 1, opts)?; + match buffer.as_bytes() { // If its a normal allowed character, add it @@ -293,7 +305,7 @@ pub(in crate::scanner) fn scan_tag_uri<'de>( *can_borrow = false; } - let amt = tag_uri_unescape(buffer, &mut scratch, true)?; + let amt = tag_uri_unescape(opts, buffer, &mut scratch, true)?; advance!(buffer, :stats, amt); }, // EOF before loop end is an error @@ -320,6 +332,7 @@ pub(in crate::scanner) fn scan_tag_uri<'de>( /// Scans a tag handle from .base, attempting to return the /// fragment if the handle is unambiguous. pub(in crate::scanner) fn scan_tag_handle<'b>( + opts: Flags, base: &'b str, stats: &mut MStats, ) -> Result, usize)>> @@ -331,6 +344,7 @@ pub(in crate::scanner) fn scan_tag_handle<'b>( // !!tag // ^ // Check that we are indeed starting a handle + cache!(~buffer, 1, opts)?; check!(~buffer => b'!', else ScanError::InvalidTagHandle)?; // %TAG !handle! tag-prefix # a comment \n @@ -338,9 +352,10 @@ pub(in crate::scanner) fn scan_tag_handle<'b>( // !handle!tag // ^^^^^^ // Safety: we just proved above we have >= 1 byte ('!') - let name = take_while(buffer[1..].as_bytes(), u8::is_ascii_alphanumeric); + let name = take_while(opts, buffer[1..].as_bytes(), u8::is_ascii_alphanumeric)?; let mut offset = 1 + name.len(); + cache!(~buffer, @offset, 1, opts)?; match buffer.as_bytes().get(offset) { // If we find a closing '!', then it must either be a secondary or named handle @@ -391,7 +406,7 @@ impl<'a> TagHandle<'a> } } -fn take_while(b: &[u8], f: F) -> &[u8] +fn take_while(opts: Flags, base: &[u8], f: F) -> Result<&[u8]> where F: Fn(&u8) -> bool, { @@ -399,10 +414,12 @@ where loop { - match b.get(index) + let i = cache!(base, @index, 1, opts)?; + + match base.get(index) { - Some(b) if f(b) => index += 1, - _ => return &b[..index], + Some(b) if f(b) => index += i, + _ => return Ok(&base[..index]), } } } -- 2.43.5 From 8b68d2f8e4297a9c4168c369cd903325d734a2cf Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:29:12 +0000 Subject: [PATCH 09/19] scalar/flow: cache! before fetch also fixes the call stack in lib/scanner --- src/scanner/mod.rs | 5 +++-- src/scanner/scalar/escape.rs | 11 ++++++++- src/scanner/scalar/flow.rs | 43 +++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 5086625..ed3c7f4 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -190,7 +190,7 @@ impl Scanner }, // Is it a flow scalar? - [SINGLE, ..] | [DOUBLE, ..] => self.fetch_flow_scalar(base, tokens), + [SINGLE, ..] | [DOUBLE, ..] => self.fetch_flow_scalar(opts, base, tokens), // Is it a plain scalar? _ if self.is_plain_scalar(*base) => self.fetch_plain_scalar(base, tokens), @@ -391,6 +391,7 @@ impl Scanner fn fetch_flow_scalar<'de>( &mut self, + opts: Flags, base: &mut &'de str, tokens: &mut Tokens<'de>, ) -> Result<()> @@ -406,7 +407,7 @@ impl Scanner self.save_key(!REQUIRED)?; - let (range, amt) = scan_flow_scalar(buffer, &mut stats, single)?; + let (range, amt) = scan_flow_scalar(opts, buffer, &mut stats, single)?; let token = range.into_token(buffer)?; // A key cannot follow a flow scalar, as we're either diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index 9ec6e60..d0379af 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -18,12 +18,17 @@ use crate::scanner::{ /// escape sequence. /// /// [Link]: https://yaml.org/spec/1.2/spec.html#c-escape -pub(in crate::scanner) fn flow_unescape(base: &str, scratch: &mut Vec) -> Result +pub(in crate::scanner) fn flow_unescape( + opts: Flags, + base: &str, + scratch: &mut Vec, +) -> Result { let mut buffer = base; let mut escape_len: Option = None; // Not an escape sequence, early exit + cache!(~buffer, 1, opts)?; if !check!(~buffer => b'\\') { return Ok(0); @@ -33,6 +38,7 @@ pub(in crate::scanner) fn flow_unescape(base: &str, scratch: &mut Vec) -> Re // See 5.7: Escaped Characters // yaml.org/spec/1.2/spec.html#id2776092 + cache!(~buffer, 1, opts)?; match buffer.as_bytes() { [b'0', ..] => scratch.push(b'\0'), @@ -63,6 +69,9 @@ pub(in crate::scanner) fn flow_unescape(base: &str, scratch: &mut Vec) -> Re if let Some(sequence) = escape_len { + // Note that we cache the _entire_ escape sequence before + // calling write_unicode_point + cache!(~buffer, sequence, opts)?; let amt = write_unicode_point(buffer, scratch, sequence)?; advance!(buffer, amt); } diff --git a/src/scanner/scalar/flow.rs b/src/scanner/scalar/flow.rs index 6db5200..8c498ac 100644 --- a/src/scanner/scalar/flow.rs +++ b/src/scanner/scalar/flow.rs @@ -3,6 +3,7 @@ use std::ops::Range; use crate::{ scanner::{ error::{ScanError, ScanResult as Result}, + flag::Flags, scalar::escape::flow_unescape, stats::MStats, }, @@ -15,6 +16,7 @@ use crate::{ /// the underlying .base, however it may be required to copy /// into .scratch and borrow from that lifetime. pub(in crate::scanner) fn scan_flow_scalar( + opts: Flags, base: &str, stats: &mut MStats, single: bool, @@ -35,6 +37,7 @@ pub(in crate::scanner) fn scan_flow_scalar( }; // Eat left quote + cache!(~buffer, 1, opts)?; advance!(buffer, :stats, 1); 'scalar: loop @@ -44,6 +47,7 @@ pub(in crate::scanner) fn scan_flow_scalar( // Even in a scalar context, YAML prohibits starting a line // with document stream tokens followed by a blank // character + cache!(~buffer, 4, opts)?; if isDocumentIndicator!(~buffer, :stats) { return Err(ScanError::InvalidFlowScalar); @@ -55,9 +59,14 @@ pub(in crate::scanner) fn scan_flow_scalar( return Err(ScanError::UnexpectedEOF); } + cache!(~buffer, 1, opts)?; + // Consume non whitespace characters while !isWhiteSpaceZ!(~buffer) { + // Longest sequence we can hit is 2 characters ('') + cache!(~buffer, 2, opts)?; + // if we encounter an escaped quote we can no longer borrow // from .base, we must unescape the quote into .scratch if kind == SingleQuote && check!(~buffer => [SINGLE, SINGLE, ..]) @@ -88,7 +97,7 @@ pub(in crate::scanner) fn scan_flow_scalar( { set_no_borrow(&mut can_borrow, base, buffer, &mut scratch); - let read = flow_unescape(buffer, &mut scratch)?; + let read = flow_unescape(opts, buffer, &mut scratch)?; advance!(buffer, :stats, read); } // Its a non blank character, add it @@ -126,6 +135,8 @@ pub(in crate::scanner) fn scan_flow_scalar( // Consume whitespace loop { + cache!(~buffer, 1, opts)?; + match (isBlank!(~buffer), isBreak!(~buffer)) { // No more whitespace, exit loop @@ -206,6 +217,7 @@ pub(in crate::scanner) fn scan_flow_scalar( }; // Eat the right quote + cache!(~buffer, 1, opts)?; advance!(buffer, :stats, 1); let advance = base.len() - buffer.len(); @@ -296,6 +308,7 @@ mod tests use pretty_assertions::assert_eq; use super::*; + use crate::scanner::flag::O_ZEROED; type TestResult = anyhow::Result<()>; @@ -308,7 +321,7 @@ mod tests let stats = &mut MStats::new(); let expected = Token::Scalar(cow!(""), ScalarStyle::SingleQuote); - let (range, read) = scan_flow_scalar(data, stats, true)?; + let (range, read) = scan_flow_scalar(O_ZEROED, data, stats, true)?; let scalar = range.into_token(data)?; assert_eq!(read, 2); @@ -328,7 +341,7 @@ mod tests let stats = &mut MStats::new(); let expected = Token::Scalar(cow!("hello world"), ScalarStyle::SingleQuote); - let (range, read) = scan_flow_scalar(data, stats, true)?; + let (range, read) = scan_flow_scalar(O_ZEROED, data, stats, true)?; let scalar = range.into_token(data)?; assert_eq!(read, 13); @@ -352,7 +365,7 @@ fourth'"#; let cmp = "first second third fourth"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::SingleQuote); - let (range, _read) = scan_flow_scalar(data, stats, true)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, true)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -372,7 +385,7 @@ fourth'"#; let cmp = "first second"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::SingleQuote); - let (range, _read) = scan_flow_scalar(data, stats, true)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, true)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -395,7 +408,7 @@ fourth'"#; let cmp = "first second third\nfourth"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::SingleQuote); - let (range, _read) = scan_flow_scalar(data, stats, true)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, true)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -417,7 +430,7 @@ fourth'"#; { stats = MStats::new(); - match scan_flow_scalar(t, &mut stats, true) + match scan_flow_scalar(O_ZEROED, t, &mut stats, true) { Err(e) => assert_eq!( e, expected, @@ -443,7 +456,7 @@ fourth'"#; { stats = MStats::new(); - match scan_flow_scalar(t, &mut stats, true) + match scan_flow_scalar(O_ZEROED, t, &mut stats, true) { Err(e) => assert_eq!( e, expected, @@ -467,7 +480,7 @@ fourth'"#; let stats = &mut MStats::new(); let expected = Token::Scalar(cow!(""), ScalarStyle::DoubleQuote); - let (range, read) = scan_flow_scalar(data, stats, false)?; + let (range, read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; assert_eq!(read, 2); @@ -487,7 +500,7 @@ fourth'"#; let stats = &mut MStats::new(); let expected = Token::Scalar(cow!("hello world"), ScalarStyle::DoubleQuote); - let (range, read) = scan_flow_scalar(data, stats, false)?; + let (range, read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; assert_eq!(read, 13); @@ -507,7 +520,7 @@ fourth'"#; let stats = &mut MStats::new(); let expected = Token::Scalar(cow!("hello α Ω ッ"), ScalarStyle::DoubleQuote); - let (range, read) = scan_flow_scalar(data, stats, false)?; + let (range, read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -537,7 +550,7 @@ fourth""#; let cmp = "first second third fourth"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::DoubleQuote); - let (range, _read) = scan_flow_scalar(data, stats, false)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -560,7 +573,7 @@ fourth""#; let cmp = "first second third\nfourth"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::DoubleQuote); - let (range, _read) = scan_flow_scalar(data, stats, false)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -580,7 +593,7 @@ fourth""#; let cmp = "first second"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::DoubleQuote); - let (range, _read) = scan_flow_scalar(data, stats, false)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; if !(scalar == expected) @@ -604,7 +617,7 @@ rst \ let cmp = "first second third\nfourth"; let expected = Token::Scalar(cow!(cmp), ScalarStyle::DoubleQuote); - let (range, _read) = scan_flow_scalar(data, stats, false)?; + let (range, _read) = scan_flow_scalar(O_ZEROED, data, stats, false)?; let scalar = range.into_token(data)?; if !(scalar == expected) -- 2.43.5 From 39b33ccd7efc0b02fa4cd09bcbfcc863f2fc7ff4 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:33:43 +0000 Subject: [PATCH 10/19] scalar/escape: fix tests --- src/scanner/scalar/escape.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scanner/scalar/escape.rs b/src/scanner/scalar/escape.rs index d0379af..d4f52d7 100644 --- a/src/scanner/scalar/escape.rs +++ b/src/scanner/scalar/escape.rs @@ -239,6 +239,7 @@ mod tests use pretty_assertions::assert_eq; use super::*; + use crate::scanner::flag::O_ZEROED; type TestResult = anyhow::Result<()>; @@ -283,7 +284,7 @@ mod tests for (i, (&t, &ex)) in data.into_iter().zip(expected).enumerate() { scratch.clear(); - flow_unescape(t, scratch) + flow_unescape(O_ZEROED, t, scratch) .map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?; assert_eq!(scratch, ex, "on iteration {}", i) @@ -344,7 +345,7 @@ mod tests let mut c: [u8; 4] = [0; 4]; scratch.clear(); - flow_unescape(t, scratch) + flow_unescape(O_ZEROED, t, scratch) .map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?; assert_eq!( @@ -385,7 +386,7 @@ mod tests { scratch.clear(); - let consumed = flow_unescape(t, scratch) + let consumed = flow_unescape(O_ZEROED, t, scratch) .map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?; assert_eq!( @@ -432,7 +433,7 @@ mod tests { scratch.clear(); - let consumed = tag_uri_unescape(t, scratch, true) + let consumed = tag_uri_unescape(O_ZEROED, t, scratch, true) .map_err(|e| anyhow!("on iteration {}, test errored with {}", i, e))?; assert_eq!( @@ -461,7 +462,7 @@ mod tests let scratch = &mut Vec::new(); let expected = ScanError::UnexpectedEOF; - match tag_uri_unescape(data, scratch, true) + match tag_uri_unescape(O_ZEROED, data, scratch, true) { Err(e) if e == expected => Ok(()), @@ -481,7 +482,7 @@ mod tests let scratch = &mut Vec::new(); let expected = ScanError::UnknownEscape; - match tag_uri_unescape(data, scratch, true) + match tag_uri_unescape(O_ZEROED, data, scratch, true) { Err(e) if e == expected => Ok(()), -- 2.43.5 From bca953716f9e831c509026fb562b75cec2245289 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:26:53 +0000 Subject: [PATCH 11/19] scanner/anchor: cache! before fetch --- src/scanner/anchor.rs | 15 ++++++++++----- src/scanner/mod.rs | 11 ++++++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/scanner/anchor.rs b/src/scanner/anchor.rs index fbb4426..031940d 100644 --- a/src/scanner/anchor.rs +++ b/src/scanner/anchor.rs @@ -1,5 +1,6 @@ use super::{ error::{ScanError, ScanResult as Result}, + flag::Flags, stats::MStats, ALIAS, ANCHOR, }; @@ -8,6 +9,7 @@ use crate::token::Token; /// Scan an anchor or alias from the underlying .buffer /// returning the relevant Token pub(in crate::scanner) fn scan_anchor<'de>( + opts: Flags, buffer: &mut &'de str, stats: &mut MStats, kind: &AnchorKind, @@ -17,7 +19,7 @@ pub(in crate::scanner) fn scan_anchor<'de>( // *anchor 'rest of the line' // ^^^^^^ - let anchor = take_while(buffer.as_bytes(), u8::is_ascii_alphanumeric); + let anchor = take_while(opts, buffer.as_bytes(), u8::is_ascii_alphanumeric)?; let anchor = advance!(<- *buffer, :stats, anchor.len()); @@ -33,6 +35,7 @@ pub(in crate::scanner) fn scan_anchor<'de>( // There does not necessarily need to be a whitespace so we // also check against a list of valid starting // tokens + cache!(~buffer, 1, opts)?; check!(~buffer => b' ' | b'\n' | b'?' | b',' | b']' | b'}' | b'%' | b'@' | b'`', else ScanError::InvalidAnchorName @@ -73,7 +76,7 @@ impl AnchorKind } } -fn take_while(b: &[u8], f: F) -> &[u8] +fn take_while(opts: Flags, base: &[u8], f: F) -> Result<&[u8]> where F: Fn(&u8) -> bool, { @@ -81,10 +84,12 @@ where loop { - match b.get(index) + let i = cache!(base, @index, 1, opts)?; + + match base.get(index) { - Some(b) if f(b) => index += 1, - _ => return &b[..index], + Some(b) if f(b) => index += i, + _ => return Ok(&base[..index]), } } } diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index ed3c7f4..0a41fbe 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -178,7 +178,7 @@ impl Scanner }, // Is it an anchor or alias? - [ANCHOR, ..] | [ALIAS, ..] => self.fetch_anchor(base, tokens), + [ANCHOR, ..] | [ALIAS, ..] => self.fetch_anchor(opts, base, tokens), // Is it a tag? [TAG, ..] => self.fetch_tag(opts, base, tokens), @@ -353,7 +353,12 @@ impl Scanner Ok(()) } - fn fetch_anchor<'de>(&mut self, base: &mut &'de str, tokens: &mut Tokens<'de>) -> Result<()> + fn fetch_anchor<'de>( + &mut self, + opts: Flags, + base: &mut &'de str, + tokens: &mut Tokens<'de>, + ) -> Result<()> { let mut buffer = *base; let mut stats = MStats::new(); @@ -373,7 +378,7 @@ impl Scanner self.save_key(!REQUIRED)?; // Scan the token from the .buffer - let token = scan_anchor(&mut buffer, &mut stats, &kind)?; + let token = scan_anchor(opts, &mut buffer, &mut stats, &kind)?; // A key may not start after an anchor (only before) self.simple_key_allowed = false; -- 2.43.5 From 89225e1481e6397af607a2c2532fb60d7c6f30dd Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:31:13 +0000 Subject: [PATCH 12/19] scalar/plain: cache! before fetch also fix the call stack in lib/scanner --- src/scanner/mod.rs | 5 +++-- src/scanner/scalar/plain.rs | 42 ++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 0a41fbe..4658ebb 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -193,7 +193,7 @@ impl Scanner [SINGLE, ..] | [DOUBLE, ..] => self.fetch_flow_scalar(opts, base, tokens), // Is it a plain scalar? - _ if self.is_plain_scalar(*base) => self.fetch_plain_scalar(base, tokens), + _ if self.is_plain_scalar(*base) => self.fetch_plain_scalar(opts, base, tokens), // Otherwise its an error _ => return Err(ScanError::UnknownDelimiter), @@ -431,6 +431,7 @@ impl Scanner fn fetch_plain_scalar<'de>( &mut self, + opts: Flags, base: &mut &'de str, tokens: &mut Tokens<'de>, ) -> Result<()> @@ -440,7 +441,7 @@ impl Scanner self.save_key(!REQUIRED)?; - let (token, amt) = scan_plain_scalar(buffer, &mut stats, &self.context)?; + let (token, amt) = scan_plain_scalar(opts, buffer, &mut stats, &self.context)?; // A simple key cannot follow a plain scalar, there must be // an indicator or new line before a key is valid diff --git a/src/scanner/scalar/plain.rs b/src/scanner/scalar/plain.rs index 22a63d6..6bbf646 100644 --- a/src/scanner/scalar/plain.rs +++ b/src/scanner/scalar/plain.rs @@ -2,6 +2,7 @@ use crate::{ scanner::{ context::Context, error::{ScanError, ScanResult as Result}, + flag::Flags, stats::MStats, }, token::{ScalarStyle, Token}, @@ -17,6 +18,7 @@ use crate::{ /// YAML 1.2: Section 7.3.3 /// yaml.org/spec/1.2/spec.html#ns-plain-first(c) pub(in crate::scanner) fn scan_plain_scalar<'de>( + opts: Flags, base: &'de str, stats: &mut MStats, cxt: &Context, @@ -53,6 +55,7 @@ pub(in crate::scanner) fn scan_plain_scalar<'de>( // Inside flow contexts you *may not* start a plain scalar // with a ':', '?', or '-' followed by a flow indicator + cache!(~buffer, 2, opts)?; if flow_context && check!(~buffer => b':' | b'?' | b'-') && flow_indicator(buffer, 1) { return Err(ScanError::InvalidPlainScalar); @@ -60,6 +63,10 @@ pub(in crate::scanner) fn scan_plain_scalar<'de>( 'scalar: loop { + // 4 is the largest character sequence we can encounter + // (document indicators) + cache!(~buffer, 4, opts)?; + if buffer.is_empty() { break 'scalar; @@ -110,6 +117,8 @@ pub(in crate::scanner) fn scan_plain_scalar<'de>( // Handle non whitespace characters while !isWhiteSpaceZ!(~buffer) { + cache!(~buffer, 2, opts)?; + if (check!(~buffer => b':') && isWhiteSpaceZ!(~buffer, 1)) || flow_context && flow_indicator(buffer, 0) { @@ -128,6 +137,8 @@ pub(in crate::scanner) fn scan_plain_scalar<'de>( // Handle whitespace characters loop { + cache!(~buffer, 1, opts)?; + match (isBlank!(~buffer), isBreak!(~buffer)) { // No more whitespace, exit loop @@ -242,6 +253,7 @@ mod tests use ScalarStyle::Plain; use super::*; + use crate::scanner::flag::O_ZEROED; type TestResult = anyhow::Result<()>; @@ -280,7 +292,7 @@ mod tests for (i, &data) in tests.iter().enumerate() { - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt) + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt) .map_err(|e| anyhow!("iteration {}: {}", i, e))?; assert_eq!(token, expected, "on iteration {}", i); @@ -301,7 +313,7 @@ mod tests for (i, &data) in tests.iter().enumerate() { - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt) + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt) .map_err(|e| anyhow!("iteration {}: {}", i, e))?; assert_eq!(token, expected, "on iteration {}", i); @@ -320,7 +332,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!(""), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -339,7 +351,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -356,7 +368,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("hello, world!"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -379,7 +391,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("hello this is a multi-line scalar"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -406,7 +418,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("this is\n\na scalar\nwith line#breaks"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -423,7 +435,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -442,7 +454,7 @@ mod tests let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -461,7 +473,7 @@ mod tests for (i, &data) in tests.iter().enumerate() { - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt) + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt) .map_err(|e| anyhow!("iteration {}: {}", i, e))?; assert_eq!(token, expected, "on iteration {}", i); @@ -485,7 +497,7 @@ string!"; let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello this is a multi-line string!"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -514,7 +526,7 @@ breaks let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello this\nbig\nstring\nhas\nline\nbreaks"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -531,7 +543,7 @@ breaks let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -548,7 +560,7 @@ breaks let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); @@ -569,7 +581,7 @@ breaks let cxt = cxt!(flow -> 1); let expected = Token::Scalar(cow!("hello"), Plain); - let (token, amt) = scan_plain_scalar(data, &mut stats, &cxt)?; + let (token, amt) = scan_plain_scalar(O_ZEROED, data, &mut stats, &cxt)?; assert_eq!(token, expected); -- 2.43.5 From fe79f121fba1194b26ad78688ab7db1667838216 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 12:32:22 +0000 Subject: [PATCH 13/19] scalar/block: cache! before fetch also fix call stack in lib/scanner --- src/scanner/mod.rs | 5 ++- src/scanner/scalar/block.rs | 80 ++++++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 26 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 4658ebb..b19936b 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -186,7 +186,7 @@ impl Scanner // Is it a block scalar? [c @ LITERAL, ..] | [c @ FOLDED, ..] if self.context.is_block() => { - self.fetch_block_scalar(base, tokens, *c == FOLDED) + self.fetch_block_scalar(opts, base, tokens, *c == FOLDED) }, // Is it a flow scalar? @@ -458,6 +458,7 @@ impl Scanner fn fetch_block_scalar<'de>( &mut self, + opts: Flags, base: &mut &'de str, tokens: &mut Tokens<'de>, fold: bool, @@ -473,7 +474,7 @@ impl Scanner // always follow a block scalar. self.simple_key_allowed = true; - let (token, amt) = scan_block_scalar(buffer, &mut stats, &self.context, fold)?; + let (token, amt) = scan_block_scalar(opts, buffer, &mut stats, &self.context, fold)?; advance!(*base, amt); self.stats = stats; diff --git a/src/scanner/scalar/block.rs b/src/scanner/scalar/block.rs index 7395335..608af2d 100644 --- a/src/scanner/scalar/block.rs +++ b/src/scanner/scalar/block.rs @@ -26,6 +26,7 @@ use crate::{ scanner::{ context::Context, error::{ScanError, ScanResult as Result}, + flag::Flags, stats::MStats, }, token::{ScalarStyle, Slice, Token}, @@ -40,6 +41,7 @@ use crate::{ /// YAML 1.2: Section 8.1 /// yaml.org/spec/1.2/#c-b-block-header(m,t) pub(in crate::scanner) fn scan_block_scalar<'de>( + opts: Flags, base: &'de str, stats: &mut MStats, cxt: &Context, @@ -78,14 +80,16 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( }; // Eat the '|' or '>' + cache!(~buffer, 1, opts)?; advance!(buffer, :local_stats, 1); // Calculate any headers this scalar may have - let (chomp, explicit) = scan_headers(&mut buffer, &mut local_stats)?; + let (chomp, explicit) = scan_headers(opts, &mut buffer, &mut local_stats)?; // The header line must contain nothing after the headers // excluding a comment until the line ending - skip_blanks(&mut buffer, &mut local_stats, COMMENTS)?; + skip_blanks(opts, &mut buffer, &mut local_stats, COMMENTS)?; + cache!(~buffer, 1, opts)?; if !isWhiteSpaceZ!(~buffer) { return Err(ScanError::InvalidBlockScalar); @@ -102,6 +106,7 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( None => { indent = detect_indent_level( + opts, &mut buffer, &mut local_stats, cxt, @@ -201,8 +206,11 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( } // Eat the line's content until the line break (or EOF) + cache!(~buffer, 1, opts)?; while !isBreakZ!(~buffer) { + cache!(~buffer, 1, opts)?; + if !can_borrow { scratch.push(buffer.as_bytes()[0]) @@ -218,6 +226,7 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( } // Eat the line break (if not EOF) + cache!(~buffer, 1, opts)?; if isBreak!(~buffer) { advance!(buffer, :local_stats, @line); @@ -226,6 +235,7 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( // Chomp indentation until the next indented line scan_indent( + opts, &mut buffer, &mut local_stats, &mut lines, @@ -246,12 +256,18 @@ pub(in crate::scanner) fn scan_block_scalar<'de>( } /// Retrieve a block scalar's headers -fn scan_headers(buffer: &mut &str, stats: &mut MStats) -> Result<(ChompStyle, IndentHeader)> +fn scan_headers( + opts: Flags, + buffer: &mut &str, + stats: &mut MStats, +) -> Result<(ChompStyle, IndentHeader)> { let mut skip = 0; let mut indent = None; let mut chomp = ChompStyle::Clip; + cache!(~buffer, 2, opts)?; + // Set the explicit indent if it exists. // // Note that we silently eat an invalid indent (0) rather @@ -291,6 +307,7 @@ fn scan_headers(buffer: &mut &str, stats: &mut MStats) -> Result<(ChompStyle, In /// Chomp the indentation spaces of a block scalar fn scan_indent( + opts: Flags, buffer: &mut &str, stats: &mut MStats, lines: &mut usize, @@ -303,6 +320,8 @@ fn scan_indent( return Ok(false); } + cache!(~buffer, 1, opts)?; + while stats.column < indent && isWhiteSpace!(~buffer) { // Indentation space, chomp @@ -321,6 +340,8 @@ fn scan_indent( *lines += 1; advance!(*buffer, :stats, @line); } + + cache!(~buffer, 1, opts)?; } Ok(true) @@ -433,6 +454,7 @@ fn scan_chomp<'de>( /// Auto-detect the indentation level from the first non /// header line of a block scalar fn detect_indent_level( + opts: Flags, buffer: &mut &str, stats: &mut MStats, cxt: &Context, @@ -444,9 +466,13 @@ fn detect_indent_level( loop { + cache!(~buffer, 1, opts)?; + // Chomp indentation spaces, erroring on a tab while isBlank!(~buffer) { + cache!(~buffer, 1, opts)?; + if check!(~buffer => b'\t') { return Err(ScanError::InvalidTab); @@ -467,6 +493,7 @@ fn detect_indent_level( } // If its not a line break we're done, exit the loop + cache!(~buffer, 1, opts)?; if !isBreak!(~buffer) { break; @@ -489,10 +516,13 @@ fn detect_indent_level( /// Skip any blanks (and .comments) until we reach a line /// ending or non blank character -fn skip_blanks(buffer: &mut &str, stats: &mut MStats, comments: bool) -> Result<()> +fn skip_blanks(opts: Flags, buffer: &mut &str, stats: &mut MStats, comments: bool) -> Result<()> { + cache!(~buffer, 1, opts)?; + while isBlank!(~buffer) { + cache!(~buffer, 1, opts)?; advance!(*buffer, :stats, 1); } @@ -500,6 +530,7 @@ fn skip_blanks(buffer: &mut &str, stats: &mut MStats, comments: bool) -> Result< { while !isBreakZ!(~buffer) { + cache!(~buffer, 1, opts)?; advance!(*buffer, :stats, 1); } } @@ -577,6 +608,7 @@ mod tests use ScalarStyle::{Folded, Literal}; use super::*; + use crate::scanner::flag::O_ZEROED; type TestResult = anyhow::Result<()>; @@ -615,7 +647,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("this is a simple block scalar"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -630,7 +662,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines...\n"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -645,7 +677,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines..."), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -660,7 +692,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines...\n\n\n"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -679,7 +711,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded\nlines\nhere\n"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -700,7 +732,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("\n\nsome folded\nlines\nhere"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -721,7 +753,7 @@ mod tests let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded\nlines\nhere\n\n\n"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -742,7 +774,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded\nlines\nhere\n\n\n"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -763,7 +795,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("this\n\nhas\n\nbreaks"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -778,7 +810,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("simple block scalar"), Literal); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, LITERAL)?; assert_eq!(token, expected); @@ -795,7 +827,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("this is a simple block scalar"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -810,7 +842,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines...\n"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -825,7 +857,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines..."), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -840,7 +872,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("trailing lines...\n\n\n"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -859,7 +891,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded lines here\n"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -880,7 +912,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("\n\nsome folded lines here"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -901,7 +933,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded lines here\n\n\n"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -922,7 +954,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("some folded lines here\n\n\n"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -943,7 +975,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("this\nhas\nbreaks"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); @@ -958,7 +990,7 @@ some.other.key: value"; let cxt = cxt!(block -> [0]); let expected = Token::Scalar(cow!("simple block scalar"), Folded); - let (token, _amt) = scan_block_scalar(data, &mut stats, &cxt, !LITERAL)?; + let (token, _amt) = scan_block_scalar(O_ZEROED, data, &mut stats, &cxt, !LITERAL)?; assert_eq!(token, expected); -- 2.43.5 From e6ad1e8434556de0b44ebb2ba64f838df81599bc Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 13:37:34 +0000 Subject: [PATCH 14/19] lib/scanner: clippy --- src/scanner/anchor.rs | 6 +++--- src/scanner/mod.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scanner/anchor.rs b/src/scanner/anchor.rs index 031940d..01816af 100644 --- a/src/scanner/anchor.rs +++ b/src/scanner/anchor.rs @@ -65,10 +65,10 @@ impl AnchorKind /// starts from the given .byte pub fn new(byte: &u8) -> Option { - let s = match byte + let s = match *byte { - &ALIAS => Self::Alias, - &ANCHOR => Self::Anchor, + ALIAS => Self::Alias, + ANCHOR => Self::Anchor, _ => return None, }; diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index b19936b..4e5b22f 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -196,7 +196,7 @@ impl Scanner _ if self.is_plain_scalar(*base) => self.fetch_plain_scalar(opts, base, tokens), // Otherwise its an error - _ => return Err(ScanError::UnknownDelimiter), + _ => Err(ScanError::UnknownDelimiter), } } -- 2.43.5 From 998c86c1e4d1e2518637a03cf6634cdbb91742e2 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 18:28:03 +0000 Subject: [PATCH 15/19] lib/scanner: save any changes that may occur after a ScanError::Extend before there was a subtle error when eating whitespace wherein the whitespace could be eaten twice, which corrupts the Scanner.stats. Now we ensure that any movement is captured before returning the error to the caller --- src/scanner/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 4e5b22f..f7cb395 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -83,9 +83,14 @@ impl Scanner { if let Some(mut buffer) = base.get(self.offset..) { - self.scan_next_token(opts, &mut buffer, tokens)?; + let run = self.scan_next_token(opts, &mut buffer, tokens); - self.offset = base.len() - buffer.len(); + if matches!(run, Err(ScanError::Extend) | Ok(_)) + { + self.offset = base.len() - buffer.len(); + } + + run?; num_tokens = tokens.len() - starting_tokens; } -- 2.43.5 From 2272f1d9b1413f0de5ea958096d16b65a02aed35 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 18:53:19 +0000 Subject: [PATCH 16/19] lib/scanner: place state mutation after any O_EXTENDABLE events this simply prevents state corruption in the Scanner by waiting to make the changes until _after_ any errors would have been returned. While this works, its not immediately obvious in the code why the operations are ordered the way they are. I should document this probably. --- src/scanner/mod.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index f7cb395..86fa5d2 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -290,11 +290,8 @@ impl Scanner return Ok(()); } - // Reset indent to starting level - self.unroll_indent(tokens, STARTING_INDENT)?; - - // Reset saved key - self.remove_saved_key()?; + // Ensure we can read the 'YAML' or 'TAG' identifiers + cache!(~buffer, @1, 4, opts)?; // Safety: we check above that we have len >= 1 (e.g a '%') // @@ -310,6 +307,12 @@ impl Scanner // Scan the directive token from the .buffer let token = scan_directive(opts, &mut buffer, &mut stats, &kind)?; + // Reset indent to starting level + self.unroll_indent(tokens, STARTING_INDENT)?; + + // Reset saved key + self.remove_saved_key()?; + // A key cannot follow a directive (a newline is required) self.simple_key_allowed = false; @@ -339,11 +342,11 @@ impl Scanner return Ok(()); } - self.save_key(!REQUIRED)?; - let (token, amt) = scan_node_tag(opts, buffer, &mut stats)?; advance!(buffer, amt); + self.save_key(!REQUIRED)?; + // A key may not start after a tag (only before) self.simple_key_allowed = false; @@ -379,12 +382,12 @@ impl Scanner _ => return Ok(()), }; - // An anchor / alias may start a simple key - self.save_key(!REQUIRED)?; - // Scan the token from the .buffer let token = scan_anchor(opts, &mut buffer, &mut stats, &kind)?; + // An anchor / alias may start a simple key + self.save_key(!REQUIRED)?; + // A key may not start after an anchor (only before) self.simple_key_allowed = false; @@ -415,11 +418,11 @@ impl Scanner return Ok(()); } - self.save_key(!REQUIRED)?; - let (range, amt) = scan_flow_scalar(opts, buffer, &mut stats, single)?; let token = range.into_token(buffer)?; + self.save_key(!REQUIRED)?; + // A key cannot follow a flow scalar, as we're either // currently in a key (which should be followed by a // value), or a value which needs a separator (e.g line @@ -444,10 +447,10 @@ impl Scanner let buffer = *base; let mut stats = self.stats.clone(); - self.save_key(!REQUIRED)?; - let (token, amt) = scan_plain_scalar(opts, buffer, &mut stats, &self.context)?; + self.save_key(!REQUIRED)?; + // A simple key cannot follow a plain scalar, there must be // an indicator or new line before a key is valid // again. -- 2.43.5 From 0c58500a9b0786440245947641166704e8e949a8 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 18:55:54 +0000 Subject: [PATCH 17/19] Cargo: dev-dependencies.cfg-if = 1 --- Cargo.lock | 7 +++++++ Cargo.toml | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 71fbb85..59f358a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,6 +36,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "ctor" version = "0.1.20" @@ -146,5 +152,6 @@ dependencies = [ "anyhow", "atoi", "bitflags", + "cfg-if", "pretty_assertions", ] diff --git a/Cargo.toml b/Cargo.toml index 4cc4d1b..9808e35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,13 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +# PRIVATE! FOR USE IN TEST ONLY! +test_buffer = [] +test_buffer_large = ["test_buffer"] +test_buffer_medium = ["test_buffer"] +test_buffer_small = ["test_buffer"] + [dependencies] atoi = "0.4" bitflags = "1" @@ -13,3 +20,4 @@ bitflags = "1" [dev-dependencies] anyhow = "1" pretty_assertions = "0.7" +cfg-if = "1" -- 2.43.5 From 0e5a0f79494d20496336ce9e68fecf7de074a592 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 18:56:55 +0000 Subject: [PATCH 18/19] lib/scanner: add feature gated test harness for tokens! In essence, this allows us to test the Scanner's ability to handle chunked byte streams, hooking directly into the existing test suite. It has three levels large, medium and small where large is probably the smallest buffer size + increment that could be considered reasonable (4k/64), with the smaller two testing absurd buffers (8/8 and 1/1). --- src/scanner/mod.rs | 72 +++++++++++++++++++++++++++++---- src/scanner/tests/str_reader.rs | 65 +++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 src/scanner/tests/str_reader.rs diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 86fa5d2..468cd84 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -1121,12 +1121,22 @@ mod tests mod tag; mod whitespace; + #[cfg(feature = "test_buffer")] + mod str_reader; + use super::*; use crate::token::{ScalarStyle::*, Token::*}; struct ScanIter<'de> { - data: &'de str, + #[cfg(feature = "test_buffer")] + data: str_reader::StrReader<'de>, + #[cfg(feature = "test_buffer")] + opts: Flags, + + #[cfg(not(feature = "test_buffer"))] + data: &'de str, + scan: Scanner, tokens: Tokens<'de>, @@ -1138,7 +1148,14 @@ mod tests pub fn new(data: &'de str) -> Self { Self { + #[cfg(feature = "test_buffer")] + data: str_reader::StrReader::new(data, str_reader::StrReader::BUF_SIZE), + #[cfg(feature = "test_buffer")] + opts: O_ZEROED | O_EXTENDABLE, + + #[cfg(not(feature = "test_buffer"))] data, + scan: Scanner::new(), tokens: Tokens::new(), done: false, @@ -1149,12 +1166,7 @@ mod tests { if (!self.done) && self.tokens.is_empty() { - if let 0 = self - .scan - .scan_tokens(O_ZEROED, self.data, &mut self.tokens)? - { - self.done = true - } + self.get_next_token()?; } if !self.done @@ -1166,6 +1178,52 @@ mod tests Ok(None) } } + + #[cfg(feature = "test_buffer")] + fn get_next_token(&mut self) -> Result<()> + { + let count = loop + { + match self + .scan + .scan_tokens(self.opts, self.data.read(), &mut self.tokens) + { + Ok(count) => break count, + Err(e) if e == ScanError::Extend => + { + self.data.expand(str_reader::StrReader::BUF_EXTEND); + + if !self.data.expandable() + { + self.opts.remove(O_EXTENDABLE) + } + + continue; + }, + Err(e) => return Err(e), + }; + }; + + if count == 0 + { + self.done = true + } + + Ok(()) + } + + #[cfg(not(feature = "test_buffer"))] + fn get_next_token(&mut self) -> Result<()> + { + if let 0 = self + .scan + .scan_tokens(O_ZEROED, self.data, &mut self.tokens)? + { + self.done = true + } + + Ok(()) + } } impl<'de> Iterator for ScanIter<'de> diff --git a/src/scanner/tests/str_reader.rs b/src/scanner/tests/str_reader.rs new file mode 100644 index 0000000..1e3376a --- /dev/null +++ b/src/scanner/tests/str_reader.rs @@ -0,0 +1,65 @@ +use cfg_if::cfg_if; + +#[derive(Debug, Clone)] +pub(super) struct StrReader<'de> +{ + s: &'de str, + size: usize, +} + +impl<'de> StrReader<'de> +{ + cfg_if! { + if #[cfg(feature = "test_buffer_large")] + { + pub const BUF_SIZE: usize = 4 * 1024; + pub const BUF_EXTEND: usize = 64; + } + else if #[cfg(feature = "test_buffer_medium")] + { + pub const BUF_SIZE: usize = 8; + pub const BUF_EXTEND: usize = 8; + } + else if #[cfg(feature = "test_buffer_small")] + { + pub const BUF_SIZE: usize = 1; + pub const BUF_EXTEND: usize = 1; + } + } + + pub fn new(s: &'de str, size: usize) -> Self + { + let size = std::cmp::min(s.len(), size); + + Self { s, size } + } + + pub fn read(&self) -> &'de str + { + &self.s[..self.size] + } + + pub fn expand(&mut self, size: usize) + { + let new = self.size + size; + + match self.s.len() > new + { + true => self.size = new, + false => self.size = self.s.len(), + } + } + + pub fn expandable(&self) -> bool + { + self.size < self.s.len() + } +} + +impl std::fmt::Display for StrReader<'_> +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result + { + self.s.fmt(f) + } +} -- 2.43.5 From c79bcc8114db858c77a0a58fe177c6fecd11d492 Mon Sep 17 00:00:00 2001 From: Bazaah Date: Thu, 9 Sep 2021 19:11:22 +0000 Subject: [PATCH 19/19] ci/prtasks: add matrix test for feature.test_buffer_small --- .github/workflows/pr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 48674c3..b4113df 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -23,6 +23,9 @@ jobs: - name: "Unit Tests" cmd: test args: --lib --bins + - name: "Unit Tests: feature.test_buffer_small" + cmd: test + args: --lib --features=test_buffer_small include: - os: ubuntu-latest sccache-path: /home/runner/.cache/sccache -- 2.43.5