Feature/directive/version #6

Closed
bazaah wants to merge 11 commits from feature/directive/version into master
7 changed files with 518 additions and 40 deletions

25
Cargo.lock generated
View File

@ -15,6 +15,21 @@ version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b"
[[package]]
name = "atoi"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "616896e05fc0e2649463a93a15183c6a16bf03413a7af88ef1285ddedfa9cda5"
dependencies = [
"num-traits",
]
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "ctor"
version = "0.1.20"
@ -31,6 +46,15 @@ version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
[[package]]
name = "num-traits"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
[[package]]
name = "output_vt100"
version = "0.1.2"
@ -114,5 +138,6 @@ name = "yary"
version = "0.1.0"
dependencies = [
"anyhow",
"atoi",
"pretty_assertions",
]

View File

@ -9,5 +9,7 @@ edition = "2018"
[dependencies]
anyhow = "1"
atoi = "0.4"
[dev-dependencies]
pretty_assertions = "0.7"

43
src/scanner/error.rs Normal file
View File

@ -0,0 +1,43 @@
use std::fmt;
pub type ScanResult<T> = std::result::Result<T, ScanError>;
#[derive(Debug, PartialEq, Eq)]
pub enum ScanError
{
/// Directive was not either YAML or TAG
UnknownDirective,
/// %YAML 1.1
/// ^
MissingMajor,
/// %YAML 1.1
/// ^
MissingMinor,
/// A directive major or minor digit was not 0..=9
InvalidVersion,
/// Tag handle was not primary (!), secondary (!!) or
/// named (!alphanumeric!)
InvalidTagHandle,
/// Tag prefix was not separated from the handle by one
/// or more spaces
InvalidTagPrefix,
/// Got end of stream while parsing a token
UnexpectedEOF,
}
impl fmt::Display for ScanError
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{
// Delegate to debug for the moment
fmt::Debug::fmt(self, f)
}
}
impl std::error::Error for ScanError {}

63
src/scanner/macros.rs Normal file
View File

@ -0,0 +1,63 @@
/// Moves head in $buffer $amount forward
macro_rules! advance {
($buffer:expr, $amount:expr) => {
let (_, rest) = $buffer.split_at($amount);
$buffer = rest
};
(<- $buffer:expr, $amount:expr) => {{
let (cut, rest) = $buffer.split_at($amount);
$buffer = rest;
cut
}};
}
/// New cow pointer from the given expr
macro_rules! cow {
($from:expr) => {
std::borrow::Cow::from($from)
};
}
/// Check the buffer for $byte matches at $pos, optionally
/// returning an error Note that the error path is special
/// cased to return an UnexpectedEOF if it encounters an
/// empty slice
macro_rules! check {
($buffer:expr, $(@$pos:expr,)? is $( $byte:pat )|+ $(, else $error:expr)? ) => {
{
let b = match $buffer$([$pos..])? {
[] => Err(false),
$([$byte, ..])|+ => Ok(true),
_ => Ok(false)
};
check!(@priv b $(=> $error)? )
}
};
($buffer:expr, $(@$pos:expr,)? not $( $byte:pat )|+ $(, else $error:expr)? ) => {
{
let b = match $buffer$([$pos..])? {
[] => Err(true),
$([$byte, ..])|+ => Ok(false),
_ => Ok(true)
};
check!(@priv b $(=> $error)? )
}
};
(@priv $bool:expr) => {
match $bool {
Ok(b) | Err(b) => b
}
};
(@priv $bool:expr => $error:expr) => {
match $bool {
Ok(true) => Ok(()),
Ok(false) => Err($error),
Err(_) => Err($crate::scanner::error::ScanError::UnexpectedEOF),
}
}
}

View File

@ -1,3 +1,10 @@
mod error;
#[macro_use]
mod macros;
use atoi::atoi;
use self::error::{ScanError, ScanResult as Result};
use crate::token::{StreamEncoding, Token};
#[derive(Debug)]
@ -17,6 +24,33 @@ impl<'a> Scanner<'a>
}
}
fn next_token(&mut self) -> Result<Option<Token<'a>>>
{
if let begin @ Some(_) = self.start_stream()
{
return Ok(begin);
}
Self::eat_whitespace(&mut self.buffer, true);
if let end @ Some(_) = self.stream_end()
{
return Ok(end);
}
if let document @ Some(_) = self.document_marker()
{
return Ok(document);
}
if let directive @ Some(_) = self.directive()?
{
return Ok(directive);
}
Ok(None)
}
fn start_stream(&mut self) -> Option<Token<'a>>
{
match self.state
@ -46,9 +80,12 @@ impl<'a> Scanner<'a>
}
}
fn eat_whitespace(&mut self) -> usize
/// Chomp whitespace and optionally comments until we
/// reach the next token, updating buffer[0] to the
/// beginning of the new token
fn eat_whitespace(buffer: &mut &str, comments: bool) -> usize
{
let mut slice = self.buffer.char_indices().peekable();
let mut slice = buffer.bytes().enumerate().peekable();
let mut chomped = None;
let mut chomp_line = false;
@ -57,12 +94,12 @@ impl<'a> Scanner<'a>
match c
{
// Eat spaces
' ' =>
b' ' =>
{},
// If we are starting a comment, chomp the entire line
'#' => chomp_line = true,
b'#' if comments => chomp_line = true,
// Reset line chomp after eating one
'\n' => chomp_line = false,
b'\n' => chomp_line = false,
// Chomp anything if we're eating the whole line
_ if chomp_line =>
{},
@ -78,7 +115,7 @@ impl<'a> Scanner<'a>
// Adjust our buffer by the chomped length
if let Some(index) = chomped
{
self.buffer = split_at(self.buffer, index)
advance!(*buffer, index);
}
// Handle EOF
@ -87,8 +124,8 @@ impl<'a> Scanner<'a>
// chomped in the while loop
if slice.peek().is_none()
{
chomped = self.buffer.len().into();
self.buffer = ""
chomped = buffer.len().into();
*buffer = ""
}
chomped.unwrap_or(0)
@ -98,13 +135,13 @@ impl<'a> Scanner<'a>
{
if self.buffer.starts_with("---")
{
self.buffer = split_at(self.buffer, 3);
advance!(self.buffer, 3);
Token::DocumentStart.into()
}
else if self.buffer.starts_with("...")
{
self.buffer = split_at(self.buffer, 3);
advance!(self.buffer, 3);
Token::DocumentEnd.into()
}
@ -113,39 +150,176 @@ impl<'a> Scanner<'a>
None
}
}
}
#[inline(always)]
fn split_at(b: &str, at: usize) -> &str
{
let (_, rest) = b.split_at(at);
rest
fn directive(&mut self) -> Result<Option<Token<'a>>>
{
let mut buffer = self.buffer;
if check!(buffer.as_bytes(), not b'%')
{
return Ok(None);
}
// Safety: we check above that we have len >= 1 (e.g a '%')
//
// %YAML 1.1
// ^^^^
// %TAG
// ^^^
let kind = DirectiveKind::new(&buffer[1..])?;
// '%' + 'YAML' or 'TAG'
advance!(buffer, 1 + kind.len());
let token = match kind
{
DirectiveKind::Version =>
{
// Chomp any preceding whitespace
Self::eat_whitespace(&mut buffer, false);
// %YAML 1.1
// ^
let (major, skip) = scan_directive_version(buffer)?;
advance!(buffer, skip);
// %YAML 1.1
// ^
match buffer.as_bytes()
{
[b'.', ..] =>
{
advance!(buffer, 1);
Ok(())
},
[] => Err(ScanError::UnexpectedEOF),
_ => Err(ScanError::InvalidVersion),
}?;
// %YAML 1.1
// ^
let (minor, skip) = scan_directive_version(buffer)?;
advance!(buffer, skip);
Token::VersionDirective(major, minor)
},
DirectiveKind::Tag =>
{
let mut markers = 0;
// Chomp any spaces up to the handle
Self::eat_whitespace(&mut buffer, false);
// %TAG !handle! tag-prefix # a comment \n
// ^
check!(buffer.as_bytes(), is b'!', else ScanError::InvalidTagHandle)?;
markers += 1;
// %TAG !handle! tag-prefix # a comment \n
// ^^^^^^
// Safety: we just proved above we have >= 1 byte ('!')
let name = take_while(buffer[1..].as_bytes(), u8::is_ascii_alphanumeric);
match buffer.as_bytes().get(markers + name.len())
{
// %TAG !! tag-prefix # a comment \n
// ^
// Either a secondary handle (!!) or named (!:alphanumeric:!)
Some(b'!') => markers += 1,
// %TAG ! tag-prefix # a comment \n
// ^
// If no name, and no second ! this is a primary handle
_ if name.is_empty() =>
{},
// Otherwise its an error
Some(_) => Err(ScanError::InvalidTagHandle)?,
None => Err(ScanError::UnexpectedEOF)?,
}
let handle = advance!(<- buffer, markers + name.len());
// Check that there is >= 1 whitespace between handle and
// prefix
check!(buffer.as_bytes(), is b' ', else ScanError::InvalidTagPrefix)?;
Self::eat_whitespace(&mut buffer, false);
// %TAG !named! :tag:prefix # a comment\n
// ^^^^^^^^^^^
let prefix = match scan_directive_tag_prefix(buffer.as_bytes())
{
[] => Err(ScanError::InvalidTagPrefix)?,
prefix @ [..] => prefix,
};
let prefix = advance!(<- buffer, prefix.len());
// %TAG !named! tag-prefix # a comment\n
// ^
// Check there is whitespace or a newline after the tag
check!(buffer.as_bytes(), is b' ' | b'\n', else ScanError::InvalidTagPrefix)?;
Token::TagDirective(cow!(handle), cow!(prefix))
},
};
// %YAML 1.1 # some comment\n
// ^^^^^^^^^^^^^^^^^ buffer
// ^^^^^^^^^ self.buffer.len - buffer.len
advance!(self.buffer, self.buffer.len() - buffer.len());
Ok(Some(token))
}
}
impl<'a> Iterator for Scanner<'a>
{
type Item = Token<'a>;
type Item = Result<Token<'a>>;
fn next(&mut self) -> Option<Self::Item>
{
if let Some(begin) = self.start_stream()
self.next_token().transpose()
}
}
enum DirectiveKind
{
Version,
Tag,
}
impl DirectiveKind
{
const V_LEN: usize = 4;
const T_LEN: usize = 3;
fn new(b: &str) -> Result<Self>
{
if b.starts_with("YAML")
{
return Some(begin);
Ok(Self::Version)
}
self.eat_whitespace();
if let Some(end) = self.stream_end()
else if b.starts_with("TAG")
{
return Some(end);
Ok(Self::Tag)
}
if let Some(document) = self.document_marker()
else
{
return Some(document);
Err(ScanError::UnknownDirective)
}
}
None
fn len(&self) -> usize
{
match self
{
Self::Version => Self::V_LEN,
Self::Tag => Self::T_LEN,
}
}
}
@ -157,6 +331,50 @@ enum StreamState
Done,
}
fn scan_directive_tag_prefix(b: &[u8]) -> &[u8]
{
take_while(b, valid_in_tag_prefix)
}
fn valid_in_tag_prefix(b: &u8) -> bool
{
assert_ne!(*b, b'%', "FIXME: url escape decode not implemented yet!");
matches!(
*b,
// alphanumeric
b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' |
// !, $, &, ', (, ), *, +, -, ., /, :, ;
b'!' | b'$' | b'&'..=b'/' | b':' | b';' |
// =, ?, @, _, ~
b'=' | b'?' | b'@' | b'_' | b'~'
)
}
fn scan_directive_version(b: &str) -> Result<(u8, usize)>
{
let v_slice = take_while(b.as_bytes(), u8::is_ascii_digit);
let v = atoi(v_slice).ok_or(ScanError::InvalidVersion)?;
Ok((v, v_slice.len()))
}
fn take_while<F>(b: &[u8], f: F) -> &[u8]
where
F: Fn(&u8) -> bool,
{
let mut index = 0;
loop
{
match b.get(index)
{
Some(b) if f(b) => index += 1,
_ => return &b[..index],
}
}
}
#[cfg(test)]
mod tests
{
@ -222,4 +440,120 @@ mod tests
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_version()
{
let data = "%YAML 1.1 # a comment\n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
| Token::VersionDirective(1, 1) => "expected version directive (1, 1)",
| Token::StreamEnd => "expected end of stream",
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_version_large()
{
let data = "%YAML 121.80 # a comment\n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
| Token::VersionDirective(121, 80) => "expected version directive (121, 80)",
| Token::StreamEnd => "expected end of stream",
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_version_invalid()
{
let data = "%YAML foo.bar # a comment\n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
> Result::<Token>::Err(ScanError::InvalidVersion) => "expected an version directive error"
);
}
#[test]
fn directive_tag_named()
{
let data = "%TAG !named! my:cool:tag # a comment\n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
| Token::TagDirective(cow!("!named!"), cow!("my:cool:tag")) => "expected named tag directive",
| Token::StreamEnd => "expected end of stream",
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_tag_primary()
{
let data = "%TAG ! my:cool:tag\n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
| Token::TagDirective(cow!("!"), cow!("my:cool:tag")) => "expected primary tag directive",
| Token::StreamEnd => "expected end of stream",
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_tag_secondary()
{
let data = "%TAG !! @my/crazy&tag: \n";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
| Token::TagDirective(cow!("!!"), cow!("@my/crazy&tag:")) => "expected secondary tag directive",
| Token::StreamEnd => "expected end of stream",
@ None => "expected stream to be finished"
);
}
#[test]
fn directive_tag_ending_ws()
{
let data = "%TAG !! @my/crazy&tag:";
let mut s = Scanner::new(data);
tokens!(s =>
| Token::StreamStart(StreamEncoding::UTF8) => "expected start of stream",
> Result::<Token>::Err(ScanError::UnexpectedEOF) => "expected an eof error"
);
}
#[test]
fn eat_whitespace()
{
let data = " abc";
let mut s = Scanner::new(data);
Scanner::eat_whitespace(&mut s.buffer, false);
assert_eq!(s.buffer, "abc");
}
#[test]
fn eat_whitespace_none()
{
let data = "abc";
let mut s = Scanner::new(data);
Scanner::eat_whitespace(&mut s.buffer, false);
assert_eq!(s.buffer, "abc");
}
}

View File

@ -1,9 +1,10 @@
/// Macro for asserting token streams
/// Used as: events!(Scanner => <sigil> <expected> [=> <message>] [, ..])
/// Where:
/// <sigil> '|' for a Token, or '@' for an Option<Token>
/// <expected> Either Token or Option<Token>
/// <message> A message to print on failure
/// Used as: events!(Scanner => <sigil> <expected> [=>
/// <message>] [, ..]) Where:
/// <sigil> '|' for a Token, or '@' for an
/// Option<Token> <expected> Either Token or
/// Option<Token> <message> A message to print on
/// failure
macro_rules! tokens {
($scanner:expr => $($id:tt $expected:expr $(=> $msg:tt)?),+ ) => {
let mut f = || -> std::result::Result<(), ::anyhow::Error> {
@ -25,7 +26,15 @@ macro_rules! tokens {
};
// Variant for option assert
(@unwrap @ $scanner:expr => $expected:expr $(=> $msg:tt)? ) => {
assert_eq!($scanner.next(), $expected $(, $msg)? )
assert_eq!($scanner.next().transpose()?, $expected $(, $msg)? )
};
(@unwrap > $scanner:expr => $expected:expr $(=> $msg:tt)? ) => {
let event = $scanner
.next()
.ok_or_else(
|| anyhow::anyhow!("Unexpected end of tokens, was expecting: {:?} ~{}", $expected, $scanner.buffer)
)?;
assert_eq!(event, $expected $(, $msg)? )
};
// Forward to option assert any unknown sigils
(@unwrap $any:tt $scanner:expr => $expected:expr $(=> $msg:tt)? ) => {
@ -35,19 +44,21 @@ macro_rules! tokens {
(@token $scanner:expr => $expected:expr) => {
let event = $scanner
.next()
.map(|r| r.map_err(|e| anyhow::anyhow!("{} ~{}", e, $scanner.buffer)))
.ok_or_else(
|| anyhow::anyhow!("Unexpected end of tokens, was expecting: {:?} ~{}", $expected, $scanner.buffer)
)?;
)??;
assert_eq!(event, $expected)
};
// Variant for token assert, no with message
// Variant for token assert, with message
(@token $scanner:expr => $expected:expr, $msg:tt) => {
let event = $scanner
.next()
.map(|r| r.map_err(|e| anyhow::anyhow!("{} ~{}", e, $scanner.buffer)))
.ok_or_else(
|| anyhow::anyhow!("Unexpected end of tokens, {}: {:?} ~{}", $msg, $expected, $scanner.buffer)
)?;
)??;
assert_eq!(event, $expected, $msg)
};

View File

@ -9,10 +9,10 @@ pub enum Token<'a>
StreamStart(StreamEncoding),
/// The stream's end [virtual]
StreamEnd,
/// The %YAML directive
VersionDirective(Slice<'a>, Slice<'a>),
/// The %YAML directive, (major,minor)
VersionDirective(u8, u8),
/// The %TAG directive
TagDirective,
TagDirective(Slice<'a>, Slice<'a>),
/// A ---
DocumentStart,
/// A ...