From 3f5d49819fd75b5ee9d8ef7d9470ec498c0c2947 Mon Sep 17 00:00:00 2001 From: KrosFire Date: Wed, 28 Aug 2024 18:55:29 +0200 Subject: [PATCH 1/4] FEAT - Created static Lexer and feature flags --- Cargo.toml | 11 + README.md | 10 + src/compiling/compiler.rs | 3 +- src/compiling/lexing/lexer.rs | 174 ++++----- src/compiling/lexing/lexer_static.rs | 511 +++++++++++++++++++++++++++ src/compiling/lexing/mod.rs | 24 +- src/compiling/mod.rs | 3 + 7 files changed, 631 insertions(+), 105 deletions(-) create mode 100644 src/compiling/lexing/lexer_static.rs diff --git a/Cargo.toml b/Cargo.toml index 3fa20a4..84f604f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,17 @@ authors = ["pawel.karas@icloud.com"] keywords = ["heraclitus", "compiler", "parser"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["compiler"] + +# Include the compiler and lexer +compiler = ["lexer_dynamic"] + +# Include the lexer +lexer_dynamic = [] + +# Include the static lexer +lexer_static = [] [dependencies] colored = "2.0.0" diff --git a/README.md b/README.md index f97b8e8..2a411b1 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,16 @@ let cc = Compiler::new("HerbScript", rules); let tokens = cc.tokenize()?; ``` +## Features + +You can import specific features from Heraclitus. Available options are: + +- `compiler` - Includes Compiler and dynamic Lexer +- `lexer_dynamic` - Includes just the dynamic Lexer +- `lexer_static` - Includes just the static Lexer + +The difference between `static` and `dynamic` Lexer is that the `static` Lexer doesn't mutate it's own state while tokenizing the input. + # Change log 🚀 ## Version 1.7.8 diff --git a/src/compiling/compiler.rs b/src/compiling/compiler.rs index 43c434e..a690e70 100644 --- a/src/compiling/compiler.rs +++ b/src/compiling/compiler.rs @@ -2,11 +2,12 @@ use capitalize::Capitalize; use std::fs::File; use std::io::prelude::*; use crate::compiling_rules::Rules; -use crate::compiling::{Token, Lexer, LexerError, LexerErrorType, Metadata, SyntaxModule}; +use crate::compiling::{Token, LexerError, LexerErrorType, Metadata, SyntaxModule}; use crate::compiling::failing::message::Message; use crate::compiling::failing::failure::Failure; use crate::error_pos; +use super::lexer::Lexer; /// How do you want to separate expressions? /// diff --git a/src/compiling/lexing/lexer.rs b/src/compiling/lexing/lexer.rs index e22da83..2fe4a53 100644 --- a/src/compiling/lexing/lexer.rs +++ b/src/compiling/lexing/lexer.rs @@ -1,25 +1,18 @@ -use crate::compiling::{ Compiler, Token, SeparatorMode, ScopingMode }; +//! Dynamic Lexer +//! +//! This module contains the dynamic lexer that is used to tokenize the source code. + use super::compound_handler::{CompoundHandler, CompoundReaction}; -use super::region_handler::{ RegionHandler, RegionReaction }; use super::reader::Reader; +use super::region_handler::{RegionHandler, RegionReaction}; +use super::{LexerError, LexerErrorType}; use crate::compiling::failing::position_info::PositionInfo; +use crate::compiling::{Compiler, ScopingMode, SeparatorMode, Token}; // This is just an estimation of token amount // inside of a typical 200-lined file. const AVG_TOKEN_AMOUNT: usize = 1024; -/// Lexer's error type -#[derive(Debug)] -pub enum LexerErrorType { - /// Unspillable region has been spilled - Singleline, - /// Given region left unclosed - Unclosed -} - -/// Type containing full error of lexer -pub type LexerError = (LexerErrorType, PositionInfo); - /// The Lexer /// /// Lexer takes source code in a form of a string and translates it to a list of tokens. @@ -40,7 +33,7 @@ pub struct Lexer<'a> { is_escaped: bool, position: (usize, usize), index: usize, - token_start_index: usize + token_start_index: usize, } impl<'a> Lexer<'a> { @@ -60,7 +53,7 @@ impl<'a> Lexer<'a> { is_escaped: false, position: (0, 0), index: 0, - token_start_index: 0 + token_start_index: 0, } } @@ -79,7 +72,9 @@ impl<'a> Lexer<'a> { }); self.position = (0, 0); String::new() - } else { word } + } else { + word + } } /// Add word that has been completed in previous iteration to the lexem @@ -89,12 +84,13 @@ impl<'a> Lexer<'a> { self.lexem.push(Token { word, pos: self.position, - start: self.token_start_index + start: self.token_start_index, }); self.position = (0, 0); String::new() + } else { + word } - else { word } } /// Add word that has been completed in current iteration to the lexem @@ -104,12 +100,13 @@ impl<'a> Lexer<'a> { self.lexem.push(Token { word, pos: self.position, - start: self.token_start_index + start: self.token_start_index, }); self.position = (0, 0); String::new() + } else { + word } - else { word } } /// Checks whether this is a nontokenizable region @@ -117,8 +114,9 @@ impl<'a> Lexer<'a> { pub fn is_tokenized_region(&self, reaction: &RegionReaction) -> bool { if let Some(region) = self.region.get_region().as_ref() { region.tokenize && *reaction == RegionReaction::Pass + } else { + false } - else { false } } /// Pattern code for adding a symbol @@ -208,7 +206,7 @@ impl<'a> Lexer<'a> { // Normally start a new region word = self.pattern_begin(word, letter); } - }, + } // If the region has been closed // Add the closing region and finish the word RegionReaction::End(tokenize) => { @@ -230,7 +228,11 @@ impl<'a> Lexer<'a> { } } RegionReaction::Pass => { - match self.compound.handle_compound(letter, &self.reader, self.is_tokenized_region(&reaction)) { + match self.compound.handle_compound( + letter, + &self.reader, + self.is_tokenized_region(&reaction), + ) { CompoundReaction::Begin => word = self.pattern_begin(word, letter), CompoundReaction::Keep => self.word_push(&mut word, letter), CompoundReaction::End => word = self.pattern_end(word, letter), @@ -239,7 +241,8 @@ impl<'a> Lexer<'a> { if !self.is_tokenized_region(&reaction) { let region = self.region.get_region().unwrap(); // Flip escaped key - self.is_escaped = (!self.is_escaped && letter == self.escape_symbol) + self.is_escaped = (!self.is_escaped + && letter == self.escape_symbol) .then(|| !self.is_escaped) .unwrap_or(false); // Handle singleline attribute @@ -247,13 +250,12 @@ impl<'a> Lexer<'a> { let pos = self.reader.get_position(); return Err(( LexerErrorType::Singleline, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name.clone()) - )) + PositionInfo::at_pos(self.path.clone(), pos, 0) + .data(region.name.clone()), + )); } self.word_push(&mut word, letter); - } - else { - + } else { /******************/ /* Mode modifiers */ /******************/ @@ -278,14 +280,14 @@ impl<'a> Lexer<'a> { is_indenting = false; } } - continue + continue; } } // Skip newline character if we want to manually insert semicolons if let SeparatorMode::Manual = self.separator_mode { if letter == '\n' { word = self.add_word(word); - continue + continue; } } @@ -316,7 +318,7 @@ impl<'a> Lexer<'a> { if let Err((pos, region)) = self.region.is_region_closed(&self.reader) { return Err(( LexerErrorType::Unclosed, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name) + PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), )); } Ok(()) @@ -325,24 +327,22 @@ impl<'a> Lexer<'a> { #[cfg(test)] mod test { - use crate::compiling_rules::{ Region, Rules }; + use crate::compiling::{Compiler, ScopingMode}; + use crate::compiling_rules::{Region, Rules}; use crate::reg; - use crate::compiling::{ Compiler, ScopingMode }; #[test] fn test_lexer_base() { let symbols = vec!['(', ')']; - let regions = reg![ - reg!(string as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(array as "Array Literal" => { - begin: "[", - end: "]" - }) - ]) - ]; + let regions = reg![reg!(string as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(array as "Array Literal" => { + begin: "[", + end: "]" + }) + ])]; let expected = vec![ ("let".to_string(), 1, 1), ("a".to_string(), 1, 5), @@ -351,7 +351,7 @@ mod test { ("12".to_string(), 1, 10), ("+".to_string(), 1, 13), ("32".to_string(), 1, 15), - (")".to_string(), 1, 17) + (")".to_string(), 1, 17), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("TestScript", rules); @@ -370,18 +370,16 @@ mod test { #[test] fn test_lexer_string_interp() { let symbols = vec!['(', ')']; - let regions = reg![ - reg!(string_literal as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(string_interp as "String interpolation" => { - begin: "{", - end: "}", - tokenize: true - } ref global) - ]) - ]; + let regions = reg![reg!(string_literal as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(string_interp as "String interpolation" => { + begin: "{", + end: "}", + tokenize: true + } ref global) + ])]; let expected = vec![ ("let".to_string(), 1, 1), ("a".to_string(), 1, 5), @@ -394,7 +392,7 @@ mod test { ("}".to_string(), 1, 30), (" long'".to_string(), 1, 31), ("}".to_string(), 1, 37), - (" 🎉 text'".to_string(), 1, 38) + (" 🎉 text'".to_string(), 1, 38), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("TestScript", rules); @@ -423,16 +421,12 @@ mod test { ("subcondition".to_string(), (2, 8), 21), (":".to_string(), (2, 20), 33), ("\n ".to_string(), (3, 1), 34), - ("pass".to_string(), (3, 9), 43) + ("pass".to_string(), (3, 9), 43), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Testhon", rules); cc.scoping_mode = ScopingMode::Indent; - cc.load(vec![ - "if condition:", - " if subcondition:", - " pass" - ].join("\n")); + cc.load(vec!["if condition:", " if subcondition:", " pass"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -455,15 +449,11 @@ mod test { ("12".to_string(), 1, 11), ("+".to_string(), 2, 1), ("12".to_string(), 3, 1), - (";".to_string(), 3, 3) + (";".to_string(), 3, 3), ]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.load(vec![ - "let age = 12", - "+", - "12;" - ].join("\n")); + cc.load(vec!["let age = 12", "+", "12;"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -478,24 +468,14 @@ mod test { #[test] fn test_lexer_multiline_regions() { let symbols = vec![';', '+', '=']; - let regions = reg![ - reg!(string as "String" => { - begin: "'", - end: "'" - }) - ]; - let expected = vec![ - ("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1) - ]; + let regions = reg![reg!(string as "String" => { + begin: "'", + end: "'" + })]; + let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec![ - "'this", - "is", - "a", - "multiline", - "string'", - ].join("\n")); + cc.load(vec!["'this", "is", "a", "multiline", "string'"].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing @@ -510,20 +490,14 @@ mod test { #[test] fn test_lexer_escaped_regions() { let symbols = vec![';', '+', '=']; - let regions = reg![ - reg!(string as "String" => { - begin: "\"", - end: "\"" - }) - ]; - let expected = vec![ - ("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1) - ]; + let regions = reg![reg!(string as "String" => { + begin: "\"", + end: "\"" + })]; + let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec![ - "\"this is \\\"escaped\\\" string\"" - ].join("\n")); + cc.load(vec!["\"this is \\\"escaped\\\" string\""].join("\n")); let mut lexer = super::Lexer::new(&cc); let mut result = vec![]; // Simulate lexing diff --git a/src/compiling/lexing/lexer_static.rs b/src/compiling/lexing/lexer_static.rs new file mode 100644 index 0000000..254e6bb --- /dev/null +++ b/src/compiling/lexing/lexer_static.rs @@ -0,0 +1,511 @@ +//! Static Lexer +//! +//! This module contains the static lexer that is used to tokenize the source code + +use crate::{ + compiling_rules::Rules, + prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, +}; + +use super::{ + compound_handler::{CompoundHandler, CompoundReaction}, + reader::Reader, + region_handler::{RegionHandler, RegionReaction}, + LexerError, LexerErrorType, +}; + +/// Static Lexer +pub struct StaticLexer { + rules: Rules, + /// Path to the lexed file + pub path: Option, + /// Separator mode for this lexer + pub separator_mode: SeparatorMode, + /// Escape symbol for this lexer. Default is '\\' + pub escape_symbol: char, + /// Scoping mode for this lexer + pub scoping_mode: ScopingMode, +} + +struct LexState<'a> { + word: String, + is_indenting: bool, + is_escaped: bool, + token_start_index: usize, + position: (usize, usize), + reader: Reader<'a>, + lexem: Vec, + region_handler: RegionHandler, + compound_handler: CompoundHandler, +} + +impl StaticLexer { + /// Create a new Lexer based on the compiler metadata + pub fn new(rules: Rules) -> Self { + StaticLexer { + rules, + path: None, + separator_mode: SeparatorMode::Manual, + escape_symbol: '\\', + scoping_mode: ScopingMode::Block, + } + } + + /// Add indentation to the lexem + #[inline] + fn add_indent(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + // Getting position by word here would attempt to + // substract with overflow since the new line character + // technically belongs to the previous line + let (row, _col) = lex_state.reader.get_position(); + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: (row, 1), + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); + } + + /// Add word that has been completed in previous iteration to the lexem + #[inline] + fn add_word(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); + } + + /// Add word that has been completed in current iteration to the lexem + #[inline] + fn add_word_inclusively(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; + } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new() + } + + /// Checks whether this is a nontokenizable region + #[inline] + fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { + if let Some(region) = lex_state.region_handler.get_region() { + region.tokenize && *reaction == RegionReaction::Pass + } else { + false + } + } + + /// Pattern code for adding a symbol + /// **[*]** + #[inline] + fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); + } + self.word_push(lex_state, letter); + lex_state.position = lex_state.reader.get_position(); + + self.add_word_inclusively(lex_state); + } + + /// Pattern code for beginning a new region + /// **[** + #[inline] + fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + self.word_push(lex_state, letter); + } + + /// Pattern code for ending current region + /// **]** + #[inline] + fn pattern_end(&self, lex_state: &mut LexState, letter: char) { + self.word_push(lex_state, letter); + self.add_word_inclusively(lex_state); + } + + /// Push letter to the word and set token start index + fn word_push(&self, lex_state: &mut LexState, letter: char) { + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); + } + lex_state.word.push(letter); + } + + /// Tokenize source code + /// + /// Run lexer and tokenize code. The result is stored in the lexem attribute + pub fn tokenize(&self, input: &str) -> Result, LexerError> { + let code = input.to_string(); + + let mut lex_state = LexState { + word: String::new(), + is_indenting: false, + is_escaped: false, + token_start_index: 0, + position: (0, 0), + lexem: Vec::new(), + reader: Reader::new(&code), + region_handler: RegionHandler::new(&self.rules), + compound_handler: CompoundHandler::new(&self.rules), + }; + + while let Some(letter) = lex_state.reader.next() { + /****************/ + /* Set Position */ + /****************/ + + // If the new position hasn't been set yet, set it + if lex_state.position == (0, 0) { + // If separator mode is set to Manual and the letter is a separator, + // then skip finding a new position + if SeparatorMode::Manual != self.separator_mode || letter != '\n' { + let region = lex_state.region_handler.get_region().unwrap(); + // If the region is tokenized, then check if the letter is a separator + if !region.tokenize || !vec![' ', '\t'].contains(&letter) { + lex_state.position = lex_state.reader.get_position(); + } + } + } + + // Reaction stores the reaction of the region handler + // Have we just opened or closed some region? + let reaction = lex_state + .region_handler + .handle_region(&lex_state.reader, lex_state.is_escaped); + match reaction { + // If the region has been opened + // Finish the part that we have been parsing + RegionReaction::Begin(tokenize) => { + // Also if the new region is an interpolation that tokenizes + // the inner content - separate the region from the content + if tokenize { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Regular region case + else { + // This is supposed to prevent overshadowing new line + // character if region rule opens with newline + if letter == '\n' { + // This additionally creates a new token + self.pattern_add_symbol(&mut lex_state, letter); + } + // Normally start a new region + self.pattern_begin(&mut lex_state, letter); + } + } + // If the region has been closed + // Add the closing region and finish the word + RegionReaction::End(tokenize) => { + // Also if the new region is an interpolation that tokenizes + // the inner content - separate the region from the content + if tokenize { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Regular region case + else { + // Normally close the region + self.pattern_end(&mut lex_state, letter); + // This is supposed to prevent overshadowing new line + // character if region rule closes with newline + if letter == '\n' { + // This additionally creates a new token + self.pattern_add_symbol(&mut lex_state, letter); + } + } + } + RegionReaction::Pass => { + let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); + match lex_state.compound_handler.handle_compound( + letter, + &lex_state.reader, + is_tokenized_region, + ) { + CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), + CompoundReaction::Keep => self.word_push(&mut lex_state, letter), + CompoundReaction::End => self.pattern_end(&mut lex_state, letter), + CompoundReaction::Pass => { + // Handle region scope + if !self.is_tokenized_region(&reaction, &mut lex_state) { + let region = lex_state.region_handler.get_region().unwrap(); + // Flip escaped key + lex_state.is_escaped = (!lex_state.is_escaped + && letter == self.escape_symbol) + .then(|| !lex_state.is_escaped) + .unwrap_or(false); + // Handle singleline attribute + if letter == '\n' && region.singleline { + let pos = lex_state.reader.get_position(); + return Err(( + LexerErrorType::Singleline, + PositionInfo::at_pos(self.path.clone(), pos, 0) + .data(region.name.clone()), + )); + } + self.word_push(&mut lex_state, letter); + } else { + /******************/ + /* Mode modifiers */ + /******************/ + + // Create indent regions: '\n ' + if let ScopingMode::Indent = self.scoping_mode { + // If we are still in the indent region - proceed + if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { + self.word_push(&mut lex_state, letter); + } + // If it's the new line - start indent region + if letter == '\n' { + lex_state.is_indenting = true; + self.pattern_begin(&mut lex_state, letter); + } + // Check if the current letter + // concludes current indent region + if lex_state.is_indenting { + if let Some(next_char) = lex_state.reader.peek() { + if !vec![' ', '\t'].contains(&next_char) { + self.add_indent(&mut lex_state); + lex_state.is_indenting = false; + } + } + continue; + } + } + // Skip newline character if we want to manually insert semicolons + if let SeparatorMode::Manual = self.separator_mode { + if letter == '\n' { + self.add_word(&mut lex_state); + continue; + } + } + + /*****************/ + /* Regular Lexer */ + /*****************/ + + // Skip whitespace + if vec![' ', '\t'].contains(&letter) { + self.add_word(&mut lex_state); + } + // Handle special symbols + else if self.rules.symbols.contains(&letter) || letter == '\n' { + self.pattern_add_symbol(&mut lex_state, letter); + } + // Handle word + else { + self.word_push(&mut lex_state, letter); + } + } + } + } + } + } + } + self.add_word(&mut lex_state); + // If some region exists that was not closed + if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { + return Err(( + LexerErrorType::Unclosed, + PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), + )); + } + + Ok(lex_state.lexem) + } +} + +#[cfg(test)] +mod test { + use crate::compiling::ScopingMode; + use crate::compiling_rules::{Region, Rules}; + use crate::reg; + + #[test] + fn test_lexer_base() { + let symbols = vec!['(', ')']; + let regions = reg![reg!(string as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(array as "Array Literal" => { + begin: "[", + end: "]" + }) + ])]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("a".to_string(), 1, 5), + ("=".to_string(), 1, 7), + ("(".to_string(), 1, 9), + ("12".to_string(), 1, 10), + ("+".to_string(), 1, 13), + ("32".to_string(), 1, 15), + (")".to_string(), 1, 17), + ]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize("let a = (12 + 32)"); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_string_interp() { + let symbols = vec!['(', ')']; + let regions = reg![reg!(string_literal as "String literal" => { + begin: "'", + end: "'" + } => [ + reg!(string_interp as "String interpolation" => { + begin: "{", + end: "}", + tokenize: true + } ref global) + ])]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("a".to_string(), 1, 5), + ("=".to_string(), 1, 7), + ("'this ".to_string(), 1, 9), + ("{".to_string(), 1, 15), + ("'is ".to_string(), 1, 16), + ("{".to_string(), 1, 20), + ("adjective".to_string(), 1, 21), + ("}".to_string(), 1, 30), + (" long'".to_string(), 1, 31), + ("}".to_string(), 1, 37), + (" 🎉 text'".to_string(), 1, 38), + ]; + let rules = Rules::new(symbols, vec![], regions); + + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_indent_scoping_mode() { + let symbols = vec![':']; + let regions = reg![]; + let expected = vec![ + ("if".to_string(), (1, 1), 0), + ("condition".to_string(), (1, 4), 3), + (":".to_string(), (1, 13), 12), + ("\n ".to_string(), (2, 1), 13), + ("if".to_string(), (2, 5), 18), + ("subcondition".to_string(), (2, 8), 21), + (":".to_string(), (2, 20), 33), + ("\n ".to_string(), (3, 1), 34), + ("pass".to_string(), (3, 9), 43), + ]; + let rules = Rules::new(symbols, vec![], regions); + + let mut lexer = super::StaticLexer::new(rules); + lexer.scoping_mode = ScopingMode::Indent; + let mut result = vec![]; + // Simulate lexing + let res = lexer + .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_manual_separator_mode() { + let symbols = vec![';', '+', '=']; + let regions = reg![]; + let expected = vec![ + ("let".to_string(), 1, 1), + ("age".to_string(), 1, 5), + ("=".to_string(), 1, 9), + ("12".to_string(), 1, 11), + ("+".to_string(), 2, 1), + ("12".to_string(), 3, 1), + (";".to_string(), 3, 3), + ]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_multiline_regions() { + let symbols = vec![';', '+', '=']; + let regions = reg![reg!(string as "String" => { + begin: "'", + end: "'" + })]; + let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } + + #[test] + fn test_lexer_escaped_regions() { + let symbols = vec![';', '+', '=']; + let regions = reg![reg!(string as "String" => { + begin: "\"", + end: "\"" + })]; + let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; + let rules = Rules::new(symbols, vec![], regions); + let lexer = super::StaticLexer::new(rules); + let mut result = vec![]; + // Simulate lexing + let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); + assert!(res.is_ok()); + for lex in res.unwrap() { + result.push((lex.word, lex.pos.0, lex.pos.1)); + } + assert_eq!(expected, result); + } +} diff --git a/src/compiling/lexing/mod.rs b/src/compiling/lexing/mod.rs index 7257ebc..b53f82e 100644 --- a/src/compiling/lexing/mod.rs +++ b/src/compiling/lexing/mod.rs @@ -1,9 +1,25 @@ //! Lexer module -//! +//! //! This module holds all the lexer related modules +use crate::prelude::PositionInfo; + mod compound_handler; -mod region_handler; +#[cfg(feature = "lexer_dynamic")] +pub mod lexer; +#[cfg(feature = "lexer_static")] +pub mod lexer_static; mod reader; -mod lexer; -pub use lexer::*; \ No newline at end of file +mod region_handler; + +/// Lexer's error type +#[derive(Debug)] +pub enum LexerErrorType { + /// Unspillable region has been spilled + Singleline, + /// Given region left unclosed + Unclosed, +} + +/// Type containing full error of lexer +pub type LexerError = (LexerErrorType, PositionInfo); diff --git a/src/compiling/mod.rs b/src/compiling/mod.rs index a305a2f..534a476 100644 --- a/src/compiling/mod.rs +++ b/src/compiling/mod.rs @@ -4,12 +4,15 @@ //! that helps you tokenize your code or even parse it entirely. mod lexing; + +#[cfg(feature = "compiler")] mod compiler; mod token; mod parser; pub mod failing; pub use lexing::*; +#[cfg(feature = "compiler")] pub use compiler::*; pub use token::*; pub use parser::*; From c38b650261b98fa6dc4234d9881a79d61d3982d3 Mon Sep 17 00:00:00 2001 From: KrosFire Date: Wed, 28 Aug 2024 19:19:41 +0200 Subject: [PATCH 2/4] Added --all-features flag in ci --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 70a416e..0015078 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -15,6 +15,6 @@ jobs: steps: - uses: actions/checkout@v3 - name: Build - run: cargo build --verbose + run: cargo build --verbose --all-features - name: Run tests - run: cargo test --verbose + run: cargo test --verbose --all-features From dd3920d1bad0585704116bfe42828cc00cda71fc Mon Sep 17 00:00:00 2001 From: KrosFire Date: Thu, 5 Sep 2024 19:42:22 +0200 Subject: [PATCH 3/4] Moved fully to static lexer --- .github/workflows/rust.yml | 4 +- Cargo.toml | 11 - src/compiling/compiler.rs | 27 +- src/compiling/lexing/lexer.rs | 365 +++++++++---------- src/compiling/lexing/lexer_static.rs | 511 --------------------------- src/compiling/lexing/mod.rs | 3 - src/compiling/mod.rs | 2 - 7 files changed, 197 insertions(+), 726 deletions(-) delete mode 100644 src/compiling/lexing/lexer_static.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0015078..70a416e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -15,6 +15,6 @@ jobs: steps: - uses: actions/checkout@v3 - name: Build - run: cargo build --verbose --all-features + run: cargo build --verbose - name: Run tests - run: cargo test --verbose --all-features + run: cargo test --verbose diff --git a/Cargo.toml b/Cargo.toml index 84f604f..3fa20a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,17 +9,6 @@ authors = ["pawel.karas@icloud.com"] keywords = ["heraclitus", "compiler", "parser"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[features] -default = ["compiler"] - -# Include the compiler and lexer -compiler = ["lexer_dynamic"] - -# Include the lexer -lexer_dynamic = [] - -# Include the static lexer -lexer_static = [] [dependencies] colored = "2.0.0" diff --git a/src/compiling/compiler.rs b/src/compiling/compiler.rs index a690e70..f531156 100644 --- a/src/compiling/compiler.rs +++ b/src/compiling/compiler.rs @@ -65,18 +65,14 @@ pub enum ScopingMode { pub struct Compiler { /// Name of your language pub name: String, - /// Rules that describe your language - pub rules: Rules, /// Source code in a form of string pub code: Option, /// Path to the compiled file if exists pub path: Option, - /// Separator mode for this compiler - pub separator_mode: SeparatorMode, - /// Scoping mode for this compiler - pub scoping_mode: ScopingMode, // Check if user wants to debug parser - debug: bool + debug: bool, + /// Lexer to tokenize the code + lexer: Lexer } impl Compiler { @@ -84,18 +80,21 @@ impl Compiler { pub fn new>(name: T, rules: Rules) -> Self { Compiler { name: String::from(name.as_ref()), - rules, code: None, path: None, - separator_mode: SeparatorMode::Manual, - scoping_mode: ScopingMode::Block, - debug: false + debug: false, + lexer: Lexer::new(rules) } } /// Set the language to use indentations pub fn use_indents(&mut self) { - self.scoping_mode = ScopingMode::Indent + self.lexer.scoping_mode = ScopingMode::Indent + } + + /// Set the language separator mode + pub fn set_separator(&mut self, mode: SeparatorMode) { + self.lexer.separator_mode = mode } /// Load file from path @@ -120,9 +119,7 @@ impl Compiler { /// Run just lexer pub fn tokenize(&self) -> Result, LexerError> { - let mut lexer = Lexer::new(self); - lexer.run()?; - Ok(lexer.lexem) + self.lexer.tokenize(&self.code.clone().unwrap()) } /// Parser will display information about the call stack diff --git a/src/compiling/lexing/lexer.rs b/src/compiling/lexing/lexer.rs index 2fe4a53..14ffea0 100644 --- a/src/compiling/lexing/lexer.rs +++ b/src/compiling/lexing/lexer.rs @@ -1,118 +1,113 @@ -//! Dynamic Lexer -//! -//! This module contains the dynamic lexer that is used to tokenize the source code. - -use super::compound_handler::{CompoundHandler, CompoundReaction}; -use super::reader::Reader; -use super::region_handler::{RegionHandler, RegionReaction}; -use super::{LexerError, LexerErrorType}; -use crate::compiling::failing::position_info::PositionInfo; -use crate::compiling::{Compiler, ScopingMode, SeparatorMode, Token}; - -// This is just an estimation of token amount -// inside of a typical 200-lined file. -const AVG_TOKEN_AMOUNT: usize = 1024; - -/// The Lexer -/// -/// Lexer takes source code in a form of a string and translates it to a list of tokens. -/// This particular implementation requires additional metadata such as like regions or symbols. -/// These can be supplied by the `Compiler` in a one cohesive package. Hence the API requires to -/// pass a reference to the `Compiler`. -pub struct Lexer<'a> { - symbols: Vec, - escape_symbol: char, - compound: CompoundHandler, - region: RegionHandler, - reader: Reader<'a>, - path: Option, - /// This attribute stores parsed tokens by the lexer - pub lexem: Vec, - separator_mode: SeparatorMode, - scoping_mode: ScopingMode, +//! Lexer +//! +//! This module contains the lexer that is used to tokenize the source code + +use crate::{ + compiling_rules::Rules, + prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, +}; + +use super::{ + compound_handler::{CompoundHandler, CompoundReaction}, + reader::Reader, + region_handler::{RegionHandler, RegionReaction}, + LexerError, LexerErrorType, +}; + +/// Lexer +#[derive(Debug, Clone, PartialEq)] +pub struct Lexer { + rules: Rules, + /// Path to the lexed file + pub path: Option, + /// Separator mode for this lexer + pub separator_mode: SeparatorMode, + /// Escape symbol for this lexer. Default is '\\' + pub escape_symbol: char, + /// Scoping mode for this lexer + pub scoping_mode: ScopingMode, +} + +struct LexState<'a> { + word: String, + is_indenting: bool, is_escaped: bool, - position: (usize, usize), - index: usize, token_start_index: usize, + position: (usize, usize), + reader: Reader<'a>, + lexem: Vec, + region_handler: RegionHandler, + compound_handler: CompoundHandler, } -impl<'a> Lexer<'a> { +impl Lexer { /// Create a new Lexer based on the compiler metadata - pub fn new(cc: &'a Compiler) -> Self { - let code: &'a String = cc.code.as_ref().unwrap(); + pub fn new(rules: Rules) -> Self { Lexer { - symbols: cc.rules.symbols.clone(), - escape_symbol: cc.rules.escape_symbol, - compound: CompoundHandler::new(&cc.rules), - region: RegionHandler::new(&cc.rules), - reader: Reader::new(code), - path: cc.path.clone(), - lexem: Vec::with_capacity(AVG_TOKEN_AMOUNT), - separator_mode: cc.separator_mode.clone(), - scoping_mode: cc.scoping_mode.clone(), - is_escaped: false, - position: (0, 0), - index: 0, - token_start_index: 0, + rules, + path: None, + separator_mode: SeparatorMode::Manual, + escape_symbol: '\\', + scoping_mode: ScopingMode::Block, } } /// Add indentation to the lexem #[inline] - fn add_indent(&mut self, word: String) -> String { - if !word.is_empty() { - // Getting position by word here would attempt to - // substract with overflow since the new line character - // technically belongs to the previous line - let (row, _col) = self.reader.get_position(); - self.lexem.push(Token { - word, - pos: (row, 1), - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_indent(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + // Getting position by word here would attempt to + // substract with overflow since the new line character + // technically belongs to the previous line + let (row, _col) = lex_state.reader.get_position(); + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: (row, 1), + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); } /// Add word that has been completed in previous iteration to the lexem #[inline] - fn add_word(&mut self, word: String) -> String { - if !word.is_empty() { - self.lexem.push(Token { - word, - pos: self.position, - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_word(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); } /// Add word that has been completed in current iteration to the lexem #[inline] - fn add_word_inclusively(&mut self, word: String) -> String { - if !word.is_empty() { - self.lexem.push(Token { - word, - pos: self.position, - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_word_inclusively(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new() } /// Checks whether this is a nontokenizable region #[inline] - pub fn is_tokenized_region(&self, reaction: &RegionReaction) -> bool { - if let Some(region) = self.region.get_region().as_ref() { + fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { + if let Some(region) = lex_state.region_handler.get_region() { region.tokenize && *reaction == RegionReaction::Pass } else { false @@ -122,70 +117,83 @@ impl<'a> Lexer<'a> { /// Pattern code for adding a symbol /// **[*]** #[inline] - fn pattern_add_symbol(&mut self, mut word: String, letter: char) -> String { - word = self.add_word(word); - if word.is_empty() { - self.token_start_index = self.index; + fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); } - self.word_push(&mut word, letter); - self.position = self.reader.get_position(); - self.add_word_inclusively(word) + self.word_push(lex_state, letter); + lex_state.position = lex_state.reader.get_position(); + + self.add_word_inclusively(lex_state); } /// Pattern code for beginning a new region /// **[** #[inline] - fn pattern_begin(&mut self, mut word: String, letter: char) -> String { - word = self.add_word(word); - self.word_push(&mut word, letter); - word + fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + self.word_push(lex_state, letter); } /// Pattern code for ending current region /// **]** #[inline] - fn pattern_end(&mut self, mut word: String, letter: char) -> String { - self.word_push(&mut word, letter); - self.add_word_inclusively(word) + fn pattern_end(&self, lex_state: &mut LexState, letter: char) { + self.word_push(lex_state, letter); + self.add_word_inclusively(lex_state); } /// Push letter to the word and set token start index - fn word_push(&mut self, word: &mut String, letter: char) { - if word.is_empty() { - self.token_start_index = self.index; + fn word_push(&self, lex_state: &mut LexState, letter: char) { + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); } - word.push(letter); + lex_state.word.push(letter); } /// Tokenize source code /// /// Run lexer and tokenize code. The result is stored in the lexem attribute - pub fn run(&mut self) -> Result<(), LexerError> { - let mut word = String::new(); - let mut is_indenting = false; - while let Some(letter) = self.reader.next() { - self.index = self.reader.get_index(); + pub fn tokenize(&self, input: &str) -> Result, LexerError> { + let code = input.to_string(); + let mut lex_state = LexState { + word: String::new(), + is_indenting: false, + is_escaped: false, + token_start_index: 0, + position: (0, 0), + lexem: Vec::new(), + reader: Reader::new(&code), + region_handler: RegionHandler::new(&self.rules), + compound_handler: CompoundHandler::new(&self.rules), + }; + + while let Some(letter) = lex_state.reader.next() { /****************/ /* Set Position */ /****************/ // If the new position hasn't been set yet, set it - if self.position == (0, 0) { + if lex_state.position == (0, 0) { // If separator mode is set to Manual and the letter is a separator, // then skip finding a new position if SeparatorMode::Manual != self.separator_mode || letter != '\n' { - let region = self.region.get_region().unwrap(); + let region = lex_state.region_handler.get_region().unwrap(); // If the region is tokenized, then check if the letter is a separator if !region.tokenize || !vec![' ', '\t'].contains(&letter) { - self.position = self.reader.get_position(); + lex_state.position = lex_state.reader.get_position(); } } } // Reaction stores the reaction of the region handler // Have we just opened or closed some region? - let reaction = self.region.handle_region(&self.reader, self.is_escaped); + let reaction = lex_state + .region_handler + .handle_region(&lex_state.reader, lex_state.is_escaped); match reaction { // If the region has been opened // Finish the part that we have been parsing @@ -193,7 +201,7 @@ impl<'a> Lexer<'a> { // Also if the new region is an interpolation that tokenizes // the inner content - separate the region from the content if tokenize { - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Regular region case else { @@ -201,10 +209,10 @@ impl<'a> Lexer<'a> { // character if region rule opens with newline if letter == '\n' { // This additionally creates a new token - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Normally start a new region - word = self.pattern_begin(word, letter); + self.pattern_begin(&mut lex_state, letter); } } // If the region has been closed @@ -213,48 +221,49 @@ impl<'a> Lexer<'a> { // Also if the new region is an interpolation that tokenizes // the inner content - separate the region from the content if tokenize { - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Regular region case else { // Normally close the region - word = self.pattern_end(word, letter); + self.pattern_end(&mut lex_state, letter); // This is supposed to prevent overshadowing new line // character if region rule closes with newline if letter == '\n' { // This additionally creates a new token - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } } } RegionReaction::Pass => { - match self.compound.handle_compound( + let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); + match lex_state.compound_handler.handle_compound( letter, - &self.reader, - self.is_tokenized_region(&reaction), + &lex_state.reader, + is_tokenized_region, ) { - CompoundReaction::Begin => word = self.pattern_begin(word, letter), - CompoundReaction::Keep => self.word_push(&mut word, letter), - CompoundReaction::End => word = self.pattern_end(word, letter), + CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), + CompoundReaction::Keep => self.word_push(&mut lex_state, letter), + CompoundReaction::End => self.pattern_end(&mut lex_state, letter), CompoundReaction::Pass => { // Handle region scope - if !self.is_tokenized_region(&reaction) { - let region = self.region.get_region().unwrap(); + if !self.is_tokenized_region(&reaction, &mut lex_state) { + let region = lex_state.region_handler.get_region().unwrap(); // Flip escaped key - self.is_escaped = (!self.is_escaped + lex_state.is_escaped = (!lex_state.is_escaped && letter == self.escape_symbol) - .then(|| !self.is_escaped) + .then(|| !lex_state.is_escaped) .unwrap_or(false); // Handle singleline attribute if letter == '\n' && region.singleline { - let pos = self.reader.get_position(); + let pos = lex_state.reader.get_position(); return Err(( LexerErrorType::Singleline, PositionInfo::at_pos(self.path.clone(), pos, 0) .data(region.name.clone()), )); } - self.word_push(&mut word, letter); + self.word_push(&mut lex_state, letter); } else { /******************/ /* Mode modifiers */ @@ -263,21 +272,21 @@ impl<'a> Lexer<'a> { // Create indent regions: '\n ' if let ScopingMode::Indent = self.scoping_mode { // If we are still in the indent region - proceed - if is_indenting && vec![' ', '\t'].contains(&letter) { - self.word_push(&mut word, letter); + if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { + self.word_push(&mut lex_state, letter); } // If it's the new line - start indent region if letter == '\n' { - is_indenting = true; - word = self.pattern_begin(word, letter); + lex_state.is_indenting = true; + self.pattern_begin(&mut lex_state, letter); } // Check if the current letter // concludes current indent region - if is_indenting { - if let Some(next_char) = self.reader.peek() { + if lex_state.is_indenting { + if let Some(next_char) = lex_state.reader.peek() { if !vec![' ', '\t'].contains(&next_char) { - word = self.add_indent(word); - is_indenting = false; + self.add_indent(&mut lex_state); + lex_state.is_indenting = false; } } continue; @@ -286,7 +295,7 @@ impl<'a> Lexer<'a> { // Skip newline character if we want to manually insert semicolons if let SeparatorMode::Manual = self.separator_mode { if letter == '\n' { - word = self.add_word(word); + self.add_word(&mut lex_state); continue; } } @@ -297,15 +306,15 @@ impl<'a> Lexer<'a> { // Skip whitespace if vec![' ', '\t'].contains(&letter) { - word = self.add_word(word); + self.add_word(&mut lex_state); } // Handle special symbols - else if self.symbols.contains(&letter) || letter == '\n' { - word = self.pattern_add_symbol(word, letter); + else if self.rules.symbols.contains(&letter) || letter == '\n' { + self.pattern_add_symbol(&mut lex_state, letter); } // Handle word else { - self.word_push(&mut word, letter); + self.word_push(&mut lex_state, letter); } } } @@ -313,21 +322,22 @@ impl<'a> Lexer<'a> { } } } - self.add_word(word); + self.add_word(&mut lex_state); // If some region exists that was not closed - if let Err((pos, region)) = self.region.is_region_closed(&self.reader) { + if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { return Err(( LexerErrorType::Unclosed, PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), )); } - Ok(()) + + Ok(lex_state.lexem) } } #[cfg(test)] mod test { - use crate::compiling::{Compiler, ScopingMode}; + use crate::compiling::ScopingMode; use crate::compiling_rules::{Region, Rules}; use crate::reg; @@ -354,14 +364,12 @@ mod test { (")".to_string(), 1, 17), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("TestScript", rules); - cc.load("let a = (12 + 32)"); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize("let a = (12 + 32)"); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -395,14 +403,13 @@ mod test { (" 🎉 text'".to_string(), 1, 38), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("TestScript", rules); - cc.load("let a = 'this {'is {adjective} long'} 🎉 text'"); - let mut lexer = super::Lexer::new(&cc); + + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -424,15 +431,15 @@ mod test { ("pass".to_string(), (3, 9), 43), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.scoping_mode = ScopingMode::Indent; - cc.load(vec!["if condition:", " if subcondition:", " pass"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + + let mut lexer = super::Lexer::new(rules); + lexer.scoping_mode = ScopingMode::Indent; let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer + .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); } assert_eq!(expected, result); @@ -452,14 +459,12 @@ mod test { (";".to_string(), 3, 3), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.load(vec!["let age = 12", "+", "12;"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -474,14 +479,12 @@ mod test { })]; let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec!["'this", "is", "a", "multiline", "string'"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -496,14 +499,12 @@ mod test { })]; let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec!["\"this is \\\"escaped\\\" string\""].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); diff --git a/src/compiling/lexing/lexer_static.rs b/src/compiling/lexing/lexer_static.rs deleted file mode 100644 index 254e6bb..0000000 --- a/src/compiling/lexing/lexer_static.rs +++ /dev/null @@ -1,511 +0,0 @@ -//! Static Lexer -//! -//! This module contains the static lexer that is used to tokenize the source code - -use crate::{ - compiling_rules::Rules, - prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, -}; - -use super::{ - compound_handler::{CompoundHandler, CompoundReaction}, - reader::Reader, - region_handler::{RegionHandler, RegionReaction}, - LexerError, LexerErrorType, -}; - -/// Static Lexer -pub struct StaticLexer { - rules: Rules, - /// Path to the lexed file - pub path: Option, - /// Separator mode for this lexer - pub separator_mode: SeparatorMode, - /// Escape symbol for this lexer. Default is '\\' - pub escape_symbol: char, - /// Scoping mode for this lexer - pub scoping_mode: ScopingMode, -} - -struct LexState<'a> { - word: String, - is_indenting: bool, - is_escaped: bool, - token_start_index: usize, - position: (usize, usize), - reader: Reader<'a>, - lexem: Vec, - region_handler: RegionHandler, - compound_handler: CompoundHandler, -} - -impl StaticLexer { - /// Create a new Lexer based on the compiler metadata - pub fn new(rules: Rules) -> Self { - StaticLexer { - rules, - path: None, - separator_mode: SeparatorMode::Manual, - escape_symbol: '\\', - scoping_mode: ScopingMode::Block, - } - } - - /// Add indentation to the lexem - #[inline] - fn add_indent(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - // Getting position by word here would attempt to - // substract with overflow since the new line character - // technically belongs to the previous line - let (row, _col) = lex_state.reader.get_position(); - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: (row, 1), - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new(); - } - - /// Add word that has been completed in previous iteration to the lexem - #[inline] - fn add_word(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: lex_state.position, - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new(); - } - - /// Add word that has been completed in current iteration to the lexem - #[inline] - fn add_word_inclusively(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: lex_state.position, - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new() - } - - /// Checks whether this is a nontokenizable region - #[inline] - fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { - if let Some(region) = lex_state.region_handler.get_region() { - region.tokenize && *reaction == RegionReaction::Pass - } else { - false - } - } - - /// Pattern code for adding a symbol - /// **[*]** - #[inline] - fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { - self.add_word(lex_state); - - if lex_state.word.is_empty() { - lex_state.token_start_index = lex_state.reader.get_index(); - } - self.word_push(lex_state, letter); - lex_state.position = lex_state.reader.get_position(); - - self.add_word_inclusively(lex_state); - } - - /// Pattern code for beginning a new region - /// **[** - #[inline] - fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { - self.add_word(lex_state); - self.word_push(lex_state, letter); - } - - /// Pattern code for ending current region - /// **]** - #[inline] - fn pattern_end(&self, lex_state: &mut LexState, letter: char) { - self.word_push(lex_state, letter); - self.add_word_inclusively(lex_state); - } - - /// Push letter to the word and set token start index - fn word_push(&self, lex_state: &mut LexState, letter: char) { - if lex_state.word.is_empty() { - lex_state.token_start_index = lex_state.reader.get_index(); - } - lex_state.word.push(letter); - } - - /// Tokenize source code - /// - /// Run lexer and tokenize code. The result is stored in the lexem attribute - pub fn tokenize(&self, input: &str) -> Result, LexerError> { - let code = input.to_string(); - - let mut lex_state = LexState { - word: String::new(), - is_indenting: false, - is_escaped: false, - token_start_index: 0, - position: (0, 0), - lexem: Vec::new(), - reader: Reader::new(&code), - region_handler: RegionHandler::new(&self.rules), - compound_handler: CompoundHandler::new(&self.rules), - }; - - while let Some(letter) = lex_state.reader.next() { - /****************/ - /* Set Position */ - /****************/ - - // If the new position hasn't been set yet, set it - if lex_state.position == (0, 0) { - // If separator mode is set to Manual and the letter is a separator, - // then skip finding a new position - if SeparatorMode::Manual != self.separator_mode || letter != '\n' { - let region = lex_state.region_handler.get_region().unwrap(); - // If the region is tokenized, then check if the letter is a separator - if !region.tokenize || !vec![' ', '\t'].contains(&letter) { - lex_state.position = lex_state.reader.get_position(); - } - } - } - - // Reaction stores the reaction of the region handler - // Have we just opened or closed some region? - let reaction = lex_state - .region_handler - .handle_region(&lex_state.reader, lex_state.is_escaped); - match reaction { - // If the region has been opened - // Finish the part that we have been parsing - RegionReaction::Begin(tokenize) => { - // Also if the new region is an interpolation that tokenizes - // the inner content - separate the region from the content - if tokenize { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Regular region case - else { - // This is supposed to prevent overshadowing new line - // character if region rule opens with newline - if letter == '\n' { - // This additionally creates a new token - self.pattern_add_symbol(&mut lex_state, letter); - } - // Normally start a new region - self.pattern_begin(&mut lex_state, letter); - } - } - // If the region has been closed - // Add the closing region and finish the word - RegionReaction::End(tokenize) => { - // Also if the new region is an interpolation that tokenizes - // the inner content - separate the region from the content - if tokenize { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Regular region case - else { - // Normally close the region - self.pattern_end(&mut lex_state, letter); - // This is supposed to prevent overshadowing new line - // character if region rule closes with newline - if letter == '\n' { - // This additionally creates a new token - self.pattern_add_symbol(&mut lex_state, letter); - } - } - } - RegionReaction::Pass => { - let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); - match lex_state.compound_handler.handle_compound( - letter, - &lex_state.reader, - is_tokenized_region, - ) { - CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), - CompoundReaction::Keep => self.word_push(&mut lex_state, letter), - CompoundReaction::End => self.pattern_end(&mut lex_state, letter), - CompoundReaction::Pass => { - // Handle region scope - if !self.is_tokenized_region(&reaction, &mut lex_state) { - let region = lex_state.region_handler.get_region().unwrap(); - // Flip escaped key - lex_state.is_escaped = (!lex_state.is_escaped - && letter == self.escape_symbol) - .then(|| !lex_state.is_escaped) - .unwrap_or(false); - // Handle singleline attribute - if letter == '\n' && region.singleline { - let pos = lex_state.reader.get_position(); - return Err(( - LexerErrorType::Singleline, - PositionInfo::at_pos(self.path.clone(), pos, 0) - .data(region.name.clone()), - )); - } - self.word_push(&mut lex_state, letter); - } else { - /******************/ - /* Mode modifiers */ - /******************/ - - // Create indent regions: '\n ' - if let ScopingMode::Indent = self.scoping_mode { - // If we are still in the indent region - proceed - if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { - self.word_push(&mut lex_state, letter); - } - // If it's the new line - start indent region - if letter == '\n' { - lex_state.is_indenting = true; - self.pattern_begin(&mut lex_state, letter); - } - // Check if the current letter - // concludes current indent region - if lex_state.is_indenting { - if let Some(next_char) = lex_state.reader.peek() { - if !vec![' ', '\t'].contains(&next_char) { - self.add_indent(&mut lex_state); - lex_state.is_indenting = false; - } - } - continue; - } - } - // Skip newline character if we want to manually insert semicolons - if let SeparatorMode::Manual = self.separator_mode { - if letter == '\n' { - self.add_word(&mut lex_state); - continue; - } - } - - /*****************/ - /* Regular Lexer */ - /*****************/ - - // Skip whitespace - if vec![' ', '\t'].contains(&letter) { - self.add_word(&mut lex_state); - } - // Handle special symbols - else if self.rules.symbols.contains(&letter) || letter == '\n' { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Handle word - else { - self.word_push(&mut lex_state, letter); - } - } - } - } - } - } - } - self.add_word(&mut lex_state); - // If some region exists that was not closed - if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { - return Err(( - LexerErrorType::Unclosed, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), - )); - } - - Ok(lex_state.lexem) - } -} - -#[cfg(test)] -mod test { - use crate::compiling::ScopingMode; - use crate::compiling_rules::{Region, Rules}; - use crate::reg; - - #[test] - fn test_lexer_base() { - let symbols = vec!['(', ')']; - let regions = reg![reg!(string as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(array as "Array Literal" => { - begin: "[", - end: "]" - }) - ])]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("a".to_string(), 1, 5), - ("=".to_string(), 1, 7), - ("(".to_string(), 1, 9), - ("12".to_string(), 1, 10), - ("+".to_string(), 1, 13), - ("32".to_string(), 1, 15), - (")".to_string(), 1, 17), - ]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize("let a = (12 + 32)"); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_string_interp() { - let symbols = vec!['(', ')']; - let regions = reg![reg!(string_literal as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(string_interp as "String interpolation" => { - begin: "{", - end: "}", - tokenize: true - } ref global) - ])]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("a".to_string(), 1, 5), - ("=".to_string(), 1, 7), - ("'this ".to_string(), 1, 9), - ("{".to_string(), 1, 15), - ("'is ".to_string(), 1, 16), - ("{".to_string(), 1, 20), - ("adjective".to_string(), 1, 21), - ("}".to_string(), 1, 30), - (" long'".to_string(), 1, 31), - ("}".to_string(), 1, 37), - (" 🎉 text'".to_string(), 1, 38), - ]; - let rules = Rules::new(symbols, vec![], regions); - - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_indent_scoping_mode() { - let symbols = vec![':']; - let regions = reg![]; - let expected = vec![ - ("if".to_string(), (1, 1), 0), - ("condition".to_string(), (1, 4), 3), - (":".to_string(), (1, 13), 12), - ("\n ".to_string(), (2, 1), 13), - ("if".to_string(), (2, 5), 18), - ("subcondition".to_string(), (2, 8), 21), - (":".to_string(), (2, 20), 33), - ("\n ".to_string(), (3, 1), 34), - ("pass".to_string(), (3, 9), 43), - ]; - let rules = Rules::new(symbols, vec![], regions); - - let mut lexer = super::StaticLexer::new(rules); - lexer.scoping_mode = ScopingMode::Indent; - let mut result = vec![]; - // Simulate lexing - let res = lexer - .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_manual_separator_mode() { - let symbols = vec![';', '+', '=']; - let regions = reg![]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("age".to_string(), 1, 5), - ("=".to_string(), 1, 9), - ("12".to_string(), 1, 11), - ("+".to_string(), 2, 1), - ("12".to_string(), 3, 1), - (";".to_string(), 3, 3), - ]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_multiline_regions() { - let symbols = vec![';', '+', '=']; - let regions = reg![reg!(string as "String" => { - begin: "'", - end: "'" - })]; - let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_escaped_regions() { - let symbols = vec![';', '+', '=']; - let regions = reg![reg!(string as "String" => { - begin: "\"", - end: "\"" - })]; - let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } -} diff --git a/src/compiling/lexing/mod.rs b/src/compiling/lexing/mod.rs index b53f82e..704573b 100644 --- a/src/compiling/lexing/mod.rs +++ b/src/compiling/lexing/mod.rs @@ -5,10 +5,7 @@ use crate::prelude::PositionInfo; mod compound_handler; -#[cfg(feature = "lexer_dynamic")] pub mod lexer; -#[cfg(feature = "lexer_static")] -pub mod lexer_static; mod reader; mod region_handler; diff --git a/src/compiling/mod.rs b/src/compiling/mod.rs index 534a476..52624ab 100644 --- a/src/compiling/mod.rs +++ b/src/compiling/mod.rs @@ -5,14 +5,12 @@ mod lexing; -#[cfg(feature = "compiler")] mod compiler; mod token; mod parser; pub mod failing; pub use lexing::*; -#[cfg(feature = "compiler")] pub use compiler::*; pub use token::*; pub use parser::*; From bf5c5cdf2d3da969078ba91b4c54e6009e9ef788 Mon Sep 17 00:00:00 2001 From: KrosFire Date: Fri, 6 Sep 2024 20:16:45 +0200 Subject: [PATCH 4/4] Removed old readme --- README.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/README.md b/README.md index 2a411b1..f97b8e8 100644 --- a/README.md +++ b/README.md @@ -43,16 +43,6 @@ let cc = Compiler::new("HerbScript", rules); let tokens = cc.tokenize()?; ``` -## Features - -You can import specific features from Heraclitus. Available options are: - -- `compiler` - Includes Compiler and dynamic Lexer -- `lexer_dynamic` - Includes just the dynamic Lexer -- `lexer_static` - Includes just the static Lexer - -The difference between `static` and `dynamic` Lexer is that the `static` Lexer doesn't mutate it's own state while tokenizing the input. - # Change log 🚀 ## Version 1.7.8