diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0015078..70a416e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -15,6 +15,6 @@ jobs: steps: - uses: actions/checkout@v3 - name: Build - run: cargo build --verbose --all-features + run: cargo build --verbose - name: Run tests - run: cargo test --verbose --all-features + run: cargo test --verbose diff --git a/Cargo.toml b/Cargo.toml index 84f604f..3fa20a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,17 +9,6 @@ authors = ["pawel.karas@icloud.com"] keywords = ["heraclitus", "compiler", "parser"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[features] -default = ["compiler"] - -# Include the compiler and lexer -compiler = ["lexer_dynamic"] - -# Include the lexer -lexer_dynamic = [] - -# Include the static lexer -lexer_static = [] [dependencies] colored = "2.0.0" diff --git a/src/compiling/compiler.rs b/src/compiling/compiler.rs index a690e70..f531156 100644 --- a/src/compiling/compiler.rs +++ b/src/compiling/compiler.rs @@ -65,18 +65,14 @@ pub enum ScopingMode { pub struct Compiler { /// Name of your language pub name: String, - /// Rules that describe your language - pub rules: Rules, /// Source code in a form of string pub code: Option, /// Path to the compiled file if exists pub path: Option, - /// Separator mode for this compiler - pub separator_mode: SeparatorMode, - /// Scoping mode for this compiler - pub scoping_mode: ScopingMode, // Check if user wants to debug parser - debug: bool + debug: bool, + /// Lexer to tokenize the code + lexer: Lexer } impl Compiler { @@ -84,18 +80,21 @@ impl Compiler { pub fn new>(name: T, rules: Rules) -> Self { Compiler { name: String::from(name.as_ref()), - rules, code: None, path: None, - separator_mode: SeparatorMode::Manual, - scoping_mode: ScopingMode::Block, - debug: false + debug: false, + lexer: Lexer::new(rules) } } /// Set the language to use indentations pub fn use_indents(&mut self) { - self.scoping_mode = ScopingMode::Indent + self.lexer.scoping_mode = ScopingMode::Indent + } + + /// Set the language separator mode + pub fn set_separator(&mut self, mode: SeparatorMode) { + self.lexer.separator_mode = mode } /// Load file from path @@ -120,9 +119,7 @@ impl Compiler { /// Run just lexer pub fn tokenize(&self) -> Result, LexerError> { - let mut lexer = Lexer::new(self); - lexer.run()?; - Ok(lexer.lexem) + self.lexer.tokenize(&self.code.clone().unwrap()) } /// Parser will display information about the call stack diff --git a/src/compiling/lexing/lexer.rs b/src/compiling/lexing/lexer.rs index 2fe4a53..14ffea0 100644 --- a/src/compiling/lexing/lexer.rs +++ b/src/compiling/lexing/lexer.rs @@ -1,118 +1,113 @@ -//! Dynamic Lexer -//! -//! This module contains the dynamic lexer that is used to tokenize the source code. - -use super::compound_handler::{CompoundHandler, CompoundReaction}; -use super::reader::Reader; -use super::region_handler::{RegionHandler, RegionReaction}; -use super::{LexerError, LexerErrorType}; -use crate::compiling::failing::position_info::PositionInfo; -use crate::compiling::{Compiler, ScopingMode, SeparatorMode, Token}; - -// This is just an estimation of token amount -// inside of a typical 200-lined file. -const AVG_TOKEN_AMOUNT: usize = 1024; - -/// The Lexer -/// -/// Lexer takes source code in a form of a string and translates it to a list of tokens. -/// This particular implementation requires additional metadata such as like regions or symbols. -/// These can be supplied by the `Compiler` in a one cohesive package. Hence the API requires to -/// pass a reference to the `Compiler`. -pub struct Lexer<'a> { - symbols: Vec, - escape_symbol: char, - compound: CompoundHandler, - region: RegionHandler, - reader: Reader<'a>, - path: Option, - /// This attribute stores parsed tokens by the lexer - pub lexem: Vec, - separator_mode: SeparatorMode, - scoping_mode: ScopingMode, +//! Lexer +//! +//! This module contains the lexer that is used to tokenize the source code + +use crate::{ + compiling_rules::Rules, + prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, +}; + +use super::{ + compound_handler::{CompoundHandler, CompoundReaction}, + reader::Reader, + region_handler::{RegionHandler, RegionReaction}, + LexerError, LexerErrorType, +}; + +/// Lexer +#[derive(Debug, Clone, PartialEq)] +pub struct Lexer { + rules: Rules, + /// Path to the lexed file + pub path: Option, + /// Separator mode for this lexer + pub separator_mode: SeparatorMode, + /// Escape symbol for this lexer. Default is '\\' + pub escape_symbol: char, + /// Scoping mode for this lexer + pub scoping_mode: ScopingMode, +} + +struct LexState<'a> { + word: String, + is_indenting: bool, is_escaped: bool, - position: (usize, usize), - index: usize, token_start_index: usize, + position: (usize, usize), + reader: Reader<'a>, + lexem: Vec, + region_handler: RegionHandler, + compound_handler: CompoundHandler, } -impl<'a> Lexer<'a> { +impl Lexer { /// Create a new Lexer based on the compiler metadata - pub fn new(cc: &'a Compiler) -> Self { - let code: &'a String = cc.code.as_ref().unwrap(); + pub fn new(rules: Rules) -> Self { Lexer { - symbols: cc.rules.symbols.clone(), - escape_symbol: cc.rules.escape_symbol, - compound: CompoundHandler::new(&cc.rules), - region: RegionHandler::new(&cc.rules), - reader: Reader::new(code), - path: cc.path.clone(), - lexem: Vec::with_capacity(AVG_TOKEN_AMOUNT), - separator_mode: cc.separator_mode.clone(), - scoping_mode: cc.scoping_mode.clone(), - is_escaped: false, - position: (0, 0), - index: 0, - token_start_index: 0, + rules, + path: None, + separator_mode: SeparatorMode::Manual, + escape_symbol: '\\', + scoping_mode: ScopingMode::Block, } } /// Add indentation to the lexem #[inline] - fn add_indent(&mut self, word: String) -> String { - if !word.is_empty() { - // Getting position by word here would attempt to - // substract with overflow since the new line character - // technically belongs to the previous line - let (row, _col) = self.reader.get_position(); - self.lexem.push(Token { - word, - pos: (row, 1), - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_indent(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + // Getting position by word here would attempt to + // substract with overflow since the new line character + // technically belongs to the previous line + let (row, _col) = lex_state.reader.get_position(); + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: (row, 1), + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); } /// Add word that has been completed in previous iteration to the lexem #[inline] - fn add_word(&mut self, word: String) -> String { - if !word.is_empty() { - self.lexem.push(Token { - word, - pos: self.position, - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_word(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new(); } /// Add word that has been completed in current iteration to the lexem #[inline] - fn add_word_inclusively(&mut self, word: String) -> String { - if !word.is_empty() { - self.lexem.push(Token { - word, - pos: self.position, - start: self.token_start_index, - }); - self.position = (0, 0); - String::new() - } else { - word + fn add_word_inclusively(&self, lex_state: &mut LexState) { + if lex_state.word.is_empty() { + return; } + + lex_state.lexem.push(Token { + word: lex_state.word.clone(), + pos: lex_state.position, + start: lex_state.token_start_index, + }); + lex_state.position = (0, 0); + lex_state.word = String::new() } /// Checks whether this is a nontokenizable region #[inline] - pub fn is_tokenized_region(&self, reaction: &RegionReaction) -> bool { - if let Some(region) = self.region.get_region().as_ref() { + fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { + if let Some(region) = lex_state.region_handler.get_region() { region.tokenize && *reaction == RegionReaction::Pass } else { false @@ -122,70 +117,83 @@ impl<'a> Lexer<'a> { /// Pattern code for adding a symbol /// **[*]** #[inline] - fn pattern_add_symbol(&mut self, mut word: String, letter: char) -> String { - word = self.add_word(word); - if word.is_empty() { - self.token_start_index = self.index; + fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); } - self.word_push(&mut word, letter); - self.position = self.reader.get_position(); - self.add_word_inclusively(word) + self.word_push(lex_state, letter); + lex_state.position = lex_state.reader.get_position(); + + self.add_word_inclusively(lex_state); } /// Pattern code for beginning a new region /// **[** #[inline] - fn pattern_begin(&mut self, mut word: String, letter: char) -> String { - word = self.add_word(word); - self.word_push(&mut word, letter); - word + fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { + self.add_word(lex_state); + self.word_push(lex_state, letter); } /// Pattern code for ending current region /// **]** #[inline] - fn pattern_end(&mut self, mut word: String, letter: char) -> String { - self.word_push(&mut word, letter); - self.add_word_inclusively(word) + fn pattern_end(&self, lex_state: &mut LexState, letter: char) { + self.word_push(lex_state, letter); + self.add_word_inclusively(lex_state); } /// Push letter to the word and set token start index - fn word_push(&mut self, word: &mut String, letter: char) { - if word.is_empty() { - self.token_start_index = self.index; + fn word_push(&self, lex_state: &mut LexState, letter: char) { + if lex_state.word.is_empty() { + lex_state.token_start_index = lex_state.reader.get_index(); } - word.push(letter); + lex_state.word.push(letter); } /// Tokenize source code /// /// Run lexer and tokenize code. The result is stored in the lexem attribute - pub fn run(&mut self) -> Result<(), LexerError> { - let mut word = String::new(); - let mut is_indenting = false; - while let Some(letter) = self.reader.next() { - self.index = self.reader.get_index(); + pub fn tokenize(&self, input: &str) -> Result, LexerError> { + let code = input.to_string(); + let mut lex_state = LexState { + word: String::new(), + is_indenting: false, + is_escaped: false, + token_start_index: 0, + position: (0, 0), + lexem: Vec::new(), + reader: Reader::new(&code), + region_handler: RegionHandler::new(&self.rules), + compound_handler: CompoundHandler::new(&self.rules), + }; + + while let Some(letter) = lex_state.reader.next() { /****************/ /* Set Position */ /****************/ // If the new position hasn't been set yet, set it - if self.position == (0, 0) { + if lex_state.position == (0, 0) { // If separator mode is set to Manual and the letter is a separator, // then skip finding a new position if SeparatorMode::Manual != self.separator_mode || letter != '\n' { - let region = self.region.get_region().unwrap(); + let region = lex_state.region_handler.get_region().unwrap(); // If the region is tokenized, then check if the letter is a separator if !region.tokenize || !vec![' ', '\t'].contains(&letter) { - self.position = self.reader.get_position(); + lex_state.position = lex_state.reader.get_position(); } } } // Reaction stores the reaction of the region handler // Have we just opened or closed some region? - let reaction = self.region.handle_region(&self.reader, self.is_escaped); + let reaction = lex_state + .region_handler + .handle_region(&lex_state.reader, lex_state.is_escaped); match reaction { // If the region has been opened // Finish the part that we have been parsing @@ -193,7 +201,7 @@ impl<'a> Lexer<'a> { // Also if the new region is an interpolation that tokenizes // the inner content - separate the region from the content if tokenize { - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Regular region case else { @@ -201,10 +209,10 @@ impl<'a> Lexer<'a> { // character if region rule opens with newline if letter == '\n' { // This additionally creates a new token - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Normally start a new region - word = self.pattern_begin(word, letter); + self.pattern_begin(&mut lex_state, letter); } } // If the region has been closed @@ -213,48 +221,49 @@ impl<'a> Lexer<'a> { // Also if the new region is an interpolation that tokenizes // the inner content - separate the region from the content if tokenize { - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } // Regular region case else { // Normally close the region - word = self.pattern_end(word, letter); + self.pattern_end(&mut lex_state, letter); // This is supposed to prevent overshadowing new line // character if region rule closes with newline if letter == '\n' { // This additionally creates a new token - word = self.pattern_add_symbol(word, letter); + self.pattern_add_symbol(&mut lex_state, letter); } } } RegionReaction::Pass => { - match self.compound.handle_compound( + let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); + match lex_state.compound_handler.handle_compound( letter, - &self.reader, - self.is_tokenized_region(&reaction), + &lex_state.reader, + is_tokenized_region, ) { - CompoundReaction::Begin => word = self.pattern_begin(word, letter), - CompoundReaction::Keep => self.word_push(&mut word, letter), - CompoundReaction::End => word = self.pattern_end(word, letter), + CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), + CompoundReaction::Keep => self.word_push(&mut lex_state, letter), + CompoundReaction::End => self.pattern_end(&mut lex_state, letter), CompoundReaction::Pass => { // Handle region scope - if !self.is_tokenized_region(&reaction) { - let region = self.region.get_region().unwrap(); + if !self.is_tokenized_region(&reaction, &mut lex_state) { + let region = lex_state.region_handler.get_region().unwrap(); // Flip escaped key - self.is_escaped = (!self.is_escaped + lex_state.is_escaped = (!lex_state.is_escaped && letter == self.escape_symbol) - .then(|| !self.is_escaped) + .then(|| !lex_state.is_escaped) .unwrap_or(false); // Handle singleline attribute if letter == '\n' && region.singleline { - let pos = self.reader.get_position(); + let pos = lex_state.reader.get_position(); return Err(( LexerErrorType::Singleline, PositionInfo::at_pos(self.path.clone(), pos, 0) .data(region.name.clone()), )); } - self.word_push(&mut word, letter); + self.word_push(&mut lex_state, letter); } else { /******************/ /* Mode modifiers */ @@ -263,21 +272,21 @@ impl<'a> Lexer<'a> { // Create indent regions: '\n ' if let ScopingMode::Indent = self.scoping_mode { // If we are still in the indent region - proceed - if is_indenting && vec![' ', '\t'].contains(&letter) { - self.word_push(&mut word, letter); + if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { + self.word_push(&mut lex_state, letter); } // If it's the new line - start indent region if letter == '\n' { - is_indenting = true; - word = self.pattern_begin(word, letter); + lex_state.is_indenting = true; + self.pattern_begin(&mut lex_state, letter); } // Check if the current letter // concludes current indent region - if is_indenting { - if let Some(next_char) = self.reader.peek() { + if lex_state.is_indenting { + if let Some(next_char) = lex_state.reader.peek() { if !vec![' ', '\t'].contains(&next_char) { - word = self.add_indent(word); - is_indenting = false; + self.add_indent(&mut lex_state); + lex_state.is_indenting = false; } } continue; @@ -286,7 +295,7 @@ impl<'a> Lexer<'a> { // Skip newline character if we want to manually insert semicolons if let SeparatorMode::Manual = self.separator_mode { if letter == '\n' { - word = self.add_word(word); + self.add_word(&mut lex_state); continue; } } @@ -297,15 +306,15 @@ impl<'a> Lexer<'a> { // Skip whitespace if vec![' ', '\t'].contains(&letter) { - word = self.add_word(word); + self.add_word(&mut lex_state); } // Handle special symbols - else if self.symbols.contains(&letter) || letter == '\n' { - word = self.pattern_add_symbol(word, letter); + else if self.rules.symbols.contains(&letter) || letter == '\n' { + self.pattern_add_symbol(&mut lex_state, letter); } // Handle word else { - self.word_push(&mut word, letter); + self.word_push(&mut lex_state, letter); } } } @@ -313,21 +322,22 @@ impl<'a> Lexer<'a> { } } } - self.add_word(word); + self.add_word(&mut lex_state); // If some region exists that was not closed - if let Err((pos, region)) = self.region.is_region_closed(&self.reader) { + if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { return Err(( LexerErrorType::Unclosed, PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), )); } - Ok(()) + + Ok(lex_state.lexem) } } #[cfg(test)] mod test { - use crate::compiling::{Compiler, ScopingMode}; + use crate::compiling::ScopingMode; use crate::compiling_rules::{Region, Rules}; use crate::reg; @@ -354,14 +364,12 @@ mod test { (")".to_string(), 1, 17), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("TestScript", rules); - cc.load("let a = (12 + 32)"); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize("let a = (12 + 32)"); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -395,14 +403,13 @@ mod test { (" 🎉 text'".to_string(), 1, 38), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("TestScript", rules); - cc.load("let a = 'this {'is {adjective} long'} 🎉 text'"); - let mut lexer = super::Lexer::new(&cc); + + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -424,15 +431,15 @@ mod test { ("pass".to_string(), (3, 9), 43), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.scoping_mode = ScopingMode::Indent; - cc.load(vec!["if condition:", " if subcondition:", " pass"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + + let mut lexer = super::Lexer::new(rules); + lexer.scoping_mode = ScopingMode::Indent; let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer + .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); } assert_eq!(expected, result); @@ -452,14 +459,12 @@ mod test { (";".to_string(), 3, 3), ]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Testhon", rules); - cc.load(vec!["let age = 12", "+", "12;"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -474,14 +479,12 @@ mod test { })]; let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec!["'this", "is", "a", "multiline", "string'"].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); @@ -496,14 +499,12 @@ mod test { })]; let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; let rules = Rules::new(symbols, vec![], regions); - let mut cc: Compiler = Compiler::new("Test", rules); - cc.load(vec!["\"this is \\\"escaped\\\" string\""].join("\n")); - let mut lexer = super::Lexer::new(&cc); + let lexer = super::Lexer::new(rules); let mut result = vec![]; // Simulate lexing - let res = lexer.run(); + let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); assert!(res.is_ok()); - for lex in lexer.lexem { + for lex in res.unwrap() { result.push((lex.word, lex.pos.0, lex.pos.1)); } assert_eq!(expected, result); diff --git a/src/compiling/lexing/lexer_static.rs b/src/compiling/lexing/lexer_static.rs deleted file mode 100644 index 254e6bb..0000000 --- a/src/compiling/lexing/lexer_static.rs +++ /dev/null @@ -1,511 +0,0 @@ -//! Static Lexer -//! -//! This module contains the static lexer that is used to tokenize the source code - -use crate::{ - compiling_rules::Rules, - prelude::{PositionInfo, ScopingMode, SeparatorMode, Token}, -}; - -use super::{ - compound_handler::{CompoundHandler, CompoundReaction}, - reader::Reader, - region_handler::{RegionHandler, RegionReaction}, - LexerError, LexerErrorType, -}; - -/// Static Lexer -pub struct StaticLexer { - rules: Rules, - /// Path to the lexed file - pub path: Option, - /// Separator mode for this lexer - pub separator_mode: SeparatorMode, - /// Escape symbol for this lexer. Default is '\\' - pub escape_symbol: char, - /// Scoping mode for this lexer - pub scoping_mode: ScopingMode, -} - -struct LexState<'a> { - word: String, - is_indenting: bool, - is_escaped: bool, - token_start_index: usize, - position: (usize, usize), - reader: Reader<'a>, - lexem: Vec, - region_handler: RegionHandler, - compound_handler: CompoundHandler, -} - -impl StaticLexer { - /// Create a new Lexer based on the compiler metadata - pub fn new(rules: Rules) -> Self { - StaticLexer { - rules, - path: None, - separator_mode: SeparatorMode::Manual, - escape_symbol: '\\', - scoping_mode: ScopingMode::Block, - } - } - - /// Add indentation to the lexem - #[inline] - fn add_indent(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - // Getting position by word here would attempt to - // substract with overflow since the new line character - // technically belongs to the previous line - let (row, _col) = lex_state.reader.get_position(); - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: (row, 1), - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new(); - } - - /// Add word that has been completed in previous iteration to the lexem - #[inline] - fn add_word(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: lex_state.position, - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new(); - } - - /// Add word that has been completed in current iteration to the lexem - #[inline] - fn add_word_inclusively(&self, lex_state: &mut LexState) { - if lex_state.word.is_empty() { - return; - } - - lex_state.lexem.push(Token { - word: lex_state.word.clone(), - pos: lex_state.position, - start: lex_state.token_start_index, - }); - lex_state.position = (0, 0); - lex_state.word = String::new() - } - - /// Checks whether this is a nontokenizable region - #[inline] - fn is_tokenized_region(&self, reaction: &RegionReaction, lex_state: &mut LexState) -> bool { - if let Some(region) = lex_state.region_handler.get_region() { - region.tokenize && *reaction == RegionReaction::Pass - } else { - false - } - } - - /// Pattern code for adding a symbol - /// **[*]** - #[inline] - fn pattern_add_symbol(&self, lex_state: &mut LexState, letter: char) { - self.add_word(lex_state); - - if lex_state.word.is_empty() { - lex_state.token_start_index = lex_state.reader.get_index(); - } - self.word_push(lex_state, letter); - lex_state.position = lex_state.reader.get_position(); - - self.add_word_inclusively(lex_state); - } - - /// Pattern code for beginning a new region - /// **[** - #[inline] - fn pattern_begin(&self, lex_state: &mut LexState, letter: char) { - self.add_word(lex_state); - self.word_push(lex_state, letter); - } - - /// Pattern code for ending current region - /// **]** - #[inline] - fn pattern_end(&self, lex_state: &mut LexState, letter: char) { - self.word_push(lex_state, letter); - self.add_word_inclusively(lex_state); - } - - /// Push letter to the word and set token start index - fn word_push(&self, lex_state: &mut LexState, letter: char) { - if lex_state.word.is_empty() { - lex_state.token_start_index = lex_state.reader.get_index(); - } - lex_state.word.push(letter); - } - - /// Tokenize source code - /// - /// Run lexer and tokenize code. The result is stored in the lexem attribute - pub fn tokenize(&self, input: &str) -> Result, LexerError> { - let code = input.to_string(); - - let mut lex_state = LexState { - word: String::new(), - is_indenting: false, - is_escaped: false, - token_start_index: 0, - position: (0, 0), - lexem: Vec::new(), - reader: Reader::new(&code), - region_handler: RegionHandler::new(&self.rules), - compound_handler: CompoundHandler::new(&self.rules), - }; - - while let Some(letter) = lex_state.reader.next() { - /****************/ - /* Set Position */ - /****************/ - - // If the new position hasn't been set yet, set it - if lex_state.position == (0, 0) { - // If separator mode is set to Manual and the letter is a separator, - // then skip finding a new position - if SeparatorMode::Manual != self.separator_mode || letter != '\n' { - let region = lex_state.region_handler.get_region().unwrap(); - // If the region is tokenized, then check if the letter is a separator - if !region.tokenize || !vec![' ', '\t'].contains(&letter) { - lex_state.position = lex_state.reader.get_position(); - } - } - } - - // Reaction stores the reaction of the region handler - // Have we just opened or closed some region? - let reaction = lex_state - .region_handler - .handle_region(&lex_state.reader, lex_state.is_escaped); - match reaction { - // If the region has been opened - // Finish the part that we have been parsing - RegionReaction::Begin(tokenize) => { - // Also if the new region is an interpolation that tokenizes - // the inner content - separate the region from the content - if tokenize { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Regular region case - else { - // This is supposed to prevent overshadowing new line - // character if region rule opens with newline - if letter == '\n' { - // This additionally creates a new token - self.pattern_add_symbol(&mut lex_state, letter); - } - // Normally start a new region - self.pattern_begin(&mut lex_state, letter); - } - } - // If the region has been closed - // Add the closing region and finish the word - RegionReaction::End(tokenize) => { - // Also if the new region is an interpolation that tokenizes - // the inner content - separate the region from the content - if tokenize { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Regular region case - else { - // Normally close the region - self.pattern_end(&mut lex_state, letter); - // This is supposed to prevent overshadowing new line - // character if region rule closes with newline - if letter == '\n' { - // This additionally creates a new token - self.pattern_add_symbol(&mut lex_state, letter); - } - } - } - RegionReaction::Pass => { - let is_tokenized_region = self.is_tokenized_region(&reaction, &mut lex_state); - match lex_state.compound_handler.handle_compound( - letter, - &lex_state.reader, - is_tokenized_region, - ) { - CompoundReaction::Begin => self.pattern_begin(&mut lex_state, letter), - CompoundReaction::Keep => self.word_push(&mut lex_state, letter), - CompoundReaction::End => self.pattern_end(&mut lex_state, letter), - CompoundReaction::Pass => { - // Handle region scope - if !self.is_tokenized_region(&reaction, &mut lex_state) { - let region = lex_state.region_handler.get_region().unwrap(); - // Flip escaped key - lex_state.is_escaped = (!lex_state.is_escaped - && letter == self.escape_symbol) - .then(|| !lex_state.is_escaped) - .unwrap_or(false); - // Handle singleline attribute - if letter == '\n' && region.singleline { - let pos = lex_state.reader.get_position(); - return Err(( - LexerErrorType::Singleline, - PositionInfo::at_pos(self.path.clone(), pos, 0) - .data(region.name.clone()), - )); - } - self.word_push(&mut lex_state, letter); - } else { - /******************/ - /* Mode modifiers */ - /******************/ - - // Create indent regions: '\n ' - if let ScopingMode::Indent = self.scoping_mode { - // If we are still in the indent region - proceed - if lex_state.is_indenting && vec![' ', '\t'].contains(&letter) { - self.word_push(&mut lex_state, letter); - } - // If it's the new line - start indent region - if letter == '\n' { - lex_state.is_indenting = true; - self.pattern_begin(&mut lex_state, letter); - } - // Check if the current letter - // concludes current indent region - if lex_state.is_indenting { - if let Some(next_char) = lex_state.reader.peek() { - if !vec![' ', '\t'].contains(&next_char) { - self.add_indent(&mut lex_state); - lex_state.is_indenting = false; - } - } - continue; - } - } - // Skip newline character if we want to manually insert semicolons - if let SeparatorMode::Manual = self.separator_mode { - if letter == '\n' { - self.add_word(&mut lex_state); - continue; - } - } - - /*****************/ - /* Regular Lexer */ - /*****************/ - - // Skip whitespace - if vec![' ', '\t'].contains(&letter) { - self.add_word(&mut lex_state); - } - // Handle special symbols - else if self.rules.symbols.contains(&letter) || letter == '\n' { - self.pattern_add_symbol(&mut lex_state, letter); - } - // Handle word - else { - self.word_push(&mut lex_state, letter); - } - } - } - } - } - } - } - self.add_word(&mut lex_state); - // If some region exists that was not closed - if let Err((pos, region)) = lex_state.region_handler.is_region_closed(&lex_state.reader) { - return Err(( - LexerErrorType::Unclosed, - PositionInfo::at_pos(self.path.clone(), pos, 0).data(region.name), - )); - } - - Ok(lex_state.lexem) - } -} - -#[cfg(test)] -mod test { - use crate::compiling::ScopingMode; - use crate::compiling_rules::{Region, Rules}; - use crate::reg; - - #[test] - fn test_lexer_base() { - let symbols = vec!['(', ')']; - let regions = reg![reg!(string as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(array as "Array Literal" => { - begin: "[", - end: "]" - }) - ])]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("a".to_string(), 1, 5), - ("=".to_string(), 1, 7), - ("(".to_string(), 1, 9), - ("12".to_string(), 1, 10), - ("+".to_string(), 1, 13), - ("32".to_string(), 1, 15), - (")".to_string(), 1, 17), - ]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize("let a = (12 + 32)"); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_string_interp() { - let symbols = vec!['(', ')']; - let regions = reg![reg!(string_literal as "String literal" => { - begin: "'", - end: "'" - } => [ - reg!(string_interp as "String interpolation" => { - begin: "{", - end: "}", - tokenize: true - } ref global) - ])]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("a".to_string(), 1, 5), - ("=".to_string(), 1, 7), - ("'this ".to_string(), 1, 9), - ("{".to_string(), 1, 15), - ("'is ".to_string(), 1, 16), - ("{".to_string(), 1, 20), - ("adjective".to_string(), 1, 21), - ("}".to_string(), 1, 30), - (" long'".to_string(), 1, 31), - ("}".to_string(), 1, 37), - (" 🎉 text'".to_string(), 1, 38), - ]; - let rules = Rules::new(symbols, vec![], regions); - - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize("let a = 'this {'is {adjective} long'} 🎉 text'"); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_indent_scoping_mode() { - let symbols = vec![':']; - let regions = reg![]; - let expected = vec![ - ("if".to_string(), (1, 1), 0), - ("condition".to_string(), (1, 4), 3), - (":".to_string(), (1, 13), 12), - ("\n ".to_string(), (2, 1), 13), - ("if".to_string(), (2, 5), 18), - ("subcondition".to_string(), (2, 8), 21), - (":".to_string(), (2, 20), 33), - ("\n ".to_string(), (3, 1), 34), - ("pass".to_string(), (3, 9), 43), - ]; - let rules = Rules::new(symbols, vec![], regions); - - let mut lexer = super::StaticLexer::new(rules); - lexer.scoping_mode = ScopingMode::Indent; - let mut result = vec![]; - // Simulate lexing - let res = lexer - .tokenize(&vec!["if condition:", " if subcondition:", " pass"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, (lex.pos.0, lex.pos.1), lex.start)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_manual_separator_mode() { - let symbols = vec![';', '+', '=']; - let regions = reg![]; - let expected = vec![ - ("let".to_string(), 1, 1), - ("age".to_string(), 1, 5), - ("=".to_string(), 1, 9), - ("12".to_string(), 1, 11), - ("+".to_string(), 2, 1), - ("12".to_string(), 3, 1), - (";".to_string(), 3, 3), - ]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["let age = 12", "+", "12;"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_multiline_regions() { - let symbols = vec![';', '+', '=']; - let regions = reg![reg!(string as "String" => { - begin: "'", - end: "'" - })]; - let expected = vec![("'this\nis\na\nmultiline\nstring'".to_string(), 1, 1)]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["'this", "is", "a", "multiline", "string'"].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } - - #[test] - fn test_lexer_escaped_regions() { - let symbols = vec![';', '+', '=']; - let regions = reg![reg!(string as "String" => { - begin: "\"", - end: "\"" - })]; - let expected = vec![("\"this is \\\"escaped\\\" string\"".to_string(), 1, 1)]; - let rules = Rules::new(symbols, vec![], regions); - let lexer = super::StaticLexer::new(rules); - let mut result = vec![]; - // Simulate lexing - let res = lexer.tokenize(&vec!["\"this is \\\"escaped\\\" string\""].join("\n")); - assert!(res.is_ok()); - for lex in res.unwrap() { - result.push((lex.word, lex.pos.0, lex.pos.1)); - } - assert_eq!(expected, result); - } -} diff --git a/src/compiling/lexing/mod.rs b/src/compiling/lexing/mod.rs index b53f82e..704573b 100644 --- a/src/compiling/lexing/mod.rs +++ b/src/compiling/lexing/mod.rs @@ -5,10 +5,7 @@ use crate::prelude::PositionInfo; mod compound_handler; -#[cfg(feature = "lexer_dynamic")] pub mod lexer; -#[cfg(feature = "lexer_static")] -pub mod lexer_static; mod reader; mod region_handler; diff --git a/src/compiling/mod.rs b/src/compiling/mod.rs index 534a476..52624ab 100644 --- a/src/compiling/mod.rs +++ b/src/compiling/mod.rs @@ -5,14 +5,12 @@ mod lexing; -#[cfg(feature = "compiler")] mod compiler; mod token; mod parser; pub mod failing; pub use lexing::*; -#[cfg(feature = "compiler")] pub use compiler::*; pub use token::*; pub use parser::*;