From e0985b0b17531bc5f49a23ebae935c191356b82e Mon Sep 17 00:00:00 2001 From: Mike Brashler Date: Fri, 26 Jul 2024 15:40:27 -0700 Subject: [PATCH 1/4] Add Util/Lexer.pas --- MorseRunner.dpr | 3 +- MorseRunner.dproj | 7 ++ Util/Lexer.pas | 226 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 Util/Lexer.pas diff --git a/MorseRunner.dpr b/MorseRunner.dpr index f379004..d3c6d07 100644 --- a/MorseRunner.dpr +++ b/MorseRunner.dpr @@ -47,7 +47,8 @@ uses ACAG in 'ACAG.pas', IaruHf in 'IaruHf.pas', ExchFields in 'ExchFields.pas', - SerNRGen in 'SerNRGen.pas'; + SerNRGen in 'SerNRGen.pas', + Lexer in 'Util\Lexer.pas'; {$R *.RES} diff --git a/MorseRunner.dproj b/MorseRunner.dproj index 563f39b..b8dbfb9 100644 --- a/MorseRunner.dproj +++ b/MorseRunner.dproj @@ -166,6 +166,7 @@ + Base @@ -279,6 +280,12 @@ + + + MorseRunner.exe + true + + diff --git a/Util/Lexer.pas b/Util/Lexer.pas new file mode 100644 index 0000000..83c8753 --- /dev/null +++ b/Util/Lexer.pas @@ -0,0 +1,226 @@ +unit Lexer; + +interface + +uses + Generics.Defaults, + Generics.Collections, // for TList<> + SysUtils, // Exception + PerlRegEx; // for regular expression support (TPerlRegEx, TPerlRegExList) + +type + { + Lexer rules are defined as a pair, consisting of a RegEx string and a + corresponding type. An array of TTokenRuleDef records will be passed + into TLexer.Create. + + Example: + LexerRules: array[0..2] of TTokenRuleDef = ( + (R: '[A-Z]+'; T: Ord(ttAlpha)), + (R: '\d+'; T: Ord(ttNumeric)), + (R: '[A-Z][A-Z\d]*'; T: Ord(ttAlphaNumeric)) + ); + + Perl-Compatible Regular Expressions ... + - https://pcre.org/original/doc/html/index.html + - https://pcre.org/original/doc/html/pcrepattern.html#SEC27 + } + TTokenRuleDef = record + R: PCREString; + T: Integer; + end; + + { + Returned by TLexer.NextToken(out tok: TExchToken). + } + TExchToken = record + TokenType: Integer; + Value: string; + Pos: integer; + + procedure Init(AType: Integer; aValue: string; aPos: integer); + end; + + { + A simple regex-based lexer/tokenizer. + + The basic idea is to search a set of rules (regular expressions) looking + for a match, where each expression represents a different token. Whitespace + is handled in one of two ways: it can be automatically skipped by the Lexer, + or user can provide additional rules to manage whitespace. + + Inspiration and design is based on this article: + https://eli.thegreenplace.net/2013/06/25/regex-based-lexical-analysis-in-python-and-javascript + } + TLexer = class + private + protected + type + { + Hold a single token rule for the Lexer, including its type and + corresponding regular expression. A set of rules are passed to + the Lexer as an array of TTokenRuleDef records. + } + TTokenRule = packed record + tokenType: Integer; + regex: TPerlRegEx; + + constructor init(AType: Integer; ARegEx: TPerlRegEx); + end; + + var + SkipWhitespace: Boolean; + Pos: Integer; + Buf: string; + ReSkipWhitespace: TPerlRegEx; + Rules: TList; + public + type + ELexerError = class(SysUtils.Exception); + EInvalidData = class(ELexerError); + + constructor Create(const ARules: array of TTokenRuleDef; + ASkipWhitespace: Boolean = True); + destructor Destroy; override; + + procedure Input(const ABuf: string); + function NextToken(var AToken: TExchToken): Boolean; virtual; + end; + +implementation + +uses + System.Classes; + +constructor TLexer.TTokenRule.init(AType: Integer; ARegEx: TPerlRegEx); +begin + Self.tokenType := AType; + Self.regex := ARegEx; +end; + +procedure TExchToken.Init(AType: Integer; aValue: string; aPos: integer); +begin + Self.TokenType := AType; + Self.Value := aValue; + Self.Pos := aPos; +end; + +{ + Create a Lexer... + ARules + An array of TTokenRuleDef's. Each rule contains a regex + and a Token type value. `Regex` is regex is the regular expression used + to recognize the token and `type` is the type of the token to return + when it's recognized. + + ASkipWhitespace + If True, whitespace will be skipped and not reported by the lexer. + Otherwise, you have to specify your rules for whitespace, or it will be + flagged as an error. +} +constructor TLexer.Create(const ARules: array of TTokenRuleDef; + ASkipWhitespace: Boolean = True); +var + Def: TTokenRuleDef; + Rule: TTokenRule; + Reg: TPerlRegEx; +begin + ReSkipWhitespace := TPerlRegEx.Create; + Rules := TList.Create; + + SkipWhitespace := ASkipWhitespace; + ReSkipWhitespace.Options := [preAnchored]; + ReSkipWhitespace.RegEx := '\s*'; //'\s+'; + ReSkipWhitespace.Compile; + + for Def in ARules do + begin + Reg := TPerlRegEx.Create; + Reg.Options := [preAnchored]; + Reg.RegEx := Def.R; + Rule.regex := Reg; + Rule.tokenType := Def.T; + Rules.Add(Rule); + Reg := nil; + end; +end; + + +destructor TLexer.Destroy; +var + Rule: TTokenRule; +begin + for Rule in Rules do + Rule.regex.Free; + FreeAndNil(Rules); + FreeAndNil(ReSkipWhitespace); +end; + + +procedure TLexer.Input(const ABuf: string); +var + Rule: TTokenRule; +begin + Buf := ABuf; + Pos := 1; + ReSkipWhitespace.Subject := Self.Buf; + ReSkipWhitespace.Start := 1; + ReSkipWhitespace.Stop := Self.Buf.Length; + for Rule in Rules do + begin + Rule.regex.Subject := Self.Buf; + Rule.regex.Start := 1; + Rule.regex.Stop := Self.Buf.Length; + end; +end; + + +function TLexer.NextToken(var AToken: TExchToken): Boolean; +var + Rule: TTokenRule; + Matched: boolean; +begin + Result := self.Pos <= buf.length; + if not Result then + begin + AToken.Init(-1, self.Buf, self.Pos); + Exit; + end; + + if SkipWhitespace then + begin + assert(ReSkipWhitespace.Subject = self.Buf); + assert(ReSkipWhitespace.Stop = Self.Buf.Length); + ReSkipWhitespace.Start := self.Pos; + if ReSkipWhitespace.MatchAgain then + self.Pos := ReSkipWhitespace.Start; + + Result := self.Pos <= buf.length; + if not Result then + begin + AToken.Init(-1, self.Buf, self.Pos); + Exit; + end; + end; + + for Rule in Rules do + begin + assert(Rule.regex.Subject = Self.Buf); + assert(Rule.regex.Stop = Self.Buf.Length); + Rule.regex.Start := Self.Pos; + Result := Rule.regex.MatchAgain; + if Result then + begin + AToken.Init(Rule.tokenType, Rule.regex.MatchedText, Self.Pos); + Self.Pos := Rule.regex.Start; + Exit; + end; + end; + + // if we're here, no rule matched + raise EInvalidData.CreateFmt('Invalid data (%s) at position %d', + [Self.Buf.Substring(Self.Pos-1,1), Self.Pos]); +end; + + +end. From 0b45b0d89cbc068a400402f0b7cc50e5b4c6b98e Mon Sep 17 00:00:00 2001 From: Mike Brashler Date: Thu, 22 Aug 2024 15:53:48 -0700 Subject: [PATCH 2/4] Update comments --- Util/Lexer.pas | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Util/Lexer.pas b/Util/Lexer.pas index 83c8753..70868d1 100644 --- a/Util/Lexer.pas +++ b/Util/Lexer.pas @@ -12,13 +12,14 @@ interface { Lexer rules are defined as a pair, consisting of a RegEx string and a corresponding type. An array of TTokenRuleDef records will be passed - into TLexer.Create. + into TLexer.Create. Example: LexerRules: array[0..2] of TTokenRuleDef = ( (R: '[A-Z]+'; T: Ord(ttAlpha)), (R: '\d+'; T: Ord(ttNumeric)), - (R: '[A-Z][A-Z\d]*'; T: Ord(ttAlphaNumeric)) + (R: '+'; T: Ord(ttPlus)), + (R: '-'; T: ORd(ttMinus)) ); Perl-Compatible Regular Expressions ... @@ -49,6 +50,9 @@ TExchToken = record is handled in one of two ways: it can be automatically skipped by the Lexer, or user can provide additional rules to manage whitespace. + The initial application of this class will be to support the ARRL + Sweepstakes Contest. + Inspiration and design is based on this article: https://eli.thegreenplace.net/2013/06/25/regex-based-lexical-analysis-in-python-and-javascript } From 0e9afeb37e46d26385b5bd4f25ca7b91d098659c Mon Sep 17 00:00:00 2001 From: Mike Brashler Date: Thu, 22 Aug 2024 15:56:25 -0700 Subject: [PATCH 3/4] update comments --- Util/Lexer.pas | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Util/Lexer.pas b/Util/Lexer.pas index 70868d1..82a247c 100644 --- a/Util/Lexer.pas +++ b/Util/Lexer.pas @@ -15,7 +15,7 @@ interface into TLexer.Create. Example: - LexerRules: array[0..2] of TTokenRuleDef = ( + LexerRules: array[0..3] of TTokenRuleDef = ( (R: '[A-Z]+'; T: Ord(ttAlpha)), (R: '\d+'; T: Ord(ttNumeric)), (R: '+'; T: Ord(ttPlus)), From a8d4dc73522b7d9152c64373f083519650163f34 Mon Sep 17 00:00:00 2001 From: Mike Brashler Date: Sat, 24 Aug 2024 00:22:51 -0700 Subject: [PATCH 4/4] fix compiler warnings --- Util/Lexer.pas | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Util/Lexer.pas b/Util/Lexer.pas index 82a247c..54f678a 100644 --- a/Util/Lexer.pas +++ b/Util/Lexer.pas @@ -167,12 +167,12 @@ procedure TLexer.Input(const ABuf: string); begin Buf := ABuf; Pos := 1; - ReSkipWhitespace.Subject := Self.Buf; + ReSkipWhitespace.Subject := UTF8String(Self.Buf); ReSkipWhitespace.Start := 1; ReSkipWhitespace.Stop := Self.Buf.Length; for Rule in Rules do begin - Rule.regex.Subject := Self.Buf; + Rule.regex.Subject := UTF8String(Self.Buf); Rule.regex.Start := 1; Rule.regex.Stop := Self.Buf.Length; end; @@ -182,7 +182,6 @@ procedure TLexer.Input(const ABuf: string); function TLexer.NextToken(var AToken: TExchToken): Boolean; var Rule: TTokenRule; - Matched: boolean; begin Result := self.Pos <= buf.length; if not Result then