forked from VE3NEA/MorseRunner
-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Util/Lexer.pas #337
Merged
Merged
Add Util/Lexer.pas #337
Changes from 1 commit
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
unit Lexer; | ||
|
||
interface | ||
|
||
uses | ||
Generics.Defaults, | ||
Generics.Collections, // for TList<> | ||
SysUtils, // Exception | ||
PerlRegEx; // for regular expression support (TPerlRegEx, TPerlRegExList) | ||
|
||
type | ||
{ | ||
Lexer rules are defined as a pair, consisting of a RegEx string and a | ||
corresponding type. An array of TTokenRuleDef records will be passed | ||
into TLexer<T>.Create. | ||
|
||
Example: | ||
LexerRules: array[0..2] of TTokenRuleDef = ( | ||
(R: '[A-Z]+'; T: Ord(ttAlpha)), | ||
(R: '\d+'; T: Ord(ttNumeric)), | ||
(R: '[A-Z][A-Z\d]*'; T: Ord(ttAlphaNumeric)) | ||
); | ||
|
||
Perl-Compatible Regular Expressions ... | ||
- https://pcre.org/original/doc/html/index.html | ||
- https://pcre.org/original/doc/html/pcrepattern.html#SEC27 | ||
} | ||
TTokenRuleDef = record | ||
R: PCREString; | ||
T: Integer; | ||
end; | ||
|
||
{ | ||
Returned by TLexer.NextToken(out tok: TExchToken). | ||
} | ||
TExchToken = record | ||
TokenType: Integer; | ||
Value: string; | ||
Pos: integer; | ||
|
||
procedure Init(AType: Integer; aValue: string; aPos: integer); | ||
end; | ||
|
||
{ | ||
A simple regex-based lexer/tokenizer. | ||
|
||
The basic idea is to search a set of rules (regular expressions) looking | ||
for a match, where each expression represents a different token. Whitespace | ||
is handled in one of two ways: it can be automatically skipped by the Lexer, | ||
or user can provide additional rules to manage whitespace. | ||
|
||
Inspiration and design is based on this article: | ||
https://eli.thegreenplace.net/2013/06/25/regex-based-lexical-analysis-in-python-and-javascript | ||
} | ||
TLexer = class | ||
private | ||
protected | ||
type | ||
{ | ||
Hold a single token rule for the Lexer, including its type and | ||
corresponding regular expression. A set of rules are passed to | ||
the Lexer as an array of TTokenRuleDef records. | ||
} | ||
TTokenRule = packed record | ||
tokenType: Integer; | ||
regex: TPerlRegEx; | ||
|
||
constructor init(AType: Integer; ARegEx: TPerlRegEx); | ||
end; | ||
|
||
var | ||
SkipWhitespace: Boolean; | ||
Pos: Integer; | ||
Buf: string; | ||
ReSkipWhitespace: TPerlRegEx; | ||
Rules: TList<TTokenRule>; | ||
public | ||
type | ||
ELexerError = class(SysUtils.Exception); | ||
EInvalidData = class(ELexerError); | ||
|
||
constructor Create(const ARules: array of TTokenRuleDef; | ||
ASkipWhitespace: Boolean = True); | ||
destructor Destroy; override; | ||
|
||
procedure Input(const ABuf: string); | ||
function NextToken(var AToken: TExchToken): Boolean; virtual; | ||
end; | ||
|
||
implementation | ||
|
||
uses | ||
System.Classes; | ||
|
||
constructor TLexer.TTokenRule.init(AType: Integer; ARegEx: TPerlRegEx); | ||
begin | ||
Self.tokenType := AType; | ||
Self.regex := ARegEx; | ||
end; | ||
|
||
procedure TExchToken.Init(AType: Integer; aValue: string; aPos: integer); | ||
begin | ||
Self.TokenType := AType; | ||
Self.Value := aValue; | ||
Self.Pos := aPos; | ||
end; | ||
|
||
{ | ||
Create a Lexer... | ||
ARules | ||
An array of TTokenRuleDef's. Each rule contains a regex | ||
and a Token type value. `Regex` is regex is the regular expression used | ||
to recognize the token and `type` is the type of the token to return | ||
when it's recognized. | ||
|
||
ASkipWhitespace | ||
If True, whitespace will be skipped and not reported by the lexer. | ||
Otherwise, you have to specify your rules for whitespace, or it will be | ||
flagged as an error. | ||
} | ||
constructor TLexer.Create(const ARules: array of TTokenRuleDef; | ||
ASkipWhitespace: Boolean = True); | ||
var | ||
Def: TTokenRuleDef; | ||
Rule: TTokenRule; | ||
Reg: TPerlRegEx; | ||
begin | ||
ReSkipWhitespace := TPerlRegEx.Create; | ||
Rules := TList<TTokenRule>.Create; | ||
|
||
SkipWhitespace := ASkipWhitespace; | ||
ReSkipWhitespace.Options := [preAnchored]; | ||
ReSkipWhitespace.RegEx := '\s*'; //'\s+'; | ||
ReSkipWhitespace.Compile; | ||
|
||
for Def in ARules do | ||
begin | ||
Reg := TPerlRegEx.Create; | ||
Reg.Options := [preAnchored]; | ||
Reg.RegEx := Def.R; | ||
Rule.regex := Reg; | ||
Rule.tokenType := Def.T; | ||
Rules.Add(Rule); | ||
Reg := nil; | ||
end; | ||
end; | ||
|
||
|
||
destructor TLexer.Destroy; | ||
var | ||
Rule: TTokenRule; | ||
begin | ||
for Rule in Rules do | ||
Rule.regex.Free; | ||
FreeAndNil(Rules); | ||
FreeAndNil(ReSkipWhitespace); | ||
end; | ||
|
||
|
||
procedure TLexer.Input(const ABuf: string); | ||
var | ||
Rule: TTokenRule; | ||
begin | ||
Buf := ABuf; | ||
Pos := 1; | ||
ReSkipWhitespace.Subject := Self.Buf; | ||
ReSkipWhitespace.Start := 1; | ||
ReSkipWhitespace.Stop := Self.Buf.Length; | ||
for Rule in Rules do | ||
begin | ||
Rule.regex.Subject := Self.Buf; | ||
Rule.regex.Start := 1; | ||
Rule.regex.Stop := Self.Buf.Length; | ||
end; | ||
end; | ||
|
||
|
||
function TLexer.NextToken(var AToken: TExchToken): Boolean; | ||
var | ||
Rule: TTokenRule; | ||
Matched: boolean; | ||
begin | ||
Result := self.Pos <= buf.length; | ||
if not Result then | ||
begin | ||
AToken.Init(-1, self.Buf, self.Pos); | ||
Exit; | ||
end; | ||
|
||
if SkipWhitespace then | ||
begin | ||
assert(ReSkipWhitespace.Subject = self.Buf); | ||
assert(ReSkipWhitespace.Stop = Self.Buf.Length); | ||
ReSkipWhitespace.Start := self.Pos; | ||
if ReSkipWhitespace.MatchAgain then | ||
self.Pos := ReSkipWhitespace.Start; | ||
|
||
Result := self.Pos <= buf.length; | ||
if not Result then | ||
begin | ||
AToken.Init(-1, self.Buf, self.Pos); | ||
Exit; | ||
end; | ||
end; | ||
|
||
for Rule in Rules do | ||
begin | ||
assert(Rule.regex.Subject = Self.Buf); | ||
assert(Rule.regex.Stop = Self.Buf.Length); | ||
Rule.regex.Start := Self.Pos; | ||
Result := Rule.regex.MatchAgain; | ||
if Result then | ||
begin | ||
AToken.Init(Rule.tokenType, Rule.regex.MatchedText, Self.Pos); | ||
Self.Pos := Rule.regex.Start; | ||
Exit; | ||
end; | ||
end; | ||
|
||
// if we're here, no rule matched | ||
raise EInvalidData.CreateFmt('Invalid data (%s) at position %d', | ||
[Self.Buf.Substring(Self.Pos-1,1), Self.Pos]); | ||
end; | ||
|
||
|
||
end. |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unless I miss something it must start with a letter, so it's not "AlphaNumeric" per se. "1A" won't match for example. I understand it's only examples, but it's a bit confusing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I initially started thinking the ttAlphaNumeric and ttNumericAlpha tokens would be useful, but they weren't. In the end for ARRL SS Contest, I converged on three token types: ttAlpha, ttDigits (numeric), and ttCallsign. I will update the above example to remove the ttAlphaNumeric. I will update this example later today or tonight.
In the few days, I'll post the ARRL SS Contest for review. It will show the tables used for SS and will hopefully be more clear. Parsing has never been my strength, so my examples may not be the best to provide.
Thank you for the feedback.
@f6fvy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated the comments and pushed an updated version.