-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparserrequirements.hs
549 lines (533 loc) · 20.2 KB
/
parserrequirements.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
{-|
Module : ParserRequirements
Description : Library containing all the requirements for a parser
Copyright : (c) Samuel Williams, 2021
Graham Hutton, 2016
License : GPL-3
Maintainer : [email protected]
Stability : release
This module is copied to any generated parser and contains 5 sections of code:
[@ParseState@] Data type and helper functions for keeping track of source position
[@Result@] Maybe-like data type including error string for failure
[@Parser a@] Modified @Parser@ library from Graham Huttons Programming in Haskell
[@Scanner@] Configurable scanner/lexer taking a String and returning a list of @Token@s
[@languageDefsParser@] A set of @Parser@s specifically for the gmr file, and any other similar meta-languages
-}
module ParserRequirements (
-- * ParseState
ParseState (..),
showPos,
parseState,
inc,
incColumn,
incLine,
nextChar,
hasNextChar,
fromPos,
lineStr,
-- * Result
Result (..),
eitherToResult,
-- * Parser
Parser (..),
parseToResult,
parse,
item,
peek,
canPeek,
tryPeek,
getState,
mustFail,
sat,
digit,
lower,
upper,
letter,
alphanum,
char,
string,
anyString,
ident,
upperIdent,
nat,
int,
float,
-- * Scanner
Scanner (..),
Token (..),
TokenType (..),
scanner,
space,
tokenWithState,
asToken,
formattedString,
formattedStringRaw,
operator,
keyword,
getCharParser,
nonAlphaNext,
stringToken,
intToken,
floatToken,
lowerIdentifierToken,
upperIdentifierToken,
identifierToken,
brackets,
getIdentifierTokenParser,
commentParser,
blockCommentParser,
lineCommentParser,
scanParser,
scan,
-- * Language definition parsers
unpackCustom,
codeBlock,
directive,
identifierPrime,
languageDefsParser
) where
import Data.Char
import Data.List
import Data.Foldable
import Control.Applicative
import Control.Monad.State
-- | Data type for storing the current parse position within the input code. Used in the Parser library.
data ParseState = ParseState { line :: Int
, column :: Int
, pos :: Int
, code :: String
, rest :: String
} deriving Show
{-|
Generates a human readable parse position string, as shown below:
@
Line 1, Column 14
function text#()
^
@
-}
showPos :: ParseState -> String
showPos pos = "Line " ++ (show $ line pos) ++ ", Column " ++ (show $ column pos)
++ "\n " ++ lineStr pos ++ "\n " ++ replicate (column pos) ' ' ++ "^"
-- | Builds a starting @ParseState@ from an input string
parseState :: String -> ParseState
parseState code = ParseState 1 1 1 code code
-- | Increments a ParseState by looking at the next character, and adjusting the line/column numbers accordingly
inc :: ParseState -> ParseState
inc ps = if nextChar ps == '\n' then incLine ps else incColumn ps
-- | Increases the column number and global position of the @ParseState@, consumes a character
incColumn :: ParseState -> ParseState
incColumn (ParseState line col pos code rest) = ParseState line (col+1) (pos+1) code (tail rest)
-- | Increases the line number and global position, and resets the column number of a @ParseState@. Consumes a character.
incLine :: ParseState -> ParseState
incLine (ParseState line _ pos code rest) = ParseState (line+1) 1 (pos+1) code (tail rest)
-- | Gets the next character of the ParseState
nextChar :: ParseState -> Char
nextChar ps = head $ rest ps
-- | Checks if the ParseState has another character.
hasNextChar :: ParseState -> Bool
hasNextChar ps = length (rest ps) > 0
-- | Builds a parse state from an input string and global position, automatically working out the line and column numbers
fromPos :: String -> Int -> ParseState
fromPos code pos = ParseState { code = code
, pos = clampedPos
, line = (length $ filter (== '\n') prePosCode) + 1
, column = column
, rest = drop clampedPos code }
where
clampedPos = min pos $ length code
prePosCode = take clampedPos code
column = length $ takeWhile (/= '\n') $ reverse prePosCode
-- | Gets the current full line of the @ParseState@
lineStr :: ParseState -> String
lineStr (ParseState _ column pos code _) = takeWhile (/= '\n') (drop (pos - column) code)
-- | Result data type, similar to Maybe (with definitions for Monad) but with an error string
data Result a = Result a | Error String
instance Show a => Show (Result a) where
show (Result a) = show a
show (Error str) = "Error: " ++ str
instance Functor Result where
fmap f r = case r of
Result a -> Result $ f a
Error e -> Error e
instance Applicative Result where
pure v = Result v
pg <*> px = case pg of
Result g -> fmap g px
Error e -> Error e
instance Monad Result where
p >>= f = case p of
Result a -> f a
Error e -> Error e
-- | Converts an @Either String a@ computation to a @Result a@
eitherToResult :: Either String a -> Result a
eitherToResult (Left s) = Error s
eitherToResult (Right c) = Result c
-- Parsing
-- | Parser data type, which transforms a @ParseState@ to another @ParseState@, with the possibility to fail on the output
newtype Parser a = P (ParseState -> (Maybe a, ParseState))
-- | Takes a @Parser@ and String, and returns a @Result@ from the @Parser@. Fails if the @Parser@ fails to return a value, or if the input is not fully used up.
-- The @ParseState@ will be used to give meaningful errors on failure.
parseToResult :: Parser a -> String -> Result a
parseToResult p inp = case parse p $ parseState inp of
(Nothing, ps) -> err ps
(Just a, ps) -> if hasNextChar ps then err ps else Result a
where err ps = Error $ "Malformed token at " ++ (showPos ps)
-- | Simply runs a @Parser@ on an input
parse :: Parser a -> ParseState -> (Maybe a, ParseState)
parse (P p) s = p s
-- | Gets the next character, or fails if there aren't any
item :: Parser Char
item = P (\s -> if hasNextChar s then
(Just $ nextChar s, inc s)
else
(Nothing, s))
-- | Gets the next character without consuming it from the input, or fails if there isn't one
peek :: Parser Char
peek = P (\s -> if hasNextChar s then
(Just $ nextChar s, s)
else
(Nothing, s))
-- | Checks if the @ParseState@ has a next input, as a @Parser@ computation
canPeek :: Parser Bool
canPeek = P (\s -> (Just $ hasNextChar s, s))
-- | Attempts to peek, returning a @Just Char@ if successful, or @Nothing@ otherwise. This cannot fail.
tryPeek :: Parser (Maybe Char)
tryPeek = do able <- canPeek
if able then fmap Just peek else return Nothing
-- | Gets the current @ParseState@
getState :: Parser ParseState
getState = P (\s -> (Just s, s))
-- | Enforces a @Parser@ to fail, essentially reversing the default failure behaviour.
mustFail :: Parser a -> Parser ()
mustFail p = P (\s -> case parse p s of
(Nothing, s') -> (Just (), s)
(Just _, s') -> (Nothing, s))
instance Functor Parser where
fmap g p = P (\s -> case parse p s of
(Nothing, s') -> (Nothing, s')
(Just v, s') -> (Just (g v), s'))
instance Applicative Parser where
pure v = P (\s -> (Just v, s))
pg <*> px = P (\s -> case parse pg s of
(Nothing, s') -> (Nothing, s')
(Just g, s') -> parse (fmap g px) s')
instance Monad Parser where
p >>= f = P (\s -> case parse p s of
(Nothing, s') -> (Nothing, s)
(Just v, s') -> parse (f v) s')
instance Alternative Parser where
empty = P (\s -> (Nothing, s))
p <|> q = P (\s -> case parse p s of
(Nothing, _) -> parse q s
(Just v, s') -> (Just v, s'))
-- | Ensures the next character satisfying a predicate, returning the character if it does.
sat :: (Char -> Bool) -> Parser Char
sat p = do x <- item
if p x then return x else empty
-- | Gets a digit
digit :: Parser Char
digit = sat isDigit
-- | Gets a lower-case character
lower :: Parser Char
lower = sat isLower
-- | Gets an upper-case character
upper :: Parser Char
upper = sat isUpper
-- | Gets a letter as defined by @isAlpha@
letter :: Parser Char
letter = sat isAlpha
-- | Gets an alphanumeric character
alphanum :: Parser Char
alphanum = sat isAlphaNum
-- | Attempts to read a specific character
char :: Char -> Parser Char
char x = sat (== x)
-- | Attempts to read a specific string
string :: String -> Parser String
string [] = return []
string (x:xs) = do char x
string xs
return (x:xs)
-- | Attempts to read any of a list of strings
anyString :: [String] -> Parser String
anyString xs = asum $ map string xs
-- | Attempts to read an identity, as defined by an alphanumeric string starting with a lower-case letter
ident :: Parser String
ident = do x <- lower
xs <- many alphanum
return (x:xs)
-- | Attempts to read an upper-identity, as defined by an alphanumeric string starting with a upper-case letter
upperIdent :: Parser String
upperIdent = do x <- upper
xs <- many alphanum
return (x:xs)
-- | Attempts to read a natural number
nat :: Parser Int
nat = do xs <- some digit
return (read xs)
-- | Attempts to read an integer value
int :: Parser Int
int = do char '-'
n <- nat
return (-n)
<|> nat
-- | Attempts to read a float value, which can also interpret integers
float :: Parser Float
float = do n <- int
char '.'
d <- some digit
return $ read $ show n ++ '.':d
<|> fmap fromIntegral int
-- Scanner
-- | Scanner data type containing the configuration for a scanner
data Scanner = Scanner
{ separateCasedIdentifiers :: Bool
, ignoreWhitespace :: Bool
, ignoreComments :: Bool
, operators :: [String]
, keywords :: [String]
, blockComment :: Maybe (String, String)
, lineComment :: Maybe String
, includeEOF :: Bool
, parserMap :: ([Parser TokenType] -> [Parser TokenType])
}
-- | Instance for show which does not include the @parserMap@ field, as it is not showable
instance Show Scanner where
show (Scanner sep whsp cmts ops kwds block line eof _) =
"Scanner {separateCasedIdentifiers = " ++ show sep ++
", ignoreWhitespace = " ++ show whsp ++
", ignoreComments = " ++ show cmts ++
", operators = " ++ show ops ++
", keywords = " ++ show kwds ++
", blockComment = " ++ show block ++
", lineComment = " ++ show line ++
", includeEOF = " ++ show eof ++ "}"
-- | Token data type storing the token itself and the ParseState it was taken from
data Token = Token ParseState TokenType
instance Show Token where
show (Token _ tt) = show tt
-- | Various tokens for the scanner, including a TokenCustom constructor for any extras.
data TokenType = TokenIdentifier String
| TokenUpperIdentifier String
| TokenOperator String
| TokenStringLit String
| TokenIntLit Int
| TokenFloatLit Float
| TokenKeyword String
| TokenWhitespace Char
| TokenBlockComment String
| TokenLineComment String
| TokenCustom String String
| TokenOpenParen | TokenCloseParen
| TokenOpenSquare | TokenCloseSquare
| TokenOpenCurly | TokenCloseCurly
| TokenEOF
deriving (Show)
-- | Default definition of Scanner, for record modification
scanner = Scanner{ separateCasedIdentifiers=True
, ignoreWhitespace=True
, ignoreComments=True
, operators=["+", "-", "*", "/", "="]
, keywords=["for", "if"]
, blockComment=Just ("/*", "*/")
, lineComment=Just "//"
, includeEOF=False
, parserMap=id
}
-- | Matches a single space, and wraps the result in a @TokenWhiteSpace@
space :: Parser TokenType
space = fmap TokenWhitespace $ sat isSpace
-- | Takes a @TokenType@ parser, gets the current @ParseState@, and returns an equivalent @Token@ parser
tokenWithState :: Parser TokenType -> Parser Token
tokenWithState p = do
ps <- getState
t <- p
return $ Token ps t
-- | Throws away @TokenLineComment@ and @TokenBlockComment@ based on the @Scanner@ configuration
asTokenComment :: Scanner -> Token -> [Token]
asTokenComment conf t@(Token _ (TokenBlockComment _)) = if ignoreComments conf then [] else [t]
asTokenComment conf t@(Token _ (TokenLineComment _)) = if ignoreComments conf then [] else [t]
asTokenComment conf t = [t]
-- | Allows a token to be preceeded or proceeded by any amount of spaces, keeping them if the @Scanner@ configuration says to.
asToken :: Scanner -> Parser TokenType -> Parser [Token]
asToken conf p = do
preSpaces <- many $ tokenWithState space
token <- tokenWithState p
postSpaces <- many $ tokenWithState space
let tokens = asTokenComment conf token
if ignoreWhitespace conf then
return tokens
else
return $ preSpaces ++ tokens ++ postSpaces
-- | Reads a string literal, defined as any amount of tokens between two quotes, allowing them to be escaped with a \ if needed.
formattedString :: Parser String
formattedString = do
quote <- char '\'' <|> char '"'
str <- many $ (char '\\' >> item)
<|> sat (/= quote)
char quote
return str
-- | Similar to formattedString, but keeps the quotes and escape characters in the outputted string
formattedStringRaw :: Parser String
formattedStringRaw = do
quote <- char '\'' <|> char '"'
str <- fmap concat $ many $
fmap (\x -> ['\\', x]) (char '\\' >> item) <|>
fmap pure (sat (/= quote))
char quote
return $ quote:str ++ [quote]
-- | Sorts a list by the length of the elements of the list, starting with the longest
backLengthSorter :: [[a]] -> [[a]]
backLengthSorter = sortOn $ negate . length
-- | Creates a @Parser@ for operators from the @Scanner@ configuration
operator :: Scanner -> Parser TokenType
operator conf = fmap TokenOperator $ anyString $ backLengthSorter $ operators conf
-- | Creates a @Parser@ for keywords from the @Scanner@ configuration
keyword :: Scanner -> Parser TokenType
keyword conf = fmap TokenKeyword $ anyString $ backLengthSorter $ keywords conf
-- | Reads a character and returns a specific TokenType on read
getCharParser :: Char -> TokenType -> Parser TokenType
getCharParser c t = char c >> return t
-- | Ensures the next character is not alpha
nonAlphaNext :: Parser ()
nonAlphaNext = do
next <- tryPeek
case next of
Just v -> if isAlpha v then empty else return ()
Nothing -> return ()
-- | Parser for string literals, wrapped in @TokenStringLit@
stringToken :: Parser TokenType
stringToken = do
v <- formattedString
return $ TokenStringLit v
-- | Parser for integers, wrapped in @TokenIntLit@
intToken :: Parser TokenType
intToken = do
v <- int
nonAlphaNext
return $ TokenIntLit v
-- | Parser for floats, wrapped in @TokenFloatLit@
floatToken :: Parser TokenType
floatToken = do
v <- float
nonAlphaNext
return $ TokenFloatLit v
-- | Parser for lower identifiers, wrapped in @TokenIdentifier@
lowerIdentifierToken :: Parser TokenType
lowerIdentifierToken = do
id <- ident
return $ TokenIdentifier id
-- | Parser for upper identifiers, wrapped in @TokenUpperIdentifier@
upperIdentifierToken :: Parser TokenType
upperIdentifierToken = do
id <- upperIdent
return $ TokenUpperIdentifier id
-- | Parser for both identifiers and upper identifiers, both wrapped in @TokenIdentifier@
identifierToken :: Parser TokenType
identifierToken = do
id <- ident <|> upperIdent
return $ TokenIdentifier id
-- | Parsers for all of the 6 brackets: @[] {} ()@
brackets :: Parser TokenType
brackets = getCharParser '(' TokenOpenParen <|>
getCharParser ')' TokenCloseParen <|>
getCharParser '{' TokenOpenCurly <|>
getCharParser '}' TokenCloseCurly <|>
getCharParser '[' TokenOpenSquare <|>
getCharParser ']' TokenCloseSquare
-- | Gets parsers for identifiers, either separated by upper and lower identifiers, or both treated as lower identifiers, based on the boolean input
getIdentifierTokenParser :: Bool -> Parser TokenType
getIdentifierTokenParser isSeperate = if isSeperate then lowerIdentifierToken <|> upperIdentifierToken else identifierToken
-- | Parser for comments taking a starting and ending token, along with a constructor.
commentParser :: String -> String -> (String -> TokenType) -> Parser TokenType
commentParser openStr closeStr const = do
string openStr
commentBody <- many $ mustFail (string closeStr) >> item
nextToken <- tryPeek
if nextToken /= Nothing then string closeStr
else return empty
return $ const commentBody
-- | Parser for block comments based on @Scanner@ configuration
blockCommentParser :: Scanner -> Parser TokenType
blockCommentParser conf = case blockComment conf of
Nothing -> empty
Just (openStr, closeStr) -> commentParser openStr closeStr TokenBlockComment
-- | Parser for line comments, using the @Scanner@ for the starting token, and a new line for the ending token.
lineCommentParser :: Scanner -> Parser TokenType
lineCommentParser conf = case lineComment conf of
Nothing -> empty
Just str -> commentParser str "\n" TokenLineComment
-- | Creates a full scanner from a @Scanner@ configuration
scanParser :: Scanner -> Parser [Token]
scanParser conf = fmap concat $ many $ asum tokenedParserList
where
parserList = [ stringToken
, blockCommentParser conf
, lineCommentParser conf
, operator conf
, brackets
, keyword conf
, getIdentifierTokenParser $ separateCasedIdentifiers conf
, intToken
, floatToken ]
modifiedParserList = parserMap conf $ parserList
tokenedParserList = map (asToken conf) modifiedParserList
-- | Runs a Scanner on a string, returning a @Result@ of the token list
scan :: Scanner -> String -> Result [Token]
scan conf str = do
tokens <- parseToResult (scanParser conf) str
if includeEOF conf then
return $ tokens ++ [Token (fromPos str $ length str) TokenEOF]
else
return tokens
-- language defs parser
-- | Unpacks the data value from a @TokenCustom@
unpackCustom :: TokenType -> String
unpackCustom (TokenCustom _ str) = str
unpackCustom _ = ""
nonCurly :: Parser String
nonCurly = do
c <- item
if (c == '}') || (c == '{') then
empty
else
return [c]
nonCurlyStr :: Parser String
nonCurlyStr = fmap concat $ many $ (fmap (:[]) $ char '\'') <|> formattedStringRaw <|> nonCurly
innerCodeBlock :: Parser String
innerCodeBlock = fmap unpackCustom (codeBlock' True) <|> return ""
codeBlockAux :: Parser String
codeBlockAux = do
preStr <- nonCurlyStr
innerBlock <- innerCodeBlock
let out = preStr ++ innerBlock
if out == "" then empty else return out
withBraces :: Bool -> String -> String
withBraces True str = "{" ++ str ++ "}"
withBraces False str = str
codeBlock' :: Bool -> Parser TokenType
codeBlock' includeBraces = do
char '{'
str <- fmap concat $ many codeBlockAux
postStr <- nonCurlyStr
char '}'
return $ TokenCustom "CodeBlock" $ withBraces includeBraces $ str ++ postStr
-- | Parser for directives of the form @%mydirective@
directive :: Parser TokenType
directive = do
char '%'
str <- ident
return $ TokenCustom "Directive" str
-- | Haskell identifiers, defined as lower identifiers followed by zero or more @'@s
identifierPrime :: Parser TokenType
identifierPrime = do
name <- ident
primes <- some (char '\'')
return $ TokenCustom "IdentifierPrime" (name ++ primes)
-- | Haskell code block, defined as follows: @{ haskellCode }@
codeBlock :: Parser TokenType
codeBlock = codeBlock' False
-- | All three above parsers combined into one parser
languageDefsParser :: Parser TokenType
languageDefsParser = codeBlock <|> directive <|> identifierPrime