From ae525e34edf017544e8ef4457d7e57cf2081dcf9 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Sun, 26 Jan 2020 20:15:51 +0100 Subject: [PATCH] [ fixed #119 ] latin1 encoding: each byte counts as 1 char The computation of the length component of AlexToken was tailored to the utf8 encoding, and didn't work correctly for latin1. This is fixed by having a new flag ALEX_LATIN1 in templates/GenericTemplate.hs that turns on code that increases the length by 1 for each byte, while for utf8 something more sophisticated is done. The fix requires more template instances to be generated. To streamline the instance generation, now all 2^4 = 16 template instances are generated for the 4 flags - ghc - latin1 - nopred - debug To ensure consistent reference to the template instance, a function templateFileName residing both in src/Main and gen-alex-sdist/Main needs to be kept consistent, should more dimensions be added to the template. (Putting this function into a separate file that is included by both modules could be an option, but seemed not enough in the spirit of cabal-organized projects.) --- alex.cabal | 16 +++++++- gen-alex-sdist/Main.hs | 39 +++++++++++++++---- src/AbsSyn.hs | 2 + src/Main.hs | 41 +++++++++++--------- templates/GenericTemplate.hs | 12 ++++-- tests/Makefile | 1 + tests/issue_119.x | 73 ++++++++++++++++++++++++++++++++++++ 7 files changed, 154 insertions(+), 30 deletions(-) create mode 100644 tests/issue_119.x diff --git a/alex.cabal b/alex.cabal index 33b6307..561f676 100644 --- a/alex.cabal +++ b/alex.cabal @@ -38,10 +38,21 @@ data-dir: data/ data-files: AlexTemplate + AlexTemplate-debug + AlexTemplate-nopred + AlexTemplate-nopred-debug + AlexTemplate-latin1 + AlexTemplate-latin1-debug + AlexTemplate-latin1-nopred + AlexTemplate-latin1-nopred-debug AlexTemplate-ghc - AlexTemplate-ghc-nopred AlexTemplate-ghc-debug - AlexTemplate-debug + AlexTemplate-ghc-nopred + AlexTemplate-ghc-nopred-debug + AlexTemplate-ghc-latin1 + AlexTemplate-ghc-latin1-debug + AlexTemplate-ghc-latin1-nopred + AlexTemplate-ghc-latin1-nopred-debug AlexWrapper-basic AlexWrapper-basic-bytestring AlexWrapper-strict-bytestring @@ -110,6 +121,7 @@ extra-source-files: tests/strict_typeclass.x tests/unicode.x tests/issue_71.x + tests/issue_119.x source-repository head type: git diff --git a/gen-alex-sdist/Main.hs b/gen-alex-sdist/Main.hs index 189fb97..7465e52 100644 --- a/gen-alex-sdist/Main.hs +++ b/gen-alex-sdist/Main.hs @@ -1,6 +1,7 @@ module Main (main) where import Control.Monad +import qualified Data.List as List import Language.Preprocessor.Cpphs import System.Directory import System.FilePath @@ -51,13 +52,37 @@ all_template_files :: [FilePath] all_template_files = map fst (templates ++ wrappers) templates :: [(FilePath,[String])] -templates = [ - ("AlexTemplate", []), - ("AlexTemplate-ghc", ["ALEX_GHC"]), - ("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]), - ("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]), - ("AlexTemplate-debug", ["ALEX_DEBUG"]) - ] +templates = + [ ( templateFileName ghc latin1 nopred debug + , templateFlags ghc latin1 nopred debug + ) + | ghc <- allBool + , latin1 <- allBool + , nopred <- allBool + , debug <- allBool + ] + where + allBool = [False, True] + +-- Keep this function in sync with its twin in src/Main.hs. +templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath +templateFileName ghc latin1 nopred debug = + List.intercalate "-" $ concat + [ [ "AlexTemplate" ] + , [ "ghc" | ghc ] + , [ "latin1" | latin1 ] + , [ "nopred" | nopred ] + , [ "debug" | debug ] + ] + +templateFlags :: Bool -> Bool -> Bool -> Bool -> [String] +templateFlags ghc latin1 nopred debug = + map ("ALEX_" ++) $ concat + [ [ "GHC" | ghc ] + , [ "LATIN1" | latin1 ] + , [ "NOPRED" | nopred ] + , [ "DEBUG" | debug ] + ] wrappers :: [(FilePath,[String])] wrappers = [ diff --git a/src/AbsSyn.hs b/src/AbsSyn.hs index bf580ea..3eb8f6b 100644 --- a/src/AbsSyn.hs +++ b/src/AbsSyn.hs @@ -170,6 +170,7 @@ type StartCode = Int -- we can generate somewhat faster code in the case that -- the lexer doesn't use predicates data UsesPreds = UsesPreds | DoesntUsePreds + deriving Eq usesPreds :: DFA s a -> UsesPreds usesPreds dfa @@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str . -- Code generation targets data Target = GhcTarget | HaskellTarget + deriving Eq diff --git a/src/Main.hs b/src/Main.hs index 8245ced..dcd06d0 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError ) import Control.Monad ( when, liftM ) import Data.Char ( chr ) import Data.List ( isSuffixOf, nub ) +import qualified Data.List as List import Data.Maybe ( isJust, fromJust ) import Data.Version ( showVersion ) import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) ) @@ -218,7 +219,7 @@ alex cli file basename script = do hPutStr out_h (actions "") -- add the template - let template_name = templateFile template_dir target usespreds cli + let template_name = templateFile template_dir target encoding usespreds cli tmplt <- alexReadFile template_name hPutStr out_h tmplt @@ -399,23 +400,27 @@ templateDir def cli [] -> def ds -> return (last ds) -templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath -templateFile dir target usespreds cli - = dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred - where - maybe_ghc = case target of - GhcTarget -> "-ghc" - _ -> "" - - maybe_debug - | OptDebugParser `elem` cli = "-debug" - | otherwise = "" - - maybe_nopred = - case usespreds of - DoesntUsePreds | not (null maybe_ghc) - && null maybe_debug -> "-nopred" - _ -> "" +-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs. +templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath +templateFileName ghc latin1 nopred debug = + List.intercalate "-" $ concat + [ [ "AlexTemplate" ] + , [ "ghc" | ghc ] + , [ "latin1" | latin1 ] + , [ "nopred" | nopred ] + , [ "debug" | debug ] + ] + +templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath +templateFile dir target encoding usespreds cli = concat + [ dir + , "/" + , templateFileName + (target == GhcTarget) + (encoding == Latin1) + (usespreds == DoesntUsePreds) + (OptDebugParser `elem` cli) + ] wrapperFile :: FilePath -> Scheme -> Maybe FilePath wrapperFile dir scheme = diff --git a/templates/GenericTemplate.hs b/templates/GenericTemplate.hs index ff71440..bc6bf26 100644 --- a/templates/GenericTemplate.hs +++ b/templates/GenericTemplate.hs @@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc = ILIT(-1) -> (new_acc, input__) -- on an error, we want to keep the input *before* the -- character that failed, not after. - _ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len) - -- note that the length is increased ONLY if this is the 1st byte in a char encoding) - new_input new_s new_acc + _ -> alex_scan_tkn user__ orig_input +#ifdef ALEX_LATIN1 + PLUS(len,ILIT(1)) + -- issue 119: in the latin1 encoding, *each* byte is one character +#else + (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len) + -- note that the length is increased ONLY if this is the 1st byte in a char encoding) +#endif + new_input new_s new_acc } where check_accs (AlexAccNone) = last_acc diff --git a/tests/Makefile b/tests/Makefile index 1f7c0b6..5d21425 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -41,6 +41,7 @@ TESTS = \ default_typeclass.x \ gscan_typeclass.x \ issue_71.x \ + issue_119.x \ monad_typeclass.x \ monad_typeclass_bytestring.x \ monadUserState_typeclass.x \ diff --git a/tests/issue_119.x b/tests/issue_119.x new file mode 100644 index 0000000..2229c66 --- /dev/null +++ b/tests/issue_119.x @@ -0,0 +1,73 @@ +-- -*- haskell -*- +{ +-- Issue 119, +-- reported 2017-10-11 by Herbert Valerio Riedel, +-- fixed 2020-01-26 by Andreas Abel. +-- +-- Problem was: the computed token length (in number of characters) +-- attached to AlexToken is tailored to UTF8 encoding and wrong +-- for LATIN1 encoding. + +module Main where + +import Control.Monad (unless) +import qualified Data.ByteString as B +import Data.Word +import System.Exit (exitFailure) +} + +%encoding "latin1" + +:- + +[\x01-\xff]+ { False } +[\x00] { True } + +{ +type AlexInput = B.ByteString + +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte = B.uncons + +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar = undefined + +-- generated by @alex@ +alexScan :: AlexInput -> Int -> AlexReturn Bool + +{- + +GOOD cases: + +("012\NUL3","012","\NUL3",3,3,False) +("\NUL0","\NUL","0",1,1,True) +("012","012","",3,3,False) + +BAD case: + +("0@P`p\128\144\160","0@P`p","",5,8,False) + +expected: + +("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False) + +-} +main :: IO () +main = do + go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD + go (B.pack [0x00,0x30]) -- GOOD + go (B.pack [0x30,0x31,0x32]) -- GOOD + + go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD + where + go inp = do + case (alexScan inp 0) of + -- expected invariant: len == B.length inp - B.length inp' + AlexToken inp' len b -> do + let diff = B.length inp - B.length inp' + unless (len == diff) $ do + putStrLn $ "ERROR: reported length and consumed length differ!" + print (inp, B.take len inp, inp', len, diff, b) + exitFailure + _ -> undefined +}