Skip to content

Commit

Permalink
[ fixed #119 ] latin1 encoding: each byte counts as 1 char
Browse files Browse the repository at this point in the history
The computation of the length component of AlexToken was tailored to
the utf8 encoding, and didn't work correctly for latin1.

This is fixed by having a new flag ALEX_LATIN1 in
templates/GenericTemplate.hs that turns on code that increases the
length by 1 for each byte, while for utf8 something more sophisticated
is done.

The fix requires more template instances to be generated.  To streamline
the instance generation, now all 2^4 = 16 template instances are
generated for the 4 flags

  - ghc
  - latin1
  - nopred
  - debug

To ensure consistent reference to the template instance, a function

  templateFileName

residing both in src/Main and gen-alex-sdist/Main needs to be kept
consistent, should more dimensions be added to the template.

(Putting this function into a separate file that is included by both
modules could be an option, but seemed not enough in the spirit of
cabal-organized projects.)
  • Loading branch information
andreasabel committed Jan 26, 2020
1 parent 0198f73 commit ae525e3
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 30 deletions.
16 changes: 14 additions & 2 deletions alex.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,21 @@ data-dir: data/

data-files:
AlexTemplate
AlexTemplate-debug
AlexTemplate-nopred
AlexTemplate-nopred-debug
AlexTemplate-latin1
AlexTemplate-latin1-debug
AlexTemplate-latin1-nopred
AlexTemplate-latin1-nopred-debug
AlexTemplate-ghc
AlexTemplate-ghc-nopred
AlexTemplate-ghc-debug
AlexTemplate-debug
AlexTemplate-ghc-nopred
AlexTemplate-ghc-nopred-debug
AlexTemplate-ghc-latin1
AlexTemplate-ghc-latin1-debug
AlexTemplate-ghc-latin1-nopred
AlexTemplate-ghc-latin1-nopred-debug
AlexWrapper-basic
AlexWrapper-basic-bytestring
AlexWrapper-strict-bytestring
Expand Down Expand Up @@ -110,6 +121,7 @@ extra-source-files:
tests/strict_typeclass.x
tests/unicode.x
tests/issue_71.x
tests/issue_119.x

source-repository head
type: git
Expand Down
39 changes: 32 additions & 7 deletions gen-alex-sdist/Main.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Main (main) where

import Control.Monad
import qualified Data.List as List
import Language.Preprocessor.Cpphs
import System.Directory
import System.FilePath
Expand Down Expand Up @@ -51,13 +52,37 @@ all_template_files :: [FilePath]
all_template_files = map fst (templates ++ wrappers)

templates :: [(FilePath,[String])]
templates = [
("AlexTemplate", []),
("AlexTemplate-ghc", ["ALEX_GHC"]),
("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]),
("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]),
("AlexTemplate-debug", ["ALEX_DEBUG"])
]
templates =
[ ( templateFileName ghc latin1 nopred debug
, templateFlags ghc latin1 nopred debug
)
| ghc <- allBool
, latin1 <- allBool
, nopred <- allBool
, debug <- allBool
]
where
allBool = [False, True]

-- Keep this function in sync with its twin in src/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFlags :: Bool -> Bool -> Bool -> Bool -> [String]
templateFlags ghc latin1 nopred debug =
map ("ALEX_" ++) $ concat
[ [ "GHC" | ghc ]
, [ "LATIN1" | latin1 ]
, [ "NOPRED" | nopred ]
, [ "DEBUG" | debug ]
]

wrappers :: [(FilePath,[String])]
wrappers = [
Expand Down
2 changes: 2 additions & 0 deletions src/AbsSyn.hs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ type StartCode = Int
-- we can generate somewhat faster code in the case that
-- the lexer doesn't use predicates
data UsesPreds = UsesPreds | DoesntUsePreds
deriving Eq

usesPreds :: DFA s a -> UsesPreds
usesPreds dfa
Expand Down Expand Up @@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str .
-- Code generation targets

data Target = GhcTarget | HaskellTarget
deriving Eq
41 changes: 23 additions & 18 deletions src/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError )
import Control.Monad ( when, liftM )
import Data.Char ( chr )
import Data.List ( isSuffixOf, nub )
import qualified Data.List as List
import Data.Maybe ( isJust, fromJust )
import Data.Version ( showVersion )
import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) )
Expand Down Expand Up @@ -218,7 +219,7 @@ alex cli file basename script = do
hPutStr out_h (actions "")

-- add the template
let template_name = templateFile template_dir target usespreds cli
let template_name = templateFile template_dir target encoding usespreds cli
tmplt <- alexReadFile template_name
hPutStr out_h tmplt

Expand Down Expand Up @@ -399,23 +400,27 @@ templateDir def cli
[] -> def
ds -> return (last ds)

templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target usespreds cli
= dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred
where
maybe_ghc = case target of
GhcTarget -> "-ghc"
_ -> ""

maybe_debug
| OptDebugParser `elem` cli = "-debug"
| otherwise = ""

maybe_nopred =
case usespreds of
DoesntUsePreds | not (null maybe_ghc)
&& null maybe_debug -> "-nopred"
_ -> ""
-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target encoding usespreds cli = concat
[ dir
, "/"
, templateFileName
(target == GhcTarget)
(encoding == Latin1)
(usespreds == DoesntUsePreds)
(OptDebugParser `elem` cli)
]

wrapperFile :: FilePath -> Scheme -> Maybe FilePath
wrapperFile dir scheme =
Expand Down
12 changes: 9 additions & 3 deletions templates/GenericTemplate.hs
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc =
ILIT(-1) -> (new_acc, input__)
-- on an error, we want to keep the input *before* the
-- character that failed, not after.
_ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
new_input new_s new_acc
_ -> alex_scan_tkn user__ orig_input
#ifdef ALEX_LATIN1
PLUS(len,ILIT(1))
-- issue 119: in the latin1 encoding, *each* byte is one character
#else
(if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
#endif
new_input new_s new_acc
}
where
check_accs (AlexAccNone) = last_acc
Expand Down
1 change: 1 addition & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ TESTS = \
default_typeclass.x \
gscan_typeclass.x \
issue_71.x \
issue_119.x \
monad_typeclass.x \
monad_typeclass_bytestring.x \
monadUserState_typeclass.x \
Expand Down
73 changes: 73 additions & 0 deletions tests/issue_119.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- -*- haskell -*-
{
-- Issue 119,
-- reported 2017-10-11 by Herbert Valerio Riedel,
-- fixed 2020-01-26 by Andreas Abel.
--
-- Problem was: the computed token length (in number of characters)
-- attached to AlexToken is tailored to UTF8 encoding and wrong
-- for LATIN1 encoding.

module Main where

import Control.Monad (unless)
import qualified Data.ByteString as B
import Data.Word
import System.Exit (exitFailure)
}

%encoding "latin1"

:-

[\x01-\xff]+ { False }
[\x00] { True }

{
type AlexInput = B.ByteString

alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte = B.uncons

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar = undefined

-- generated by @alex@
alexScan :: AlexInput -> Int -> AlexReturn Bool

{-
GOOD cases:
("012\NUL3","012","\NUL3",3,3,False)
("\NUL0","\NUL","0",1,1,True)
("012","012","",3,3,False)
BAD case:
("0@P`p\128\144\160","0@P`p","",5,8,False)
expected:
("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)
-}
main :: IO ()
main = do
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
go (B.pack [0x00,0x30]) -- GOOD
go (B.pack [0x30,0x31,0x32]) -- GOOD

go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD
where
go inp = do
case (alexScan inp 0) of
-- expected invariant: len == B.length inp - B.length inp'
AlexToken inp' len b -> do
let diff = B.length inp - B.length inp'
unless (len == diff) $ do
putStrLn $ "ERROR: reported length and consumed length differ!"
print (inp, B.take len inp, inp', len, diff, b)
exitFailure
_ -> undefined
}

0 comments on commit ae525e3

Please sign in to comment.