Skip to content

Commit

Permalink
Fix haskell#277: Reword toCaseFold database generation
Browse files Browse the repository at this point in the history
- Add property and regression test that toCaseFold should be idempotent
- Add scripts/tests.sh to run tests with all GHCs.
  There are plenty of setup commands to pass.
- Rework CaseFolding.hs so it considers that toLower behaves
  differently with different GHCs, and therefore fallbacks to it
  only when it behaves consistently.
  For that purpose a helper `scripts/Dump.hs` is added.

Note: `toLower`, `toUpper`, and `toTitle` would benefit from using
dumped database as well.  This commit is already quite big, so that is
left for a follow up.
  • Loading branch information
phadej committed Aug 25, 2020
1 parent e07c149 commit 641581c
Show file tree
Hide file tree
Showing 11 changed files with 738 additions and 14 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@

# Test data repo ignored. Please see instruction in tests-and-benchmarks.markdown
/tests/text-test-data/

# Data for case functions
/scripts/CaseFolding.txt
/scripts/SpecialCasing.txt
/scripts/db-*.txt
5 changes: 5 additions & 0 deletions cabal.tests.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- this project doesn't have local 'text' package,
-- so tests build faster.

packages: tests
tests: True
110 changes: 103 additions & 7 deletions scripts/CaseFolding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ module CaseFolding

import Arsec

import Data.Char (ord)
import Data.Maybe (mapMaybe)
import qualified Data.List as L
import qualified Data.Set as Set
import qualified Data.Map as Map

data Fold = Fold {
code :: Char
, status :: Char
Expand All @@ -33,14 +39,104 @@ entries = CF <$> many comment <*> many (entry <* many comment)
parseCF :: FilePath -> IO (Either ParseError CaseFolding)
parseCF name = parse entries name <$> readFile name

mapCF :: CaseFolding -> [String]
mapCF (CF _ ms) = typ ++ (map nice . filter p $ ms) ++ [last]
-- We generate mapping trying to call toLower
mapCF :: [Map.Map Char (Char,Char,Char)] -> CaseFolding -> [String]
mapCF dbs (CF _ ms) = concat
[ typ
, mapMaybe nice [ minBound .. maxBound ]
, [last]
]
where
-- characters for which base's toLower has different results
different :: Set.Set Char
different
= Map.keysSet
$ Map.filter g
$ L.foldl' (alignWith f) (Map.map ToChar dbh) dbt
where
dbh : dbt = map (Map.mapMaybeWithKey h) dbs

-- Only valid case is when both lhs and rhs of alignment
-- have the same single character.
-- Everything else is consider as different.
--
f :: These Maps Char -> Maps
f (This _) = Differently
f (That _) = Differently
f (These Differently _) = Differently
f (These (ToChar l) l')
| l == l' = ToChar l
| otherwise = Differently

-- We are only interested in Differently
-- for these we cannot trust toLower
g :: Maps -> Bool
g Differently = True
g (ToChar _) = False

-- look for toLower data only
h c (_, l, _) | c == l = Nothing
| otherwise = Just l

-- we are only interested in C and F cases
-- * C: common case folding
-- * F: full case folding
--
-- Case Folding says
--
-- Usage:
-- A. To do a simple case folding, use the mappings with status C + S.
-- B. To do a full case folding, use the mappings with status C + F.
--
folds :: Map.Map Char Fold
folds = Map.fromList
$ map (\f -> (code f, f))
$ filter (\f -> status f `elem` "CF") ms

-- there are three cases:
--
nice :: Char -> Maybe String
nice c
-- not mapping to toLower, and toLower is same for all GHCs
| s /= [toLower c], not isDifferent
= Just
$ "-- " ++ n ++ "\n" ++
"foldMapping " ++ showC c ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"

-- when toLower cannot be trusted
| isDifferent
= Just
$ "-- " ++ n ++ "\n" ++
"foldMapping " ++ showC c ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"

-- otherwise omit, to be handled by catch all toLower case.
| otherwise
= Nothing
where
s :: [Char] -- mapping
n :: String -- name
(n, s) = maybe (defName, [c]) (\f -> (name f, mapping f)) (Map.lookup c folds)

isDifferent = Set.member c different

[x,y,z] = (map showC . take 3) (s ++ repeat '\0')

defName = "NOT FOLDED TO toLower " ++ showC c

typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char"
,"{-# NOINLINE foldMapping #-}"]
last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')"
nice c = "-- " ++ name c ++ "\n" ++
"foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
p f = status f `elem` "CF" &&
mapping f /= [toLower (code f)]

-- auxiliary data type used to determine whether toLower is the same for a char
data Maps
= ToChar Char
| Differently
deriving Show

alignWith :: Ord k => (These a b -> c) -> Map.Map k a -> Map.Map k b -> Map.Map k c
alignWith f = Map.mergeWithKey
(\_ x y -> Just $ f $ These x y)
(Map.map (f . This))
(Map.map (f . That))

data These a b = This a | That b | These a b deriving Show
40 changes: 38 additions & 2 deletions scripts/CaseMapping.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,23 @@ import Arsec
import CaseFolding
import SpecialCasing

import qualified Data.Map as Map

-- 1. download SpecialCasing.txt and CaseFolding.txt files from unicode.org
--
-- 2. dump Char DB by running dump.sh
--
-- 3. run from scripts/ directory with
--
-- runghc-8.6.5 -package-env=- CaseMapping.hs
--

main = do
args <- getArgs
let oname = case args of
[] -> "../Data/Text/Internal/Fusion/CaseMapping.hs"
[] -> "../src/Data/Text/Internal/Fusion/CaseMapping.hs"
[o] -> o
dbs <- loadDBs
psc <- parseSC "SpecialCasing.txt"
pcf <- parseCF "CaseFolding.txt"
scs <- case psc of
Expand All @@ -34,5 +46,29 @@ main = do
mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
mapM_ (hPutStrLn h) (mapSC "title" title toTitle scs)
mapM_ (hPutStrLn h) (mapCF cfs)
mapM_ (hPutStrLn h) (mapCF dbs cfs)
hClose h

loadDBs :: IO [Map.Map Char (Char,Char,Char)]
loadDBs = mapM loadDB
[ "7.0.4"
, "7.2.2"
, "7.4.2"
, "7.6.3"
, "7.8.4"
, "7.10.3"

, "8.0.2"
, "8.2.2"
, "8.4.4"
, "8.6.5"
, "8.8.4"
, "8.10.2"

-- , "9.0.1"
]
where
loadDB v = fmap (f . read) (readFile ("db-" ++ v ++ ".txt"))

f :: [(Char,Char,Char,Char)] -> Map.Map Char (Char,Char,Char)
f = Map.fromList . map (\(c,u,l,t) -> (c,(u,l,t)))
15 changes: 15 additions & 0 deletions scripts/Dump.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- This script is used to dump casing DB from GHCs base library

import Data.Char

main :: IO ()
main = print
[ (c, u, l, t)
| c <- [ minBound .. maxBound ]
, let u = toUpper c
, let l = toLower c
, let t = toTitle c

-- we dump only characters which have some transformations
, c /= u || c /= l || c /= t
]
23 changes: 23 additions & 0 deletions scripts/dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/sh

set -ex

dump() {
"runghc-$1" Dump.hs > "db-$1.txt"
}

dump 7.0.4
dump 7.2.2
dump 7.4.2
dump 7.6.3
dump 7.8.4
dump 7.10.3

dump 8.0.2
dump 8.2.2
dump 8.4.4
dump 8.6.5
dump 8.8.4
dump 8.10.2

# dump 9.0.1
30 changes: 30 additions & 0 deletions scripts/tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/sh

set -ex

runtest() {
HC=$1
shift

# EDIT last line to pass arguments

cabal run text-tests:test:tests \
--project-file=cabal.tests.project \
--builddir="dist-newstyle/$HC" \
--with-compiler="$HC" \
-- "$@"
}

runtest ghc-8.10.2 "$@"
runtest ghc-8.8.4 "$@"
runtest ghc-8.6.5 "$@"
runtest ghc-8.4.4 "$@"
runtest ghc-8.2.2 "$@"
runtest ghc-8.0.2 "$@"

runtest ghc-7.10.3 "$@"
runtest ghc-7.8.4 "$@"
runtest ghc-7.6.3 "$@"
runtest ghc-7.4.2 "$@"
runtest ghc-7.2.2 "$@"
runtest ghc-7.0.4 "$@"
Loading

0 comments on commit 641581c

Please sign in to comment.