Skip to content

Commit

Permalink
Implement filter
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Aug 22, 2021
1 parent 92fa4ed commit 730cbf6
Showing 1 changed file with 66 additions and 3 deletions.
69 changes: 66 additions & 3 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -223,14 +223,14 @@ import Data.Binary (Binary(get, put))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
import Data.String (IsString(..))
import Data.Text.Internal.Encoding.Utf8 (chr3, utf8Length)
import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr2, chr3, chr4)
import qualified Data.Text.Internal.Fusion as S
import qualified Data.Text.Internal.Fusion.Common as S
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
import Data.Text.Internal.Private (span_)
import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
import Data.Text.Show (singleton, unpack, unpackCString#)
import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
Expand Down Expand Up @@ -1487,7 +1487,70 @@ partition p t = (filter p t, filter (not . p) t)
-- returns a 'Text' containing those characters that satisfy the
-- predicate.
filter :: (Char -> Bool) -> Text -> Text
filter p = unstream . S.filter p . stream
filter p = go
where
go (Text src (I# o) (I# l)) = runST $ do
-- It's tempting to allocate l elements at once and avoid resizing.
-- However, this can be unacceptable in scenarios where a huge array
-- is filtered with a rare predicate, resulting in a much shorter buffer.
let !(I# dstLen) = min (I# l) 64
dst <- A.new (I# dstLen)
outer dst dstLen o 0#
where
-- I expected that GHC did worker/wrapper transformation for me,
-- but apparently did not manage to convince it, so ended up explicitly
-- passing unboxed Int#.
outer :: forall s. A.MArray s -> Int# -> Int# -> Int# -> ST s Text
outer !dst !dstLen = inner
where
inner !srcOff !dstOff
| I# srcOff >= I# o + I# l = do
A.shrinkM dst (I# dstOff)
arr <- A.unsafeFreeze dst
return (Text arr 0 (I# dstOff))
| I# dstOff + 4 > I# dstLen = do
-- Double size of the buffer, unless it becomes longer than
-- source string. Ensure to extend it by least 4 bytes.
let !(I# dstLen') = I# dstLen + max 4 (min (I# l + I# o - I# srcOff) (I# dstLen))
dst' <- A.resizeM dst (I# dstLen')
outer dst' dstLen' srcOff dstOff
-- In case of success, filter writes exactly the same character
-- it just read (this is not a case for map, for example).
-- We leverage this fact below: no need to decode Char back into UTF8,
-- just copy bytes from input.
| otherwise = do
let m0 = A.unsafeIndex src (I# srcOff)
m1 = A.unsafeIndex src (I# srcOff + 1)
m2 = A.unsafeIndex src (I# srcOff + 2)
m3 = A.unsafeIndex src (I# srcOff + 3)
!(I# d) = utf8LengthByLeader m0
case d of
1# -> do
let c = unsafeChr8 m0
if not (p c) then inner (srcOff +# 1#) dstOff else do
A.unsafeWrite dst (I# dstOff) m0
inner (srcOff +# 1#) (dstOff +# 1#)
2# -> do
let c = chr2 m0 m1
if not (p c) then inner (srcOff +# 2#) dstOff else do
A.unsafeWrite dst (I# dstOff) m0
A.unsafeWrite dst (I# dstOff + 1) m1
inner (srcOff +# 2#) (dstOff +# 2#)
3# -> do
let c = chr3 m0 m1 m2
if not (p c) then inner (srcOff +# 3#) dstOff else do
A.unsafeWrite dst (I# dstOff) m0
A.unsafeWrite dst (I# dstOff + 1) m1
A.unsafeWrite dst (I# dstOff + 2) m2
inner (srcOff +# 3#) (dstOff +# 3#)
_ -> do
let c = chr4 m0 m1 m2 m3
if not (p c) then inner (srcOff +# 4#) dstOff else do
A.unsafeWrite dst (I# dstOff) m0
A.unsafeWrite dst (I# dstOff + 1) m1
A.unsafeWrite dst (I# dstOff + 2) m2
A.unsafeWrite dst (I# dstOff + 3) m3
inner (srcOff +# 4#) (dstOff +# 4#)
{-# INLINE [1] filter #-}

{-# RULES
Expand Down

0 comments on commit 730cbf6

Please sign in to comment.