Skip to content

Commit

Permalink
Bump version and update changelog
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Sep 6, 2021
1 parent 20b901d commit fd49707
Show file tree
Hide file tree
Showing 10 changed files with 65 additions and 6 deletions.
23 changes: 21 additions & 2 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
### 1.3

### 2.0

* [Switch internal representation of text from UTF-16 to UTF-8](https://github.com/haskell/text/pull/365):
* Functions in `Data.Text.Array` now operate over arrays of `Word8` instead of `Word16`.
* Rename constructors of `Array` and `MArray` to `ByteArray` and `MutableByteArray`.
* Rename functions and types in `Data.Text.Foreign` to reflect switch
from `Word16` to `Word8`.
* Rename slicing functions in `Data.Text.Unsafe` to reflect switch
from `Word16` to `Word8`.
* Rename `Data.Text.Internal.Unsafe.Char.unsafeChr` to `unsafeChr16`.
* Change semantics and order of arguments of `Data.Text.Array.copyI`:
pass length, not end offset.
* Extend `Data.Text.Internal.Encoding.Utf8` to provide more UTF-8 related routines.
* Extend interface of `Data.Text.Array` with more utility functions.
* Add `instance Show Data.Text.Unsafe.Iter`.
* Add `Data.Text.measureOff`.
* Extend `Data.Text.Unsafe` with `iterArray` and `reverseIterArray`.
* Export `Data.Text.Internal.Lazy.equal`.
* Export `Data.Text.Internal.append`.
* Add `Data.Text.Internal.Private.spanAscii_`.
* Replacement characters in `decodeUtf8With` are no longer limited to Basic Multilingual Plane.
* [Disable implicit fusion rules](https://github.com/haskell/text/pull/348)
* [Add `Data.Text.Encoding.decodeUtf8Lenient`](https://github.com/haskell/text/pull/342)
* [Remove `Data.Text.Internal.Unsafe.Shift`](https://github.com/haskell/text/pull/343)
Expand Down
2 changes: 2 additions & 0 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,8 @@ take n t@(Text arr off len)
--
-- This function is used to implement 'take', 'drop', 'splitAt' and 'length'
-- and is useful on its own in streaming and parsing libraries.
--
-- @since 2.0
measureOff :: Int -> Text -> Int
measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
cSsizeToInt $ unsafeDupablePerformIO $
Expand Down
18 changes: 16 additions & 2 deletions src/Data/Text/Array.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ new (I# len#)
{-# INLINE new #-}

-- | Create an uninitialized mutable pinned array.
--
-- @since 2.0
newPinned :: forall s. Int -> ST s (MArray s)
newPinned (I# len#)
#if defined(ASSERTS)
Expand All @@ -90,13 +92,15 @@ newPinned (I# len#)
(# s2#, marr# #) -> (# s2#, MutableByteArray marr# #)
{-# INLINE newPinned #-}

-- | @since 2.0
newFilled :: Int -> Int -> ST s (MArray s)
newFilled (I# len#) (I# c#) = ST $ \s1# ->
case newByteArray# len# s1# of
(# s2#, marr# #) -> case setByteArray# marr# 0# len# c# s2# of
s3# -> (# s3#, MutableByteArray marr# #)
{-# INLINE newFilled #-}

-- | @since 2.0
tile :: MArray s -> Int -> ST s ()
tile marr tileLen = do
totalLen <- getSizeofMArray marr
Expand Down Expand Up @@ -130,8 +134,10 @@ unsafeIndex (ByteArray arr) i@(I# i#) =
case indexWord8Array# arr i# of r# -> (W8# r#)
{-# INLINE unsafeIndex #-}

-- sizeofMutableByteArray# is deprecated, because it is unsafe in the presence of
-- shrinkMutableByteArray# and resizeMutableByteArray#.
-- | 'sizeofMutableByteArray#' is deprecated, because it is unsafe in the presence of
-- 'shrinkMutableByteArray#' and 'resizeMutableByteArray#'.
--
-- @since 2.0
getSizeofMArray :: MArray s -> ST s Int
getSizeofMArray (MutableByteArray marr) = ST $ \s0# ->
case getSizeofMutableByteArray# marr s0# of
Expand Down Expand Up @@ -185,12 +191,14 @@ run2 k = runST (do
return (arr,b))
{-# INLINE run2 #-}

-- | @since 2.0
resizeM :: MArray s -> Int -> ST s (MArray s)
resizeM (MutableByteArray ma) i@(I# i#) = ST $ \s1# ->
case resizeMutableByteArray# ma i# s1# of
(# s2#, newArr #) -> (# s2#, MutableByteArray newArr #)
{-# INLINE resizeM #-}

-- | @since 2.0
shrinkM ::
#if defined(ASSERTS)
HasCallStack =>
Expand Down Expand Up @@ -253,6 +261,8 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
{-# INLINE copyI #-}

-- | Copy from pointer.
--
-- @since 2.0
copyFromPointer
:: MArray s -- ^ Destination
-> Int -- ^ Destination offset
Expand All @@ -270,6 +280,8 @@ copyFromPointer (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I#
{-# INLINE copyFromPointer #-}

-- | Copy to pointer.
--
-- @since 2.0
copyToPointer
:: Array -- ^ Source
-> Int -- ^ Source offset
Expand All @@ -293,6 +305,8 @@ equal src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count == 0
{-# INLINE equal #-}

-- | Compare portions of two arrays. No bounds checking is performed.
--
-- @since 2.0
compare :: Array -> Int -> Array -> Int -> Int -> Ordering
compare src1 off1 src2 off2 count = compareInternal src1 off1 src2 off2 count `Prelude.compare` 0
{-# INLINE compare #-}
Expand Down
2 changes: 1 addition & 1 deletion src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
, bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
, isValidBS bs = do
withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff)
unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))

| dstOff + 4 > dstLen = do
Expand Down
6 changes: 6 additions & 0 deletions src/Data/Text/Foreign.hs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ import qualified Data.Text.Array as A
-- the functions in the 'Data.Text.Encoding' module.

-- | A type representing a number of UTF-8 code units.
--
-- @since 2.0
newtype I8 = I8 Int
deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)

Expand Down Expand Up @@ -86,6 +88,8 @@ fromPtr ptr (I8 len) = unsafeSTToIO $ do
-- If @n@ would cause the 'Text' to end inside a code point, the
-- end of the prefix will be advanced by several additional 'Word8' units
-- to maintain its validity.
--
-- @since 2.0
takeWord8 :: I8 -> Text -> Text
takeWord8 = (fst .) . splitAtWord8

Expand All @@ -95,6 +99,8 @@ takeWord8 = (fst .) . splitAtWord8
-- If @n@ would cause the 'Text' to begin inside a code point, the
-- beginning of the suffix will be advanced by several additional 'Word8'
-- unit to maintain its validity.
--
-- @since 2.0
dropWord8 :: I8 -> Text -> Text
dropWord8 = (snd .) . splitAtWord8

Expand Down
7 changes: 7 additions & 0 deletions src/Data/Text/Internal/Encoding/Utf8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ between x y z = x >= y && x <= z
-- | ord c < 0x800 = 2
-- | ord c < 0x10000 = 3
-- | otherwise = 4

-- | @since 2.0
utf8Length :: Char -> Int
utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800#) +# geChar# c (chr# 0x10000#)))
{-# INLINE utf8Length #-}
Expand All @@ -82,6 +84,8 @@ utf8Length (C# c) = I# ((1# +# geChar# c (chr# 0x80#)) +# (geChar# c (chr# 0x800
-- c `xor` I# (c# <=# 0#) is a branchless equivalent of c `max` 1.
-- It is crucial to write c# <=# 0# and not c# ==# 0#, otherwise
-- GHC is tempted to "optimize" by introduction of branches.

-- | @since 2.0
utf8LengthByLeader :: Word8 -> Int
utf8LengthByLeader w = c `xor` I# (c# <=# 0#)
where
Expand Down Expand Up @@ -256,11 +260,13 @@ updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)

newtype CodePoint = CodePoint Int

-- | @since 2.0
data DecoderResult
= Accept !Char
| Incomplete !DecoderState !CodePoint
| Reject

-- | @since 2.0
utf8DecodeStart :: Word8 -> DecoderResult
utf8DecodeStart w
| st == utf8AcceptState = Accept (chr (word8ToInt w))
Expand All @@ -271,6 +277,7 @@ utf8DecodeStart w
st = updateState cl utf8AcceptState
cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w

-- | @since 2.0
utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
utf8DecodeContinue w st (CodePoint cp)
| st' == utf8AcceptState = Accept (chr cp')
Expand Down
2 changes: 2 additions & 0 deletions src/Data/Text/Internal/Private.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ span_ p t@(Text arr off len) = (# hd,tl #)

-- | For the sake of performance this function does not check
-- that a char is in ASCII range; it is a responsibility of @p@.
--
-- @since 2.0
spanAscii_ :: (Word8 -> Bool) -> Text -> (# Text, Text #)
spanAscii_ p (Text arr off len) = (# hd, tl #)
where hd = text arr off k
Expand Down
1 change: 1 addition & 0 deletions src/Data/Text/Internal/Unsafe/Char.hs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ ord :: Char -> Int
ord (C# c#) = I# (ord# c#)
{-# INLINE ord #-}

-- | @since 2.0
unsafeChr16 :: Word16 -> Char
unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#)))
{-# INLINE unsafeChr16 #-}
Expand Down
8 changes: 8 additions & 0 deletions src/Data/Text/Unsafe.hs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ iter ::
iter (Text arr off _len) i = iterArray arr (off + i)
{-# INLINE iter #-}

-- | @since 2.0
iterArray :: A.Array -> Int -> Iter
iterArray arr j = Iter chr l
where m0 = A.unsafeIndex arr j
Expand Down Expand Up @@ -107,6 +108,7 @@ reverseIter :: Text -> Int -> Iter
reverseIter (Text arr off _len) i = reverseIterArray arr (off + i)
{-# INLINE reverseIter #-}

-- | @since 2.0
reverseIterArray :: A.Array -> Int -> Iter
reverseIterArray arr j
| m0 < 0x80 = Iter (unsafeChr8 m0) (-1)
Expand Down Expand Up @@ -139,16 +141,22 @@ reverseIter_ (Text arr off _len) i
-- | /O(1)/ Return the length of a 'Text' in units of 'Word8'. This
-- is useful for sizing a target array appropriately before using
-- 'unsafeCopyToPtr'.
--
-- @since 2.0
lengthWord8 :: Text -> Int
lengthWord8 (Text _arr _off len) = len
{-# INLINE lengthWord8 #-}

-- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'.
--
-- @since 2.0
takeWord8 :: Int -> Text -> Text
takeWord8 k (Text arr off _len) = Text arr off k
{-# INLINE takeWord8 #-}

-- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'.
--
-- @since 2.0
dropWord8 :: Int -> Text -> Text
dropWord8 k (Text arr off len) = Text arr (off+k) (len-k)
{-# INLINE dropWord8 #-}
2 changes: 1 addition & 1 deletion text.cabal
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cabal-version: 2.2
name: text
version: 1.2.5.0
version: 2.0

homepage: https://github.com/haskell/text
bug-reports: https://github.com/haskell/text/issues
Expand Down

0 comments on commit fd49707

Please sign in to comment.