Add comment to unsafe block in decode_utf8_lossy

servo · Nov 3, 2019 · f332e0c · f332e0c
1 parent 3144f86
commit f332e0c
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 14 deletions.
diff --git a/percent_encoding/lib.rs b/percent_encoding/lib.rs
@@ -430,15 +430,26 @@ fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow<str> {
     match input {
         Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
         Cow::Owned(bytes) => {
-            let raw_utf8: *const [u8];
             match String::from_utf8_lossy(&bytes) {
-                Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(),
-                Cow::Owned(s) => return s.into(),
+                Cow::Borrowed(utf8) => {
+                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
+                    // be sure our original bytes were valid UTF-8. This is because
+                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
+                    // to allocate a new owned string to back the Cow so it could
+                    // replace invalid bytes with a placeholder.
+
+                    // First we do a debug_assert to confirm our description above.
+                    let raw_utf8: *const [u8];
+                    raw_utf8 = utf8.as_bytes();
+                    debug_assert!(raw_utf8 == &*bytes as *const [u8]);
+
+                    // Given we know the original input bytes are valid UTF-8,
+                    // and we have ownership of those bytes, we re-use them and
+                    // return a Cow::Owned here.
+                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
+                },
+                Cow::Owned(s) => Cow::Owned(s),
             }
-            // from_utf8_lossy returned a borrow of `bytes` unchanged.
-            debug_assert!(raw_utf8 == &*bytes as *const [u8]);
-            // Reuse the existing `Vec` allocation.
-            unsafe { String::from_utf8_unchecked(bytes) }.into()
         }
     }
 }
diff --git a/src/query_encoding.rs b/src/query_encoding.rs
@@ -18,18 +18,30 @@ pub(crate) fn encode<'a>(encoding_override: EncodingOverride, input: &'a str) ->
 }
 
 pub(crate) fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow<str> {
+    // Note: This function is duplicated in `percent_encoding/lib.rs`.
     match input {
         Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
         Cow::Owned(bytes) => {
-            let raw_utf8: *const [u8];
             match String::from_utf8_lossy(&bytes) {
-                Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(),
-                Cow::Owned(s) => return s.into(),
+                Cow::Borrowed(utf8) => {
+                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
+                    // be sure our original bytes were valid UTF-8. This is because
+                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
+                    // to allocate a new owned string to back the Cow so it could
+                    // replace invalid bytes with a placeholder.
+
+                    // First we do a debug_assert to confirm our description above.
+                    let raw_utf8: *const [u8];
+                    raw_utf8 = utf8.as_bytes();
+                    debug_assert!(raw_utf8 == &*bytes as *const [u8]);
+
+                    // Given we know the original input bytes are valid UTF-8,
+                    // and we have ownership of those bytes, we re-use them and
+                    // return a Cow::Owned here.
+                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
+                },
+                Cow::Owned(s) => Cow::Owned(s),
             }
-            // from_utf8_lossy returned a borrow of `bytes` unchanged.
-            debug_assert!(raw_utf8 == &*bytes as *const [u8]);
-            // Reuse the existing `Vec` allocation.
-            unsafe { String::from_utf8_unchecked(bytes) }.into()
         }
     }
 }