diff --git a/README.md b/README.md index 26bc541..3289fb0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ 这里是 https://github.com/esp-rs/std-training 的简体中文翻译 -目前进度:已经翻译完成,跟踪到 f7ce2e7 +目前进度:已经翻译完成,跟踪到 1d2fc8a --- diff --git a/intro/http-client/examples/http_client.rs b/intro/http-client/examples/http_client.rs index a52dcc1..4720f09 100644 --- a/intro/http-client/examples/http_client.rs +++ b/intro/http-client/examples/http_client.rs @@ -61,18 +61,70 @@ fn get(url: impl AsRef) -> Result<()> { match status { 200..=299 => { // 5. If the status is OK, read response data chunk by chunk into a buffer and print it until done. + // + // NB. There is no guarantee that chunks will be split at the boundaries of valid UTF-8 + // sequences (in fact it is likely that they are not) so this edge case needs to be handled. + // However, for the purposes of clarity and brevity(?), the additional case of completely invalid + // UTF-8 sequences will not be handled here and is left as an exercise for the reader. let mut buf = [0_u8; 256]; + // Offset into the buffer to indicate that there may still be + // bytes at the beginning that have not been decoded yet + let mut offset = 0; + // Keep track of the total number of bytes read to print later + let mut total = 0; let mut reader = response; loop { - if let Ok(size) = Read::read(&mut reader, &mut buf) { + // read into the buffer starting at the offset to not overwrite + // the incomplete UTF-8 sequence we put there earlier + if let Ok(size) = Read::read(&mut reader, &mut buf[offset..]) { if size == 0 { + // It might be nice to check if we have any left over bytes here (ie. the offset > 0) + // as this would mean that the response ended with an invalid UTF-8 sequence, but for the + // purposes of this training we are assuming that the full response will be valid UTF-8 break; } + // Update the total number of bytes read + total += size; // 6. Try converting the bytes into a Rust (UTF-8) string and print it. - let response_text = str::from_utf8(&buf[..size])?; - println!("{}", response_text); + // Remember that we read into an offset and recalculate the real length + // of the bytes to decode. + let size_plus_offset = size + offset; + match str::from_utf8(&buf[..size_plus_offset]) { + Ok(text) => { + // buffer contains fully valid UTF-8 data, + // print it and reset the offset to 0. + print!("{}", text); + offset = 0; + }, + Err(error) => { + // The buffer contains incomplete UTF-8 data, we will + // print the valid part, copy the invalid sequence to + // the beginning of the buffer and set an offset for the + // next read. + // + // NB. There is actually an additional case here that should be + // handled in a real implementation. The Utf8Error may also contain + // an error_len field indicating that there is actually an invalid UTF-8 + // sequence in the middle of the buffer. Such an error would not be + // recoverable through our offset and copy mechanism. The result will be + // that the invalid sequence will be copied to the front of the buffer and + // eventually the buffer will be filled until no more bytes can be read when + // the offset == buf.len(). At this point the loop will exit without reading + // any more of the response. + let valid_up_to = error.valid_up_to(); + unsafe { + // It's ok to use unsafe here as the error code already told us that + // the UTF-8 data up to this point is valid, so we can tell the compiler + // it's fine. + print!("{}", str::from_utf8_unchecked(&buf[..valid_up_to])); + } + buf.copy_within(valid_up_to.., 0); + offset = size_plus_offset - valid_up_to; + } + } } } + println!("Total: {} bytes", total); } _ => bail!("Unexpected response code: {}", status), } diff --git a/intro/http-client/examples/https_client.rs b/intro/http-client/examples/https_client.rs index 87b8b6d..0b6bcd5 100644 --- a/intro/http-client/examples/https_client.rs +++ b/intro/http-client/examples/https_client.rs @@ -66,18 +66,39 @@ fn get(url: impl AsRef) -> Result<()> { match status { 200..=299 => { // 4. if the status is OK, read response data chunk by chunk into a buffer and print it until done + // + // NB. see http_client.rs for an explanation of the offset mechanism for handling chunks that are + // split in the middle of valid UTF-8 sequences. This case is encountered a lot with the given + // example URL. let mut buf = [0_u8; 256]; + let mut offset = 0; + let mut total = 0; let mut reader = response; loop { - if let Ok(size) = Read::read(&mut reader, &mut buf) { + if let Ok(size) = Read::read(&mut reader, &mut buf[offset..]) { if size == 0 { break; } + total += size; // 5. try converting the bytes into a Rust (UTF-8) string and print it - let response_text = str::from_utf8(&buf[..size])?; - println!("{}", response_text); + let size_plus_offset = size + offset; + match str::from_utf8(&buf[..size_plus_offset]) { + Ok(text) => { + print!("{}", text); + offset = 0; + }, + Err(error) => { + let valid_up_to = error.valid_up_to(); + unsafe { + print!("{}", str::from_utf8_unchecked(&buf[..valid_up_to])); + } + buf.copy_within(valid_up_to.., 0); + offset = size_plus_offset - valid_up_to; + } + } } } + println!("Total: {} bytes", total); } _ => bail!("Unexpected response code: {}", status), }