diff --git a/quic/s2n-quic-bench/Cargo.toml b/quic/s2n-quic-bench/Cargo.toml
index 8947b1dc22..e925360519 100644
--- a/quic/s2n-quic-bench/Cargo.toml
+++ b/quic/s2n-quic-bench/Cargo.toml
@@ -11,6 +11,7 @@ publish = false
 [dependencies]
 criterion = { version = "0.4", features = ["html_reports"] }
 crossbeam-channel = { version = "0.5" }
+internet-checksum = "0.2"
 s2n-codec = { path = "../../common/s2n-codec", features = ["testing"] }
 s2n-quic-core = { path = "../s2n-quic-core", features = ["testing"] }
 s2n-quic-crypto = { path = "../s2n-quic-crypto", features = ["testing"] }
diff --git a/quic/s2n-quic-bench/src/inet.rs b/quic/s2n-quic-bench/src/inet.rs
new file mode 100644
index 0000000000..81df632523
--- /dev/null
+++ b/quic/s2n-quic-bench/src/inet.rs
@@ -0,0 +1,36 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use core::hash::Hasher;
+use criterion::{black_box, BenchmarkId, Criterion, Throughput};
+use s2n_quic_core::inet::checksum::Checksum;
+
+pub fn benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("inet");
+    for block in [1500, 9000, 1 << 16] {
+        let data = vec![123u8; block];
+        group.throughput(Throughput::Bytes(block as u64));
+        group.bench_with_input(
+            BenchmarkId::new("s2n/checksum", block),
+            &data,
+            |b, block| {
+                let cs = Checksum::default();
+                let input = black_box(&block[..]);
+                b.iter(|| {
+                    let mut checksum = cs;
+                    checksum.write(input);
+                    black_box(checksum.finish())
+                })
+            },
+        );
+        group.bench_with_input(
+            BenchmarkId::new("fuchsia/checksum", block),
+            &data,
+            |b, block| {
+                let input = black_box(&block[..]);
+                b.iter(|| black_box(internet_checksum::checksum(input)))
+            },
+        );
+    }
+    group.finish();
+}
diff --git a/quic/s2n-quic-bench/src/lib.rs b/quic/s2n-quic-bench/src/lib.rs
index 7facdc8fee..addbba3868 100644
--- a/quic/s2n-quic-bench/src/lib.rs
+++ b/quic/s2n-quic-bench/src/lib.rs
@@ -6,6 +6,7 @@ use criterion::Criterion;
 mod buffer;
 mod crypto;
 mod frame;
+mod inet;
 mod packet;
 mod sync;
 mod varint;
@@ -14,6 +15,7 @@ pub fn benchmarks(c: &mut Criterion) {
     buffer::benchmarks(c);
     crypto::benchmarks(c);
     frame::benchmarks(c);
+    inet::benchmarks(c);
     packet::benchmarks(c);
     sync::benchmarks(c);
     varint::benchmarks(c);
diff --git a/quic/s2n-quic-core/src/inet/__fuzz__/inet__checksum__tests__differential/corpus.tar.gz b/quic/s2n-quic-core/src/inet/__fuzz__/inet__checksum__tests__differential/corpus.tar.gz
new file mode 100644
index 0000000000..dd25d767d7
--- /dev/null
+++ b/quic/s2n-quic-core/src/inet/__fuzz__/inet__checksum__tests__differential/corpus.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:492fbbc7cf37e7de4a376bf6f2d4e021f7f2807a95524d163d3414b6c46eda73
+size 225280
diff --git a/quic/s2n-quic-core/src/inet/checksum.rs b/quic/s2n-quic-core/src/inet/checksum.rs
new file mode 100644
index 0000000000..c3052f5bf9
--- /dev/null
+++ b/quic/s2n-quic-core/src/inet/checksum.rs
@@ -0,0 +1,318 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use core::{fmt, hash::Hasher, num::Wrapping};
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod x86;
+
+/// Computes the [IP checksum](https://www.rfc-editor.org/rfc/rfc1071) over the given slice of bytes
+#[inline]
+pub fn checksum(data: &[u8]) -> u16 {
+    let mut checksum = Checksum::default();
+    checksum.write(data);
+    checksum.finish()
+}
+
+/// Minimum size for a payload to be considered for platform-specific code
+const LARGE_WRITE_LEN: usize = 32;
+
+/// Platform-specific function for computing a checksum
+type LargeWriteFn = for<'a> unsafe fn(&mut Wrapping<u32>, bytes: &'a [u8]) -> &'a [u8];
+
+/// Generic implementation of a function that computes a checksum over the given slice
+#[inline]
+fn write_sized_generic<'a, const LEN: usize>(
+    state: &mut Wrapping<u32>,
+    mut bytes: &'a [u8],
+) -> &'a [u8] {
+    //= https://www.rfc-editor.org/rfc/rfc1071#section-4.1
+    //# The following "C" code algorithm computes the checksum with an inner
+    //# loop that sums 16-bits at a time in a 32-bit accumulator.
+    //#
+    //# in 6
+    //#    {
+    //#        /* Compute Internet Checksum for "count" bytes
+    //#         *         beginning at location "addr".
+    //#         */
+    //#    register long sum = 0;
+    //#
+    //#     while( count > 1 )  {
+    //#        /*  This is the inner loop */
+    //#            sum += * (unsigned short) addr++;
+    //#            count -= 2;
+    //#    }
+    //#
+    //#        /*  Add left-over byte, if any */
+    //#    if( count > 0 )
+    //#            sum += * (unsigned char *) addr;
+    //#
+    //#        /*  Fold 32-bit sum to 16 bits */
+    //#    while (sum>>16)
+    //#        sum = (sum & 0xffff) + (sum >> 16);
+    //#
+    //#    checksum = ~sum;
+    //# }
+
+    while bytes.len() >= LEN {
+        let (chunks, remaining) = bytes.split_at(LEN);
+
+        bytes = remaining;
+
+        let mut sum = 0;
+        // for each pair of bytes, interpret them as 16 bit integers and sum them up
+        for chunk in chunks.chunks_exact(2) {
+            let value = u16::from_ne_bytes([chunk[0], chunk[1]]) as u32;
+            sum += value;
+        }
+        *state += sum;
+    }
+
+    bytes
+}
+
+/// Returns the most optimized function implementation for the current platform
+#[inline]
+#[cfg(all(feature = "once_cell", not(any(kani, miri))))]
+fn probe_write_large() -> LargeWriteFn {
+    static LARGE_WRITE_FN: once_cell::sync::Lazy<LargeWriteFn> = once_cell::sync::Lazy::new(|| {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(fun) = x86::probe() {
+                return fun;
+            }
+        }
+
+        write_sized_generic::<8>
+    });
+
+    *LARGE_WRITE_FN
+}
+
+#[inline]
+#[cfg(not(all(feature = "once_cell", not(any(kani, miri)))))]
+fn probe_write_large() -> LargeWriteFn {
+    write_sized_generic::<8>
+}
+
+/// Computes the [IP checksum](https://www.rfc-editor.org/rfc/rfc1071) over an arbitrary set of inputs
+#[derive(Clone, Copy)]
+pub struct Checksum {
+    state: Wrapping<u32>,
+    partial_write: bool,
+    write_large: LargeWriteFn,
+}
+
+impl Default for Checksum {
+    fn default() -> Self {
+        Self {
+            state: Default::default(),
+            partial_write: false,
+            write_large: probe_write_large(),
+        }
+    }
+}
+
+impl fmt::Debug for Checksum {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut v = *self;
+        v.carry();
+        f.debug_tuple("Checksum").field(&v.finish()).finish()
+    }
+}
+
+impl Checksum {
+    /// Writes a single byte to the checksum state
+    #[inline]
+    fn write_byte(&mut self, byte: u8, shift: bool) {
+        if shift {
+            self.state += (byte as u32) << 8;
+        } else {
+            self.state += byte as u32;
+        }
+    }
+
+    /// Carries all of the bits into a single 16 bit range
+    #[inline]
+    fn carry(&mut self) {
+        let mut state = self.state;
+
+        for _ in 0..3 {
+            state = Wrapping((state.0 & 0xffff) + (state.0 >> 16));
+        }
+
+        self.state = state;
+    }
+
+    /// Computes the final checksum
+    #[inline]
+    fn finish(mut self) -> u16 {
+        self.carry();
+
+        let value = self.state.0 as u16;
+        let value = !value;
+
+        value.to_be()
+    }
+}
+
+impl Hasher for Checksum {
+    #[inline]
+    fn write(&mut self, mut bytes: &[u8]) {
+        if bytes.is_empty() {
+            return;
+        }
+
+        // Check to see if we have a partial write to flush
+        if core::mem::take(&mut self.partial_write) {
+            let (chunk, remaining) = bytes.split_at(1);
+            bytes = remaining;
+
+            // shift the byte if we're on little endian
+            self.write_byte(chunk[0], cfg!(target_endian = "little"));
+        }
+
+        // Only delegate to the optimized platform function if the payload is big enough
+        if bytes.len() >= LARGE_WRITE_LEN {
+            bytes = unsafe { (self.write_large)(&mut self.state, bytes) };
+        }
+
+        // Fall back on the generic implementation to wrap things up
+        bytes = write_sized_generic::<2>(&mut self.state, bytes);
+
+        // if we only have a single byte left, write it to the state and mark it as a partial write
+        if let Some(byte) = bytes.first().copied() {
+            self.partial_write = true;
+            self.write_byte(byte, cfg!(target_endian = "big"));
+        }
+    }
+
+    #[inline]
+    fn finish(&self) -> u64 {
+        Self::finish(*self) as _
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bolero::check;
+
+    #[test]
+    fn rfc_example_test() {
+        //= https://www.rfc-editor.org/rfc/rfc1071#section-3
+        //= type=test
+        //# We now present explicit examples of calculating a simple 1's
+        //# complement sum on a 2's complement machine.  The examples show the
+        //# same sum calculated byte by bye, by 16-bits words in normal and
+        //# swapped order, and 32 bits at a time in 3 different orders.  All
+        //# numbers are in hex.
+        //#
+        //#               Byte-by-byte    "Normal"  Swapped
+        //#                                 Order    Order
+        //#
+        //#     Byte 0/1:    00   01        0001      0100
+        //#     Byte 2/3:    f2   03        f203      03f2
+        //#     Byte 4/5:    f4   f5        f4f5      f5f4
+        //#     Byte 6/7:    f6   f7        f6f7      f7f6
+        //#                 ---  ---       -----     -----
+        //#     Sum1:       2dc  1f0       2ddf0     1f2dc
+        //#
+        //#                  dc   f0        ddf0      f2dc
+        //#     Carrys:       1    2           2         1
+        //#                  --   --        ----      ----
+        //#     Sum2:        dd   f2        ddf2      f2dd
+        //#
+        //#     Final Swap:  dd   f2        ddf2      ddf2
+        let bytes = [0x00, 0x01, 0xf2, 0x03, 0xf4, 0xf5, 0xf6, 0xf7];
+
+        let mut checksum = Checksum::default();
+        checksum.write(&bytes);
+        checksum.carry();
+
+        assert_eq!((checksum.state.0 as u16).to_le_bytes(), [0xdd, 0xf2]);
+        assert_eq!((!rfc_c_port(&bytes)).to_be_bytes(), [0xdd, 0xf2]);
+    }
+
+    fn rfc_c_port(data: &[u8]) -> u16 {
+        //= https://www.rfc-editor.org/rfc/rfc1071#section-4.1
+        //= type=test
+        //# The following "C" code algorithm computes the checksum with an inner
+        //# loop that sums 16-bits at a time in a 32-bit accumulator.
+        //#
+        //# in 6
+        //#    {
+        //#        /* Compute Internet Checksum for "count" bytes
+        //#         *         beginning at location "addr".
+        //#         */
+        //#    register long sum = 0;
+        //#
+        //#     while( count > 1 )  {
+        //#        /*  This is the inner loop */
+        //#            sum += * (unsigned short) addr++;
+        //#            count -= 2;
+        //#    }
+        //#
+        //#        /*  Add left-over byte, if any */
+        //#    if( count > 0 )
+        //#            sum += * (unsigned char *) addr;
+        //#
+        //#        /*  Fold 32-bit sum to 16 bits */
+        //#    while (sum>>16)
+        //#        sum = (sum & 0xffff) + (sum >> 16);
+        //#
+        //#    checksum = ~sum;
+        //# }
+
+        let mut addr = data.as_ptr() as *const u8;
+        let mut count = data.len();
+
+        unsafe {
+            let mut sum = 0u32;
+
+            while count > 1 {
+                let value = u16::from_be_bytes([*addr, *addr.add(1)]);
+                sum = sum.wrapping_add(value as u32);
+                addr = addr.add(2);
+                count -= 2;
+            }
+
+            if count > 0 {
+                let value = u16::from_be_bytes([*addr, 0]);
+                sum = sum.wrapping_add(value as u32);
+            }
+
+            while sum >> 16 != 0 {
+                sum = (sum & 0xffff) + (sum >> 16);
+            }
+
+            !(sum as u16)
+        }
+    }
+
+    /// * Compares the implementation to a port of the C code defined in the RFC
+    /// * Ensures partial writes are correctly handled, even if they're not at a 16 bit boundary
+    #[test]
+    #[cfg_attr(kani, kani::proof, kani::unwind(8), kani::solver(kissat))]
+    fn differential() {
+        #[cfg(any(kani, miri))]
+        type Bytes = crate::testing::InlineVec<u8, 5>;
+        #[cfg(not(any(kani, miri)))]
+        type Bytes = Vec<u8>;
+
+        check!()
+            .with_type::<(usize, Bytes)>()
+            .for_each(|(index, bytes)| {
+                let index = if bytes.is_empty() {
+                    0
+                } else {
+                    *index % bytes.len()
+                };
+                let (a, b) = bytes.split_at(index);
+                let mut cs = Checksum::default();
+                cs.write(a);
+                cs.write(b);
+                assert_eq!(rfc_c_port(bytes).to_be_bytes(), cs.finish().to_be_bytes());
+            });
+    }
+}
diff --git a/quic/s2n-quic-core/src/inet/checksum/__fuzz__/inet__checksum__x86__tests__differential/corpus.tar.gz b/quic/s2n-quic-core/src/inet/checksum/__fuzz__/inet__checksum__x86__tests__differential/corpus.tar.gz
new file mode 100644
index 0000000000..d12bc1ec08
--- /dev/null
+++ b/quic/s2n-quic-core/src/inet/checksum/__fuzz__/inet__checksum__x86__tests__differential/corpus.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd691654a191d1282189e54f495f61e135f13d9130a0dc6fb28d7206daf8146d
+size 71680
diff --git a/quic/s2n-quic-core/src/inet/checksum/x86.rs b/quic/s2n-quic-core/src/inet/checksum/x86.rs
new file mode 100644
index 0000000000..760c6a0fe7
--- /dev/null
+++ b/quic/s2n-quic-core/src/inet/checksum/x86.rs
@@ -0,0 +1,187 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use super::{LargeWriteFn, LARGE_WRITE_LEN};
+use core::num::Wrapping;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+/// Returns the optimized function for the given platform
+///
+/// If possible, this uses runtime feature detection so this should be cached.
+#[inline]
+pub fn probe() -> Option<LargeWriteFn> {
+    #[cfg(all(feature = "std", not(any(kani, miri))))]
+    {
+        if std::is_x86_feature_detected!("avx") {
+            return Some(write_sized_avx);
+        }
+
+        if std::is_x86_feature_detected!("sse4.1") {
+            return Some(write_sized_sse);
+        }
+    }
+
+    // no way to reliably detect features in no_std
+    None
+}
+
+/// Enable sse4.1 optimizations for the implementation
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn write_sized_sse<'a>(state: &mut Wrapping<u32>, bytes: &'a [u8]) -> &'a [u8] {
+    write_sized(state, bytes)
+}
+
+/// Enable avx optimizations for the implementation
+#[target_feature(enable = "avx")]
+pub unsafe fn write_sized_avx<'a>(state: &mut Wrapping<u32>, bytes: &'a [u8]) -> &'a [u8] {
+    write_sized(state, bytes)
+}
+
+#[inline(always)]
+unsafe fn write_sized<'a>(state: &mut Wrapping<u32>, mut bytes: &'a [u8]) -> &'a [u8] {
+    assume!(
+        bytes.len() >= LARGE_WRITE_LEN,
+        "large write function should only be called with large byte buffers"
+    );
+
+    // Maintain two different sums to increase instruction parallelism
+    let mut sum_a = Sum::new();
+    let mut sum_b = Sum::new();
+
+    // Unroll the loop to read 256 bytes at a time (16 16-byte blocks)
+    while bytes.len() >= 256 {
+        let (chunks, remaining) = bytes.split_at(256);
+        bytes = remaining;
+
+        let ptr = chunks.as_ptr() as *const __m128i;
+
+        sum_a += _mm_loadu_si128(ptr);
+        sum_a += _mm_loadu_si128(ptr.add(1));
+        sum_a += _mm_loadu_si128(ptr.add(2));
+        sum_a += _mm_loadu_si128(ptr.add(3));
+        sum_a += _mm_loadu_si128(ptr.add(4));
+        sum_a += _mm_loadu_si128(ptr.add(5));
+        sum_a += _mm_loadu_si128(ptr.add(6));
+        sum_a += _mm_loadu_si128(ptr.add(7));
+
+        sum_b += _mm_loadu_si128(ptr.add(8));
+        sum_b += _mm_loadu_si128(ptr.add(9));
+        sum_b += _mm_loadu_si128(ptr.add(10));
+        sum_b += _mm_loadu_si128(ptr.add(11));
+        sum_b += _mm_loadu_si128(ptr.add(12));
+        sum_b += _mm_loadu_si128(ptr.add(13));
+        sum_b += _mm_loadu_si128(ptr.add(14));
+        sum_b += _mm_loadu_si128(ptr.add(15));
+    }
+
+    // Unroll the loop to read 64 bytes at a time (4 16-byte blocks)
+    while bytes.len() >= 64 {
+        let (chunks, remaining) = bytes.split_at(64);
+        bytes = remaining;
+
+        let ptr = chunks.as_ptr() as *const __m128i;
+
+        sum_a += _mm_loadu_si128(ptr);
+        sum_a += _mm_loadu_si128(ptr.add(1));
+        sum_b += _mm_loadu_si128(ptr.add(2));
+        sum_b += _mm_loadu_si128(ptr.add(3));
+    }
+
+    // Finish reading the full 16-byte blocks
+    while bytes.len() >= 16 {
+        let (chunks, remaining) = bytes.split_at(16);
+        bytes = remaining;
+
+        let ptr = chunks.as_ptr() as *const __m128i;
+        sum_a += _mm_loadu_si128(ptr);
+    }
+
+    // Add up all of the sums and merge them into a single u32
+    let sum_a = _mm_add_epi32(sum_a.a, sum_a.b);
+    let sum_b = _mm_add_epi32(sum_b.a, sum_b.b);
+    let mut total = _mm_add_epi32(sum_a, sum_b);
+
+    total = _mm_hadd_epi32(total, _mm_setzero_si128());
+    total = _mm_hadd_epi32(total, _mm_setzero_si128());
+
+    *state += _mm_extract_epi32(total, 0) as u32;
+
+    bytes
+}
+
+struct Sum {
+    a: __m128i,
+    b: __m128i,
+}
+
+impl Sum {
+    #[inline(always)]
+    unsafe fn new() -> Self {
+        Self {
+            a: _mm_setzero_si128(),
+            b: _mm_setzero_si128(),
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn add(&mut self, rhs: __m128i) {
+        // Reads pairs of bytes into a 32-bit value
+        //
+        // Since we have 16 bytes as input, we need two outputs since we're doubling the bit-width
+        let mask_a = _mm_setr_epi8(
+            0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1,
+        );
+        let mask_b = _mm_setr_epi8(
+            0x8, 0x9, -1, -1, 0xa, 0xb, -1, -1, 0xc, 0xd, -1, -1, 0xe, 0xf, -1, -1,
+        );
+
+        // Add the shuffled counts to our current state
+        self.a = _mm_add_epi32(self.a, _mm_shuffle_epi8(rhs, mask_a));
+        self.b = _mm_add_epi32(self.b, _mm_shuffle_epi8(rhs, mask_b));
+    }
+}
+
+impl core::ops::AddAssign<__m128i> for Sum {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: __m128i) {
+        unsafe { self.add(rhs) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{super::write_sized_generic, *};
+    use bolero::check;
+
+    /// Compares the x86-optimized function to the generic function and ensures consistency
+    #[test]
+    fn differential() {
+        if let Some(write_sized_opt) = probe() {
+            check!().for_each(|bytes| {
+                if bytes.len() < LARGE_WRITE_LEN {
+                    return;
+                }
+
+                let generic = {
+                    let mut state = Wrapping(0);
+                    write_sized_generic::<2>(&mut state, bytes);
+                    state.0
+                };
+
+                let actual = {
+                    let mut state = Wrapping(0);
+                    let bytes = unsafe { write_sized_opt(&mut state, bytes) };
+                    // finish the rest of the bytes
+                    write_sized_generic::<2>(&mut state, bytes);
+                    state.0
+                };
+
+                assert_eq!(generic.to_ne_bytes(), actual.to_ne_bytes())
+            });
+        }
+    }
+}
diff --git a/quic/s2n-quic-core/src/inet/mod.rs b/quic/s2n-quic-core/src/inet/mod.rs
index 9271093be2..e389ac1d5a 100644
--- a/quic/s2n-quic-core/src/inet/mod.rs
+++ b/quic/s2n-quic-core/src/inet/mod.rs
@@ -1,11 +1,10 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-#![forbid(unsafe_code)]
-
 #[macro_use]
 mod macros;
 
+pub mod checksum;
 pub mod datagram;
 pub mod ecn;
 pub mod ethernet;
diff --git a/specs/www.rfc-editor.org/rfc/rfc1071.txt b/specs/www.rfc-editor.org/rfc/rfc1071.txt
new file mode 100644
index 0000000000..3b941084b9
--- /dev/null
+++ b/specs/www.rfc-editor.org/rfc/rfc1071.txt
@@ -0,0 +1,1417 @@
+
+
+
+
+Network Working Group                                         R.  Braden
+Request for Comments: 1071                                           ISI
+                                                              D.  Borman
+                                                           Cray Research
+                                                            C. Partridge
+                                                        BBN Laboratories
+                                                          September 1988
+
+
+                    Computing the Internet Checksum
+
+
+Status of This Memo
+
+   This memo summarizes techniques and algorithms for efficiently
+   computing the Internet checksum.  It is not a standard, but a set of
+   useful implementation techniques.  Distribution of this memo is
+   unlimited.
+
+1.  Introduction
+
+   This memo discusses methods for efficiently computing the Internet
+   checksum that is used by the standard Internet protocols IP, UDP, and
+   TCP.
+
+   An efficient checksum implementation is critical to good performance.
+   As advances in implementation techniques streamline the rest of the
+   protocol processing, the checksum computation becomes one of the
+   limiting factors on TCP performance, for example.  It is usually
+   appropriate to carefully hand-craft the checksum routine, exploiting
+   every machine-dependent trick possible; a fraction of a microsecond
+   per TCP data byte can add up to a significant CPU time savings
+   overall.
+
+   In outline, the Internet checksum algorithm is very simple:
+
+   (1)  Adjacent octets to be checksummed are paired to form 16-bit
+        integers, and the 1's complement sum of these 16-bit integers is
+        formed.
+
+   (2)  To generate a checksum, the checksum field itself is cleared,
+        the 16-bit 1's complement sum is computed over the octets
+        concerned, and the 1's complement of this sum is placed in the
+        checksum field.
+
+   (3)  To check a checksum, the 1's complement sum is computed over the
+        same set of octets, including the checksum field.  If the result
+        is all 1 bits (-0 in 1's complement arithmetic), the check
+        succeeds.
+
+        Suppose a checksum is to be computed over the sequence of octets
+
+
+
+Braden, Borman, & Partridge                                     [Page 1]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+        A, B, C, D, ... , Y, Z.  Using the notation [a,b] for the 16-bit
+        integer a*256+b, where a and b are bytes, then the 16-bit 1's
+        complement sum of these bytes is given by one of the following:
+
+            [A,B] +' [C,D] +' ... +' [Y,Z]              [1]
+
+            [A,B] +' [C,D] +' ... +' [Z,0]              [2]
+
+        where +' indicates 1's complement addition. These cases
+        correspond to an even or odd count of bytes, respectively.
+
+        On a 2's complement machine, the 1's complement sum must be
+        computed by means of an "end around carry", i.e., any overflows
+        from the most significant bits are added into the least
+        significant bits. See the examples below.
+
+        Section 2 explores the properties of this checksum that may be
+        exploited to speed its calculation.  Section 3 contains some
+        numerical examples of the most important implementation
+        techniques.  Finally, Section 4 includes examples of specific
+        algorithms for a variety of common CPU types.  We are grateful
+        to Van Jacobson and Charley Kline for their contribution of
+        algorithms to this section.
+
+        The properties of the Internet checksum were originally
+        discussed by Bill Plummer in IEN-45, entitled "Checksum Function
+        Design".  Since IEN-45 has not been widely available, we include
+        it as an extended appendix to this RFC.
+
+     2.  Calculating the Checksum
+
+        This simple checksum has a number of wonderful mathematical
+        properties that may be exploited to speed its calculation, as we
+        will now discuss.
+
+
+   (A)  Commutative and Associative
+
+        As long as the even/odd assignment of bytes is respected, the
+        sum can be done in any order, and it can be arbitrarily split
+        into groups.
+
+        For example, the sum [1] could be split into:
+
+           ( [A,B] +' [C,D] +' ... +' [J,0] )
+
+                  +' ( [0,K] +' ... +' [Y,Z] )               [3]
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 2]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+   (B)  Byte Order Independence
+
+        The sum of 16-bit integers can be computed in either byte order.
+        Thus, if we calculate the swapped sum:
+
+           [B,A] +' [D,C] +' ... +' [Z,Y]                   [4]
+
+        the result is the same as [1], except the bytes are swapped in
+        the sum! To see why this is so, observe that in both orders the
+        carries are the same: from bit 15 to bit 0 and from bit 7 to bit
+        8.  In other words, consistently swapping bytes simply rotates
+        the bits within the sum, but does not affect their internal
+        ordering.
+
+        Therefore, the sum may be calculated in exactly the same way
+        regardless of the byte order ("big-endian" or "little-endian")
+        of the underlaying hardware.  For example, assume a "little-
+        endian" machine summing data that is stored in memory in network
+        ("big-endian") order.  Fetching each 16-bit word will swap
+        bytes, resulting in the sum [4]; however, storing the result
+        back into memory will swap the sum back into network byte order.
+
+        Byte swapping may also be used explicitly to handle boundary
+        alignment problems.  For example, the second group in [3] can be
+        calculated without concern to its odd/even origin, as:
+
+              [K,L] +' ... +' [Z,0]
+
+        if this sum is byte-swapped before it is added to the first
+        group.  See the example below.
+
+   (C)  Parallel Summation
+
+        On machines that have word-sizes that are multiples of 16 bits,
+        it is possible to develop even more efficient implementations.
+        Because addition is associative, we do not have to sum the
+        integers in the order they appear in the message.  Instead we
+        can add them in "parallel" by exploiting the larger word size.
+
+        To compute the checksum in parallel, simply do a 1's complement
+        addition of the message using the native word size of the
+        machine.  For example, on a 32-bit machine we can add 4 bytes at
+        a time: [A,B,C,D]+'... When the sum has been computed, we "fold"
+        the long sum into 16 bits by adding the 16-bit segments.  Each
+        16-bit addition may produce new end-around carries that must be
+        added.
+
+        Furthermore, again the byte order does not matter; we could
+        instead sum 32-bit words: [D,C,B,A]+'... or [B,A,D,C]+'... and
+        then swap the bytes of the final 16-bit sum as necessary.  See
+        the examples below.  Any permutation is allowed that collects
+
+
+
+Braden, Borman, & Partridge                                     [Page 3]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+        all the even-numbered data bytes into one sum byte and the odd-
+        numbered data bytes into the other sum byte.
+
+
+   There are further coding techniques that can be exploited to speed up
+   the checksum calculation.
+
+   (1)  Deferred Carries
+
+        Depending upon the machine, it may be more efficient to defer
+        adding end-around carries until the main summation loop is
+        finished.
+
+        One approach is to sum 16-bit words in a 32-bit accumulator, so
+        the overflows build up in the high-order 16 bits.  This approach
+        typically avoids a carry-sensing instruction but requires twice
+        as many additions as would adding 32-bit segments; which is
+        faster depends upon the detailed hardware architecture.
+
+   (2)  Unwinding Loops
+
+        To reduce the loop overhead, it is often useful to "unwind" the
+        inner sum loop, replicating a series of addition commands within
+        one loop traversal.  This technique often provides significant
+        savings, although it may complicate the logic of the program
+        considerably.
+
+   (3)  Combine with Data Copying
+
+        Like checksumming, copying data from one memory location to
+        another involves per-byte overhead.  In both cases, the
+        bottleneck is essentially the memory bus, i.e., how fast the
+        data can be fetched. On some machines (especially relatively
+        slow and simple micro-computers), overhead can be significantly
+        reduced by combining memory-to-memory copy and the checksumming,
+        fetching the data only once for both.
+
+   (4)  Incremental Update
+
+        Finally, one can sometimes avoid recomputing the entire checksum
+        when one header field is updated.  The best-known example is a
+        gateway changing the TTL field in the IP header, but there are
+        other examples (for example, when updating a source route).  In
+        these cases it is possible to update the checksum without
+        scanning the message or datagram.
+
+        To update the checksum, simply add the differences of the
+        sixteen bit integers that have been changed.  To see why this
+        works, observe that every 16-bit integer has an additive inverse
+        and that addition is associative.  From this it follows that
+        given the original value m, the new value m', and the old
+
+
+
+Braden, Borman, & Partridge                                     [Page 4]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+        checksum C, the new checksum C' is:
+
+                C' = C + (-m) + m' = C + (m' - m)
+
+
+3. Numerical Examples
+
+   We now present explicit examples of calculating a simple 1's
+   complement sum on a 2's complement machine.  The examples show the
+   same sum calculated byte by bye, by 16-bits words in normal and
+   swapped order, and 32 bits at a time in 3 different orders.  All
+   numbers are in hex.
+
+                  Byte-by-byte    "Normal"  Swapped
+                                    Order    Order
+
+        Byte 0/1:    00   01        0001      0100
+        Byte 2/3:    f2   03        f203      03f2
+        Byte 4/5:    f4   f5        f4f5      f5f4
+        Byte 6/7:    f6   f7        f6f7      f7f6
+                    ---  ---       -----     -----
+        Sum1:       2dc  1f0       2ddf0     1f2dc
+
+                     dc   f0        ddf0      f2dc
+        Carrys:       1    2           2         1
+                     --   --        ----      ----
+        Sum2:        dd   f2        ddf2      f2dd
+
+        Final Swap:  dd   f2        ddf2      ddf2
+
+
+        Byte 0/1/2/3:  0001f203     010003f2       03f20100
+        Byte 4/5/6/7:  f4f5f6f7     f5f4f7f6       f7f6f5f4
+                       --------     --------       --------
+        Sum1:         0f4f7e8fa    0f6f4fbe8      0fbe8f6f4
+
+        Carries:              0            0              0
+
+        Top half:          f4f7         f6f4           fbe8
+        Bottom half:       e8fa         fbe8           f6f4
+                          -----        -----          -----
+        Sum2:             1ddf1        1f2dc          1f2dc
+
+                           ddf1         f2dc           f2dc
+        Carrys:               1            1              1
+                           ----         ----           ----
+        Sum3:              ddf2         f2dd           f2dd
+
+        Final Swap:        ddf2         ddf2           ddf2
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 5]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+   Finally, here an example of breaking the sum into two groups, with
+   the second group starting on a odd boundary:
+
+
+                   Byte-by-byte    Normal
+                                    Order
+
+        Byte 0/1:    00   01        0001
+        Byte 2/ :    f2  (00)       f200
+                    ---  ---       -----
+        Sum1:        f2   01        f201
+
+        Byte 4/5:    03   f4        03f4
+        Byte 6/7:    f5   f6        f5f6
+        Byte 8/:     f7  (00)       f700
+                    ---  ---       -----
+        Sum2:                      1f0ea
+
+        Sum2:                       f0ea
+        Carry:                         1
+                                   -----
+        Sum3:                       f0eb
+
+        Sum1:                       f201
+        Sum3 byte swapped:          ebf0
+                                   -----
+        Sum4:                      1ddf1
+
+        Sum4:                       ddf1
+        Carry:                         1
+                                   -----
+        Sum5:                       ddf2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 6]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+4.  Implementation Examples
+
+   In this section we show examples of Internet checksum implementation
+   algorithms that have been found to be efficient on a variety of
+   CPU's.  In each case, we show the core of the algorithm, without
+   including environmental code (e.g., subroutine linkages) or special-
+   case code.
+
+4.1  "C"
+
+   The following "C" code algorithm computes the checksum with an inner
+   loop that sums 16-bits at a time in a 32-bit accumulator.
+
+   in 6
+       {
+           /* Compute Internet Checksum for "count" bytes
+            *         beginning at location "addr".
+            */
+       register long sum = 0;
+
+        while( count > 1 )  {
+           /*  This is the inner loop */
+               sum += * (unsigned short) addr++;
+               count -= 2;
+       }
+
+           /*  Add left-over byte, if any */
+       if( count > 0 )
+               sum += * (unsigned char *) addr;
+
+           /*  Fold 32-bit sum to 16 bits */
+       while (sum>>16)
+           sum = (sum & 0xffff) + (sum >> 16);
+
+       checksum = ~sum;
+   }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 7]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+4.2  Motorola 68020
+
+   The following algorithm is given in assembler language for a Motorola
+   68020 chip.  This algorithm performs the sum 32 bits at a time, and
+   unrolls the loop with 16 replications.  For clarity, we have omitted
+   the logic to add the last fullword when the length is not a multiple
+   of 4.  The result is left in register d0.
+
+   With a 20MHz clock, this routine was measured at 134 usec/kB summing
+   random data.  This algorithm was developed by Van Jacobson.
+
+
+       movl    d1,d2
+       lsrl    #6,d1       | count/64 = # loop traversals
+       andl    #0x3c,d2    | Then find fractions of a chunk
+       negl    d2
+       andb    #0xf,cc     | Clear X (extended carry flag)
+
+       jmp     pc@(2$-.-2:b,d2)  | Jump into loop
+
+   1$:     | Begin inner loop...
+
+       movl    a0@+,d2     |  Fetch 32-bit word
+       addxl   d2,d0       |    Add word + previous carry
+       movl    a0@+,d2     |  Fetch 32-bit word
+       addxl   d2,d0       |    Add word + previous carry
+
+           | ... 14 more replications
+   2$:
+       dbra    d1,1$   | (NB- dbra doesn't affect X)
+
+       movl    d0,d1   | Fold 32 bit sum to 16 bits
+       swap    d1      | (NB- swap doesn't affect X)
+       addxw   d1,d0
+       jcc     3$
+       addw    #1,d0
+   3$:
+       andl    #0xffff,d0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 8]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+4.3  Cray
+
+   The following example, in assembler language for a Cray CPU, was
+   contributed by Charley Kline.  It implements the checksum calculation
+   as a vector operation, summing up to 512 bytes at a time with a basic
+   summation unit of 32 bits.  This example omits many details having to
+   do with short blocks, for clarity.
+
+   Register A1 holds the address of a 512-byte block of memory to
+   checksum.  First two copies of the data are loaded into two vector
+   registers.  One is vector-shifted right 32 bits, while the other is
+   vector-ANDed with a 32 bit mask. Then the two vectors are added
+   together.  Since all these operations chain, it produces one result
+   per clock cycle.  Then it collapses the result vector in a loop that
+   adds each element to a scalar register.  Finally, the end-around
+   carry is performed and the result is folded to 16-bits.
+
+         EBM
+         A0      A1
+         VL      64            use full vectors
+         S1      <32           form 32-bit mask from the right.
+         A2      32
+         V1      ,A0,1            load packet into V1
+         V2      S1&V1            Form right-hand 32-bits in V2.
+         V3      V1>A2            Form left-hand 32-bits in V3.
+         V1      V2+V3            Add the two together.
+         A2      63            Prepare to collapse into a scalar.
+         S1      0
+         S4      <16           Form 16-bit mask from the right.
+         A4      16
+   CK$LOOP S2    V1,A2
+         A2      A2-1
+         A0      A2
+         S1      S1+S2
+         JAN     CK$LOOP
+         S2      S1&S4           Form right-hand 16-bits in S2
+         S1      S1>A4           Form left-hand 16-bits in S1
+         S1      S1+S2
+         S2      S1&S4           Form right-hand 16-bits in S2
+         S1      S1>A4           Form left-hand 16-bits in S1
+         S1      S1+S2
+         S1      #S1            Take one's complement
+         CMR            At this point, S1 contains the checksum.
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                     [Page 9]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+4.4  IBM 370
+
+   The following example, in assembler language for an IBM 370 CPU, sums
+   the data 4 bytes at a time.  For clarity, we have omitted the logic
+   to add the last fullword when the length is not a multiple of 4, and
+   to reverse the bytes when necessary.  The result is left in register
+   RCARRY.
+
+   This code has been timed on an IBM 3090 CPU at 27 usec/KB when
+   summing all one bits.  This time is reduced to 24.3 usec/KB if the
+   trouble is taken to word-align the addends (requiring special cases
+   at both the beginning and the end, and byte-swapping when necessary
+   to compensate for starting on an odd byte).
+
+   *      Registers RADDR and RCOUNT contain the address and length of
+   *              the block to be checksummed.
+   *
+   *      (RCARRY, RSUM) must be an even/odd register pair.
+   *      (RCOUNT, RMOD) must be an even/odd register pair.
+   *
+   CHECKSUM  SR    RSUM,RSUM       Clear working registers.
+             SR    RCARRY,RCARRY
+             LA    RONE,1          Set up constant 1.
+   *
+             SRDA  RCOUNT,6        Count/64 to RCOUNT.
+             AR    RCOUNT,RONE       +1 = # times in loop.
+             SRL   RMOD,26         Size of partial chunk to RMOD.
+             AR    RADDR,R3        Adjust addr to compensate for
+             S     RADDR,=F(64)      jumping into the loop.
+             SRL   RMOD,1          (RMOD/4)*2 is halfword index.
+             LH    RMOD,DOPEVEC9(RMOD) Use magic dope-vector for offset,
+             B     LOOP(RMOD)          and jump into the loop...
+   *
+   *             Inner loop:
+   *
+   LOOP      AL    RSUM,0(,RADDR)   Add Logical fullword
+             BC    12,*+6             Branch if no carry
+             AR    RCARRY,RONE        Add 1 end-around
+             AL    RSUM,4(,RADDR)   Add Logical fullword
+             BC    12,*+6             Branch if no carry
+             AR    RCARRY,RONE        Add 1 end-around
+   *
+   *                    ... 14 more replications ...
+   *
+             A     RADDR,=F'64'    Increment address ptr
+             BCT   RCOUNT,LOOP     Branch on Count
+    *
+    *            Add Carries into sum, and fold to 16 bits
+    *
+             ALR   RCARRY,RSUM      Add SUM and CARRY words
+             BC    12,*+6              and take care of carry
+
+
+
+Braden, Borman, & Partridge                                    [Page 10]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+             AR    RCARRY,RONE
+             SRDL  RCARRY,16        Fold 32-bit sum into
+             SRL   RSUM,16            16-bits
+             ALR   RCARRY,RSUM
+             C     RCARRY,=X'0000FFFF' and take care of any
+             BNH   DONE                     last carry
+             S     RCARRY,=X'0000FFFF'
+   DONE      X     RCARRY,=X'0000FFFF' 1's complement
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 11]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     IEN 45
+     Section 2.4.4.5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+                       TCP Checksum Function Design
+
+
+
+                            William W. Plummer
+
+
+                       Bolt Beranek and Newman, Inc.
+                             50 Moulton Street
+                           Cambridge MA   02138
+
+
+                                5 June 1978
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 12]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     1.      Introduction
+
+     Checksums  are  included  in  packets  in   order   that   errors
+     encountered  during  transmission  may be detected.  For Internet
+     protocols such as TCP [1,9] this is especially important  because
+     packets  may  have  to cross wireless networks such as the Packet
+     Radio Network  [2]  and  Atlantic  Satellite  Network  [3]  where
+     packets  may  be  corrupted.  Internet protocols (e.g., those for
+     real time speech transmission) can tolerate a  certain  level  of
+     transmission  errors  and  forward error correction techniques or
+     possibly no checksum at all might be better.  The focus  in  this
+     paper  is  on  checksum functions for protocols such as TCP where
+     the required reliable delivery is achieved by retransmission.
+
+     Even if the checksum appears good on a  message  which  has  been
+     received, the message may still contain an undetected error.  The
+     probability of this is bounded by 2**(-C) where  C  is the number
+     of  checksum bits.  Errors can arise from hardware (and software)
+     malfunctions as well as transmission  errors.   Hardware  induced
+     errors  are  usually manifested in certain well known ways and it
+     is desirable to account for this in the design  of  the  checksum
+     function.  Ideally no error of the "common hardware failure" type
+     would go undetected.
+
+     An  example  of  a  failure  that  the  current checksum function
+     handles successfully is picking up a bit in the network interface
+     (or I/O buss, memory channel, etc.).  This will always render the
+     checksum bad.  For an example of  how  the  current  function  is
+     inadequate, assume that a control signal stops functioning in the
+     network  interface and the interface stores zeros in place of the
+     real data.  These  "all  zero"  messages  appear  to  have  valid
+     checksums.   Noise  on the "There's Your Bit" line of the ARPANET
+     Interface [4] may go undetected because the extra bits input  may
+     cause  the  checksum  to be perturbed (i.e., shifted) in the same
+     way as the data was.
+
+     Although messages containing undetected errors will  occasionally
+     be  passed  to  higher levels of protocol, it is likely that they
+     will not make sense at that level.  In the case of TCP most  such
+     messages will be ignored, but some could cause a connection to be
+     aborted.   Garbled  data could be viewed as a problem for a layer
+     of protocol above TCP which itself may have a checksuming scheme.
+
+     This paper is the first step in design of a new checksum function
+     for TCP  and  some  other  Internet  protocols.   Several  useful
+     properties  of  the current function are identified.  If possible
+
+                                   - 1 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 13]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     these should be retained  in  any  new  function.   A  number  of
+     plausible  checksum  schemes are investigated.  Of these only the
+     "product code" seems to be simple enough for consideration.
+
+     2.      The Current TCP Checksum Function
+
+     The current function is  oriented  towards  sixteen-bit  machines
+     such  as  the PDP-11 but can be computed easily on other machines
+     (e.g., PDP-10).  A packet is thought of as  a  string  of  16-bit
+     bytes  and the checksum function is the one's complement sum (add
+     with  end-around  carry)  of  those  bytes.   It  is  the   one's
+     complement  of  this sum which is stored in the checksum field of
+     the TCP header.  Before computing the checksum value, the  sender
+     places  a  zero  in  the  checksum  field  of the packet.  If the
+     checksum value computed by a receiver of the packet is zero,  the
+     packet  is  assumed  to  be  valid.  This is a consequence of the
+     "negative" number in the checksum field  exactly  cancelling  the
+     contribution of the rest of the packet.
+
+     Ignoring  the  difficulty  of  actually  evaluating  the checksum
+     function for a given  packet,  the  way  of  using  the  checksum
+     described  above  is quite simple, but it assumes some properties
+     of the checksum operator (one's complement addition, "+" in  what
+     follows):
+
+       (P1)    +  is commutative.  Thus, the  order  in  which
+             the   16-bit   bytes   are  "added"  together  is
+             unimportant.
+
+       (P2)    +  has  at  least  one  identity  element  (The
+             current  function  has  two:  +0  and  -0).  This
+             allows  the  sender  to  compute   the   checksum
+             function by placing a zero in the packet checksum
+             field before computing the value.
+
+       (P3)    +  has an  inverse.   Thus,  the  receiver  may
+             evaluate the checksum function and expect a zero.
+
+       (P4)    +  is associative, allowing the checksum  field
+             to be anywhere in the packet and the 16-bit bytes
+             to be scanned sequentially.
+
+     Mathematically, these properties of the binary operation "+" over
+     the set of 16-bit numbers forms an Abelian group [5].  Of course,
+     there  are  many Abelian groups but not all would be satisfactory
+     for  use  as  checksum  operators.   (Another  operator   readily
+
+                                   - 2 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 14]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     available  in  the  PDP-11  instruction set that has all of these
+     properties is exclusive-OR, but XOR is unsatisfactory  for  other
+     reasons.)
+
+     Albeit imprecise, another property which must be preserved in any
+     future checksum scheme is:
+
+       (P5)    +  is fast to compute on a variety of  machines
+             with limited storage requirements.
+
+     The  current  function  is  quite  good  in this respect.  On the
+     PDP-11 the inner loop looks like:
+
+             LOOP:   ADD (R1)+,R0    ; Add the next 16-bit byte
+                     ADC R0          ; Make carry be end-around
+                     SOB R2,LOOP     ; Loop over entire packet.
+
+              ( 4 memory cycles per 16-bit byte )
+
+     On the PDP-10 properties  P1-4  are  exploited  further  and  two
+     16-bit bytes per loop are processed:
+
+     LOOP: ILDB THIS,PTR   ; Get 2 16-bit bytes
+           ADD SUM,THIS    ; Add into current sum
+           JUMPGE SUM,CHKSU2  ; Jump if fewer than 8 carries
+           LDB THIS,[POINT 20,SUM,19] ; Get left 16 and carries
+           ANDI SUM,177777 ; Save just low 16 here
+           ADD SUM,THIS    ; Fold in carries
+     CHKSU2: SOJG COUNT,LOOP ; Loop over entire packet
+
+     ( 3.1 memory cycles per 16-bit byte )
+
+     The  "extra"  instruction  in  the  loops  above  are required to
+     convert the two's complement  ADD  instruction(s)  into  a  one's
+     complement  add  by  making  the  carries  be  end-around.  One's
+     complement arithmetic is better than two's complement because  it
+     is  equally  sensitive  to errors in all bit positions.  If two's
+     complement addition were used, an even number  of  1's  could  be
+     dropped  (or  picked  up)  in  the  most  significant bit channel
+     without affecting the value of the checksum.   It  is  just  this
+     property  that makes some sort of addition preferable to a simple
+     exclusive-OR which is frequently used but permits an even  number
+     of drops (pick ups) in any bit channel.  RIM10B paper tape format
+     used  on PDP-10s [10] uses two's complement add because space for
+     the loader program is extremely limited.
+
+                                   - 3 -
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 15]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     Another property of the current checksum scheme is:
+
+       (P6)    Adding the checksum to a packet does not change
+             the information bytes.  Peterson [6] calls this a
+             "systematic" code.
+
+     This property  allows  intermediate  computers  such  as  gateway
+     machines  to  act  on  fields  (i.e.,  the  Internet  Destination
+     Address) without having to first  decode  the  packet.   Cyclical
+     Redundancy  Checks  used  for error correction are not systematic
+     either.  However, most applications of  CRCs  tend  to  emphasize
+     error  detection rather than correction and consequently can send
+     the message unchanged, with the CRC check bits being appended  to
+     the  end.   The  24-bit CRC used by ARPANET IMPs and Very Distant
+     Host Interfaces [4] and the ANSI standards for 800 and 6250  bits
+     per inch magnetic tapes (described in [11]) use this mode.
+
+     Note  that  the  operation  of higher level protocols are not (by
+     design) affected by anything that may be done by a gateway acting
+     on possibly invalid packets.  It is permissible for  gateways  to
+     validate  the  checksum  on  incoming  packets,  but  in  general
+     gateways will not know how to  do  this  if  the  checksum  is  a
+     protocol-specific feature.
+
+     A final property of the current checksum scheme which is actually
+     a consequence of P1 and P4 is:
+
+       (P7)    The checksum may be incrementally modified.
+
+     This  property permits an intermediate gateway to add information
+     to a packet, for instance a timestamp, and "add"  an  appropriate
+     change  to  the  checksum  field  of  the  packet.  Note that the
+     checksum  will  still  be  end-to-end  since  it  was  not  fully
+     recomputed.
+
+     3.      Product Codes
+
+     Certain  "product  codes"  are potentially useful for checksuming
+     purposes.  The following is a brief description of product  codes
+     in  the  context  of TCP.  More general treatment can be found in
+     Avizienis [7] and probably other more recent works.
+
+     The basic concept of this coding is that the message (packet)  to
+     be sent is formed by transforming the original source message and
+     adding  some  "check"  bits.   By  reading  this  and  applying a
+     (possibly different) transformation, a receiver  can  reconstruct
+
+                                   - 4 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 16]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     the  original  message  and  determine  if  it has been corrupted
+     during transmission.
+
+              Mo              Ms              Mr
+
+             -----           -----           -----
+             | A |  code     | 7 |   decode  | A |
+             | B |    ==>    | 1 |     ==>   | B |
+             | C |           | 4 |           | C |
+             -----           |...|           -----
+                             | 2 | check     plus "valid" flag
+                             ----- info
+
+             Original        Sent            Reconstructed
+
+     With product codes the transformation is  Ms = K * Mo .  That is,
+     the message sent is simply the product of  the  original  message
+     Mo   and  some  well known constant  K .  To decode, the received
+     Ms  is divided by  K  which will yield  Mr  as the  quotient  and
+     0   as the remainder if  Mr is to be considered the same as  Mo .
+
+     The first problem is selecting a "good" value for  K, the  "check
+     factor".   K  must  be  relatively  prime  to  the base chosen to
+     express  the  message.   (Example:  Binary   messages   with    K
+     incorrectly  chosen  to be 8.  This means that  Ms  looks exactly
+     like  Mo  except that three zeros have been appended.   The  only
+     way  the message could look bad to a receiver dividing by 8 is if
+     the error occurred in one of those three bits.)
+
+     For TCP the base  R  will be chosen to be 2**16.  That is,  every
+     16-bit byte (word on the PDP-11) will be considered as a digit of
+     a big number and that number is the message.  Thus,
+
+                     Mo =  SIGMA [ Bi * (R**i)]   ,   Bi is i-th byte
+                          i=0 to N
+
+                     Ms = K * Mo
+
+     Corrupting a single digit  of   Ms   will  yield   Ms' =  Ms +or-
+     C*(R**j)  for some radix position  j .  The receiver will compute
+     Ms'/K = Mo +or- C(R**j)/K. Since R  and  K  are relatively prime,
+     C*(R**j) cannot be any exact  multiple  of   K.   Therefore,  the
+     division will result in a non-zero remainder which indicates that
+     Ms'   is  a  corrupted  version  of  Ms.  As will be seen, a good
+     choice for  K  is (R**b - 1), for some  b  which  is  the  "check
+     length"  which  controls  the  degree  of detection to be had for
+
+                                   - 5 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 17]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     burst errors which affect a string of digits (i.e., 16-bit bytes)
+     in the message.  In fact  b  will be chosen to be  1, so  K  will
+     be  2**16 - 1 so that arithmetic operations will be simple.  This
+     means  that  all  bursts  of  15  or fewer bits will be detected.
+     According to [7] this choice for  b   results  in  the  following
+     expression for the fraction of undetected weight 2 errors:
+
+      f =  16(k-1)/[32(16k-3) + (6/k)]  where k is the message length.
+
+     For  large messages  f  approaches  3.125 per cent as  k  goes to
+     infinity.
+
+     Multiple precision multiplication and division are normally quite
+     complex operations, especially on small machines which  typically
+     lack  even  single precision multiply and divide operations.  The
+     exception to this is exactly the case being dealt  with  here  --
+     the  factor  is  2**16  - 1  on machines with a word length of 16
+     bits.  The reason for this is due to the following identity:
+
+             Q*(R**j)  =  Q, mod (R-1)     0 <= Q < R
+
+     That is, any digit  Q  in the selected  radix  (0,  1,  ...  R-1)
+     multiplied  by any power of the radix will have a remainder of  Q
+     when divided by the radix minus 1.
+
+     Example:  In decimal R = 10.  Pick  Q = 6.
+
+                     6  =   0 * 9  +  6  =  6, mod 9
+                    60  =   6 * 9  +  6  =  6, mod 9
+                   600  =  66 * 9  +  6  =  6, mod 9   etc.
+
+        More to the point, rem(31415/9) = rem((30000+1000+400+10+5)/9)
+           = (3 mod 9) + (1 mod 9) + (4 mod 9) + (1 mod 9) + (5 mod 9)
+           = (3+1+4+1+5) mod 9
+           = 14 mod 9
+           = 5
+
+     So, the remainder of a number divided by the radix minus one  can
+     be  found  by simply summing the digits of the number.  Since the
+     radix in the TCP case has been chosen to be  2**16 and the  check
+     factor is  2**16 - 1, a message can quickly be checked by summing
+     all  of  the  16-bit  words  (on  a  PDP-11),  with carries being
+     end-around.  If zero is the result, the message can be considered
+     valid.  Thus, checking a product coded  message  is  exactly  the
+     same complexity as with the current TCP checksum!
+
+                                   - 6 -
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 18]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     In  order  to  form   Ms,  the  sender must multiply the multiple
+     precision "number"  Mo  by  2**16 - 1.  Or,  Ms = (2**16)Mo - Mo.
+     This is performed by shifting  Mo   one  whole  word's  worth  of
+     precision  and  subtracting   Mo.   Since  carries must propagate
+     between digits, but it is only the  current  digit  which  is  of
+     interest, one's complement arithmetic is used.
+
+             (2**16)Mo =  Mo0 + Mo1 + Mo2 + ... + MoX +  0
+                 -  Mo =    - ( Mo0 + Mo1 + ......... + MoX)
+             ---------    ----------------------------------
+                Ms     =  Ms0 + Ms1 + ...             - MoX
+
+     A  loop  which  implements  this  function on a PDP-11 might look
+     like:
+             LOOP:   MOV -2(R2),R0   ; Next byte of (2**16)Mo
+                     SBC R0          ; Propagate carries from last SUB
+                     SUB (R2)+,R0    ; Subtract byte of  Mo
+                     MOV R0,(R3)+    ; Store in Ms
+                     SOB R1,LOOP     ; Loop over entire message
+                                     ; 8 memory cycles per 16-bit byte
+
+     Note that the coding procedure is not done in-place since  it  is
+     not  systematic.   In general the original copy, Mo, will have to
+     be  retained  by  the  sender  for  retransmission  purposes  and
+     therefore  must  remain  readable.   Thus  the  MOV  R0,(R3)+  is
+     required which accounts for 2 of the  8  memory cycles per  loop.
+
+     The  coding  procedure  will  add  exactly one 16-bit word to the
+     message since  Ms <  (2**16)Mo .  This additional 16 bits will be
+     at the tail of the message, but may be  moved  into  the  defined
+     location  in the TCP header immediately before transmission.  The
+     receiver will have to undo this to put  Ms   back  into  standard
+     format before decoding the message.
+
+     The  code  in  the receiver for fully decoding the message may be
+     inferred  by  observing  that  any  word  in   Ms   contains  the
+     difference between two successive words of  Mo  minus the carries
+     from the previous word, and the low order word contains minus the
+     low word of Mo.  So the low order (i.e., rightmost) word of Mr is
+     just  the negative of the low order byte of Ms.  The next word of
+     Mr is the next word of  Ms  plus the just computed  word  of   Mr
+     plus the carry from that previous computation.
+
+     A  slight  refinement  of  the  procedure is required in order to
+     protect against an all-zero message passing to  the  destination.
+     This  will  appear to have a valid checksum because Ms'/K  =  0/K
+
+                                   - 7 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 19]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     = 0 with 0 remainder.  The refinement is to make  the  coding  be
+     Ms  =  K*Mo + C where  C  is some arbitrary, well-known constant.
+     Adding this constant requires a second pass over the message, but
+     this will typically be very short since it can stop  as  soon  as
+     carries  stop propagating.  Chosing  C = 1  is sufficient in most
+     cases.
+
+     The product code checksum must  be  evaluated  in  terms  of  the
+     desired  properties  P1 - P7.  It has been shown that a factor of
+     two more machine cycles are consumed in computing or verifying  a
+     product code checksum (P5 satisfied?).
+
+     Although the code is not systematic, the checksum can be verified
+     quickly   without   decoding   the   message.   If  the  Internet
+     Destination Address is located at the least  significant  end  of
+     the packet (where the product code computation begins) then it is
+     possible  for  a  gateway to decode only enough of the message to
+     see this field without  having  to  decode  the  entire  message.
+     Thus,   P6  is  at  least  partially  satisfied.   The  algebraic
+     properties P1 through P4 are not  satisfied,  but  only  a  small
+     amount  of  computation  is  needed  to  account  for this -- the
+     message needs to be reformatted as previously mentioned.
+
+     P7  is  satisfied  since  the  product  code  checksum   can   be
+     incrementally  updated to account for an added word, although the
+     procedure is  somewhat  involved.    Imagine  that  the  original
+     message  has two halves, H1 and  H2.  Thus,  Mo = H1*(R**j) + H2.
+     The timestamp word is to be inserted between these halves to form
+     a modified  Mo' = H1*(R**(j+1)) + T*(R**j) + H2.  Since   K   has
+     been  chosen to be  R-1, the transmitted message  Ms' = Mo'(R-1).
+     Then,
+
+      Ms' =  Ms*R + T(R-1)(R**j) + P2((R-1)**2)
+
+          =  Ms*R + T*(R**(j+1))  + T*(R**j) + P2*(R**2) - 2*P2*R - P2
+
+     Recalling that  R   is  2**16,  the  word  size  on  the  PDP-11,
+     multiplying  by   R   means copying down one word in memory.  So,
+     the first term of  Ms' is simply the  unmodified  message  copied
+     down  one word.  The next term is the new data  T  added into the
+     Ms' being formed beginning at the (j+1)th word.  The addition  is
+     fairly  easy  here  since  after adding in T  all that is left is
+     propagating the carry, and that can stop as soon as no  carry  is
+     produced.  The other terms can be handle similarly.
+
+                                   - 8 -
+
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 20]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     4.      More Complicated Codes
+
+     There exists a wealth of theory on error detecting and correcting
+     codes.   Peterson  [6]  is an excellent reference.  Most of these
+     "CRC" schemes are  designed  to  be  implemented  using  a  shift
+     register  with  a  feedback  network  composed  of exclusive-ORs.
+     Simulating such a logic circuit with a program would be too  slow
+     to be useful unless some programming trick is discovered.
+
+     One  such  trick has been proposed by Kirstein [8].  Basically, a
+     few bits (four or eight) of the current shift register state  are
+     combined with bits from the input stream (from Mo) and the result
+     is  used  as  an  index  to  a  table  which yields the new shift
+     register state and, if the code is not systematic, bits  for  the
+     output  stream  (Ms).  A trial coding of an especially "good" CRC
+     function using four-bit bytes showed showed this technique to  be
+     about  four times as slow as the current checksum function.  This
+     was true for  both  the  PDP-10  and  PDP-11  machines.   Of  the
+     desirable  properties  listed  above, CRC schemes satisfy only P3
+     (It has an inverse.), and P6 (It is systematic.).   Placement  of
+     the  checksum  field in the packet is critical and the CRC cannot
+     be incrementally modified.
+
+     Although the bulk of coding theory deals with binary codes,  most
+     of  the theory works if the alphabet contains   q  symbols, where
+     q is a power of a prime number.  For instance  q  taken as  2**16
+     should  make  a great deal of the theory useful on a word-by-word
+     basis.
+
+     5.      Outboard Processing
+
+     When a function such as computing an involved  checksum  requires
+     extensive processing, one solution is to put that processing into
+     an  outboard processor.  In this way "encode message" and "decode
+     message" become single instructions which do  not  tax  the  main
+     host   processor.   The  Digital  Equipment  Corporation  VAX/780
+     computer is equipped with special  hardware  for  generating  and
+     checking  CRCs [13].  In general this is not a very good solution
+     since such a processor must be constructed  for  every  different
+     host machine which uses TCP messages.
+
+     It is conceivable that the gateway functions for a large host may
+     be  performed  entirely  in an "Internet Frontend Machine".  This
+     machine would be  responsible  for  forwarding  packets  received
+
+                                   - 9 -
+
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 21]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     either  from the network(s) or from the Internet protocol modules
+     in the connected host, and for  reassembling  Internet  fragments
+     into  segments and passing these to the host.  Another capability
+     of this machine would be  to  check  the  checksum  so  that  the
+     segments given to the host are known to be valid at the time they
+     leave the frontend.  Since computer cycles are assumed to be both
+     inexpensive and available in the frontend, this seems reasonable.
+
+     The problem with attempting to validate checksums in the frontend
+     is that it destroys the end-to-end character of the checksum.  If
+     anything,  this is the most powerful feature of the TCP checksum!
+     There is a way to make the host-to-frontend link  be  covered  by
+     the  end-to-end  checksum.   A  separate,  small protocol must be
+     developed to cover this link.  After having validated an incoming
+     packet from the network, the frontend would pass it to  the  host
+     saying "here is an Internet segment for you.  Call it #123".  The
+     host  would  save  this  segment,  and  send  a  copy back to the
+     frontend saying, "Here is what you gave me as #123.  Is it  OK?".
+     The  frontend  would  then  do a word-by-word comparison with the
+     first transmission, and  tell  the  host  either  "Here  is  #123
+     again",  or "You did indeed receive #123 properly.  Release it to
+     the appropriate module for further processing."
+
+     The headers on the messages crossing the host-frontend link would
+     most likely be covered  by  a  fairly  strong  checksum  so  that
+     information  like  which  function  is  being  performed  and the
+     message reference numbers are reliable.  These headers  would  be
+     quite  short,  maybe  only sixteen bits, so the checksum could be
+     quite strong.  The bulk of the message would not be checksumed of
+     course.
+     The reason this scheme reduces the computing burden on  the  host
+     is  that  all  that  is required in order to validate the message
+     using the end-to-end checksum is to send it back to the  frontend
+     machine.   In  the  case  of  the PDP-10, this requires only  0.5
+     memory cycles per 16-bit byte of Internet message, and only a few
+     processor cycles to setup the required transfers.
+
+     6.      Conclusions
+
+     There is an ordering of checksum functions: first and simplest is
+     none at all which provides  no  error  detection  or  correction.
+     Second,  is  sending a constant which is checked by the receiver.
+     This also is extremely weak.  Third, the exclusive-OR of the data
+     may be sent.  XOR takes the minimal amount of  computer  time  to
+     generate  and  check,  but  is  not  a  good  checksum.   A two's
+     complement sum of the data is somewhat better and takes  no  more
+
+                                  - 10 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 22]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     computer  time  to  compute.   Fifth, is the one's complement sum
+     which is what is currently used by  TCP.   It  is  slightly  more
+     expensive  in terms of computer time.  The next step is a product
+     code.  The product code is strongly related to  one's  complement
+     sum,  takes  still more computer time to use, provides a bit more
+     protection  against  common  hardware  failures,  but  has   some
+     objectionable properties.  Next is a genuine CRC polynomial code,
+     used  for  checking  purposes only.  This is very expensive for a
+     program to implement.  Finally, a full CRC error  correcting  and
+     detecting scheme may be used.
+
+     For  TCP  and  Internet  applications  the product code scheme is
+     viable.  It suffers mainly in that messages  must  be  (at  least
+     partially)  decoded  by  intermediate gateways in order that they
+     can be forwarded.  Should product  codes  not  be  chosen  as  an
+     improved  checksum,  some  slight  modification  to  the existing
+     scheme might be possible.  For  instance  the  "add  and  rotate"
+     function  used  for  paper  tape  by  the  PDP-6/10  group at the
+     Artificial Intelligence Laboratory at  M.I.T.  Project  MAC  [12]
+     could  be  useful  if it can be proved that it is better than the
+     current scheme and that it  can  be  computed  efficiently  on  a
+     variety of machines.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+                                  - 11 -
+
+
+
+
+
+
+
+Braden, Borman, & Partridge                                    [Page 23]
+
+RFC 1071            Computing the Internet Checksum       September 1988
+
+
+     Internet Experiment Note  45                          5 June 1978
+     TCP Checksum Function Design                   William W. Plummer
+
+     References
+
+     [1]  Cerf, V.G. and Kahn, Robert E., "A Protocol for Packet Network
+          Communications," IEEE Transactions on Communications, vol.
+          COM-22, No.  5, May 1974.
+
+     [2]  Kahn, Robert E., "The Organization of Computer Resources into
+          a Packet Radio Network", IEEE Transactions on Communications,
+          vol. COM-25, no. 1, pp. 169-178, January 1977.
+
+     [3]  Jacobs, Irwin, et al., "CPODA - A Demand Assignment Protocol
+          for SatNet", Fifth Data Communications Symposium, September
+          27-9, 1977, Snowbird, Utah
+
+     [4]  Bolt Beranek and Newman, Inc.  "Specifications for the
+          Interconnection of a Host and an IMP", Report 1822, January
+          1976 edition.
+
+     [5]  Dean, Richard A., "Elements of Abstract Algebra", John Wyley
+          and Sons, Inc., 1966
+
+     [6]  Peterson, W. Wesley, "Error Correcting Codes", M.I.T. Press
+          Cambridge MA, 4th edition, 1968.
+
+     [7]  Avizienis, Algirdas, "A Study of the Effectiveness of Fault-
+          Detecting Codes for Binary Arithmetic", Jet Propulsion
+          Laboratory Technical Report No. 32-711, September 1, 1965.
+
+     [8]  Kirstein, Peter, private communication
+
+     [9]  Cerf, V. G. and Postel, Jonathan B., "Specification of
+          Internetwork Transmission Control Program Version 3",
+          University of Southern California Information Sciences
+          Institute, January 1978.
+
+     [10] Digital Equipment Corporation, "PDP-10 Reference Handbook",
+          1970, pp. 114-5.
+
+     [11] Swanson, Robert, "Understanding Cyclic Redundancy Codes",
+          Computer Design, November, 1975, pp. 93-99.
+
+     [12] Clements, Robert C., private communication.
+
+     [13] Conklin, Peter F., and Rodgers, David P., "Advanced
+          Minicomputer Designed by Team Evaluation of Hardware/Software
+          Tradeoffs", Computer Design, April 1978, pp. 136-7.
+
+                                       - 12 -
+
+
+
+Braden, Borman, & Partridge                                    [Page 24]
+