rust can't serialize 11 fields efficiently #45068

jrmuizel · 2017-10-06T15:39:04Z

Using noalias (#45012) lets rust generate much better code for the serialization of 10 fields in good_bake_bytes() however it falls back to terrible with the 11 fields of bad_bake_bytes()

use std::io::Write;
use std::{io, ptr};

struct UnsafeVecWriter<'a>(&'a mut Vec<u8>);

impl<'a> Write for UnsafeVecWriter<'a> {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        unsafe {
            let old_len = self.0.len();
            self.0.set_len(old_len + buf.len());
            ptr::copy_nonoverlapping(buf.as_ptr(), self.0.as_mut_ptr().offset(old_len as isize), buf.len());
        }
        Ok(buf.len())
    }
    fn flush(&mut self) -> io::Result<()> { Ok(()) }
}

struct Entity {
    o: (f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32),
}

use std::mem::transmute;
fn do_f32<W: Write>(w: &mut W, x: f32) {
    unsafe {
        let p: [u8; 4] = std::mem::transmute([x]);
        w.write(&p);
    }
}

#[inline(never)]
fn bad_bake_bytes(vec: &mut Vec<u8>, e: &Entity) {
    let w = &mut UnsafeVecWriter(vec);
    do_f32(w, e.o.0);
    do_f32(w, e.o.1);
    do_f32(w, e.o.2);
    do_f32(w, e.o.3);
    do_f32(w, e.o.4);
    do_f32(w, e.o.5);
    do_f32(w, e.o.6);
    do_f32(w, e.o.7);
    do_f32(w, e.o.8);
    do_f32(w, e.o.9);
    do_f32(w, e.o.10);
}

#[inline(never)]
fn good_bake_bytes(vec: &mut Vec<u8>, e: &Entity) {
    let w = &mut UnsafeVecWriter(vec);
    do_f32(w, e.o.0);
    do_f32(w, e.o.1);
    do_f32(w, e.o.2);
    do_f32(w, e.o.3);
    do_f32(w, e.o.4);
    do_f32(w, e.o.5);
    do_f32(w, e.o.6);
    do_f32(w, e.o.7);
    do_f32(w, e.o.8);
    do_f32(w, e.o.9);
    //do_f32(w, e.o.10);
}

fn main() {
    let mut encoded = Vec::new();
    let decoded: Entity = unsafe { std::mem::uninitialized() };
    bad_bake_bytes(&mut encoded, &decoded);
    good_bake_bytes(&mut encoded, &decoded);
}

__ZN10serde_fast14bad_bake_bytes17h506e94e6df0b1a3bE:
	.cfi_startproc
	pushq	%rbp
Lcfi0:
	.cfi_def_cfa_offset 16
Lcfi1:
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
Lcfi2:
	.cfi_def_cfa_register %rbp
	movl	(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	4(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	8(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	12(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	16(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	20(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	24(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	28(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	32(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	36(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	movl	40(%rsi), %eax
	movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)
	movq	(%rdi), %rdx
	movl	%eax, (%rdx,%rcx)
	popq	%rbp
	retq
	.cfi_endproc

	.p2align	4, 0x90
__ZN10serde_fast15good_bake_bytes17h3098644f875a0da3E:
	.cfi_startproc
	pushq	%rbp
Lcfi3:
	.cfi_def_cfa_offset 16
Lcfi4:
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
Lcfi5:
	.cfi_def_cfa_register %rbp
	movl	(%rsi), %eax
	movq	(%rdi), %rcx
	movq	16(%rdi), %rdx
	movl	%eax, (%rcx,%rdx)
	movl	4(%rsi), %eax
	movl	%eax, 4(%rcx,%rdx)
	movl	8(%rsi), %eax
	movl	%eax, 8(%rcx,%rdx)
	movl	12(%rsi), %eax
	movl	%eax, 12(%rcx,%rdx)
	movl	16(%rsi), %eax
	movl	%eax, 16(%rcx,%rdx)
	movl	20(%rsi), %eax
	movl	%eax, 20(%rcx,%rdx)
	movl	24(%rsi), %eax
	movl	%eax, 24(%rcx,%rdx)
	movl	28(%rsi), %eax
	movl	%eax, 28(%rcx,%rdx)
	movl	32(%rsi), %eax
	movl	%eax, 32(%rcx,%rdx)
	movl	36(%rsi), %eax
	leaq	40(%rdx), %rsi
	movq	%rsi, 16(%rdi)
	movl	%eax, 36(%rcx,%rdx)
	popq	%rbp
	retq
	.cfi_endproc

pcwalton · 2017-10-06T16:31:15Z

Wild guess: A larger function trips the inlining thresholds differently, causing a pass ordering issue.

jrmuizel · 2017-10-06T17:08:23Z

Running llvm trunk opt doesn't fix it. @sunfishcode any ideas what might be happening here?

sunfishcode · 2017-10-06T17:25:33Z

I'm looking into it (building a compiler with the appropriate patch now). In the mean time, just to be clear, the problem here is the

        movq	16(%rdi), %rcx
	leaq	4(%rcx), %rdx
	movq	%rdx, 16(%rdi)

updating the length field of the Vec after each element is written in the "bad" version. In the "good" version, LLVM figures out that the length field isn't written to by the other stores, so it's able to combine all the increments and do a single store at the end.

jrmuizel · 2017-10-06T17:47:08Z

Exactly.

sunfishcode · 2017-10-06T20:19:52Z

It's due to the hard-coded limit in CaptureTracking here.

The code does this:

load the buffer pointer from the Vec
store the first float to the buffer
load the length from the Vec

and the key question is whether that store writes to the memory read by the following load.

The noalias on the argument isn't sufficient; LLVM needs to be sure that the address of the Vec hasn't escaped, such that it could have become stored in the buffer pointer field. CaptureTracking walks through the uses of the Vec's address, however, in the bad_bake_bytes case, it hits the threshold before it sees all of them, and assumes the worst.

jrmuizel · 2017-10-06T20:47:54Z

Is there something we can do to make llvm's job easier?

pcwalton · 2017-10-06T21:22:54Z

Could we up the limit as a compiler option here?

For example, MemorySSA has a compiler option for this: http://llvm.org/doxygen/MemorySSA_8cpp.html#a5926ddc0f7c4225c6ced440baa2fb7a3

I know this is the kind of thing we can't select a good value for in general, but for WebRender we could select a value that we know always produces good serde codegen for us.

pcwalton · 2017-10-06T21:26:56Z

@jrmuizel Doing this optimization ahead of time in rustc on MIR would help. It's easier for rustc to tell that pointers don't alias.

Unfortunately, MIR optimizations are missing a lot of basic infrastructure to enable this kind of thing; for example, they can't do inlining or SROA yet…

pcwalton · 2017-10-06T21:29:35Z

I'll write the LLVM patch to add an option if nobody objects.

pcwalton · 2017-10-06T22:18:53Z

Patch is up: https://reviews.llvm.org/D38648

jrmuizel · 2017-10-06T22:37:39Z

In the mean time I can work around this by making it so that instead of modifying length every write we just move the pointer and compute the length at the end.

struct UnsafeVecWriter(*mut u8);

impl Write for UnsafeVecWriter {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        unsafe {
            ptr::copy_nonoverlapping(buf.as_ptr(), self.0, buf.len());
            self.0 = self.0.offset(buf.len() as isize);
        }
        Ok(buf.len())
    }
    fn flush(&mut self) -> io::Result<()> { Ok(()) }
}

Instead of changing the length every write we just adjust the pointer. This avoids rust-lang/rust#45068. However we now need to ensure that we set the length when we are done.

pcwalton · 2017-10-06T23:26:11Z

@jrmuizel Does WR have good serialization codegen now?

Instead of changing the length every write we just adjust the pointer. This avoids rust-lang/rust#45068. However we now need to ensure that we set the length when we are done.

Instead of changing the length every write we just adjust the pointer. This avoids rust-lang/rust#45068. However we now need to ensure that we set the length when we are done. With this patch only 1.2% of WebRender display list building is spent in serialization.

jrmuizel · 2017-10-07T02:19:01Z

It's pretty good but could be better. If I run opt -O2 on the ir it improves:
before:

        movl    (%rsi), %ecx
        movl    %ecx, (%rax)
        movl    4(%rsi), %ecx
        movl    %ecx, 4(%rax)
        movl    8(%rsi), %ecx
        movl    %ecx, 8(%rax)
        movl    12(%rsi), %ecx
        movl    %ecx, 12(%rax)
        movl    16(%rsi), %ecx
        movl    %ecx, 16(%rax)
        movl    20(%rsi), %ecx
        movl    %ecx, 20(%rax)
        movl    24(%rsi), %ecx
        movl    %ecx, 24(%rax)
        movl    28(%rsi), %ecx
        movl    %ecx, 28(%rax)
        movl    32(%rsi), %ecx
        movl    %ecx, 32(%rax)
        movl    36(%rsi), %ecx
        movl    %ecx, 36(%rax)
        movl    40(%rsi), %ecx
        movl    %ecx, 40(%rax)

after:

        movups  (%rsi), %xmm0
        movups  %xmm0, (%rax)
        movups  16(%rsi), %xmm0
        movups  %xmm0, 16(%rax)
        movl    32(%rsi), %ecx
        movl    %ecx, 32(%rax)
        movl    36(%rsi), %ecx
        movl    %ecx, 36(%rax)
        movl    40(%rsi), %ecx
        movl    %ecx, 40(%rax)

pcwalton · 2017-10-07T16:00:35Z

Seems like a pass ordering issue. Perhaps the new pass manager in LLVM will help someday…

Or we could fiddle with the pass ordering ourselves.

Make the writer even more unsafe. Instead of changing the length every write we just adjust the pointer. This avoids rust-lang/rust#45068. However we now need to ensure that we set the length when we are done.  --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/webrender/1830)

jrmuizel · 2017-10-31T18:35:07Z

FWIW, it looks like the movups vs movl 24(%rsi), %ecx problem can be solved by building with opt-level=3

jrmuizel · 2017-10-31T18:51:43Z

I've filed https://bugzilla.mozilla.org/show_bug.cgi?id=1413285 on Mozilla side about this.

klensy · 2022-05-30T14:38:38Z

https://rust.godbolt.org/z/hfG734fGf

Replacing tuple with array and iterating over it eliminates bad codegen, even with -O.

jrmuizel mentioned this issue Oct 6, 2017

Display list serialization / safety. servo/webrender#1800

Closed

pcwalton added A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. I-slow Issue: Problems and improvements with respect to performance of generated code. labels Oct 6, 2017

jrmuizel mentioned this issue Oct 6, 2017

Make the writer even more unsafe. servo/webrender#1830

Merged

TimNN added the C-enhancement Category: An issue proposing an enhancement or a PR with one. label Oct 8, 2017

Mark-Simulacrum added the WG-llvm Working group: LLVM backend code generation label May 28, 2018

Noratrieb added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Apr 5, 2023

workingjubilee added the C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such label Oct 8, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

rust can't serialize 11 fields efficiently #45068

rust can't serialize 11 fields efficiently #45068

jrmuizel commented Oct 6, 2017 •

edited

Loading

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

sunfishcode commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

sunfishcode commented Oct 6, 2017 •

edited

Loading

jrmuizel commented Oct 6, 2017

pcwalton commented Oct 6, 2017 •

edited

Loading

pcwalton commented Oct 6, 2017

pcwalton commented Oct 6, 2017

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 7, 2017

pcwalton commented Oct 7, 2017 •

edited

Loading

jrmuizel commented Oct 31, 2017

jrmuizel commented Oct 31, 2017

klensy commented May 30, 2022

rust can't serialize 11 fields efficiently #45068

rust can't serialize 11 fields efficiently #45068

Comments

jrmuizel commented Oct 6, 2017 • edited Loading

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

sunfishcode commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

sunfishcode commented Oct 6, 2017 • edited Loading

jrmuizel commented Oct 6, 2017

pcwalton commented Oct 6, 2017 • edited Loading

pcwalton commented Oct 6, 2017

pcwalton commented Oct 6, 2017

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 6, 2017

pcwalton commented Oct 6, 2017

jrmuizel commented Oct 7, 2017

pcwalton commented Oct 7, 2017 • edited Loading

jrmuizel commented Oct 31, 2017

jrmuizel commented Oct 31, 2017

klensy commented May 30, 2022

jrmuizel commented Oct 6, 2017 •

edited

Loading

sunfishcode commented Oct 6, 2017 •

edited

Loading

pcwalton commented Oct 6, 2017 •

edited

Loading

pcwalton commented Oct 7, 2017 •

edited

Loading