Constructing an iterator from a slice or Vec doesn't optimise completely #11751

huonw · 2014-01-23T13:17:39Z

#![crate_type = "lib"]

pub fn slice(s: &[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}
pub fn vec(s: Vec<uint>) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

pub fn owned(s: ~[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

Compiled with -O --lib --emit-llvm -S gives the following. The only major difference between &[]/Vec and ~[] are two lines marked THIS CHECK, which, we think, is because when constructing an iterator from ~[] we do a pointer offset and dereference, so LLVM knows the pointers are non-null (in the slice/Vec case, the match it.next() { None => ... } part of the for loop isn't removed).

; ModuleID = '11751.rs'
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"struct.std::vec::Vec<uint>[#1]" = type { i64, i64, i64* }

; Function Attrs: nounwind readonly uwtable
define i64 @_ZN5slice20h084d58a6edab0287daa4v0.0E({ i64*, i64 }* noalias nocapture nonnull readonly) unnamed_addr #0 {
entry-block:
  %1 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 0
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %match_else, %entry-block
  %6 = phi i64* [ %9, %match_else ], [ %2, %entry-block ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null     ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %return, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %return, label %loop_body

return:                                           ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  ret i64 %__make_return_pointer.0
}

; Function Attrs: uwtable
define i64 @_ZN3vec20h4963a1d1a9f58c9eUaa4v0.0E(%"struct.std::vec::Vec<uint>[#1]"* noalias nocapture nonnull readonly) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 2
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 0
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %2, %entry-block ], [ %9, %match_else ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null      ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %clean_custom_6, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %clean_custom_6, label %loop_body

clean_custom_6:                                   ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  %12 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 1
  %13 = load i64* %12, align 8
  %14 = icmp eq i64 %13, 0
  br i1 %14, label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit", label %then-block-549-.i.i

then-block-549-.i.i:                              ; preds = %clean_custom_6
  %15 = bitcast i64* %2 to i8*
  tail call void @je_dallocx(i8* %15, i32 3)
  br label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit"

"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit": ; preds = %clean_custom_6, %then-block-549-.i.i
  ret i64 %__make_return_pointer.0
}

declare void @je_dallocx(i8*, i32) unnamed_addr #2

; Function Attrs: uwtable
define i64 @_ZN5owned20h3f7b4426165c9e96Bba4v0.0E({ i64, i64, [0 x i64] }* noalias nonnull) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 0
  %2 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 0
  %3 = load i64* %2, align 8
  %4 = lshr i64 %3, 3
  %5 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %1, %entry-block ], [ %8, %match_else ]
  %7 = icmp eq i64* %6, %5
  br i1 %7, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %match_else

match_else:                                       ; preds = %loop_body
  %8 = getelementptr inbounds i64* %6, i64 1
  %9 = load i64* %6, align 8
  %10 = icmp ugt i64 %9, 10
  br i1 %10, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %loop_body

"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit": ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %9, %match_else ], [ 0, %loop_body ]
  %11 = bitcast { i64, i64, [0 x i64] }* %0 to i8*
  tail call void @je_dallocx(i8* %11, i32 3)
  ret i64 %__make_return_pointer.0
}

attributes #0 = { nounwind readonly uwtable "split-stack" }
attributes #1 = { uwtable "split-stack" }
attributes #2 = { "split-stack" }

The text was updated successfully, but these errors were encountered:

thestinger · 2014-01-23T13:37:49Z

Implementing #9546 would fix this.

This uses a vector iterator to avoid the necessity for unsafe indexing, and makes this function slightly faster. Unfortunately rust-lang#11751 means that the iterator comes with repeated `null` checks which means the pure-ASCII case still has room for significant improvement (and the other cases too, but it's most significant for just ASCII). Before: is_utf8_100_ascii ... bench: 143 ns/iter (+/- 6) is_utf8_100_multibyte ... bench: 134 ns/iter (+/- 4) After: is_utf8_100_ascii ... bench: 123 ns/iter (+/- 4) is_utf8_100_multibyte ... bench: 115 ns/iter (+/- 5)

zwarich · 2014-06-11T21:53:33Z

In theory, LLVM should be able to determine that this null check is unnecessary without additional metadata. There are two separate changes to LLVM's optimizer that are required:

An inbounds GEP either produces a valid pointer into an allocated object or a poison value. In address space 0, there is no allocated object at the zero address. This implies that any inbounds GEP in address space 0 is a poison value. According to the LLVM LangRef, the icmp would depend on the poison value, and any instruction that depends on poison values exhibits undefined behavior. However, Dan Gohman (sunfish) tells me that it was only intended to apply to instructions exhibiting externally visible side effects, as otherwise it would mean that any add instruction could potentially have undefined behavior. Any chain of inbounds GEPs and phis ending with a load of the inbounds value should be undefined behavior, because of how poison values behave like undef. We can't just assume that an inbounds GEP produces a nonnull value, because then that implies that inbounds GEPs themselves can have undefined behavior when the runtime value is actually null. Lots of optimizations depend on being able to hoist GEPs, e.g. out of loops, and that wouldn't be possible if they potentially had undefined behavior.
The current LazyValueInfo / CorrelatedValuePropagation passes are not optimistic with respect to control flow. Since there is a phi here, the optimization opportunity would be missed even if LazyValueInfo understood that the pointers are null. This would improve other optimizations as well, but it would probably hurt compile-time a bit. IIRC changes along these lines have been proposed for LLVM in the past, but they have never gone in.

Correctly implementing the rule for poison values in LazyValueInfo would be quite difficult, because it requires reasoning about control-dependence with respect to poison values. Also, the cost of making LazyValueInfo optimistic might be too high in compile time to get the patch landed.

In #9546 there is a proposal to add metadata on LLVM instructions that indicates that the instruction produces a nonnull value. There are two reasons why this proposal would be a bit more difficult than it seems at first:

If an instruction is marked with the nonnull metadata, then what happens if the value actually isn't null at runtime? Is it undefined behavior, or is it a poison value? If it is undefined behavior, then this means that any instruction with the nonnull metadata would potentially have side effects, and code motion that modifies control dependence of this instruction would be prohibited. Dan and I realized that this is also a problem with the current range metadata in LLVM that has likely gone unnoticed. It isn't a problem with the existing nonnull attribute on function parameters and return values, because the attribute is erased upon inlining.
The nonnull metadata would only tell you that the inputs to the phi are nonnull, but LazyValueInfo would still not be able to propagate that to the phi because of the inadequacy mentioned above. Another pass would have to propagate nonnull on phis.

Another option would be to write a pass that looks for chains of inbounds GEPs, nonnull parameters / return values, and phis of these that feed into loads and stores control-dependent on null checks of some intermediate value in the chain. The null checks could be replaced with false, and then hopefully other optimization passes would be able to clean everything up.

pcwalton · 2014-06-12T02:47:49Z

Sounds like the last option is the easiest. I also like the fact that it's a separate pass, meaning that if we have trouble getting it upstream we can maintain it in our branch for a while.

zwarich · 2014-06-26T07:20:01Z

I wrote the optimization pass I described. It is able to optimize the first case (with &[uint]), but it is unable to optimize the second case (with Vec<uint>). I have an informal inductive argument for why it should still always be nonnull with LLVM IR's poison value semantics, but implementing it as code will be trickier.

zwarich · 2014-06-27T09:13:30Z

Unfortunately, my pass causes the compiled rustc to segfault when compiling liblibc. That might be a pain to track down. The problem could be in my code, or rustc could just be marking a GEP inbounds when it shouldn't.

zwarich · 2014-06-28T07:24:36Z

I found the issue and put a first cut of my pass up as rust-lang/llvm#13.

brson · 2014-06-28T17:02:01Z

\o/
On Jun 28, 2014 12:24 AM, "Cameron Zwarich" [email protected]
wrote:

I found the issue and put a first cut of my pass up as rust-lang/llvm#13
rust-lang/llvm#13.

—
Reply to this email directly or view it on GitHub
#11751 (comment).

zwarich · 2014-06-28T20:38:20Z

The pass that was landed handles the &[T] case. There are two obvious remaining things to do:

Generalize to the case of multiple null checks involved in a single branch. This shouldn't be too hard.
Handle the case where the base pointer is not known to be nonnull, but everything else in the recurrence is an inbounds GEP. This corresponds to the Vec<T> case. In theory, we could add nonnull metadata to the load (since loads are allowed to have undefined behavior), but applying the LLVM rules inductively actually lets the checks be removed without that.

zwarich · 2014-06-28T20:50:36Z

Actually, that second point applies to the IR generated by &[T] as well. I must've oversimplified it when I made test cases. I'll try to solve that soon then.

zwarich · 2014-06-28T23:54:42Z

I have local changes that fix those two items. In order to make this apply to zip(), I also need to track dominating conditions from other blocks. This shouldn't be that much more work, so maybe I'll hold out until I have that working.

zwarich · 2014-06-29T18:41:02Z

Those changes are up as rust-lang/llvm#14. I'll need to add less conservative control dependence checking, and then it should be able to handle arbitrary chaining of zipped iterators.

Fixes rust-lang#11751

…p1995 Add `lint_groups_priority` lint Warns when a lint group in Cargo.toml's `[lints]` section shares the same priority as a lint. This is in the cargo section but is categorised as `correctness` so it's on by default, it doesn't call `cargo metadata` though and parses the `Cargo.toml` directly The lint should be temporary until rust-lang/cargo#12918 is resolved, but in the meanwhile this is an common issue to run into - rust-lang#11237 - rust-lang#11751 - rust-lang#11830 changelog: Add [`lint_groups_priority`] lint r? `@flip1995`

thestinger mentioned this issue Jan 28, 2014

port from ~[T] to Vec<T> #11875

Closed

3 tasks

huonw mentioned this issue Feb 15, 2014

std::str: optimize is_utf8 & first_non_utf8_index. #12241

Closed

huonw mentioned this issue Mar 3, 2014

Add namespace support for rename operation #12563

Closed

thestinger mentioned this issue Apr 12, 2014

Cloning a 1MB vector is 30x slower than cloning a 1MB ~str #13472

Closed

huonw mentioned this issue May 20, 2014

Update to LLVM head and mark various ptrs as nonnull. #14306

Merged

thestinger mentioned this issue May 21, 2014

use information about non-raw pointers being non-null #9546

Closed

huonw mentioned this issue Jun 4, 2014

10% slowdown when using iterators #14659

Closed

huonw changed the title ~~Constructing an iterator from a slice doesn't optimise completely~~ Constructing an iterator from a slice or Vec doesn't optimise completely Jun 4, 2014

aturon added the A-libs label Jun 4, 2014

zwarich mentioned this issue Jun 28, 2014

Add a Rust NullCheckElimination pass rust-lang/llvm#13

Closed

zwarich self-assigned this Jun 28, 2014

bluss mentioned this issue Jul 13, 2014

str: Implement a faster Chars iterator for &str #15638

Merged

huonw mentioned this issue Jul 15, 2014

Small performance enhancements for cut: bytes cutting uutils/coreutils#357

Merged

dotdash mentioned this issue Jul 20, 2014

Update LLVM to include NullCheckElimination pass #15837

Closed

dotdash added a commit to dotdash/rust that referenced this issue Jul 21, 2014

Update LLVM to include NullCheckElimination pass

a478a2e

Fixes rust-lang#11751

alexcrichton closed this as completed in e3887a7 Jul 21, 2014

huonw unassigned zwarich Jan 31, 2015

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

huonw commented Jan 23, 2014

thestinger commented Jan 23, 2014

zwarich commented Jun 11, 2014

pcwalton commented Jun 12, 2014

zwarich commented Jun 26, 2014

zwarich commented Jun 27, 2014

zwarich commented Jun 28, 2014

brson commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 29, 2014

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

Comments

huonw commented Jan 23, 2014

thestinger commented Jan 23, 2014

zwarich commented Jun 11, 2014

pcwalton commented Jun 12, 2014

zwarich commented Jun 26, 2014

zwarich commented Jun 27, 2014

zwarich commented Jun 28, 2014

brson commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 28, 2014

zwarich commented Jun 29, 2014