Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

Closed
huonw opened this issue Jan 23, 2014 · 11 comments
Closed

Constructing an iterator from a slice or Vec doesn't optimise completely #11751

huonw opened this issue Jan 23, 2014 · 11 comments
Labels
I-slow Issue: Problems and improvements with respect to performance of generated code.

Comments

@huonw
Copy link
Member

huonw commented Jan 23, 2014

#![crate_type = "lib"]

pub fn slice(s: &[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}
pub fn vec(s: Vec<uint>) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

pub fn owned(s: ~[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

Compiled with -O --lib --emit-llvm -S gives the following. The only major difference between &[]/Vec and ~[] are two lines marked THIS CHECK, which, we think, is because when constructing an iterator from ~[] we do a pointer offset and dereference, so LLVM knows the pointers are non-null (in the slice/Vec case, the match it.next() { None => ... } part of the for loop isn't removed).

; ModuleID = '11751.rs'
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"struct.std::vec::Vec<uint>[#1]" = type { i64, i64, i64* }

; Function Attrs: nounwind readonly uwtable
define i64 @_ZN5slice20h084d58a6edab0287daa4v0.0E({ i64*, i64 }* noalias nocapture nonnull readonly) unnamed_addr #0 {
entry-block:
  %1 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 0
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %match_else, %entry-block
  %6 = phi i64* [ %9, %match_else ], [ %2, %entry-block ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null     ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %return, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %return, label %loop_body

return:                                           ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  ret i64 %__make_return_pointer.0
}

; Function Attrs: uwtable
define i64 @_ZN3vec20h4963a1d1a9f58c9eUaa4v0.0E(%"struct.std::vec::Vec<uint>[#1]"* noalias nocapture nonnull readonly) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 2
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 0
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %2, %entry-block ], [ %9, %match_else ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null      ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %clean_custom_6, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %clean_custom_6, label %loop_body

clean_custom_6:                                   ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  %12 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 1
  %13 = load i64* %12, align 8
  %14 = icmp eq i64 %13, 0
  br i1 %14, label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit", label %then-block-549-.i.i

then-block-549-.i.i:                              ; preds = %clean_custom_6
  %15 = bitcast i64* %2 to i8*
  tail call void @je_dallocx(i8* %15, i32 3)
  br label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit"

"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit": ; preds = %clean_custom_6, %then-block-549-.i.i
  ret i64 %__make_return_pointer.0
}

declare void @je_dallocx(i8*, i32) unnamed_addr #2

; Function Attrs: uwtable
define i64 @_ZN5owned20h3f7b4426165c9e96Bba4v0.0E({ i64, i64, [0 x i64] }* noalias nonnull) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 0
  %2 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 0
  %3 = load i64* %2, align 8
  %4 = lshr i64 %3, 3
  %5 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %1, %entry-block ], [ %8, %match_else ]
  %7 = icmp eq i64* %6, %5
  br i1 %7, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %match_else

match_else:                                       ; preds = %loop_body
  %8 = getelementptr inbounds i64* %6, i64 1
  %9 = load i64* %6, align 8
  %10 = icmp ugt i64 %9, 10
  br i1 %10, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %loop_body

"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit": ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %9, %match_else ], [ 0, %loop_body ]
  %11 = bitcast { i64, i64, [0 x i64] }* %0 to i8*
  tail call void @je_dallocx(i8* %11, i32 3)
  ret i64 %__make_return_pointer.0
}

attributes #0 = { nounwind readonly uwtable "split-stack" }
attributes #1 = { uwtable "split-stack" }
attributes #2 = { "split-stack" }
@thestinger
Copy link
Contributor

Implementing #9546 would fix this.

huonw added a commit to huonw/rust that referenced this issue Feb 18, 2014
This uses a vector iterator to avoid the necessity for unsafe indexing,
and makes this function slightly faster. Unfortunately rust-lang#11751 means that
the iterator comes with repeated `null` checks which means the
pure-ASCII case still has room for significant improvement (and the
other cases too, but it's most significant for just ASCII).

Before:

    is_utf8_100_ascii             ... bench:       143 ns/iter (+/- 6)
    is_utf8_100_multibyte         ... bench:       134 ns/iter (+/- 4)

After:

    is_utf8_100_ascii             ... bench:       123 ns/iter (+/- 4)
    is_utf8_100_multibyte         ... bench:       115 ns/iter (+/- 5)
@huonw huonw changed the title Constructing an iterator from a slice doesn't optimise completely Constructing an iterator from a slice or Vec doesn't optimise completely Jun 4, 2014
@aturon aturon added the A-libs label Jun 4, 2014
@zwarich
Copy link

zwarich commented Jun 11, 2014

In theory, LLVM should be able to determine that this null check is unnecessary without additional metadata. There are two separate changes to LLVM's optimizer that are required:

  1. An inbounds GEP either produces a valid pointer into an allocated object or a poison value. In address space 0, there is no allocated object at the zero address. This implies that any inbounds GEP in address space 0 is a poison value. According to the LLVM LangRef, the icmp would depend on the poison value, and any instruction that depends on poison values exhibits undefined behavior. However, Dan Gohman (sunfish) tells me that it was only intended to apply to instructions exhibiting externally visible side effects, as otherwise it would mean that any add instruction could potentially have undefined behavior. Any chain of inbounds GEPs and phis ending with a load of the inbounds value should be undefined behavior, because of how poison values behave like undef. We can't just assume that an inbounds GEP produces a nonnull value, because then that implies that inbounds GEPs themselves can have undefined behavior when the runtime value is actually null. Lots of optimizations depend on being able to hoist GEPs, e.g. out of loops, and that wouldn't be possible if they potentially had undefined behavior.
  2. The current LazyValueInfo / CorrelatedValuePropagation passes are not optimistic with respect to control flow. Since there is a phi here, the optimization opportunity would be missed even if LazyValueInfo understood that the pointers are null. This would improve other optimizations as well, but it would probably hurt compile-time a bit. IIRC changes along these lines have been proposed for LLVM in the past, but they have never gone in.

Correctly implementing the rule for poison values in LazyValueInfo would be quite difficult, because it requires reasoning about control-dependence with respect to poison values. Also, the cost of making LazyValueInfo optimistic might be too high in compile time to get the patch landed.

In #9546 there is a proposal to add metadata on LLVM instructions that indicates that the instruction produces a nonnull value. There are two reasons why this proposal would be a bit more difficult than it seems at first:

  1. If an instruction is marked with the nonnull metadata, then what happens if the value actually isn't null at runtime? Is it undefined behavior, or is it a poison value? If it is undefined behavior, then this means that any instruction with the nonnull metadata would potentially have side effects, and code motion that modifies control dependence of this instruction would be prohibited. Dan and I realized that this is also a problem with the current range metadata in LLVM that has likely gone unnoticed. It isn't a problem with the existing nonnull attribute on function parameters and return values, because the attribute is erased upon inlining.
  2. The nonnull metadata would only tell you that the inputs to the phi are nonnull, but LazyValueInfo would still not be able to propagate that to the phi because of the inadequacy mentioned above. Another pass would have to propagate nonnull on phis.

Another option would be to write a pass that looks for chains of inbounds GEPs, nonnull parameters / return values, and phis of these that feed into loads and stores control-dependent on null checks of some intermediate value in the chain. The null checks could be replaced with false, and then hopefully other optimization passes would be able to clean everything up.

@pcwalton
Copy link
Contributor

Sounds like the last option is the easiest. I also like the fact that it's a separate pass, meaning that if we have trouble getting it upstream we can maintain it in our branch for a while.

@zwarich
Copy link

zwarich commented Jun 26, 2014

I wrote the optimization pass I described. It is able to optimize the first case (with &[uint]), but it is unable to optimize the second case (with Vec<uint>). I have an informal inductive argument for why it should still always be nonnull with LLVM IR's poison value semantics, but implementing it as code will be trickier.

@zwarich
Copy link

zwarich commented Jun 27, 2014

Unfortunately, my pass causes the compiled rustc to segfault when compiling liblibc. That might be a pain to track down. The problem could be in my code, or rustc could just be marking a GEP inbounds when it shouldn't.

@zwarich
Copy link

zwarich commented Jun 28, 2014

I found the issue and put a first cut of my pass up as rust-lang/llvm#13.

@brson
Copy link
Contributor

brson commented Jun 28, 2014

\o/
On Jun 28, 2014 12:24 AM, "Cameron Zwarich" [email protected]
wrote:

I found the issue and put a first cut of my pass up as rust-lang/llvm#13
rust-lang/llvm#13.


Reply to this email directly or view it on GitHub
#11751 (comment).

@zwarich
Copy link

zwarich commented Jun 28, 2014

The pass that was landed handles the &[T] case. There are two obvious remaining things to do:

  1. Generalize to the case of multiple null checks involved in a single branch. This shouldn't be too hard.
  2. Handle the case where the base pointer is not known to be nonnull, but everything else in the recurrence is an inbounds GEP. This corresponds to the Vec<T> case. In theory, we could add nonnull metadata to the load (since loads are allowed to have undefined behavior), but applying the LLVM rules inductively actually lets the checks be removed without that.

@zwarich
Copy link

zwarich commented Jun 28, 2014

Actually, that second point applies to the IR generated by &[T] as well. I must've oversimplified it when I made test cases. I'll try to solve that soon then.

@zwarich
Copy link

zwarich commented Jun 28, 2014

I have local changes that fix those two items. In order to make this apply to zip(), I also need to track dominating conditions from other blocks. This shouldn't be that much more work, so maybe I'll hold out until I have that working.

@zwarich zwarich self-assigned this Jun 28, 2014
@zwarich
Copy link

zwarich commented Jun 29, 2014

Those changes are up as rust-lang/llvm#14. I'll need to add less conservative control dependence checking, and then it should be able to handle arbitrary chaining of zipped iterators.

dotdash added a commit to dotdash/rust that referenced this issue Jul 21, 2014
flip1995 pushed a commit to flip1995/rust that referenced this issue Feb 8, 2024
…p1995

Add `lint_groups_priority` lint

Warns when a lint group in Cargo.toml's `[lints]` section shares the same priority as a lint. This is in the cargo section but is categorised as `correctness` so it's on by default, it doesn't call `cargo metadata` though and parses the `Cargo.toml` directly

The lint should be temporary until rust-lang/cargo#12918 is resolved, but in the meanwhile this is an common issue to run into

- rust-lang#11237
- rust-lang#11751
- rust-lang#11830

changelog: Add [`lint_groups_priority`] lint

r? `@flip1995`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
I-slow Issue: Problems and improvements with respect to performance of generated code.
Projects
None yet
Development

Successfully merging a pull request may close this issue.

6 participants