-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
slice::rotate_left/right is not simplified for compile-time-constant small rotates #89714
Comments
#92967 should make it better in some cases (if |
@paolobarbolini you might be interested to known that this naive implementation outperform (at least for u32) both the current std and the std fixed version: pub fn rorate_left_with_reverse(s: &mut [u32], mid: usize) {
assert!(mid <= s.len());
let _ = s[..mid].reverse();
let _ = s[mid..].reverse();
let _ = s.reverse();
} Benchmark with
x86_64
|
I known what's going on. My naive implementation doesn't use ASM of rorate_left_with_reverse (naive implementation)example::rorate_left_with_reverse:
push rbx
mov r8, rsi
sub r8, rdx
jb .LBB1_26
cmp rdx, 2
jb .LBB1_9
mov r9, rdx
shr r9
cmp r9, 1
jne .LBB1_4
xor ecx, ecx
test dl, 2
jne .LBB1_8
jmp .LBB1_9
.LBB1_4:
lea rax, [rdi + 4]
lea r10, [rdi + 4*rdx]
add r10, -4
and r9, -2
neg r9
xor ecx, ecx
.LBB1_5:
mov r11d, dword ptr [rax - 4]
mov ebx, dword ptr [r10 + 4*rcx]
mov dword ptr [rax - 4], ebx
mov dword ptr [r10 + 4*rcx], r11d
mov r11d, dword ptr [rax]
mov ebx, dword ptr [r10 + 4*rcx - 4]
mov dword ptr [rax], ebx
mov dword ptr [r10 + 4*rcx - 4], r11d
add rax, 8
add rcx, -2
cmp r9, rcx
jne .LBB1_5
neg rcx
test dl, 2
je .LBB1_9
.LBB1_8:
mov rax, rcx
not rax
add rax, rdx
mov r9d, dword ptr [rdi + 4*rcx]
mov ebx, dword ptr [rdi + 4*rax]
mov dword ptr [rdi + 4*rcx], ebx
mov dword ptr [rdi + 4*rax], r9d
.LBB1_9:
cmp r8, 2
jb .LBB1_17
mov r9, r8
shr r9
cmp r9, 1
jne .LBB1_12
xor ecx, ecx
test r8b, 2
jne .LBB1_16
jmp .LBB1_17
.LBB1_12:
lea rax, [rdi + 4*rdx]
add rax, 4
lea r10, [rdi + 4*rsi]
add r10, -4
and r9, -2
neg r9
xor ecx, ecx
.LBB1_13:
mov r11d, dword ptr [rax - 4]
mov ebx, dword ptr [r10 + 4*rcx]
mov dword ptr [rax - 4], ebx
mov dword ptr [r10 + 4*rcx], r11d
mov r11d, dword ptr [rax]
mov ebx, dword ptr [r10 + 4*rcx - 4]
mov dword ptr [rax], ebx
mov dword ptr [r10 + 4*rcx - 4], r11d
add rax, 8
add rcx, -2
cmp r9, rcx
jne .LBB1_13
neg rcx
test r8b, 2
je .LBB1_17
.LBB1_16:
lea rax, [rdi + 4*rdx]
mov rdx, rcx
not rdx
add r8, rdx
mov edx, dword ptr [rax + 4*rcx]
mov ebx, dword ptr [rax + 4*r8]
mov dword ptr [rax + 4*rcx], ebx
mov dword ptr [rax + 4*r8], edx
.LBB1_17:
cmp rsi, 2
jb .LBB1_25
mov r8, rsi
shr r8
cmp r8, 1
jne .LBB1_20
xor eax, eax
test sil, 2
jne .LBB1_24
jmp .LBB1_25
.LBB1_20:
lea rdx, [rdi + 4]
lea rcx, [rdi + 4*rsi]
add rcx, -4
and r8, -2
neg r8
xor eax, eax
.LBB1_21:
mov r9d, dword ptr [rdx - 4]
mov ebx, dword ptr [rcx + 4*rax]
mov dword ptr [rdx - 4], ebx
mov dword ptr [rcx + 4*rax], r9d
mov r9d, dword ptr [rdx]
mov ebx, dword ptr [rcx + 4*rax - 4]
mov dword ptr [rdx], ebx
mov dword ptr [rcx + 4*rax - 4], r9d
add rdx, 8
add rax, -2
cmp r8, rax
jne .LBB1_21
neg rax
test sil, 2
je .LBB1_25
.LBB1_24:
mov rcx, rax
not rcx
add rcx, rsi
mov edx, dword ptr [rdi + 4*rax]
mov esi, dword ptr [rdi + 4*rcx]
mov dword ptr [rdi + 4*rax], esi
mov dword ptr [rdi + 4*rcx], edx
.LBB1_25:
pop rbx
ret
.LBB1_26:
lea rdi, [rip + .L__unnamed_1]
lea rdx, [rip + .L__unnamed_2]
mov esi, 32
call qword ptr [rip + core::panicking::panic@GOTPCREL]
ud2 ASM of rotate_u8_right1 (fixed std version)example::rotate_u8_right1:
push rbp
push r14
push rbx
test rsi, rsi
je .LBB0_11
mov rdx, rsi
add rdx, -1
je .LBB0_10
mov r14, rdi
cmp rsi, 23
ja .LBB0_8
mov bpl, byte ptr [r14]
mov eax, 1
mov ecx, 1
sub rcx, rsi
xor esi, esi
mov edi, 1
jmp .LBB0_4
.LBB0_5:
add rdi, 1
.LBB0_4:
mov ebx, ebp
movzx ebp, byte ptr [r14 + rdi]
mov byte ptr [r14 + rdi], bl
cmp rdi, rdx
jb .LBB0_5
add rdi, rcx
je .LBB0_9
cmp rdi, rax
cmovb rax, rsi
cmovb rdi, rsi
jmp .LBB0_4
.LBB0_8:
lea rdi, [r14 + 1]
mov bpl, byte ptr [r14 + rdx]
mov rsi, r14
call qword ptr [rip + memmove@GOTPCREL]
.LBB0_9:
mov byte ptr [r14], bpl
.LBB0_10:
pop rbx
pop r14
pop rbp
ret
.LBB0_11:
lea rdi, [rip + .L__unnamed_1]
lea rdx, [rip + .L__unnamed_2]
mov esi, 34
call qword ptr [rip + core::panicking::panic@GOTPCREL]
ud2 |
I have a crate where the number of items to /// Same as `array[..=usize::from(n)].rotate_right(1);`
pub fn rotate_right1_fastest(array: &mut [u8; 256], n: u8) {
debug_assert!(n < 6);
let n = usize::from(n);
let b = array[n];
array[5 - usize::from(n < 5)] = array[4];
array[4 - usize::from(n < 4)] = array[3];
array[3 - usize::from(n < 3)] = array[2];
array[2 - usize::from(n < 2)] = array[1];
array[1 - usize::from(n < 1)] = array[0];
array[0] = b;
} ASMexample::rotate_right1_fastest:
movzx eax, sil
mov al, byte ptr [rdi + rax]
mov cl, byte ptr [rdi + 4]
cmp sil, 5
mov rdx, rdi
sbb rdx, 0
mov byte ptr [rdx + 5], cl
mov cl, byte ptr [rdi + 3]
cmp sil, 4
mov rdx, rdi
sbb rdx, 0
mov byte ptr [rdx + 4], cl
mov cl, byte ptr [rdi + 2]
cmp sil, 3
mov rdx, rdi
sbb rdx, 0
mov byte ptr [rdx + 3], cl
cmp sil, 2
mov rcx, rdi
sbb rcx, 0
mov dl, byte ptr [rdi + 1]
mov byte ptr [rcx + 2], dl
mov cl, byte ptr [rdi]
cmp sil, 1
mov rdx, rdi
sbb rdx, -1
mov byte ptr [rdx], cl
mov byte ptr [rdi], al
ret |
@Urgau Note that That said, I did recently rewrite Actually, maybe the |
@scottmcm Effectively, I didn't taught of that. Current std:
Nearly original (ManuallyDrop + nits):
Nearly original + with original ptr_swap_n:
Naive implementation:
Conclusion, unsurprisingly the current implementation seems to be overhaul better. |
@rustbot label +I-slow |
There is another problem when the size of the type is big like for example struct BallastType {
ballast: [u64; 32], // the compiler keeps this, even if unused
value: usize,
} The algorithm will switch immediately to the I think the idea to use the I also noticed that when the I think, the |
The current (2021-10-08 nightly) and recent implementations of
slice::rotate_right
produce 15x more instructions than a manual special-case implementation for a small rotate by a compile-time constant offset. This results in a 5-50% performance difference, depending on the length of the slice in question.godbolt demonstration: https://godbolt.org/z/a4YdqP1rn
I think the current implementation doesn't optimize well because it is a
loop {
surrounding three rotate algorithms with their own predicates, the first of which is almost never known at compile time:and it is the second algorithm that we want to run in this case:
Simply pasting a copy of the second algorithm above the loop removes the performance difference and most but not all of the code size difference.
cc @scottmcm I've finally gotten around to reporting this after months
The text was updated successfully, but these errors were encountered: