internal: use more core::hint::assert_unchecked #6132
Conversation
assert_unchecked to BoundTupleIterator and BorrowedTupleIterator len()assert_unchecked to BoundTupleIterator and BorrowedTupleIterator len()
assert_unchecked to BoundTupleIterator and BorrowedTupleIterator len()core::hint::assert_unchecked
|
Thanks for this PR. I am undecided how I feel about this. On the one hand, squeezing out efficiency and performance is always desirable. On the other hand, this is
Accordingly I'd prefer we were absolutely certain these make sense before merging. PyO3 has enough I'd value other maintainers' opinions on this. |
|
To clarify my position, I think I have a slight leaning that I'd prefer not to have these assertions, however can be persuaded to keep them if we can demonstrate they have a meaningful effect on final codegen. |
|
+1 to @davidhewitt Such assertions feels very much like "last percent of performance squeezing" and we should make sure we have done all the simpler things first and have proper benchmarking. Also I see often unsafe { core::hint::assert_unchecked(self.index <= self.length) };
self.length.saturating_sub(self.index)how is it better than just the plain self.length - self.index? |
|
I used cargo asm cargo install cargo-show-asmCargo.toml[package]
name = "pyo3_test"
version = "0.1.0"
edition = "2024"
[dependencies]
jiff = "0.2.28"
pyo3 = { version = "0.29.0", features = ["jiff-02"]}src/main.rs#![allow(dead_code)]
use jiff::civil::DateTime;
use jiff::tz::TimeZone;
use pyo3::types::PyDateTime;
use pyo3::{Bound, IntoPyObject, PyResult, Python};
#[inline(never)]
#[unsafe(no_mangle)]
fn datetime_to_pydatetime<'py>(
py: Python<'py>,
datetime: DateTime,
fold: bool,
timezone: Option<&TimeZone>,
) -> PyResult<Bound<'py, PyDateTime>> {
PyDateTime::new_with_fold(
py,
datetime.year().into(),
datetime.month().try_into()?,
datetime.day().try_into()?,
datetime.hour().try_into()?,
datetime.minute().try_into()?,
datetime.second().try_into()?,
(datetime.subsec_nanosecond() / 1000).try_into()?,
timezone
.map(|tz| tz.into_pyobject(py))
.transpose()?
.as_ref(),
fold,
)
}
#[inline(never)]
#[unsafe(no_mangle)]
fn datetime_to_pydatetime_assert_unchecked<'py>(
py: Python<'py>,
datetime: DateTime,
fold: bool,
timezone: Option<&TimeZone>,
) -> PyResult<Bound<'py, PyDateTime>> {
let micros = datetime.subsec_nanosecond() / 1000;
// SAFETY: `subsec_nanosecond()` [0, 999_999_999], after / 1000 always non-negative
unsafe { core::hint::assert_unchecked(micros >= 0) };
PyDateTime::new_with_fold(
py,
datetime.year().into(),
datetime.month().try_into()?,
datetime.day().try_into()?,
datetime.hour().try_into()?,
datetime.minute().try_into()?,
datetime.second().try_into()?,
micros as u32,
timezone
.map(|tz| tz.into_pyobject(py))
.transpose()?
.as_ref(),
fold,
)
}
fn main() {}Output❯ cargo asm
Finished `release` profile [optimized] target(s) in 0.10s
Try one of those by name or a sequence number
0 "<std::rt::lang_start<()>::{closure#0} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0}" [23]
1 "core::ptr::drop_glue::<core::option::Option<pyo3::instance::Bound<pyo3::types::datetime::PyTzInfo>>>" [36]
2 "datetime_to_pydatetime" [271]
3 "datetime_to_pydatetime_assert_unchecked" [264]
4 "main" [19]
5 "pyo3_test::main" [7]
6 "std::rt::lang_start::<()>" [31]
7 "std::rt::lang_start::<()>::{closure#0}" [18]
8 "std::sys::backtrace::__rust_begin_short_backtrace::<fn(), ()>" [28]
##################################
❯ cargo asm 2
Finished `release` profile [optimized] target(s) in 0.04s
.section .text,"xr",one_only,datetime_to_pydatetime,unique,6
.globl datetime_to_pydatetime
.p2align 4
datetime_to_pydatetime:
.cv_func_id 14
.seh_proc datetime_to_pydatetime
.seh_handler __CxxFrameHandler3, @unwind, @except
push rbp
.seh_pushreg rbp
push r15
.seh_pushreg r15
push r14
.seh_pushreg r14
push r13
.seh_pushreg r13
push r12
.seh_pushreg r12
push rsi
.seh_pushreg rsi
push rdi
.seh_pushreg rdi
push rbx
.seh_pushreg rbx
sub rsp, 168
.seh_stackalloc 168
lea rbp, [rsp + 128]
.seh_setframe rbp, 128
.seh_endprologue
mov qword ptr [rbp + 32], -2
mov rsi, rcx
movzx ebx, byte ptr [rdx + 10]
.cv_inline_site_id 15 within 14 inlined_at 8 19 0
.cv_inline_site_id 16 within 15 inlined_at 10 819 0
test bl, bl
js .LBB6_1
movzx edi, byte ptr [rdx + 11]
.cv_inline_site_id 17 within 14 inlined_at 8 20 0
.cv_inline_site_id 18 within 17 inlined_at 10 819 0
test dil, dil
js .LBB6_1
movzx r15d, byte ptr [rdx + 4]
.cv_inline_site_id 19 within 14 inlined_at 8 21 0
.cv_inline_site_id 20 within 19 inlined_at 10 819 0
test r15b, r15b
js .LBB6_1
movzx r12d, byte ptr [rdx + 5]
.cv_inline_site_id 21 within 14 inlined_at 8 22 0
.cv_inline_site_id 22 within 21 inlined_at 10 819 0
test r12b, r12b
js .LBB6_1
movzx r13d, byte ptr [rdx + 6]
.cv_inline_site_id 23 within 14 inlined_at 8 23 0
.cv_inline_site_id 24 within 23 inlined_at 10 819 0
test r13b, r13b
js .LBB6_1
movsxd r14, dword ptr [rdx]
cmp r14, -1000
jg .LBB6_7
.LBB6_1:
.cv_inline_site_id 25 within 14 inlined_at 8 0 0
lea rcx, [rsi + 8]
mov dl, 3
call <pyo3::err::PyErr as core::convert::From<core::num::error::TryFromIntError>>::from
.LBB6_17:
mov qword ptr [rsi], 1
.LBB6_18:
mov rax, rsi
.seh_startepilogue
add rsp, 168
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
pop rbp
.seh_endepilogue
ret
.LBB6_7:
.cv_inline_site_id 26 within 14 inlined_at 8 26 0
movsx edx, word ptr [rdx + 8]
test r9, r9
je .LBB6_8
.cv_inline_site_id 27 within 26 inlined_at 12 1162 0
mov dword ptr [rbp + 20], edx
mov byte ptr [rbp + 24], r8b
lea rcx, [rbp - 48]
mov rdx, r9
call <&jiff::tz::timezone::TimeZone as pyo3::conversion::IntoPyObject>::into_pyobject
.cv_inline_site_id 28 within 14 inlined_at 8 27 0
mov rax, qword ptr [rbp - 48]
cmp rax, -1
je .LBB6_10
test rax, rax
movzx r8d, byte ptr [rbp + 24]
jne .LBB6_16
mov r9, qword ptr [rbp - 40]
jmp .LBB6_13
.LBB6_8:
xor r9d, r9d
jmp .LBB6_14
.LBB6_10:
xor r9d, r9d
movzx r8d, byte ptr [rbp + 24]
.LBB6_13:
mov edx, dword ptr [rbp + 20]
.LBB6_14:
imul rax, r14, 274877907
mov rcx, rax
shr rcx, 63
sar rax, 38
add eax, ecx
mov qword ptr [rbp + 8], r9
.cv_inline_site_id 29 within 14 inlined_at 8 28 0
test r9, r9
lea rcx, [rbp + 8]
mov qword ptr [rbp + 24], r9
cmove rcx, r9
mov byte ptr [rsp + 72], r8b
mov qword ptr [rsp + 64], rcx
mov dword ptr [rsp + 56], eax
mov byte ptr [rsp + 48], r13b
mov byte ptr [rsp + 40], r12b
mov byte ptr [rsp + 32], r15b
mov rcx, rsi
mov r8d, ebx
mov r9d, edi
call <pyo3::types::datetime::PyDateTime>::new_with_fold
nop
mov rcx, qword ptr [rbp + 24]
call core::ptr::drop_glue::<core::option::Option<pyo3::instance::Bound<pyo3::types::datetime::PyTzInfo>>>
jmp .LBB6_18
.LBB6_16:
mov rax, qword ptr [rbp - 40]
mov rcx, qword ptr [rbp]
mov qword ptr [rsi + 48], rcx
movups xmm0, xmmword ptr [rbp - 16]
movups xmmword ptr [rsi + 32], xmm0
movups xmm0, xmmword ptr [rbp - 32]
movups xmmword ptr [rsi + 16], xmm0
.cv_inline_site_id 30 within 14 inlined_at 8 25 0
mov qword ptr [rsi + 8], rax
jmp .LBB6_17
.seh_handlerdata
.long $cppxdata$datetime_to_pydatetime@IMGREL
.section .text,"xr",one_only,datetime_to_pydatetime,unique,6
.seh_endproc
.def "?dtor$19@?0?datetime_to_pydatetime@4HA";
.scl 3;
.type 32;
.endef
.p2align 4
"?dtor$19@?0?datetime_to_pydatetime@4HA":
.seh_proc "?dtor$19@?0?datetime_to_pydatetime@4HA"
mov qword ptr [rsp + 16], rdx
push rbp
.seh_pushreg rbp
push r15
.seh_pushreg r15
push r14
.seh_pushreg r14
push r13
.seh_pushreg r13
push r12
.seh_pushreg r12
push rsi
.seh_pushreg rsi
push rdi
.seh_pushreg rdi
push rbx
.seh_pushreg rbx
sub rsp, 88
.seh_stackalloc 88
lea rbp, [rdx + 128]
.seh_endprologue
mov rcx, qword ptr [rbp + 24]
call core::ptr::drop_glue::<core::option::Option<pyo3::instance::Bound<pyo3::types::datetime::PyTzInfo>>>
nop
.seh_startepilogue
add rsp, 88
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
pop rbp
.seh_endepilogue
ret
#########################################################
❯ cargo asm 3
Finished `release` profile [optimized] target(s) in 0.04s
.section .text,"xr",one_only,datetime_to_pydatetime_assert_unchecked,unique,7
.globl datetime_to_pydatetime_assert_unchecked
.p2align 4
datetime_to_pydatetime_assert_unchecked:
.cv_func_id 31
.seh_proc datetime_to_pydatetime_assert_unchecked
.seh_handler __CxxFrameHandler3, @unwind, @except
push rbp
.seh_pushreg rbp
push r15
.seh_pushreg r15
push r14
.seh_pushreg r14
push r13
.seh_pushreg r13
push r12
.seh_pushreg r12
push rsi
.seh_pushreg rsi
push rdi
.seh_pushreg rdi
push rbx
.seh_pushreg rbx
sub rsp, 168
.seh_stackalloc 168
lea rbp, [rsp + 128]
.seh_setframe rbp, 128
.seh_endprologue
mov qword ptr [rbp + 32], -2
mov rsi, rcx
movzx ebx, byte ptr [rdx + 10]
.cv_inline_site_id 32 within 31 inlined_at 8 47 0
.cv_inline_site_id 33 within 32 inlined_at 10 819 0
test bl, bl
js .LBB7_1
movzx edi, byte ptr [rdx + 11]
.cv_inline_site_id 34 within 31 inlined_at 8 48 0
.cv_inline_site_id 35 within 34 inlined_at 10 819 0
test dil, dil
js .LBB7_1
movzx r15d, byte ptr [rdx + 4]
.cv_inline_site_id 36 within 31 inlined_at 8 49 0
.cv_inline_site_id 37 within 36 inlined_at 10 819 0
test r15b, r15b
js .LBB7_1
movzx r12d, byte ptr [rdx + 5]
.cv_inline_site_id 38 within 31 inlined_at 8 50 0
.cv_inline_site_id 39 within 38 inlined_at 10 819 0
test r12b, r12b
js .LBB7_1
movzx r13d, byte ptr [rdx + 6]
.cv_inline_site_id 40 within 31 inlined_at 8 51 0
.cv_inline_site_id 41 within 40 inlined_at 10 819 0
test r13b, r13b
js .LBB7_1
.cv_inline_site_id 42 within 31 inlined_at 8 54 0
movsxd rax, dword ptr [rdx]
movsx edx, word ptr [rdx + 8]
test r9, r9
je .LBB7_7
.cv_inline_site_id 43 within 42 inlined_at 12 1162 0
mov qword ptr [rbp + 16], rax
mov dword ptr [rbp + 24], edx
mov r14d, r8d
lea rcx, [rbp - 48]
mov rdx, r9
call <&jiff::tz::timezone::TimeZone as pyo3::conversion::IntoPyObject>::into_pyobject
.cv_inline_site_id 44 within 31 inlined_at 8 55 0
mov rax, qword ptr [rbp - 48]
cmp rax, -1
je .LBB7_11
test rax, rax
jne .LBB7_15
mov r8d, r14d
mov r9, qword ptr [rbp - 40]
jmp .LBB7_12
.LBB7_1:
.cv_inline_site_id 45 within 31 inlined_at 8 0 0
lea rcx, [rsi + 8]
mov dl, 3
call <pyo3::err::PyErr as core::convert::From<core::num::error::TryFromIntError>>::from
.LBB7_16:
mov qword ptr [rsi], 1
.LBB7_17:
mov rax, rsi
.seh_startepilogue
add rsp, 168
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
pop rbp
.seh_endepilogue
ret
.LBB7_7:
xor r9d, r9d
jmp .LBB7_8
.LBB7_11:
xor r9d, r9d
mov r8d, r14d
.LBB7_12:
mov edx, dword ptr [rbp + 24]
mov rax, qword ptr [rbp + 16]
.LBB7_8:
imul rax, rax, 274877907
mov rcx, rax
shr rcx, 63
sar rax, 38
add eax, ecx
mov qword ptr [rbp + 8], r9
.cv_inline_site_id 46 within 31 inlined_at 8 56 0
test r9, r9
lea rcx, [rbp + 8]
mov qword ptr [rbp + 24], r9
cmove rcx, r9
mov byte ptr [rsp + 72], r8b
mov qword ptr [rsp + 64], rcx
mov dword ptr [rsp + 56], eax
mov byte ptr [rsp + 48], r13b
mov byte ptr [rsp + 40], r12b
mov byte ptr [rsp + 32], r15b
mov rcx, rsi
mov r8d, ebx
mov r9d, edi
call <pyo3::types::datetime::PyDateTime>::new_with_fold
nop
mov rcx, qword ptr [rbp + 24]
call core::ptr::drop_glue::<core::option::Option<pyo3::instance::Bound<pyo3::types::datetime::PyTzInfo>>>
jmp .LBB7_17
.LBB7_15:
mov rax, qword ptr [rbp - 40]
mov rcx, qword ptr [rbp]
mov qword ptr [rsi + 48], rcx
movups xmm0, xmmword ptr [rbp - 16]
movups xmmword ptr [rsi + 32], xmm0
movups xmm0, xmmword ptr [rbp - 32]
movups xmmword ptr [rsi + 16], xmm0
.cv_inline_site_id 47 within 31 inlined_at 8 53 0
mov qword ptr [rsi + 8], rax
jmp .LBB7_16
.seh_handlerdata
.long $cppxdata$datetime_to_pydatetime_assert_unchecked@IMGREL
.section .text,"xr",one_only,datetime_to_pydatetime_assert_unchecked,unique,7
.seh_endproc
.def "?dtor$18@?0?datetime_to_pydatetime_assert_unchecked@4HA";
.scl 3;
.type 32;
.endef
.p2align 4
"?dtor$18@?0?datetime_to_pydatetime_assert_unchecked@4HA":
.seh_proc "?dtor$18@?0?datetime_to_pydatetime_assert_unchecked@4HA"
mov qword ptr [rsp + 16], rdx
push rbp
.seh_pushreg rbp
push r15
.seh_pushreg r15
push r14
.seh_pushreg r14
push r13
.seh_pushreg r13
push r12
.seh_pushreg r12
push rsi
.seh_pushreg rsi
push rdi
.seh_pushreg rdi
push rbx
.seh_pushreg rbx
sub rsp, 88
.seh_stackalloc 88
lea rbp, [rdx + 128]
.seh_endprologue
mov rcx, qword ptr [rbp + 24]
call core::ptr::drop_glue::<core::option::Option<pyo3::instance::Bound<pyo3::types::datetime::PyTzInfo>>>
nop
.seh_startepilogue
add rsp, 88
pop rbx
pop rdi
pop rsi
pop r12
pop r13
pop r14
pop r15
pop rbp
.seh_endepilogue
ret
movsxd r14, dword ptr [rdx]
cmp r14, -1000
jg .LBB6_7In movsxd rax, dword ptr [rdx]
movsx edx, word ptr [rdx + 8]Similarly, number of instructions is reduced from 271 to 264 |
For
|
I searched GitHub within pyo3 but couldn't find any such code
|
@chirizxc It's from your diff like |
Looking at the results again at https://godbolt.org, it seems that Instead of fn len(&self) -> usize {
self.length.0.saturating_sub(self.index.0)
}
pub fn len_pr2(&self) -> usize {
self.length.0 - self.index.0
}len:
mov rcx, qword ptr [rdi + 16]
xor eax, eax
sub rcx, qword ptr [rdi + 8]
cmovae rax, rcx
ret
len_pr2:
mov rax, qword ptr [rdi + 16]
sub rax, qword ptr [rdi + 8]
retBut could the invariant In If the list is reduced from the outside (for example, by another thread in free-threaded Python), then |
Sounds like something possible. It's why I would tend to prefer keeping the current code as it is. |
Icxolu
left a comment
There was a problem hiding this comment.
Are the assertion hints actually making any differences here? I quickly tested your examples on my machine and got no difference in the asm between having the hint and not having it (and having a debug_assertion instead for that matter). So I believe the difference you observed is simply caused by using an integer cast now (which is not checked in release mode) instead of going through TryInto which does check.
So I currently don't see any justification to introduce more unsafe here. If we really wanted to, we could keep the casts and add debug_assertions instead, but I'm not really convinced that it makes a measurable difference. In general I would prefer to actually have benchmarks that show the improvement of an optimization. We would also need at least some documentation everywhere that we do these optimizations so that we know in the future why we did it this way and it's not getting lost in maintenance. All in all I would also agree @davidhewitt and @Tpt that this is not sufficiently justified.
No description provided.