In a previous post I guessed that 91 bytes was close to the minimum size for implementing signed and unsigned saturating 32-bit addition and subtraction on x86. A commenter rightly pointed out that it should be possible to do better. Attached is my best shot: 83 bytes. Given the crappy x86 calling convention I’m going to guess it’s hard to do much better unless the saturating SSE instructions offer a way. Or can the branches in the signed functions be avoided? It seems like it, but I couldn’t figure out how.
sat_unsigned_add:
movl 4(%esp), %eax
xor %edx, %edx
not %edx
addl 8(%esp), %eax
cmovb %edx, %eax
ret
sat_unsigned_sub:
movl 4(%esp), %eax
xor %edx, %edx
subl 8(%esp), %eax
cmovb %edx, %eax
ret
sat_signed_add:
movl 4(%esp), %eax
movl $2147483647, %edx
movl $-2147483648, %ecx
addl 8(%esp), %eax
jno out1
cmovg %edx, %eax
cmovl %ecx, %eax
out1: ret
sat_signed_sub:
movl 4(%esp), %eax
movl $2147483647, %edx
movl $-2147483648, %ecx
subl 8(%esp), %eax
jno out2
cmovg %edx, %eax
cmovl %ecx, %eax
out2: ret
27 instructions in 84 bytes – not 83 bytes, as John Regehr
stated.
Note: due to the (ab)use of conditional move
instructions CMOVcc
introduced with the
Pentium Pro processor, this code but does
not run on the original i386 32-bit
processor!
The 4 highlighted instructions common to both signed saturating
addition and subtraction can of course be relocated, thus saving 4
instructions and 16 bytes for a total of 23 instructions in 68
bytes; additionally the directions of both conditional branch
instructions Jcc
suit the branch predictor of
super-scalar processors better:
sat_signed_add:
movl 4(%esp), %eax
addl 8(%esp), %eax
jo common
return: ret
sat_signed_sub:
movl 4(%esp), %eax
subl 8(%esp), %eax
jno return
common: movl $2147483647, %edx
movl $-2147483648, %ecx
cmovg %edx, %eax
cmovl %ecx, %eax
ret
Finally the branch-free implementation, also using
a total of 23 instructions in 68 bytes:
sat_signed_add:
movl 4(%esp), %eax
addl 8(%esp), %eax
cltd
lea 2147483648(%edx), %edx
cmovo %edx, %eax
ret
sat_signed_sub:
movl 4(%esp), %eax
subl 8(%esp), %eax
cltd
lea 2147483648(%edx), %edx
cmovo %edx, %eax
ret
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
# Common "cdecl" calling and naming convention for i386 platform:
# - arguments are pushed on stack in reverse order (from right to left),
# 4-byte aligned;
# - 64-bit integer arguments are passed as pair of 32-bit integer arguments,
# low part below high part;
# - 64-bit integer result is returned in registers EAX (low part) and
# EDX (high part);
# - 32-bit integer or pointer result is returned in register EAX;
# - registers EAX, ECX and EDX are volatile and can be clobbered;
# - registers EBX, ESP, EBP, ESI and EDI must be preserved;
# - function names are prefixed with an underscore.
.file "i386.s"
.arch generic32
.code32
.intel_syntax noprefix
.text
# [esp+8] = addend
# [esp+4] = augend
sat_unsigned_add:
mov eax, [esp+4]
add eax, [esp+8]
sbb ecx, ecx
or eax, ecx
ret
.size sat_unsigned_add, .-sat_unsigned_add
.type sat_unsigned_add, @function
.global sat_unsigned_add
# [esp+8] = divisor
# [esp+4] = dividend
sat_unsigned_div:
xor edx, edx
mov eax, [esp+4]
div dword ptr [esp+8]
ret
.size sat_unsigned_div, .-sat_unsigned_div
.type sat_unsigned_div, @function
.global sat_unsigned_div
# [esp+8] = divisor
# [esp+4] = dividend
sat_unsigned_mod:
xor edx, edx
mov eax, [esp+4]
div dword ptr [esp+8]
mov edx, eax
ret
.size sat_unsigned_mod, .-sat_unsigned_mod
.type sat_unsigned_mod, @function
.global sat_unsigned_mod
# [esp+8] = multiplier
# [esp+4] = multiplicand
sat_unsigned_mul:
mov eax, [esp+4]
mul dword ptr [esp+8]
sbb ecx, ecx
or eax, ecx
ret
.size sat_unsigned_mul, .-sat_unsigned_mul
.type sat_unsigned_mul, @function
.global sat_unsigned_mul
# [esp+8] = subtrahend
# [esp+4] = minuend
sat_unsigned_sub:
mov eax, [esp+4]
sub eax, [esp+8]
cmc
sbb ecx, ecx
and eax, ecx
ret
.size sat_unsigned_sub, .-sat_unsigned_sub
.type sat_unsigned_sub, @function
.global sat_unsigned_sub
# [esp+8] = addend
# [esp+4] = augend
sat_signed_add:
mov eax, [esp+4]
add eax, [esp+8]
jo .Loverflow
.Lreturn1:
ret
# [esp+8] = subtrahend
# [esp+4] = minuend
sat_signed_sub:
mov eax, [esp+4]
sub eax, [esp+8]
jno .Lreturn1
.Loverflow:
.if 0
cdq
mov eax, 2147483648
xor eax, edx
.else
sar eax, 31
add eax, 2147483648
.endif
.Lreturn2:
ret
.size sat_signed_add, .-sat_signed_add
.type sat_signed_add, @function
.global sat_signed_add
.size sat_signed_sub, .-sat_signed_sub+1
.type sat_signed_sub, @function
.global sat_signed_sub
# [esp+8] = multiplier
# [esp+4] = multiplicand
sat_signed_mul:
mov eax, [esp+4]
imul eax, [esp+8]
jno .Lreturn2
mov eax, [esp+4]
xor eax, [esp+8]
.if 0
cdq
mov eax, 2147483647
xor eax, edx
.else
sar eax, 31
xor eax, 2147483647
.endif
ret
.size sat_signed_mul, .-sat_signed_mul+1
.type sat_signed_mul, @function
.global sat_signed_mul
# [esp+8] = divisor
# [esp+4] = dividend
sat_signed_div:
mov ecx, [esp+8]
mov eax, [esp+4]
cmp ecx, -1
je .Lspecial
cdq
idiv ecx
.Lreturn3:
ret
.Lspecial:
neg eax
jno .Lreturn3
not eax
ret
.size sat_signed_div, .-sat_signed_div
.type sat_signed_div, @function
.global sat_signed_div
# [esp+8] = divisor
# [esp+4] = dividend
sat_signed_mod:
mov ecx, [esp+8]
mov eax, ecx
inc eax
jz .Ltrivial
mov eax, [esp+4]
cdq
idiv ecx
mov edx, eax
.Ltrivial:
ret
.size sat_signed_mod, .-sat_signed_mod
.type sat_signed_mod, @function
.global sat_signed_mod
.end
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.file "i686.s"
.code32
.intel_syntax noprefix
.text
# [esp+8] = addend
# [esp+4] = augend
sat_signed_add:
mov eax, [esp+4]
add eax, [esp+8]
cdq
lea edx, [edx+2147483648]
cmovo eax, edx
ret
.size sat_signed_add, .-sat_signed_add
.type sat_signed_add, @function
.global sat_signed_add
# [esp+8] = multiplier
# [esp+4] = multiplicand
sat_signed_mul:
mov eax, [esp+4]
mov edx, [esp+8]
mov ecx, eax
xor ecx, edx
sar ecx, 31
xor ecx, 2147483647
imul eax, edx
cmovo eax, ecx
ret
.size sat_signed_mul, .-sat_signed_mul
.type sat_signed_mul, @function
.global sat_signed_mul
# [esp+8] = subtrahend
# [esp+4] = minuend
sat_signed_sub:
mov eax, [esp+4]
sub eax, [esp+8]
cdq
lea edx, [edx+2147483648]
cmovo eax, edx
ret
.size sat_signed_sub, .-sat_signed_sub
.type sat_signed_sub, @function
.global sat_signed_sub
.end
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.file "amd64.s"
.arch generic64
.code64
.intel_syntax noprefix
.text
# Unix System V calling convention for AMD64 platform:
# - first 6 integer or pointer arguments (from left to right) are passed
# in registers RDI/R7, RSI/R6, RDX/R2, RCX/R1, R8 and R9
# (R10 is used as static chain pointer in case of nested functions);
# - surplus arguments are pushed on stack in reverse order (from right to
# left), 8-byte aligned;
# - 128-bit integer arguments are passed as pair of 64-bit integer arguments,
# low part before/below high part;
# - 128-bit integer result is returned in registers RAX/R0 (low part) and
# RDX/R2 (high part);
# - 64-bit integer or pointer result is returned in register RAX/R0;
# - 32-bit integer result is returned in register EAX;
# - registers RBX/R3, RSP/R4, RBP/R5, R12, R13, R14 and R15 must be
# preserved;
# - registers RAX/R0, RCX/R1, RDX/R2, RSI/R6, RDI/R7, R8, R9, R10 (in
# case of normal functions) and R11 are volatile and can be clobbered;
# - stack is 16-byte aligned: callee must decrement RSP by 8+n*16 bytes
# before calling other functions (CALL instruction pushes 8 bytes);
# - a "red zone" of 128 bytes below the stack pointer can be clobbered.
# rdi = augend
# rsi = addend
sat_unsigned_add:
add rdi, rsi # rdi = augend + addend
# = sum
sbb rax, rax # rax = (sum < 2**64) ? 0 : ~0
or rax, rdi # rax = (sum < 2**64) ? sum : 2**64 - 1
ret
.size sat_unsigned_add, .-sat_unsigned_add
.type sat_unsigned_add, @function
.global sat_unsigned_add
# rdi = dividend
# rsi = divisor
sat_unsigned_div:
xor edx, edx
mov rax, rdi # rdx:rax = dividend
div rsi # rax = dividend / divisor
# = quotient,
# rdx = dividend % divisor
# = remainder
ret
.size sat_unsigned_div, .-sat_unsigned_div
.type sat_unsigned_div, @function
.global sat_unsigned_div
.hidden sat_unsigned_div
# rdi = dividend
# rsi = divisor
sat_unsigned_mod:
xor edx, edx
mov rax, rdi # rdx:rax = dividend
div rsi # rax = dividend / divisor
# = quotient,
# rdx = dividend % divisor
# = remainder
mov rax, rdx # rax = remainder
ret
.size sat_unsigned_mod, .-sat_unsigned_mod
.type sat_unsigned_mod, @function
.global sat_unsigned_mod
.hidden sat_unsigned_mod
# rdi = multiplicand
# rsi = multiplier
sat_unsigned_mul:
mov rax, rdi # rax = multiplicand
mul rsi # rdx:rax = multiplicand * multiplier
# = product
sbb rdx, rdx # rdx = (product < 2**64) ? 0 : ~0
or rax, rdx # rax = (product < 2**64) ? product : 2**64 - 1
ret
.size sat_unsigned_mul, .-sat_unsigned_mul
.type sat_unsigned_mul, @function
.global sat_unsigned_mul
# rdi = minuend
# rsi = subtrahend
sat_unsigned_sub:
.if 0
sub rdi, rsi # rdi = minuend - subtrahend
# = difference
cmc
sbb rax, rax # rax = (difference < 0) ? 0 : ~0
and rax, rdi # rax = (difference < 0) ? 0 : difference
.else
xor eax, eax
sub rdi, rsi # rdi = minuend - subtrahend
# = difference
cmovnb rax, rdi # rax = (difference < 0) ? 0 : difference
.endif
ret
.size sat_unsigned_sub, .-sat_unsigned_sub
.type sat_unsigned_sub, @function
.global sat_unsigned_sub
# rdi = augend
# rsi = addend
sat_signed_add:
mov rax, rdi # rax = augend
cqo # rdx = (augend < 0) ? ~0 : 0
not rdx # rdx = (augend < 0) ? 0 : ~0
btc rdx, 63 # rdx = (augend < 0) ? -2**63 : 2**63 - 1
add rax, rsi # rax = augend + addend
# = sum
cmovo rax, rdx # rax = clamped sum
ret
.size sat_signed_add, .-sat_signed_add
.type sat_signed_add, @function
.global sat_signed_add
# rdi = dividend
# rsi = divisor
sat_signed_div:
.if 0
mov rax, rdi # rax = dividend
cmp rsi, -1
je .Lspecial # divisor = -1?
cqo # rdx:rax = dividend
idiv rsi # rax = dividend / divisor
# = quotient,
# rdx = dividend % divisor
# = remainder
ret
.Lspecial:
neg rax # rax = -dividend
not rdi # rdi = ~dividend
# = -dividend - 1
cmovo rax, rdi # rax = (dividend = -2**63) ? ~dividend : -dividend
# = (dividend = -2**63) ? 2**63 - 1 : -dividend
# = clamped quotient
ret
.else
mov rax, rdi # rax = dividend
cqo # rdx:rax = dividend
btc rdi, 63 # rdi = (dividend = -2**63) ? 0 : *
xor rdx, rsi # rdx = (divisor = -1) | (divisor = 0) ? 0 : *
or rdx, rdi # rdx = (divisor = -1) & (dividend = -2**63) ? 0 : *
neg rdx # CF = (divisor <> -1) | (dividend <> -2**63)
adc rax, -1 # rax = dividend - 1
# + ((divisor <> -1) | (dividend <> -2**63))
# = dividend'
cqo # rdx:rax = dividend'
idiv rsi # rax = dividend' / divisor
# = quotient,
# rdx = dividend' % divisor
# = remainder
ret
.endif
.size sat_signed_div, .-sat_signed_div
.type sat_signed_div, @function
.global sat_signed_div
# rdi = dividend
# rsi = divisor
sat_signed_mod:
.if 0
cmp rsi, -1
je .Ltrivial # divisor = -1?
mov rax, rdi # rax = dividend
cqo # rdx:rax = dividend
idiv rsi # rax = dividend / divisor
# = quotient,
# rdx = dividend % divisor
# = remainder
mov rax, rdx # rax = remainder
ret
.Ltrivial:
xor eax, eax # rax = remainder = 0
ret
.else
mov rax, rdi # rax = dividend
cqo # rdx:rax = dividend
btc rdi, 63 # rdi = (dividend = -2**63) ? 0 : *
xor rdx, rsi # rdx = (divisor = -1) | (divisor = 0) ? 0 : *
or rdx, rdi # rdx = (divisor = -1) & (dividend = -2**63) ? 0 : *
neg rdx # CF = (divisor <> -1) | (dividend <> -2**63)
adc rax, -1 # rax = dividend - 1
# + ((divisor <> -1) | (dividend <> -2**63))
# = dividend'
cqo # rdx:rax = dividend'
idiv rsi # rax = dividend' / divisor
# = quotient,
# rdx = dividend' % divisor
# = remainder
mov rax, rdx # rax = remainder
ret
.endif
.size sat_signed_mod, .-sat_signed_mod
.type sat_signed_mod, @function
.global sat_signed_mod
# rdi = multiplicand
# rsi = multiplier
sat_signed_mul:
mov rax, rdi # rax = multiplicand
xor rax, rsi # rax = multiplicand ^ multiplier
sar rax, 63 # rax = (product < 0) ? ~0 : 0
not rax # rax = (product < 0) ? 0 : ~0
btc rax, 63 # rax = (product < 0) ? -2**63 : 2**63 - 1
imul rdi, rsi # rdi = multiplicand * multiplier
# = product
cmovno rax, rdi # rax = clamped product
ret
.size sat_signed_mul, .-sat_signed_mul
.type sat_signed_mul, @function
.global sat_signed_mul
# rdi = minuend
# rsi = subtrahend
sat_signed_sub:
mov rax, rdi # rax = minuend
cqo # rdx = (minuend < 0) ? ~0 : 0
not rdx # rdx = (minuend < 0) ? 0 : ~0
btc rdx, 63 # rdx = (minuend < 0) ? -2**63 : 2**63 - 1
sub rax, rsi # rax = minuend - subtrahend
# = difference
cmovo rax, rdx # rax = clamped difference
ret
.size sat_signed_sub, .-sat_signed_sub
.type sat_signed_sub, @function
.global sat_signed_sub
.end
Note: in case of overflow, augend and saturated sum
respectively minuend and saturated difference have the same sign!
funbegins with signed and unsigned 64-bit integers on 32-bit (i386 alias x86) processors as well as signed and unsigned 128-bit integers on 64-bit (AMD64 alias x64) processors, but ends unfortunately as soon as one compares the following properly optimised assembly to the code generated by so-called
optimisingcompilers:
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.file "clamp_64.s"
.arch generic32
.code32
.intel_syntax noprefix
.extern __divdi3
.extern __moddi3
.text
# [esp+16] = high dword of addend
# [esp+12] = low dword of addend
# [esp+8] = high dword of augend
# [esp+4] = low dword of augend
sat_unsigned_add_64:
mov eax, [esp+4]
mov edx, [esp+8] # edx:eax = augend
add eax, [esp+12]
adc edx, [esp+16] # edx:eax = augend + addend
# = sum
sbb ecx, ecx # ecx = (sum < 2**64) ? 0 : ~0
or eax, ecx
or edx, ecx # edx:eax = (sum < 2**64) ? sum : 2**64 - 1
ret
.size sat_unsigned_add_64, .-sat_unsigned_add_64
.type sat_unsigned_add_64, @function
.global sat_unsigned_add_64
# [esp+16] = high dword of multiplier
# [esp+12] = low dword of multiplier
# [esp+8] = high dword of multiplicand
# [esp+4] = low dword of multiplicand
sat_unsigned_mul_64:
push ebx
xor edx, edx # edx = 0
mov eax, [esp+12] # eax = high dword of multiplicand
cmp edx, eax
sbb ebx, ebx # ebx = (high dword of multiplicand = 0) ? 0 : ~0
# = (multiplicand < 2**32) ? 0 : ~0
mov ecx, [esp+20] # ecx = high dword of multiplier
cmp edx, ecx
sbb edx, edx # edx = (high dword of multiplier = 0) ? 0 : ~0
# = (multiplier < 2**32) ? 0 : ~0
and ebx, edx # ebx = (multiplicand < 2**32)
# & (multiplier < 2**32) ? 0 : ~0
# = (product < 2**64) ? 0 : ~0
mov edx, [esp+16] # edx = low dword of multiplier
mul edx # edx:eax = high dword of multiplicand
# * low dword of multiplier
adc ebx, ebx # ebx = (product < 2**64) ? 0 : *
xchg ecx, eax # ecx = high dword of multiplicand
# * low dword of multiplier,
# eax = high dword of multiplier
mov edx, [esp+8] # edx = low dword of multiplicand
mul edx # edx:eax = high dword of multiplier
# * low dword of multiplicand
adc ebx, ebx # ebx = (product < 2**64) ? 0 : *
add ecx, eax # ecx = high dword of multiplicand
# * low dword of multiplier
# + high dword of multiplier
# * low dword of multiplicand
# adc ebx, ebx # ebx = (product < 2**64) ? 0 : *
mov eax, [esp+8] # eax = low dword of multiplicand
mov edx, [esp+16] # edx = low dword of multiplier
mul edx # edx:eax = low dword of multiplicand
# * low dword of multiplier
add edx, ecx # edx:eax = product % 2**64
adc ebx, ebx # ebx = (product < 2**64) ? 0 : *
jnz .Loverflow # product >= 2**64?
pop ebx
ret
.Loverflow:
stc
sbb eax, eax
cdq # edx:eax = 2**64 - 1
pop ebx
ret
.size sat_unsigned_mul_64, .-sat_unsigned_mul_64
.type sat_unsigned_mul_64, @function
.global sat_unsigned_mul_64
# [esp+16] = high dword of subtrahend
# [esp+12] = low dword of subtrahend
# [esp+8] = high dword of minuend
# [esp+4] = low dword of minuend
sat_unsigned_sub_64:
mov eax, [esp+4]
mov edx, [esp+8] # edx:eax = minuend
sub eax, [esp+12]
sbb edx, [esp+16] # edx:eax = minuend - subtrahend
# = difference
cmc
sbb ecx, ecx # ecx = (difference < 0) ? 0 : ~0
and eax, ecx
and edx, ecx # edx:eax = (difference < 0) ? 0 : difference
ret
.size sat_unsigned_sub_64, .-sat_unsigned_sub_64
.type sat_unsigned_sub_64, @function
.global sat_unsigned_sub_64
# [esp+16] = high dword of addend
# [esp+12] = low dword of addend
# [esp+8] = high dword of augend
# [esp+4] = low dword of augend
sat_signed_add_64:
mov eax, [esp+4]
mov edx, [esp+8] # edx:eax = augend
add eax, [esp+12]
adc edx, [esp+16] # edx:eax = augend + addend
# = sum
# = result
jo .Loverflow1 # sum < -2**63?
# sum >= 2**63?
.Lreturn1:
ret
# [esp+16] = high dword of subtrahend
# [esp+12] = low dword of subtrahend
# [esp+8] = high dword of minuend
# [esp+4] = low dword of minuend
sat_signed_sub_64:
mov eax, [esp+4]
mov edx, [esp+8] # edx:eax = minuend
sub eax, [esp+12]
sbb edx, [esp+16] # edx:eax = minuend - subtrahend
# = difference
# = result
jno .Lreturn1 # -2**63 <= difference < 2**63?
.Loverflow1:
.if 0
mov eax, edx
cdq # edx = (result < 0) ? ~0 : 0
.else
sar edx, 31 # edx = (result < 0) ? ~0 : 0
.endif
mov eax, edx # eax = low dword of clamped result
xor edx, 2147483648 # edx = (result < 0) ? -2**31 : 2**31 - 1
# = high dword of clamped result
ret
.size sat_signed_add_64, .-sat_signed_add_64
.type sat_signed_add_64, @function
.global sat_signed_add_64
.size sat_signed_sub_64, .-sat_signed_sub_64+1
.type sat_signed_sub_64, @function
.global sat_signed_sub_64
# [esp+16] = high dword of multiplier
# [esp+12] = low dword of multiplier
# [esp+8] = high dword of multiplicand
# [esp+4] = low dword of multiplicand
sat_signed_mul_64:
push ebx
mov ebx, [esp+16] # ebx = low dword of multiplier
mov eax, [esp+8] # eax = low dword of multiplicand
mul ebx # edx:eax = low dword of multiplicand
# * low dword of multiplier
push eax
mov ecx, edx # ecx = low dword of "inner" product
mov eax, [esp+16] # eax = high dword of multiplicand
mul ebx # edx:eax = high dword of multiplicand
# * low dword of multiplier
xor ebx, ebx # ebx = high dword of "inner" product
add ecx, eax
adc ebx, edx # ebx:ecx = intermediate "inner" product
mov eax, [esp+12] # eax = low dword of multiplicand
mul dword ptr [esp+24]
# edx:eax = low dword of multiplicand
# * high dword of multiplier
add ecx, eax
adc ebx, edx # ebx:ecx = final "inner" product
push ecx
sbb ecx, ecx # ecx = 0 - carry from "inner" product
mov eax, [esp+20] # eax = high dword of multiplicand
mul dword ptr [esp+28]
# edx:eax = high dword of multiplicand
# * high dword of multiplier
neg ecx # ecx = carry from "inner" product
add eax, ebx
adc edx, ecx # edx:eax = high dword of multiplicand
# * high dword of multiplier
# + high dword of "inner" product
# + carry from "inner" product
# = high qword of (unsigned) product"
xor ebx, ebx # ebx = sign of multiplier = 0
cmp ebx, [esp+28]
jle 0f # (high dword of) multiplier >= 0?
not ebx # ebx = 0 - sign of multiplier = -1
sub eax, [esp+16]
sbb edx, [esp+20] # edx:eax = high qword of product"
# - multiplicand
# = high qword of product'
0:
xor ecx, ecx
cmp ecx, [esp+20]
jle 1f # (high dword of) multiplicand >= 0?
not ebx # ebx = (0 - sign of multiplier)
# ^ (0 - sign of multiplicand)
sub eax, [esp+24]
sbb edx, [esp+28] # edx:eax = high qword of product'
# - multiplier
# = high qword of (signed) product
1:
xor eax, ebx
xor edx, ebx # edx:eax = high qword of product
# ^ (0 - sign of multiplier)
# ^ (0 - sign of multiplicand)
or eax, edx # eax = high qword of product
# <> (0 - sign of multiplier)
# ^ (0 - sign of multiplicand)
pop edx # edx = high dword of (low qword of) product
shld ecx, edx, 1 # ecx = sign of product
add ebx, ecx # ebx = sign of product
# <> sign of multiplier
# ^ sign of multiplicand
or eax, ebx # eax = (-2**63 <= product < 2**63) ? 0 : *
pop eax # edx:eax = (low qword of) product
pop ebx
jnz .Loverflow2 # product < -2**63?
# product >= 2**63?
ret
.Loverflow2:
mov eax, [esp+8] # eax = high dword of multiplicand
xor eax, [esp+16] # eax = high dword of multiplicand
# ^ high dword of multiplier
sar eax, 31 # eax = (multiplicand ^ multiplier < 0) ? ~0 : 0
# = (product < 0) ? ~0 : 0
# = low dword of clamped product
mov edx, 2147483648 # edx = ±2**31
xor edx, eax # edx = (product < 0) ? -2**31 : 2**31 - 1
# = high dword of clamped product
ret
.size sat_signed_mul_64, .-sat_signed_mul_64
.type sat_signed_mul_64, @function
.global sat_signed_mul_64
# [esp+16] = high dword of divisor
# [esp+12] = low dword of divisor
# [esp+8] = high dword of dividend
# [esp+4] = low dword of dividend
sat_signed_mod_64:
mov eax, [esp+12]
and eax, [esp+16]
inc eax
jz .Ltrivial # divisor = -1?
jmp __moddi3
.Ltrivial:
cdq # edx:eax = remainder = 0
.Lreturn2:
ret
.size sat_signed_mod_64, .-sat_signed_mod_64
.type sat_signed_mod_64, @function
.global sat_signed_mod_64
# [esp+16] = high dword of divisor
# [esp+12] = low dword of divisor
# [esp+8] = high dword of dividend
# [esp+4] = low dword of dividend
sat_signed_div_64:
mov eax, [esp+12]
and eax, [esp+16]
inc eax
jz .Lspecial # divisor = -1?
jmp __divdi3
.Lspecial:
cdq # edx:eax = 0
sub eax, [esp+4]
sbb edx, [esp+8] # edx:eax = -dividend
# = quotient
jno .Lreturn2 # dividend <> -2**63 = 2**63?
.if 0
add eax, -1
adc edx, -1 # edx:eax = -dividend - 1
# = 2**63 - 1
# = clamped quotient
.else
not eax
not edx # edx:eax = ~dividend
# = 2**63 - 1
# = clamped quotient
.endif
ret
.size sat_signed_div_64, .-sat_signed_div_64+1
.type sat_signed_div_64, @function
.global sat_signed_div_64
.end
32 instructions in 96 bytes for signed and unsigned saturating
64-bit addition and subtraction.
Note: my related article
Fast(est) Double-Word Integer Division
presents properly optimised implementations of the signed 64-bit
division routines
__divdi3()
and
__moddi3()
called here, while my articles
Deficiencies in GCC’s code generator and optimiser
and
True Lies
– or What LLVM claims, but fails to deliver
Not quite so optimising
Microsoft® Visual C compiler
document their poor implementations in the compiler runtime
libraries of both
GCC
and
LLVM.
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.file "clamp_128.s"
.arch generic64
.code64
.intel_syntax noprefix
.extern __divti3
.extern __modti3
.text
# rsi:rdi = augend
# rcx:rdx = addend
sat_unsigned_add_128:
add rdi, rdx
adc rsi, rcx # rsi:rdi = augend + addend
# = sum
sbb rax, rax # rax = (sum < 2**128) ? 0 : ~0
cqo # rdx:rax = (sum < 2**128) ? 0 : 2**128 - 1
or rax, rdi
or rdx, rsi # rdx:rax = (sum < 2**128) ? sum : 2**128 - 1
ret
.size sat_unsigned_add_128, .-sat_unsigned_add_128
.type sat_unsigned_add_128, @function
.global sat_unsigned_add_128
# rsi:rdi = minuend
# rcx:rdx = subtrahend
sat_unsigned_sub_128:
sub rdi, rdx
sbb rsi, rcx # rsi:rdi = minuend - subtrahend
# = difference
cmc
sbb rax, rax # rax = (difference < 0) ? 0 : ~0
cqo # rdx:rax = (difference < 0) ? 0 : 2**128 - 1
and rax, rdi
and rdx, rsi # rdx:rax = (difference < 0) ? 0 : difference
ret
.size sat_unsigned_sub_128, .-sat_unsigned_sub_128
.type sat_unsigned_sub_128, @function
.global sat_unsigned_sub_128
# rsi:rdi = augend
# rcx:rdx = addend
sat_signed_add_128:
mov r11, rdx
mov rax, rsi
cqo # rdx = (augend < 0) ? ~0 : 0
not rdx # rdx = (augend < 0) ? 0 : ~0
mov rax, rdx # rdx:rax = (augend < 0) ? 0 : ~0
btc rdx, 63 # rdx:rax = (augend < 0) ? -2**127 : 2**127 - 1
add rdi, r11
adc rsi, rcx # rsi:rdi = augend + addend
# = sum
cmovno rax, rdi
cmovno rdx, rsi # rdx:rax = clamped sum
ret
.size sat_signed_add_128, .-sat_signed_add_128
.type sat_signed_add_128, @function
.global sat_signed_add_128
# rsi:rdi = dividend
# rcx:rdx = divisor
sat_signed_div_128:
mov rax, rcx
and rax, rdx
inc rax
jnz __divti3 # divisor <> -1?
mov edx, eax # rdx:rax = 0
sub rax, rdi
sbb rdx, rsi # rdx:rax = -dividend
not rdi
not rsi # rsi:rdi = ~dividend
# = -dividend - 1
cmovo rax, rdi
cmovo rdx, rsi # rdx:rax = (dividend = -2**127) ? ~dividend : -dividend
# = (dividend = -2**127) ? 2**127 - 1 : -dividend
# = clamped quotient
ret
.size sat_signed_div_128, .-sat_signed_div_128
.type sat_signed_div_128, @function
.global sat_signed_div_128
# rsi:rdi = dividend
# rcx:rdx = divisor
sat_signed_mod_128:
mov rax, rcx
and rax, rdx
inc rax
jnz __modti3 # divisor <> -1?
mov edx, eax # rdx:rax = remainder = 0
ret
.size sat_signed_mod_128, .-sat_signed_mod_128
.type sat_signed_mod_128, @function
.global sat_signed_mod_128
# rsi:rdi = multiplicand
# rcx:rdx = multiplier
sat_signed_mul_128:
mov r10, rdx # r10 = low qword of multiplier
mov r11, rcx # r11 = high qword of multiplier
mov rax, rcx # rax = high qword of multiplier
cqo # rdx = (multiplier < 0) ? -1 : 0
mov r9, rdx # r9 = (multiplier < 0) ? -1 : 0
xor r10, rdx
xor r11, rdx # r11:r10 = (multiplier < 0) ? ~multiplier : multiplier
sub r10, rdx
sbb r11, rdx # r11:r10 = (multiplier < 0) ? -multiplier : multiplier
# = |multiplier|
mov rax, rsi # rax = high qword of multiplicand
cqo # rdx = (multiplicand < 0) ? -1 : 0
xor r9, rdx # r9 = (multiplicand < 0) <> (multiplier < 0) ? -1 : 0
# = (product < 0) ? -1 : 0
xor rdi, rdx
xor rsi, rdx # rsi:rdi = (multiplicand < 0) ? ~multiplicand : multiplicand
sub rdi, rdx
sbb rsi, rdx # rsi:rdi = (multiplicand < 0) ? -multiplicand : multiplicand
# = |multiplicand|
xor ecx, ecx
cmp rcx, rsi
sbb edx, edx # edx = (high qword of |multiplicand| = 0) ? 0 : -1
# = (|multiplicand| < 2**64) ? 0 : -1
cmp rcx, r11
sbb ecx, ecx # ecx = (high qword of |multiplier| = 0) ? 0 : -1
# = (|multiplier| < 2**64) ? 0 : -1
and ecx, edx # ecx = (|multiplicand| < 2**64)
# & (|multiplier| < 2**64) ? 0 : -1
# = (|product| < 2**128) ? 0 : -1
mov rax, rsi
mul r10 # rdx:rax = high qword of |multiplicand|
# * low qword of |multiplier|
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
mov rsi, rax
mov rax, rdi
mul r11 # rdx:rax = low qword of |multiplicand|
# * high qword of |multiplier|
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
add rsi, rax # rsi = high qword of |multiplicand|
# * low qword of |multiplier|
# + low qword of |multiplicand|
# * high qword of |multiplier|
# adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
mov rax, rdi
mul r10 # rdx:rax = low qword of |multiplicand|
# * low qword of |multiplier|
add rdx, rsi # rdx:rax = |product % 2**128|
# = |product| % 2**128
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
add rax, r9
adc rdx, r9 # rdx:rax = (product < 0)
# ? ~product % 2**128 : product % 2**128
mov rsi, rdx # rsi = (product % 2**128 < -2**127)
# | (product % 2**128 >= 2**127)
# ? negative : positive
xor rax, r9
xor rdx, r9 # rdx:rax = product % 2**128
not r9 # r9 = (product < 0) ? 0 : ~0
mov r8, r9 # r8:r9 = (product < 0) ? 0 : ~0
btc r8, 63 # r8:r9 = (product < 0) ? -2**127 : 2**127 - 1
add rsi, rsi
adc ecx, ecx # ecx = (-2**127 <= product < 2**127) ? 0 : *
cmovnz rax, r9
cmovnz rdx, r8 # rdx:rax = clamped product
ret
.size sat_signed_mul_128, .-sat_signed_mul_128
.type sat_signed_mul_128, @function
.global sat_signed_mul_128
# rsi:rdi = minuend
# rcx:rdx = subtrahend
sat_signed_sub_128:
mov r11, rdx
mov rax, rsi
cqo # rdx = (minuend < 0) ? ~0 : 0
not rdx # rdx = (minuend < 0) ? 0 : ~0
mov rax, rdx # rdx:rax = (minuend < 0) ? 0 : ~0
btc rdx, 63 # rdx:rax = (minuend < 0) ? -2**127 : 2**127 - 1
sub rdi, r11
sbb rsi, rcx # rsi:rdi = minuend - subtrahend
# = difference
cmovno rax, rdi
cmovno rdx, rsi # rdx:rax = clamped difference
ret
.size sat_signed_sub_128, .-sat_signed_sub_128
.type sat_signed_sub_128, @function
.global sat_signed_sub_128
.end
37 instructions in 105 bytes for signed and unsigned saturating
128-bit addition and subtraction.
Note: my related article
Fast(est) Double-Word Integer Division
presents implementations of the signed 128-bit division routines
__divti3()
and
__modti3()
called here, while my articles
Deficiencies in GCC’s code generator and optimiser
and
True Lies
– or What LLVM claims, but fails to deliver
document their poor implementations in the compiler runtime
libraries of both
GCC,
and
LLVM.
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
unsigned int sat_unsigned_add(unsigned int augend, unsigned int addend)
{
const unsigned int sum = augend + addend;
return -(sum < augend) | sum;
}
unsigned int sat_unsigned_div(unsigned int dividend, unsigned int divisor)
{
// NOTE: overflow impossible!
return dividend / divisor;
}
unsigned int sat_unsigned_mod(unsigned int dividend, unsigned int divisor)
{
// NOTE: overflow impossible!
return dividend % divisor;
}
unsigned int sat_unsigned_mul(unsigned int multiplicand, unsigned int multiplier)
{
const unsigned long long product = (unsigned long long) multiplicand * multiplier;
const unsigned int low = product, high = product >> 32;
return -(high != 0U) | low;
}
unsigned int sat_unsigned_sub(unsigned int minuend, unsigned int subtrahend)
{
const unsigned int difference = minuend - subtrahend;
return -(difference >= minuend) & difference;
}
int sat_signed_add(int augend, int addend)
{
const int sum = augend + addend;
#ifdef UNDEFINED_BEHAVIOUR
const int clamp = 2147483647 + (augend < 0);
#else
const int clamp = 2147483647 ^ (augend >> 31);
#endif
// overflow if both augend and addend have opposite sign of sum,
// which is equivalent to augend has sign of addend
// and addend (or augend) has opposite sign of sum
return ((augend ^ sum) & (addend ^ sum)) < 0 ? clamp : sum;
}
int sat_signed_div(int dividend, int divisor)
{
// signed division overflows (really: raises 'division exception')
// only for -2**31 / -1 = ±2**31!
#if 0
dividend -= (divisor == -1) && (dividend == ~2147483647);
return dividend / divisor;
#else
if (divisor == -1)
return dividend == ~2147483647 ? ~dividend : -dividend;
return dividend / divisor;
#endif
}
int sat_signed_mod(int dividend, int divisor)
{
// signed division overflows (really: raises 'division exception')
// only for -2**31 / -1 = ±2**31!
#if 0
dividend -= (divisor == -1) && (dividend == ~2147483647);
return dividend % divisor;
#else
return divisor == -1 ? 0 : dividend % divisor;
#endif
}
int sat_signed_mul(int multiplicand, int multiplier)
{
const long long product = (long long) multiplicand * multiplier;
#ifdef UNDEFINED_BEHAVIOUR
const int clamp = 2147483647 + ((multiplicand ^ multiplier) < 0);
#else
const int clamp = 2147483647 ^ ((multiplicand ^ multiplier) >> 31);
#endif
const int low = product, high = product >> 32;
return (low >> 31) != high ? clamp : product;
}
int sat_signed_sub(int minuend, int subtrahend)
{
const int difference = minuend - subtrahend;
#ifdef UNDEFINED_BEHAVIOUR
const int clamp = 2147483647 + (minuend < 0);
#else
const int clamp = 2147483647 ^ (minuend >> 31);
#endif
// overflow if minuend has opposite sign of subtrahend
// and minuend has opposite sign of difference
// (or subtrahend has sign of difference)
return ((minuend ^ subtrahend) & (minuend ^ difference)) < 0 ? clamp : difference;
}
GCC 11.3: 100
instructions in 252 bytes.
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add(unsigned int, unsigned int): mov eax, [esp+8] add eax, [esp+4] sbb edx, edx or eax, edx ret sat_unsigned_div(unsigned int, unsigned int): mov eax, [esp+4] xor edx, edx div dword ptr [esp+8] ret sat_unsigned_mod(unsigned int, unsigned int): mov eax, [esp+4] xor edx, edx div dword ptr [esp+8] mov eax, edx ret sat_unsigned_mul(unsigned int, unsigned int): push ebx mov eax, [esp+12] mul dword ptr [esp+8] pop ebx neg edx mov ecx, eax sbb edx, edx mov eax, ecx or eax, edx ret sat_unsigned_sub(unsigned int, unsigned int): mov eax, [esp+4] mov edx, eax sub edx, [esp+8] cmp eax, edx setbe al movzx eax, al neg eax and eax, edx ret sat_signed_add(int, int): push ebx mov eax, [esp+8] mov ecx, [esp+12] mov ebx, eax lea edx, [eax+ecx] sar eax, 31 xor ebx, edx xor ecx, edx xor eax, 2147483647 test ebx, ecx pop ebx cmovns edx, eax mov eax, edx ret sat_signed_div(int, int): mov ecx, [esp+8] mov eax, [esp+4] cmp ecx, -1 je .L17 cdq idiv ecx ret .L17: mov edx, eax neg edx cmp eax, -2147483648 mov eax, 2147483647 cmovne eax, edx ret sat_signed_mod(int, int): mov ecx, [esp+8] xor edx, edx cmp ecx, -1 je .L18 mov eax, [esp+4] cdq idiv ecx .L18: mov eax, edx ret sat_signed_mul(int, int): push ebx mov eax, [esp+8] imul dword ptr [esp+12] mov ecx, eax mov ebx, eax sar ecx, 31 cmp ecx, edx je .L22 mov eax, [esp+8] xor eax, [esp+12] sar eax, 31 xor eax, 2147483647 mov ebx, eax .L22: mov eax, ebx pop ebx ret sat_signed_sub(int, int): push ebx mov eax, [esp+8] mov ecx, [esp+12] mov edx, eax mov ebx, eax sub edx, [esp+12] xor ebx, edx xor ecx, eax sar eax, 31 xor eax, 2147483647 test ecx, ebx pop ebx cmovs edx, eax mov eax, edx retClang 14.0.0: 90 instructions in 236 bytes.
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add(unsigned int, unsigned int): mov ecx, [esp+4] add ecx, [esp+8] mov eax, -1 cmovae eax, ecx ret sat_unsigned_div(unsigned int, unsigned int): mov eax, [esp+4] xor edx, edx div dword ptr [esp+8] ret sat_unsigned_mod(unsigned int, unsigned int): mov eax, [esp+4] xor edx, edx div dword ptr [esp+8] mov eax, edx ret sat_unsigned_mul(unsigned int, unsigned int): mov eax, [esp+4] mul dword ptr [esp+8] mov ecx, -1 cmovo eax, ecx ret sat_unsigned_sub(unsigned int, unsigned int): mov ecx, [esp+4] mov edx, ecx sub edx, [esp+8] xor eax, eax cmp edx, ecx cmovae eax, edx ret sat_signed_add(int, int): push esi mov eax, [esp+8] mov ecx, [esp+12] lea edx, [ecx + eax] mov esi, edx xor esi, eax shr eax, 31 add eax, 2147483647 xor ecx, edx test esi, ecx cmovns eax, edx pop esi ret sat_signed_div(int, int): mov ecx, [esp+8] mov eax, [esp+4] cmp ecx, -1 je .LBB6_1 cdq idiv ecx ret .LBB6_1: mov ecx, eax neg ecx cmp eax, -2147483648 mov eax, 2147483647 cmovne eax, ecx ret sat_signed_mod(int, int): mov ecx, [esp+8] cmp ecx, -1 je .LBB7_1 mov eax, [esp+4] cdq idiv ecx mov eax, edx ret .LBB7_1: xor eax, eax ret sat_signed_mul(int, int): push esi mov ecx, [esp+12] mov esi, [esp+8] mov eax, ecx imul esi xor ecx, esi shr ecx, 31 add ecx, 2147483647 mov esi, eax sar esi, 31 xor esi, edx cmovne eax, ecx pop esi ret sat_signed_sub(int, int): push esi mov ecx, [esp+12] mov edx, [esp+8] mov esi, edx sub esi, ecx mov eax, edx shr eax, 31 add eax, 2147483647 xor ecx, edx xor edx, esi test edx, ecx cmovns eax, esi pop esi ret
__builtin_*_overflow()
provided by
GCC
and
LLVM:
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
unsigned int sat_unsigned_add(unsigned int augend, unsigned int addend)
{
unsigned int sum;
return __builtin_add_overflow(augend, addend, &sum) ? ~0U : sum;
}
unsigned int sat_unsigned_mul(unsigned int multiplicand, unsigned int multiplier)
{
unsigned int product;
return __builtin_mul_overflow(multiplicand, multiplier, &product) ? ~0U : product;
}
unsigned int sat_unsigned_sub(unsigned int minuend, unsigned int subtrahend)
{
unsigned int difference;
return __builtin_sub_overflow(minuend, subtrahend, &difference) ? 0U : difference;
}
int sat_signed_add(int augend, int addend)
{
int sum, clamp = 2147483647 ^ (augend >> 31);
return __builtin_add_overflow(augend, addend, &sum) ? clamp : sum;
}
int sat_signed_mul(int multiplicand, int multiplier)
{
int product, clamp = 2147483647 ^ ((multiplicand ^ multiplier) >> 31);
return __builtin_mul_overflow(multiplicand, multiplier, &product) ? clamp : product;
}
int sat_signed_sub(int minuend, int subtrahend)
{
int difference, clamp = 2147483647 ^ (minuend >> 31);
return __builtin_sub_overflow(minuend, subtrahend, &difference) ? clamp : difference;
}
GCC 11.3:
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add(unsigned int, unsigned int): mov eax, [esp+8] add eax, [esp+4] jc .L7 ret .L7: or eax, -1 ret sat_unsigned_mul(unsigned int, unsigned int): mov eax, [esp+4] mul dword ptr [esp+8] jo .L13 ret .L13: or eax, -1 ret sat_unsigned_sub(unsigned int, unsigned int): mov eax, [esp+4] mov edx, 0 sub eax, [esp+8] cmovb eax, edx ret sat_signed_add(int, int): mov eax, [esp+4] add eax, [esp+8] jo .L24 ret .L24: mov eax, [esp+4] sar eax, 31 xor eax, 2147483647 ret sat_signed_mul(int, int): mov eax, [esp+4] imul eax, [esp+8] jo .L30 ret .L30: mov eax, [esp+4] xor eax, [esp+8] sar eax, 31 add eax, 2147483647 ret sat_signed_sub(int, int): mov edx, [esp+4] mov eax, edx sub eax, [esp+8] jo .L36 ret .L36: mov eax, edx sar eax, 31 xor eax, 2147483648 ret43 instructions in 124 bytes instead of 68 instructions in 171 bytes, but not properly optimised; the first pair of highlighted lines can be replaced with the following branch-free instruction sequence:
sbb edx, edx
or eax, edx
The second pair of highlighted lines can be replaced with the
following instruction sequence:
sbb edx, edx
and eax, edx
Clang 14.0.0: 41 instructions in 129 bytes.
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add(unsigned int, unsigned int): mov ecx, [esp+4] add ecx, [esp+8] mov eax, -1 cmovae eax, ecx ret sat_unsigned_mul(unsigned int, unsigned int): mov eax, [esp+4] mul dword ptr [esp+8] mov ecx, -1 cmovo eax, ecx ret sat_unsigned_sub(unsigned int, unsigned int): mov eax, [esp+4] xor ecx, ecx sub eax, [esp+8] cmovb eax, ecx ret sat_signed_add(int, int): mov eax, [esp+4] mov ecx, [esp+8] lea edx, [eax+ecx] sar edx, 31 xor edx, -2147483648 add eax, ecx cmovo eax, edx ret sat_signed_mul(int, int): mov eax, [esp+4] mov ecx, [esp+8] mov edx, ecx xor edx, eax shr edx, 31 add edx, 2147483647 imul eax, ecx cmovo eax, edx ret sat_signed_sub(int, int): mov eax, [esp+4] mov edx, [esp+8] xor ecx, ecx cmp eax, edx setns cl add ecx, 2147483647 sub eax, edx cmovo eax, ecx ret
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
unsigned long long sat_unsigned_add_64(unsigned long long augend, unsigned long long addend)
{
unsigned long long sum;
return __builtin_add_overflow(augend, addend, &sum) ? ~0ULL : sum;
}
unsigned long long sat_unsigned_div_64(unsigned long long dividend, unsigned long long divisor)
{
// NOTE: overflow impossible!
return dividend / divisor;
}
unsigned long long sat_unsigned_mod_64(unsigned long long dividend, unsigned long long divisor)
{
// NOTE: overflow impossible!
return dividend % divisor;
}
unsigned long long sat_unsigned_mul_64(unsigned long long multiplicand, unsigned long long multiplier)
{
unsigned long long product;
return __builtin_mul_overflow(multiplicand, multiplier, &product) ? ~0ULL : product;
}
unsigned long long sat_unsigned_sub_64(unsigned long long minuend, unsigned long long subtrahend)
{
unsigned long long difference;
return __builtin_sub_overflow(minuend, subtrahend, &difference) ? 0ULL : difference;
}
long long sat_signed_add_64(long long augend, long long addend)
{
long long sum;
#if 0
return __builtin_add_overflow(augend, addend, &sum) ? 9223372036854775807LL ^ (augend >> 63) : sum;
#else
return __builtin_add_overflow(augend, addend, &sum) ? ~9223372036854775807LL ^ (sum >> 63) : sum;
#endif
}
long long sat_signed_div_64(long long dividend, long long divisor)
{
// NOTE: signed division overflows (really: raises 'division exception')
// only for -2**63 / -1 = ±2**63!
#if 0
dividend -= (divisor == -1LL) && (dividend == ~9223372036854775807LL);
return dividend / divisor;
#else
return (divisor == -1LL) & (dividend == ~9223372036854775807LL) ? ~9223372036854775807LL : dividend / divisor;
#endif
}
long long sat_signed_mod_64(long long dividend, long long divisor)
{
// NOTE: signed division overflows (really: raises 'division exception')
// only for -2**63 / -1 = ±2**63!
#if 0
dividend -= (divisor == -1LL) && (dividend == ~9223372036854775807LL);
return dividend % divisor;
#else
return (divisor == -1LL) & (dividend == ~9223372036854775807LL) ? 0LL : dividend % divisor;
#endif
}
long long sat_signed_mul_64(long long multiplicand, long long multiplier)
{
long long product;
return __builtin_mul_overflow(multiplicand, multiplier, &product) ? 9223372036854775807LL ^ ((multiplicand ^ multiplier) >> 63) : product;
}
long long sat_signed_sub_64(long long minuend, long long subtrahend)
{
long long difference;
#if 0
return __builtin_sub_overflow(minuend, subtrahend, &difference) ? 9223372036854775807LL ^ (minuend >> 63) : difference;
#else
return __builtin_sub_overflow(minuend, subtrahend, &difference) ? ~9223372036854775807LL ^ (difference >> 63) : difference;
#endif
}
GCC 11.3:
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add_64(unsigned long long, unsigned long long): push ebx mov ecx, [esp+16] mov ebx, [esp+20] add ecx, [esp+8] adc ebx, [esp+12] jc .L6 mov edx, ebx mov eax, ecx pop ebx ret .L6: mov eax, -1 mov edx, -1 pop ebx ret sat_unsigned_div_64(unsigned long long, unsigned long long): sub esp, 12 push [esp+28] push [esp+28] push [esp+28] push [esp+28] call __udivdi3 add esp, 28 ret sat_unsigned_mod_64(unsigned long long, unsigned long long): sub esp, 12 push [esp+28] push [esp+28] push [esp+28] push [esp+28] call __umoddi3 add esp, 28 ret sat_unsigned_mul_64(unsigned long long, unsigned long long): push ebp push edi push esi push ebx sub esp, 20 mov edi, [esp+40] mov ecx, [esp+44] mov dword ptr [esp], 0 mov ebp, [esp+48] mov ebx, [esp+52] mov [esp+4], edi test ecx, ecx jne .L16 test ebx, ebx jne .L17 mov eax, edi mul ebp mov ebx, eax .L14: cmp byte ptr [esp], 0 mov ecx, -1 mov eax, ebx cmovne eax, ecx cmovne edx, ecx add esp, 20 pop ebx pop esi pop edi pop ebp ret .L16: test ebx, ebx jne .L20 mov esi, ecx mov edi, ebp .L18: mov eax, [esp+4] mul ebp mov [esp+8], eax mov eax, edi mov [esp+12], edx mul esi mov edi, [esp+12] mov esi, edi xor edi, edi add esi, eax adc edi, edx test edi, edi jne .L20 mov edx, esi mov esi, [esp+8] xor edi, edi or edx, edi mov ebx, esi jmp .L14 .L17: mov esi, ebx jmp .L18 .L20: mov esi, [esp+4] imul ecx, ebp mov dword ptr [esp], 1 imul ebx, esi mov eax, esi mul ebp add ecx, ebx lea edx, [edx+ecx] mov ebx, eax jmp .L14 sat_unsigned_sub_64(unsigned long long, unsigned long long): push ebx mov eax, [esp+8] mov edx, [esp+12] mov ecx, eax sub ecx, [esp+16] mov ebx, edx sbb ebx, [esp+20] cmp eax, ecx mov eax, edx sbb eax, ebx jc .L29 mov edx, ebx mov eax, ecx pop ebx ret .L29: xor eax, eax xor edx, edx pop ebx ret sat_signed_add_64(long long, long long): mov eax, [esp+12] mov edx, [esp+16] add eax, [esp+4] adc edx, [esp+8] jno .L31 mov eax, edx sar eax, 31 lea edx, [eax-2147483648] .L31: ret sat_signed_div_64(long long, long long): push edi push esi push ebx mov eax, [esp+24] mov edx, [esp+28] mov ecx, [esp+16] mov esi, eax mov ebx, [esp+20] and esi, edx cmp esi, -1 jne .L42 mov esi, ecx lea edi, [ebx-2147483648] or esi, edi je .L41 .L42: push edx push eax push ebx push ecx call __divdi3 add esp, 16 .L38: pop ebx pop esi pop edi ret .L41: xor eax, eax mov edx, -2147483648 jmp .L38 sat_signed_mod_64(long long, long long): push edi push esi push ebx sub esp, 16 mov ecx, [esp+40] mov ebx, [esp+44] mov esi, [esp+32] mov edi, [esp+36] mov eax, ecx and eax, ebx mov [esp+8], esi mov [esp+12], edi cmp eax, -1 jne .L51 mov eax, edi mov edx, esi xor edi, edi xor esi, esi add eax, -2147483648 or edx, eax jne .L51 add esp, 16 mov eax, esi mov edx, edi pop ebx pop esi pop edi ret .L51: push ebx push ecx push [esp+20] push [esp+20] call __moddi3 add esp, 16 add esp, 16 mov esi, eax mov edi, edx mov eax, esi mov edx, edi pop ebx pop esi pop edi ret sat_signed_mul_64(long long, long long): push ebp push edi push esi push ebx sub esp, 36 mov ebx, [esp+56] mov esi, [esp+64] mov dword ptr [esp], 0 mov edi, [esp+68] mov ecx, [esp+60] mov edx, ebx mov eax, esi sar edx, 31 mov [esp+4], edi sar eax, 31 cmp edx, ecx jne .L59 cmp eax, edi jne .L60 mov eax, ebx imul esi mov ebp, edx .L57: mov edx, [esp] test edx, edx jne .L67 add esp, 36 mov edx, ebp pop ebx pop esi pop edi pop ebp ret .L67: xor ecx, [esp+4] add esp, 36 mov edx, ecx pop ebx pop esi sar edx, 31 pop edi pop ebp mov eax, edx xor edx, 2147483647 not eax ret .L59: cmp eax, [esp+4] jne .L62 mov [esp+8], ebx mov edi, ecx mov ebp, esi mov [esp+12], ecx .L61: mov eax, ebx mul esi mov [esp+24], eax mov eax, edi mov [esp+28], edx mul ebp mov [esp+16], eax mov [esp+20], edx test edi, edi jns .L63 xor eax, eax sub [esp+16], eax sbb [esp+20], ebp .L63: test ebp, ebp jns .L64 mov edi, [esp+8] mov ebp, [esp+12] sub [esp+16], edi sbb [esp+20], ebp .L64: mov edx, [esp+28] mov eax, edx xor edx, edx add eax, [esp+16] adc edx, [esp+20] mov edi, eax mov [esp+8], eax sar edi, 31 mov [esp+12], edx cmp edi, edx jne .L65 mov ebp, [esp+8] mov ebx, [esp+24] xor esi, esi mov eax, ebx or ebp, esi jmp .L57 .L60: mov [esp+8], esi mov ebp, ebx mov [esp+12], edi jmp .L61 .L62: mov ebp, ecx mov eax, edi imul eax, ebx imul ebp, esi add ebp, eax mov eax, ebx mul esi add ebp, edx lea edx, [ecx+1] cmp edx, 1 ja .L58 lea edx, [edi+1] cmp edx, 1 ja .L58 cmp ecx, edi jne .L66 xor esi, esi cmp esi, eax sbb esi, ebp jl .L57 .L58: mov dword ptr [esp], 1 jmp .L57 .L66: test ebp, ebp js .L57 jmp .L58 .L65: mov eax, [esp+4] mov ebp, ecx imul ebp, esi imul eax, ebx add ebp, eax mov eax, ebx mul esi add ebp, edx jmp .L58 sat_signed_sub_64(long long, long long): mov eax, [esp+4] mov edx, [esp+8] sub eax, [esp+12] sbb edx, [esp+16] jno .L70 mov eax, edx sar eax, 31 lea edx, [eax-2147483648] .L70: ret326 instructions in 866 bytes, miserably
optimised!
Clang 14.0.0
# Compilation provided by Compiler Explorer at https://godbolt.org/ sat_unsigned_add_64(unsigned long long, unsigned long long): mov eax, [esp+4] mov edx, [esp+8] add eax, [esp+12] adc edx, [esp+16] mov ecx, -1 cmovb edx, ecx cmovb eax, ecx ret sat_unsigned_div_64(unsigned long long, unsigned long long): sub esp, 12 push [esp+28] push [esp+28] push [esp+28] push [esp+28] call __udivdi3 add esp, 28 ret sat_unsigned_mod_64(unsigned long long, unsigned long long): sub esp, 12 push [esp+28] push [esp+28] push [esp+28] push [esp+28] call __umoddi3 add esp, 28 ret sat_unsigned_mul_64(unsigned long long, unsigned long long): push ebp push ebx push edi push esi mov ebp, [esp+20] mov eax, [esp+24] mov edi, [esp+32] test edi, edi setne dl test eax, eax setne cl and cl, dl mul dword ptr [esp+28] mov esi, eax seto bl mov eax, edi mul ebp seto ch or ch, bl or ch, cl add esi, eax mov eax, ebp mul dword ptr [esp+28] add edx, esi setb cl or cl, ch mov ecx, -1 cmovne eax, ecx cmovne edx, ecx pop esi pop edi pop ebx pop ebp ret sat_unsigned_sub_64(unsigned long long, unsigned long long): mov eax, [esp+4] xor ecx, ecx sub eax, [esp+12] mov edx, [esp+8] sbb edx, [esp+16] cmovb edx, ecx cmovb eax, ecx ret sat_signed_add_64(long long, long long): push ebx push edi push esi mov esi, [esp+16] mov ecx, [esp+20] mov edi, -1 mov eax, -1 add eax, 1 mov edx, 2147483647 adc edx, 0 add esi, [esp+24] mov ebx, 2147483647 adc ecx, [esp+28] cmovs edx, ebx cmovs eax, edi seto bl test bl, bl cmove eax, esi test bl, bl cmove edx, ecx pop esi pop edi pop ebx ret sat_signed_div_64(long long, long long): push edi push esi push eax mov esi, [esp+28] mov edx, [esp+24] mov ecx, [esp+20] mov eax, [esp+16] mov edi, ecx xor edi, -2147483648 or edi, eax jne .LBB6_3 mov edi, edx and edi, esi cmp edi, -1 jne .LBB6_3 mov edx, -2147483648 xor eax, eax add esp, 4 pop esi pop edi ret .LBB6_3: push esi push edx push ecx push eax call __divdi3 add esp, 16 add esp, 4 pop esi pop edi ret sat_signed_mod_64(long long, long long): push ebp push ebx push edi push esi sub esp, 12 mov ebx, [esp+44] mov edi, [esp+40] mov esi, [esp+36] mov ecx, [esp+32] mov eax, esi xor eax, -2147483648 or eax, ecx jne .LBB7_2 xor eax, eax mov ebp, edi and ebp, ebx mov edx, 0 cmp ebp, -1 je .LBB7_3 .LBB7_2: push ebx push edi push esi push ecx call __moddi3 add esp, 16 .LBB7_3: add esp, 12 pop esi pop edi pop ebx pop ebp ret sat_signed_mul_64(long long, long long): push ebp push ebx push edi push esi sub esp, 12 mov eax, [esp+40] mov ebp, [esp+44] mov esi, [esp+36] sar esi, 31 mov ecx, eax mul esi mov [esp+4], eax # 4-byte Spill mov edi, edx imul ecx, esi add edi, ecx imul esi, ebp mov ebx, ebp sar ebx, 31 mov eax, ebx mul dword ptr [esp+32] add esi, edi mov ecx, ebx mov edi, [esp+36] imul ecx, edi add edx, ecx mov ecx, [esp+32] imul ebx, ecx add ebx, edx add eax, [esp+4] # 4-byte Folded Reload mov [esp+8], eax # 4-byte Spill adc ebx, esi mov eax, ecx mov ecx, [esp+40] mul ecx mov esi, edx mov [esp+4], eax # 4-byte Spill mov eax, edi mul ecx mov edi, edx add esi, eax adc edi, 0 mov eax, [esp+32] mul ebp mov ecx, edx add esi, eax adc ecx, edi setb byte ptr [esp+3] # 1-byte Folded Spill mov eax, [esp+36] mul ebp add eax, ecx movzx ecx, byte ptr [esp+3] # 1-byte Folded Reload adc edx, ecx add eax, [esp+8] # 4-byte Folded Reload adc edx, ebx mov edi, esi sar edi, 31 xor edx, edi xor edi, eax mov eax, -1 add eax, 1 mov ecx, 2147483647 adc ecx, 0 xor ebp, [esp+36] mov ebx, 2147483647 cmovs ecx, ebx mov ebx, -1 cmovs eax, ebx or edi, edx cmove eax, [esp+4] # 4-byte Folded Reload cmove ecx, esi mov edx, ecx add esp, 12 pop esi pop edi pop ebx pop ebp ret sat_signed_sub_64(long long, long long): push ebx push edi push esi mov esi, [esp+16] mov ecx, [esp+20] mov edi, -1 mov eax, -1 add eax, 1 mov edx, 2147483647 adc edx, 0 sub esi, [esp+24] mov ebx, 2147483647 sbb ecx, [esp+28] cmovs edx, ebx cmovs eax, edi seto bl test bl, bl cmove eax, esi test bl, bl cmove edx, ecx pop esi pop edi pop ebx ret253 instructions in 676 bytes, also poorly
optimised; especially notice the highlighted parts to load the constants -263 = 263 = 0x8000000000000000 and 263 -1 = 0x7FFFFFFFFFFFFFFF in a rather weird way!
Use the X.509 certificate to send S/MIME encrypted mail.
Note: email in weird format and without a proper sender name is likely to be discarded!
I dislike
HTML (and even
weirder formats too) in email, I prefer to receive plain text.
I also expect to see your full (real) name as sender, not your
nickname.
I abhor top posts and expect inline quotes in replies.
as iswithout any warranty, neither express nor implied.
cookiesin the web browser.
The web service is operated and provided by
Telekom Deutschland GmbH The web service provider stores a session cookie
in the web
browser and records every visit of this web site with the following
data in an access log on their server(s):