Question
Is there a reason that 8 byte std::array comparisons seem to be producing different assembly for char vs. std::byte?
I noticed 8 byte std::array comparisons seem to be producing assembly different from bit_cast
ing. GCC seems to do what I expect for a char array, but clang generates an extra mov instruction (spilling the by-value array<>
arg from an 8-byte register to the red zone, but still comparing the register arg with the memory pointed-to by the other arg).
In the std::byte
case we get 8 separate single-byte cmp
vs. a single efficient qword compare for array<char>
. Curious if there is a reason for this difference?
#include <array>
#include <bit>
#include <cstdint>
// produces completely different asm then the other 2 functions
bool compare1(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return p == r;
}
// seems to be similar to bit_casting, but clang generates 1 more instruction
bool compare2(const std::array<char, 8> &p, std::array<char, 8> r)
{
return p == r;
}
// same assembly if you use char instead of byte
bool compare3(const std::array<std::byte, 8> &p, std::array<std::byte, 8> r)
{
return std::bit_cast<uint64_t>(p) == std::bit_cast<uint64_t>(r);
}
clang asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>): # @compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>)
cmp dil, sil
sete al
jne .LBB0_8
mov eax, edi
shr eax, 8
mov ecx, esi
shr ecx, 8
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 16
mov ecx, esi
shr ecx, 16
cmp al, cl
sete al
jne .LBB0_8
mov eax, edi
shr eax, 24
mov ecx, esi
shr ecx, 24
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 32
mov rcx, rsi
shr rcx, 32
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 40
mov rcx, rsi
shr rcx, 40
cmp al, cl
sete al
jne .LBB0_8
mov rax, rdi
shr rax, 48
mov rcx, rsi
shr rcx, 48
cmp al, cl
sete al
jne .LBB0_8
xor rdi, rsi
shr rdi, 56
sete al
.LBB0_8:
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>): # @compare2(std::array<char, 8ul> const&, std::array<char, 8ul>)
mov qword ptr [rsp - 8], rsi
cmp qword ptr [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>): # @compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>)
cmp qword ptr [rdi], rsi
sete al
ret
gcc asm:
compare1(std::array<std::byte, 8ul>, std::array<std::byte, 8ul>):
mov rdx, rdi
mov rax, rsi
cmp sil, dil
jne .L9
movzx ecx, ah
cmp dh, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 16
shr rcx, 16
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 24
shr rcx, 24
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 32
shr rcx, 32
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 40
shr rcx, 40
cmp sil, cl
jne .L9
mov rsi, rdi
mov rcx, rax
shr rsi, 48
shr rcx, 48
cmp sil, cl
jne .L9
shr rdx, 56
shr rax, 56
cmp dl, al
sete al
ret
.L9:
xor eax, eax
ret
compare2(std::array<char, 8ul> const&, std::array<char, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret
compare3(std::array<std::byte, 8ul> const&, std::array<std::byte, 8ul>):
cmp QWORD PTR [rdi], rsi
sete al
ret