Question
Twice as slow SIMD performance without extra copy
I've been optimizing some code, and stumbled across some peculiar case. Here are the two assembly codes:
; FAST
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3] ;this function only writes 3 components
movaps xmm0,xmmword ptr [rsp+50h]
lea rbx,[rbx+0Ch]
mulps xmm0,xmm6
movlps qword ptr [rbx-0Ch],xmm0
movaps xmmword ptr [rsp+50h],xmm0
extractps eax,xmm0,2
mov dword ptr [rbx-4],eax
; SLOW
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3] ;this function only writes 3 components
movaps xmm0,xmmword ptr [rsp+50h]
lea rbx,[rbx+0Ch]
mulps xmm0,xmm6
movlps qword ptr [rbx-0Ch],xmm0
extractps eax,xmm0,2
mov dword ptr [rbx-4],eax
Both of the versions are executed 10000 times in a tight loop (same loop code omited). As you can see, the assemblies are exactly the same, except for one extra movaps xmmword ptr [rsp+50h],xmm0
instruction in the fast version.
Actually it's a no-op, because rsp+50h will be overwritten in the next iteration:
lea rcx,[rsp+50h]
call qword ptr [Random_get_float3]
What's interesting in this code is that slow version is twice as slow compared to fast while missing one extra useless instruction.
Can someone explain why?
The C++ code (compiled with MSVC v140 with VS 2022):
#include <immintrin.h>
#include <cstdlib>
__declspec(noinline) void random_get_float3(float* vec3) {
int v = rand();
vec3[0] = *(float*)&v;
v = rand();
vec3[1] = *(float*)&v;
v = rand();
vec3[2] = *(float*)&v;
vec3[0] = powf(vec3[0], 1.0f / 3.0f);
vec3[1] = powf(vec3[1], 1.0f / 3.0f);
vec3[2] = powf(vec3[2], 1.0f / 3.0f);
}
void* randomGetFuncPtr = &random_get_float3;
// Not aligned by 16.
struct Vector3 {
float x, y, z;
};
struct Vector3Array {
size_t length;
Vector3* m_Items;
};
static bool inited = false;
Vector3 scaledRandomPosExtern = Vector3{ 0.5f, 0.5f, 0.5f };
Vector3Array randomPositions;
#define __SLOW // comment to enable fast version.
int numObjectsExtern = 10000;
void TestFunc()
{
int numObjects = numObjectsExtern;
if (!inited) {
randomPositions = {
10000,
new Vector3[10000]
};
inited = true;
}
typedef void (*Random_get_float3_fptr) (__m128* __restrict);
Random_get_float3_fptr _il2cpp_icall_func = (Random_get_float3_fptr)randomGetFuncPtr;
Vector3 scaledRandomPos = scaledRandomPosExtern;
__m128 scaledRandomPosVec = _mm_setr_ps(scaledRandomPos.x, scaledRandomPos.y, scaledRandomPos.z, 0.0f);
Vector3Array* outputArray = &randomPositions;
int* items = (int*)&outputArray->m_Items[0];
for (int i = 0; i < numObjects; i++) {
__m128 v1;
_il2cpp_icall_func(&v1);
#ifdef __SLOW
__m128 v3;
v3 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v3
#else
v1 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v1
#endif
_mm_storel_pi((__m64*)(items), RESVEC);
items[2] = _mm_extract_ps(RESVEC, 2);
items += 3;
}
}
Reproducable on
CPU:
AMD Ryzen 7 3700x Windows 10 19045.3930
Other Ryzen CPUs
Can't be reproducable on Intel CPUs.