Question

Twice as slow SIMD performance without extra copy

I've been optimizing some code, and stumbled across some peculiar case. Here are the two assembly codes:

; FAST
lea         rcx,[rsp+50h]  
call        qword ptr [Random_get_float3] ;this function only writes 3 components  
movaps      xmm0,xmmword ptr [rsp+50h]  
lea         rbx,[rbx+0Ch]  
mulps       xmm0,xmm6  
movlps      qword ptr [rbx-0Ch],xmm0  
movaps      xmmword ptr [rsp+50h],xmm0  
extractps   eax,xmm0,2  
mov         dword ptr [rbx-4],eax  

; SLOW
lea         rcx,[rsp+50h]  
call        qword ptr [Random_get_float3] ;this function only writes 3 components
movaps      xmm0,xmmword ptr [rsp+50h]  
lea         rbx,[rbx+0Ch]  
mulps       xmm0,xmm6  
movlps      qword ptr [rbx-0Ch],xmm0  
extractps   eax,xmm0,2  
mov         dword ptr [rbx-4],eax

Both of the versions are executed 10000 times in a tight loop (same loop code omited). As you can see, the assemblies are exactly the same, except for one extra movaps xmmword ptr [rsp+50h],xmm0 instruction in the fast version.

Actually it's a no-op, because rsp+50h will be overwritten in the next iteration:

lea         rcx,[rsp+50h]  
call        qword ptr [Random_get_float3]

What's interesting in this code is that slow version is twice as slow compared to fast while missing one extra useless instruction.

Can someone explain why?

The C++ code (compiled with MSVC v140 with VS 2022):

#include <immintrin.h>
#include <cstdlib>

__declspec(noinline) void random_get_float3(float* vec3) {
    int v = rand();
    vec3[0] = *(float*)&v;
    v = rand();
    vec3[1] = *(float*)&v;
    v = rand();
    vec3[2] = *(float*)&v;

    vec3[0] = powf(vec3[0], 1.0f / 3.0f);
    vec3[1] = powf(vec3[1], 1.0f / 3.0f);
    vec3[2] = powf(vec3[2], 1.0f / 3.0f);
}

void* randomGetFuncPtr = &random_get_float3;

// Not aligned by 16.
struct Vector3 {
    float x, y, z;
};

struct Vector3Array {
    size_t length;
    Vector3* m_Items;
};

static bool inited = false;

Vector3 scaledRandomPosExtern = Vector3{ 0.5f, 0.5f, 0.5f };
Vector3Array randomPositions;
#define __SLOW // comment to enable fast version.
int numObjectsExtern = 10000;

void TestFunc() 
{
  int numObjects = numObjectsExtern;
  if (!inited) {
    randomPositions = {
        10000,
        new Vector3[10000]
    };

    inited = true;
  }

  typedef void (*Random_get_float3_fptr) (__m128* __restrict);
  Random_get_float3_fptr _il2cpp_icall_func = (Random_get_float3_fptr)randomGetFuncPtr;
  Vector3 scaledRandomPos = scaledRandomPosExtern;

  __m128 scaledRandomPosVec = _mm_setr_ps(scaledRandomPos.x, scaledRandomPos.y, scaledRandomPos.z, 0.0f);

  Vector3Array* outputArray = &randomPositions;
  int* items = (int*)&outputArray->m_Items[0];

  for (int i = 0; i < numObjects; i++) {
    __m128 v1;
    _il2cpp_icall_func(&v1);

#ifdef __SLOW
    __m128 v3;
    v3 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v3
#else
    v1 = _mm_mul_ps(v1, scaledRandomPosVec);
#define RESVEC v1
#endif

    _mm_storel_pi((__m64*)(items), RESVEC);
    items[2] = _mm_extract_ps(RESVEC, 2);
    items += 3;
  }
}

Mirror on Compiler Explorer

Reproducable on
CPU: AMD Ryzen 7 3700x Windows 10 19045.3930
Other Ryzen CPUs
Can't be reproducable on Intel CPUs.

3 105 3

1 Jan 1970