saxpy_SSE.c (337B)
1 #include <x86intrin.h> 2 3 void saxpy_SSE(float *x, float *y, float a, size_t n) 4 { 5 const __m128 a4 = _mm_set1_ps(a); 6 // assumes n % 4 == 0 7 for (size_t i = 0; i < n; i += 4) { 8 __m128 r0 = _mm_load_ps(x + i); 9 __m128 r1 = _mm_load_ps(y + i); 10 _mm_store_ps(y + i, _mm_add_ps(_mm_mul_ps(r0, a4), r1)); 11 } 12 }