I face a simliar problem. As everybody says that SSE is so marvelous,
we are trying to put some SSE code in our render engine, to speed up this.
But look at the results of the code below (box is a P4@1.8, Xeon with ht):
annwn:~/sse> ss-g
Proc std:
5020 kticks
Proc std inline:
4320 kticks
Proc sse:
4290 kticks
Proc sse inline:
3890 kticks
So what ? Just around 500 ticks for updating to sse ? As Computer Architecture
people at the school says, it is something called 'spill code' (did I wrote it
ok?). In short, too much sse but too less registers, so Intel ia32 turns into
crap when you need some indexes, out of registers and copy to and from the stack.
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#if defined(__INTEL_COMPILER)
#include <xmmintrin.h>
#endif
#define LOOPS 1000
#define SZ 100000
#if defined(__GNUC__) && defined(__SSE__)
typedef void __ve_reg __attribute__((__mode__(V4SF)));
#endif
typedef struct point point;
struct point {
float v[4];
};
void mulp_std(const point* a,const point* b,point* r)
{
int i;
for (i=0; i<4; i++)
r->v[i] = a->v[i] * b->v[i];
}
inline void mulpi_std(const point* a,const point* b,point* r)
{
int i;
for (i=0; i<4; i++)
r->v[i] = a->v[i] * b->v[i];
}
void mulp_sse(const point* a,const point* b,point* r)
{
#if defined(__GNUC__) && defined(__SSE__)
__ve_reg xmm0,xmm1,xmm2;
xmm0 = __builtin_ia32_loadups((float*)a->v);
xmm1 = __builtin_ia32_loadups((float*)b->v);
xmm2 = __builtin_ia32_mulps(xmm0,xmm1);
__builtin_ia32_storeups(r->v,xmm2);
#endif
#if defined(__INTEL_COMPILER)
__m128 xmm0,xmm1,xmm2;
xmm0 = _mm_loadu_ps((float*)a->v);
xmm1 = _mm_loadu_ps((float*)b->v);
xmm2 = _mm_mul_ps(xmm0,xmm1);
_mm_storeu_ps(r->v,xmm2);
#endif
}
inline void mulpi_sse(const point* a,const point* b,point* r)
{
#if defined(__GNUC__) && defined(__SSE__)
__ve_reg xmm0,xmm1,xmm2;
xmm0 = __builtin_ia32_loadups((float*)a->v);
xmm1 = __builtin_ia32_loadups((float*)b->v);
xmm2 = __builtin_ia32_mulps(xmm0,xmm1);
__builtin_ia32_storeups(r->v,xmm2);
#endif
#if defined(__INTEL_COMPILER)
#if defined(__INTEL_COMPILER)
__m128 xmm0,xmm1,xmm2;
xmm0 = _mm_loadu_ps((float*)a->v);
xmm1 = _mm_loadu_ps((float*)b->v);
xmm2 = _mm_mul_ps(xmm0,xmm1);
_mm_storeu_ps(r->v,xmm2);
#endif
#endif
}
int main(int argc, char** argv)
{
point *a;
point *b;
point *c;
int i,j;
unsigned long t0,t1;
a = malloc(SZ*sizeof(point));
b = malloc(SZ*sizeof(point));
c = malloc(SZ*sizeof(point));
printf("Proc std:\n");
t0 = clock();
for (i=0; i<LOOPS; i++)
{
for (j=0; j<SZ; j++)
mulp_std(&a[j],&b[j],&c[j]);
for (j=0; j<SZ; j++)
mulp_std(&b[j],&b[j],&a[j]);
}
t1 = clock();
printf("%10d kticks\n",(t1-t0)/1000);
printf("Proc std inline:\n");
t0 = clock();
for (i=0; i<LOOPS; i++)
{
for (j=0; j<SZ; j++)
mulpi_std(&a[j],&b[j],&c[j]);
for (j=0; j<SZ; j++)
mulpi_std(&b[j],&b[j],&a[j]);
}
t1 = clock();
printf("%10d kticks\n",(t1-t0)/1000);
printf("Proc sse:\n");
t0 = clock();
for (i=0; i<LOOPS; i++)
{
for (j=0; j<SZ; j++)
mulp_sse(&a[j],&b[j],&c[j]);
for (j=0; j<SZ; j++)
mulp_sse(&b[j],&b[j],&a[j]);
}
t1 = clock();
printf("%10d kticks\n",(t1-t0)/1000);
printf("Proc sse inline:\n");
t0 = clock();
for (i=0; i<LOOPS; i++)
{
for (j=0; j<SZ; j++)
mulpi_sse(&a[j],&b[j],&c[j]);
for (j=0; j<SZ; j++)
mulpi_sse(&b[j],&b[j],&a[j]);
}
t1 = clock();
printf("%10d kticks\n",(t1-t0)/1000);
free(c);
free(b);
free(a);
return 0;
}
-- J.A. Magallon <jamagallon@able.es> \ Software is like sex: werewolf.able.es \ It's better when it's free Mandrake Linux release 9.1 (Cooker) for i586 Linux 2.4.21-pre4-jam1 (gcc 3.2.1 (Mandrake Linux 9.1 3.2.1-5mdk)) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/