Not for one matrix(3,3) x vector(3) product, but for many:
void PF::ICCTransform::apply(float* redin, float* greenin, float* bluein, float* redout, float* greenout, float* blueout, int n)
{
if( is_rgb2rgb ) {
/* std::cout<<"ICCTransform::apply(): in="<<(void*)in<<" out="<<(void*)out<<std::endl;
size_t addr = (size_t)in;
float faddr = (float)addr;
printf(" %f / 16 = %f\n",faddr,faddr/16);*/
// float* in2 = in; float* out2 = out;
// for(int i = 0; i < n; i++) {
// out2[0] = rgb2rgb[0][0]*in2[0] + rgb2rgb[0][1]*in2[1] + rgb2rgb[0][2]*in2[2];
// out2[1] = rgb2rgb[1][0]*in2[0] + rgb2rgb[1][1]*in2[1] + rgb2rgb[1][2]*in2[2];
// out2[2] = rgb2rgb[2][0]*in2[0] + rgb2rgb[2][1]*in2[1] + rgb2rgb[2][2]*in2[2];
// in2 += 3; out2 += 3;
// }
#ifdef __SSE2__
__m128 rgb2rgbv[3][3];
for(int i = 0; i < 3; i++) {
for(int j = 0; j < 3; j++) {
rgb2rgbv[i][j] = _mm_set1_ps(rgb2rgb[i][j]);
}
}
#endif
int i = 0;
#ifdef __SSE2__
for(; i < n - 3; i += 4) {
__m128 redv = _mm_loadu_ps(&redin[i]);
__m128 greenv = _mm_loadu_ps(&greenin[i]);
__m128 bluev = _mm_loadu_ps(&bluein[i]);
_mm_storeu_ps(&redout[i], rgb2rgbv[0][0]*redv + rgb2rgbv[0][1]*greenv + rgb2rgbv[0][2]*bluev);
_mm_storeu_ps(&greenout[i], rgb2rgbv[1][0]*redv + rgb2rgbv[1][1]*greenv + rgb2rgbv[1][2]*bluev);
_mm_storeu_ps(&blueout[i], rgb2rgbv[2][0]*redv + rgb2rgbv[2][1]*greenv + rgb2rgbv[2][2]*bluev);
}
#endif // __SSE2__
for(; i < n; i++) { // remaining pixels if n % 4 != 0
redout[i] = rgb2rgb[0][0]*redin[i] + rgb2rgb[0][1]*greenin[i] + rgb2rgb[0][2]*bluein[i];
greenout[i] = rgb2rgb[1][0]*redin[i] + rgb2rgb[1][1]*greenin[i] + rgb2rgb[1][2]*bluein[i];
blueout[i] = rgb2rgb[2][0]*redin[i] + rgb2rgb[2][1]*greenin[i] + rgb2rgb[2][2]*bluein[i];
}
return;
//std::cout<<"out(1): "<<out[0]<<","<<out[1]<<","<<out[2]<<std::endl;
}