PhotoFlow optimizations and benchmarks

Not for one matrix(3,3) x vector(3) product, but for many:

void PF::ICCTransform::apply(float* redin, float* greenin, float* bluein, float* redout, float* greenout, float* blueout, int n)
{
  if( is_rgb2rgb ) {
    /* std::cout<<"ICCTransform::apply(): in="<<(void*)in<<"  out="<<(void*)out<<std::endl;
    size_t addr = (size_t)in;
    float faddr = (float)addr;
    printf("    %f / 16 = %f\n",faddr,faddr/16);*/
//    float* in2 = in; float* out2 = out;
//    for(int i = 0; i < n; i++) {
//      out2[0] = rgb2rgb[0][0]*in2[0] + rgb2rgb[0][1]*in2[1] + rgb2rgb[0][2]*in2[2];
//      out2[1] = rgb2rgb[1][0]*in2[0] + rgb2rgb[1][1]*in2[1] + rgb2rgb[1][2]*in2[2];
//      out2[2] = rgb2rgb[2][0]*in2[0] + rgb2rgb[2][1]*in2[1] + rgb2rgb[2][2]*in2[2];
//      in2 += 3; out2 += 3;
//    }
#ifdef __SSE2__
    __m128 rgb2rgbv[3][3];
    for(int i = 0; i < 3; i++) {
        for(int j = 0; j < 3; j++) {
            rgb2rgbv[i][j] = _mm_set1_ps(rgb2rgb[i][j]);
        }
    }
#endif
    int i = 0;
#ifdef __SSE2__
    for(; i < n - 3; i += 4) {
        __m128 redv = _mm_loadu_ps(&redin[i]);
        __m128 greenv = _mm_loadu_ps(&greenin[i]);
        __m128 bluev = _mm_loadu_ps(&bluein[i]);
        _mm_storeu_ps(&redout[i], rgb2rgbv[0][0]*redv + rgb2rgbv[0][1]*greenv + rgb2rgbv[0][2]*bluev);
        _mm_storeu_ps(&greenout[i], rgb2rgbv[1][0]*redv + rgb2rgbv[1][1]*greenv + rgb2rgbv[1][2]*bluev);
        _mm_storeu_ps(&blueout[i], rgb2rgbv[2][0]*redv + rgb2rgbv[2][1]*greenv + rgb2rgbv[2][2]*bluev);
    }
#endif // __SSE2__
    for(; i < n; i++) { // remaining pixels if n % 4 != 0
        redout[i] = rgb2rgb[0][0]*redin[i] + rgb2rgb[0][1]*greenin[i] + rgb2rgb[0][2]*bluein[i];
        greenout[i] = rgb2rgb[1][0]*redin[i] + rgb2rgb[1][1]*greenin[i] + rgb2rgb[1][2]*bluein[i];
        blueout[i] = rgb2rgb[2][0]*redin[i] + rgb2rgb[2][1]*greenin[i] + rgb2rgb[2][2]*bluein[i];
        
    }

    return;
    //std::cout<<"out(1): "<<out[0]<<","<<out[1]<<","<<out[2]<<std::endl;
}
1 Like