Impala Documentation

00001 #ifndef Impala_Core_Vector_Chi2Distance_h
00002 #define Impala_Core_Vector_Chi2Distance_h
00003 
00004 #include "Core/Vector/VectorTem.h"
00005 #ifdef SSE_USED
00006 #include <xmmintrin.h>
00007 #include <stdint.h>
00008 #ifndef POINTER_ALIGNED
00009 #define POINTER_ALIGNED(x) (!((( intptr_t)x) & 0xF))
00010 #endif
00011 #endif // SSE_USED
00012 
00013 namespace Impala
00014 {
00015 namespace Core
00016 {
00017 namespace Vector
00018 {
00019 
00020 
00021 inline double
00022 Chi2Distance(const Vector::VectorTem<double>& v1,
00023              const Vector::VectorTem<double>& v2)
00024 {
00025     int length = v1.Size();
00026     double accumulator = 0;
00027     for (int i=0 ; i<length ; i++)
00028     {
00029         double sum = v1[i] + v2[i];
00030         if (sum)
00031             accumulator += (v1[i] - v2[i]) * (v1[i] - v2[i]) / sum;
00032     }
00033     // make it between 0..1, instead of 0..2 (this statement is only true if
00034     // the input vectors sum to one)
00035     return accumulator / 2.0;
00036 }
00037 
00038 
00039 #ifdef SSE_USED
00040 
00041 // assumption: vectorSize is a multiple of 4 (or at least the memory is big
00042 // enough/zero-padded), pointers are 16-byte aligned
00043 inline void
00044 DoSlabChi2DistanceSSE(float* C, const float* A, const float* B,
00045                       unsigned int slabSizeA, unsigned int slabSizeB,
00046                       unsigned int vectorSize)
00047 {
00048     const unsigned int alignFactor = 4;
00049     const int SSELength = IntAlignUp(vectorSize, alignFactor) / alignFactor;
00050     const __m128 zero = _mm_setzero_ps();
00051     const __m128 two = _mm_set1_ps(2.0f);
00052     #pragma omp parallel for
00053     for (unsigned int a = 0; a < slabSizeA; a++)
00054     {
00055         const __m128* baseA = (__m128*) (A + (a * vectorSize));
00056         for (unsigned int b = 0; b < slabSizeB; b++)
00057         {
00058             const __m128* baseB = (__m128*) (B + (b * vectorSize));
00059             __m128 sum = _mm_setzero_ps();
00060             for (unsigned int k = 0; k < SSELength; k++)
00061             {
00062                 __m128 As = baseA[k];
00063                 __m128 Bs = baseB[k];
00064                 __m128 tmp = _mm_sub_ps(As, Bs);
00065                 // sum += (As - Bs) * (As - Bs) / (As + Bs);
00066                 tmp =_mm_div_ps(_mm_mul_ps(tmp, tmp), _mm_add_ps(As, Bs));
00067                 // fix the NaN for 0/0
00068                 tmp = _mm_max_ps(tmp, zero);
00069                 sum = _mm_add_ps(sum, tmp);
00070             }
00071             sum = _mm_div_ps(sum, two);
00072             // add the components of sum together
00073             // shuffle = [1 0 3 2]
00074             // sum     = [3+1 2+0 1+3 0+2]
00075             // shuffle = [2+0 3+1 0+2 1+3]
00076             // res     = [3+1+2+0 3+1+2+0 1+3+0+2 0+2+1+3]
00077             __m128 shuffle;
00078             shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
00079             sum     = _mm_add_ps(sum, shuffle) ;
00080             shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
00081             __m128 result  = _mm_add_ps(sum, shuffle) ;
00082             C[b * slabSizeA + a] = _mm_cvtss_f32(result);
00083         }
00084     }
00085 }
00086 
00087 // assumption: vectorSize is a multiple of 4 (or at least the memory is big
00088 // enough/zero-padded), pointers are 16-byte aligned
00089 inline void
00090 DoSlabChi2DistanceSSE(double* C, const double* A, const double* B,
00091                       unsigned int slabSizeA, unsigned int slabSizeB,
00092                       unsigned int vectorSize)
00093 {
00094     const unsigned int alignFactor = 2;
00095     const int SSELength = IntAlignUp(vectorSize, alignFactor) / alignFactor;
00096     const __m128d zero = _mm_setzero_pd();
00097     const __m128d two = _mm_set1_pd(2.0);
00098     #pragma omp parallel for
00099     for (unsigned int a = 0; a < slabSizeA; a++)
00100     {
00101         const __m128d* baseA = (__m128d*) (A + (a * vectorSize));
00102         for (unsigned int b = 0; b < slabSizeB; b++)
00103         {
00104             const __m128d* baseB = (__m128d*) (B + (b * vectorSize));
00105             __m128d sum = _mm_setzero_pd();
00106             for (unsigned int k = 0; k < SSELength; k++)
00107             {
00108                 __m128d As = baseA[k];
00109                 __m128d Bs = baseB[k];
00110                 __m128d tmp = _mm_sub_pd(As, Bs);
00111                 // sum += (As - Bs) * (As - Bs) / (As + Bs);
00112                 tmp =_mm_div_pd(_mm_mul_pd(tmp, tmp), _mm_add_pd(As, Bs));
00113                 // fix the NaN for 0/0
00114                 tmp = _mm_max_pd(tmp, zero);
00115                 sum = _mm_add_pd(sum, tmp);
00116             }
00117             sum = _mm_div_pd(sum, two);
00118             // add the components of sum together
00119             __m128d shuffle = _mm_shuffle_pd(sum, sum, _MM_SHUFFLE2(0, 1));
00120             __m128d result  = _mm_add_pd(sum, shuffle) ;
00121             C[b * slabSizeA + a] = _mm_cvtsd_f64(result);
00122         }
00123     }
00124 }
00125 
00126 #endif // SSE_USED
00127 
00128 
00138 template <class FType>
00139 void
00140 DoSlabChi2DistanceStd(FType* C, const FType* A, const FType* B,
00141                       unsigned int slabSizeA, unsigned int slabSizeB,
00142                       unsigned int vectorSize)
00143 {
00144     #pragma omp parallel for
00145     for (unsigned int a = 0; a < slabSizeA; a++)
00146     {
00147         for (unsigned int b = 0; b < slabSizeB; b++)
00148         {
00149             const FType* baseA = A + (a * vectorSize);
00150             const FType* baseB = B + (b * vectorSize);
00151             FType sum = 0;
00152             for (unsigned int k = 0; k < vectorSize; k++)
00153             {
00154                 FType As = baseA[k];
00155                 FType Bs = baseB[k];
00156                 if(As == 0)
00157                 {
00158                     sum += Bs;
00159                 }
00160                 else
00161                 {
00162                     sum += (As - Bs) * (As - Bs) / (As + Bs);
00163                 }
00164             }
00165             C[b * slabSizeA + a] = (FType)(sum / 2);
00166         }
00167     }
00168 }
00169 
00170 template <class FType>
00171 inline void
00172 DoSlabChi2Distance(FType* C, const FType* A, const FType* B,
00173                    unsigned int slabSizeA, unsigned int slabSizeB,
00174                    unsigned int vectorSize)
00175 {
00176 #ifdef SSE_USED
00177     return DoSlabChi2DistanceSSE(C, A, B, slabSizeA, slabSizeB, vectorSize);
00178 #endif
00179     DoSlabChi2DistanceStd(C, A, B, slabSizeA, slabSizeB, vectorSize);
00180 }
00181 
00182 } // namespace Vector
00183 } // namespace Core
00184 } // namespace Impala
00185 
00186 #endif
Chi2Distance.h