00001 #ifndef Impala_Core_Vector_Chi2Distance_h
00002 #define Impala_Core_Vector_Chi2Distance_h
00003
00004 #include "Core/Vector/VectorTem.h"
00005 #ifdef SSE_USED
00006 #include <xmmintrin.h>
00007 #include <stdint.h>
00008 #ifndef POINTER_ALIGNED
00009 #define POINTER_ALIGNED(x) (!((( intptr_t)x) & 0xF))
00010 #endif
00011 #endif // SSE_USED
00012
00013 namespace Impala
00014 {
00015 namespace Core
00016 {
00017 namespace Vector
00018 {
00019
00020
00021 inline double
00022 Chi2Distance(const Vector::VectorTem<double>& v1,
00023 const Vector::VectorTem<double>& v2)
00024 {
00025 int length = v1.Size();
00026 double accumulator = 0;
00027 for (int i=0 ; i<length ; i++)
00028 {
00029 double sum = v1[i] + v2[i];
00030 if (sum)
00031 accumulator += (v1[i] - v2[i]) * (v1[i] - v2[i]) / sum;
00032 }
00033
00034
00035 return accumulator / 2.0;
00036 }
00037
00038
00039 #ifdef SSE_USED
00040
00041
00042
00043 inline void
00044 DoSlabChi2DistanceSSE(float* C, const float* A, const float* B,
00045 unsigned int slabSizeA, unsigned int slabSizeB,
00046 unsigned int vectorSize)
00047 {
00048 const unsigned int alignFactor = 4;
00049 const int SSELength = IntAlignUp(vectorSize, alignFactor) / alignFactor;
00050 const __m128 zero = _mm_setzero_ps();
00051 const __m128 two = _mm_set1_ps(2.0f);
00052 #pragma omp parallel for
00053 for (unsigned int a = 0; a < slabSizeA; a++)
00054 {
00055 const __m128* baseA = (__m128*) (A + (a * vectorSize));
00056 for (unsigned int b = 0; b < slabSizeB; b++)
00057 {
00058 const __m128* baseB = (__m128*) (B + (b * vectorSize));
00059 __m128 sum = _mm_setzero_ps();
00060 for (unsigned int k = 0; k < SSELength; k++)
00061 {
00062 __m128 As = baseA[k];
00063 __m128 Bs = baseB[k];
00064 __m128 tmp = _mm_sub_ps(As, Bs);
00065
00066 tmp =_mm_div_ps(_mm_mul_ps(tmp, tmp), _mm_add_ps(As, Bs));
00067
00068 tmp = _mm_max_ps(tmp, zero);
00069 sum = _mm_add_ps(sum, tmp);
00070 }
00071 sum = _mm_div_ps(sum, two);
00072
00073
00074
00075
00076
00077 __m128 shuffle;
00078 shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
00079 sum = _mm_add_ps(sum, shuffle) ;
00080 shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
00081 __m128 result = _mm_add_ps(sum, shuffle) ;
00082 C[b * slabSizeA + a] = _mm_cvtss_f32(result);
00083 }
00084 }
00085 }
00086
00087
00088
00089 inline void
00090 DoSlabChi2DistanceSSE(double* C, const double* A, const double* B,
00091 unsigned int slabSizeA, unsigned int slabSizeB,
00092 unsigned int vectorSize)
00093 {
00094 const unsigned int alignFactor = 2;
00095 const int SSELength = IntAlignUp(vectorSize, alignFactor) / alignFactor;
00096 const __m128d zero = _mm_setzero_pd();
00097 const __m128d two = _mm_set1_pd(2.0);
00098 #pragma omp parallel for
00099 for (unsigned int a = 0; a < slabSizeA; a++)
00100 {
00101 const __m128d* baseA = (__m128d*) (A + (a * vectorSize));
00102 for (unsigned int b = 0; b < slabSizeB; b++)
00103 {
00104 const __m128d* baseB = (__m128d*) (B + (b * vectorSize));
00105 __m128d sum = _mm_setzero_pd();
00106 for (unsigned int k = 0; k < SSELength; k++)
00107 {
00108 __m128d As = baseA[k];
00109 __m128d Bs = baseB[k];
00110 __m128d tmp = _mm_sub_pd(As, Bs);
00111
00112 tmp =_mm_div_pd(_mm_mul_pd(tmp, tmp), _mm_add_pd(As, Bs));
00113
00114 tmp = _mm_max_pd(tmp, zero);
00115 sum = _mm_add_pd(sum, tmp);
00116 }
00117 sum = _mm_div_pd(sum, two);
00118
00119 __m128d shuffle = _mm_shuffle_pd(sum, sum, _MM_SHUFFLE2(0, 1));
00120 __m128d result = _mm_add_pd(sum, shuffle) ;
00121 C[b * slabSizeA + a] = _mm_cvtsd_f64(result);
00122 }
00123 }
00124 }
00125
00126 #endif // SSE_USED
00127
00128
00138 template <class FType>
00139 void
00140 DoSlabChi2DistanceStd(FType* C, const FType* A, const FType* B,
00141 unsigned int slabSizeA, unsigned int slabSizeB,
00142 unsigned int vectorSize)
00143 {
00144 #pragma omp parallel for
00145 for (unsigned int a = 0; a < slabSizeA; a++)
00146 {
00147 for (unsigned int b = 0; b < slabSizeB; b++)
00148 {
00149 const FType* baseA = A + (a * vectorSize);
00150 const FType* baseB = B + (b * vectorSize);
00151 FType sum = 0;
00152 for (unsigned int k = 0; k < vectorSize; k++)
00153 {
00154 FType As = baseA[k];
00155 FType Bs = baseB[k];
00156 if(As == 0)
00157 {
00158 sum += Bs;
00159 }
00160 else
00161 {
00162 sum += (As - Bs) * (As - Bs) / (As + Bs);
00163 }
00164 }
00165 C[b * slabSizeA + a] = (FType)(sum / 2);
00166 }
00167 }
00168 }
00169
00170 template <class FType>
00171 inline void
00172 DoSlabChi2Distance(FType* C, const FType* A, const FType* B,
00173 unsigned int slabSizeA, unsigned int slabSizeB,
00174 unsigned int vectorSize)
00175 {
00176 #ifdef SSE_USED
00177 return DoSlabChi2DistanceSSE(C, A, B, slabSizeA, slabSizeB, vectorSize);
00178 #endif
00179 DoSlabChi2DistanceStd(C, A, B, slabSizeA, slabSizeB, vectorSize);
00180 }
00181
00182 }
00183 }
00184 }
00185
00186 #endif