Impala Documentation

00001 #ifndef Impala_Core_Vector_ReduceSum_h
00002 #define Impala_Core_Vector_ReduceSum_h
00003 
00004 #include "Core/Vector/VectorTem.h"
00005 #ifdef SSE_USED
00006 #include <xmmintrin.h>
00007 #include <stdint.h>
00008 #ifndef POINTER_ALIGNED
00009 #define POINTER_ALIGNED(x) (!((( intptr_t)x) & 0xF))
00010 #endif
00011 #endif // SSE_USED
00012 
00013 namespace Impala
00014 {
00015 namespace Core
00016 {
00017 namespace Vector
00018 {
00019 
00020 
00021 #ifdef SSE_USED
00022 
00023 // assumption: A has size IntAlignUp(n,4) and the pointer is 16-byte aligned
00024 float
00025 ReduceSumSSE(const float* A, unsigned int n)
00026 {
00027     if(!POINTER_ALIGNED(A)) return -1.0f;
00028     __m128 result = _mm_setzero_ps();
00029     __m128 *pA = (__m128*) A;
00030     const int SSELength = IntAlignUp(n, 4) / 4;
00031     for(unsigned int i = 0; i < SSELength; i++)
00032     {
00033         result = _mm_add_ps(result, pA[i]);
00034     }
00035 
00036     // shuffle = [1 0 3 2]
00037     // sum     = [3+1 2+0 1+3 0+2]
00038     // shuffle = [2+0 3+1 0+2 1+3]
00039     // res     = [3+1+2+0 3+1+2+0 1+3+0+2 0+2+1+3]
00040     __m128 shuffle, sum;
00041     shuffle = _mm_shuffle_ps(result, result, _MM_SHUFFLE(1, 0, 3, 2));
00042     sum     = _mm_add_ps(result, shuffle) ;
00043     shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
00044     result  = _mm_add_ps(sum, shuffle) ;
00045 
00046     return _mm_cvtss_f32(result);
00047 }
00048 
00049 
00050 // assumption: A has size IntAlignUp(n,2) and the pointer is 16-byte aligned
00051 double
00052 ReduceSumSSE(const double* A, unsigned int n)
00053 {
00054     if(!POINTER_ALIGNED(A)) return -1.0;
00055     __m128d result = _mm_setzero_pd();
00056     __m128d *pA = (__m128d*) A;
00057     const int SSELength = IntAlignUp(n, 2) / 2;
00058     for(unsigned int i = 0; i < SSELength; i++)
00059     {
00060         result = _mm_add_pd(result, pA[i]);
00061     }
00062 
00063     __m128d shuffle, sum;
00064     shuffle = _mm_shuffle_pd(result, result, _MM_SHUFFLE2(0, 1));
00065     result  = _mm_add_pd(result, shuffle) ;
00066 
00067     return _mm_cvtsd_f64(result);
00068 }
00069 
00070 #endif // SSE_USED
00071 
00072 
00073 template <class FType>
00074 inline double
00075 ReduceSumStd(const FType* A, unsigned int n)
00076 {
00077     double result = 0.0;
00078     for(unsigned int i = 0; i < n; i++)
00079     {
00080         result += A[i];
00081     }
00082     return result;
00083 }
00084 
00085 template <class FType>
00086 inline double
00087 ReduceSum(const FType* A, unsigned int n)
00088 {
00089 #ifdef SSE_USED
00090     return ReduceSumSSE(A, n);
00091 #endif
00092     ReduceSum(A, n);
00093 }
00094 
00095 } // namespace Vector
00096 } // namespace Core
00097 } // namespace Impala
00098 
00099 #endif
ReduceSum.h