00001 #ifndef Impala_Core_Vector_ReduceSum_h
00002 #define Impala_Core_Vector_ReduceSum_h
00003
00004 #include "Core/Vector/VectorTem.h"
00005 #ifdef SSE_USED
00006 #include <xmmintrin.h>
00007 #include <stdint.h>
00008 #ifndef POINTER_ALIGNED
00009 #define POINTER_ALIGNED(x) (!((( intptr_t)x) & 0xF))
00010 #endif
00011 #endif // SSE_USED
00012
00013 namespace Impala
00014 {
00015 namespace Core
00016 {
00017 namespace Vector
00018 {
00019
00020
00021 #ifdef SSE_USED
00022
00023
00024 float
00025 ReduceSumSSE(const float* A, unsigned int n)
00026 {
00027 if(!POINTER_ALIGNED(A)) return -1.0f;
00028 __m128 result = _mm_setzero_ps();
00029 __m128 *pA = (__m128*) A;
00030 const int SSELength = IntAlignUp(n, 4) / 4;
00031 for(unsigned int i = 0; i < SSELength; i++)
00032 {
00033 result = _mm_add_ps(result, pA[i]);
00034 }
00035
00036
00037
00038
00039
00040 __m128 shuffle, sum;
00041 shuffle = _mm_shuffle_ps(result, result, _MM_SHUFFLE(1, 0, 3, 2));
00042 sum = _mm_add_ps(result, shuffle) ;
00043 shuffle = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
00044 result = _mm_add_ps(sum, shuffle) ;
00045
00046 return _mm_cvtss_f32(result);
00047 }
00048
00049
00050
00051 double
00052 ReduceSumSSE(const double* A, unsigned int n)
00053 {
00054 if(!POINTER_ALIGNED(A)) return -1.0;
00055 __m128d result = _mm_setzero_pd();
00056 __m128d *pA = (__m128d*) A;
00057 const int SSELength = IntAlignUp(n, 2) / 2;
00058 for(unsigned int i = 0; i < SSELength; i++)
00059 {
00060 result = _mm_add_pd(result, pA[i]);
00061 }
00062
00063 __m128d shuffle, sum;
00064 shuffle = _mm_shuffle_pd(result, result, _MM_SHUFFLE2(0, 1));
00065 result = _mm_add_pd(result, shuffle) ;
00066
00067 return _mm_cvtsd_f64(result);
00068 }
00069
00070 #endif // SSE_USED
00071
00072
00073 template <class FType>
00074 inline double
00075 ReduceSumStd(const FType* A, unsigned int n)
00076 {
00077 double result = 0.0;
00078 for(unsigned int i = 0; i < n; i++)
00079 {
00080 result += A[i];
00081 }
00082 return result;
00083 }
00084
00085 template <class FType>
00086 inline double
00087 ReduceSum(const FType* A, unsigned int n)
00088 {
00089 #ifdef SSE_USED
00090 return ReduceSumSSE(A, n);
00091 #endif
00092 ReduceSum(A, n);
00093 }
00094
00095 }
00096 }
00097 }
00098
00099 #endif