diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 06917dc..496d950 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -42,11 +42,17 @@ CCL_NAMESPACE_BEGIN /* Precalculated data for the ray->tri intersection. */ typedef struct IsectPrecalc { +#ifndef __KERNEL_AVX__ /* Maximal dimension kz, and orthogonal dimensions. */ int kx, ky, kz; /* Shear constants. */ float Sx, Sy, Sz; +#else + /* Same for vectorized intersector */ + ssei k; + ssef S; +#endif } IsectPrecalc; /* Workaround for CUDA toolkit 6.5.16. */ @@ -78,14 +84,23 @@ void triangle_intersect_precalc(float3 dir, /* Calculate the shear constants. */ float inf_dir_z = 1.0f / IDX(dir, kz); - isect_precalc->Sx = IDX(dir, kx) * inf_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inf_dir_z; - isect_precalc->Sz = inf_dir_z; + float Sx = IDX(dir, kx) * inf_dir_z; + float Sy = IDX(dir, ky) * inf_dir_z; + float Sz = inf_dir_z; + + /* Store the dimensions and the shear constants. */ +#ifndef __KERNEL_AVX__ + isect_precalc->Sx = Sx; + isect_precalc->Sy = Sy; + isect_precalc->Sz = Sz; - /* Store the dimensions. */ isect_precalc->kx = kx; isect_precalc->ky = ky; isect_precalc->kz = kz; +#else + isect_precalc->k = ssei(kx, ky, kz, 0); + isect_precalc->S = ssef(Sx, Sy, Sz, 0); +#endif } /* TODO(sergey): Make it general utility function. */ @@ -103,6 +118,65 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, int object, int triAddr) { +#ifdef __KERNEL_AVX__ + /* Calculate vertices relative to ray origin. */ + const sse3f tri(kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 0), + kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 1), + kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 2)); + + const ssef vP = load4f(P); + + const ssef A = tri.x - vP; + const ssef B = tri.y - vP; + const ssef C = tri.z - vP; + + const ssei vk = isect_precalc->k; + const ssef A_k = shuffle(A, vk); + const ssef B_k = shuffle(B, vk); + const ssef C_k = shuffle(C, vk); + + /* Perform shear and scale of vertices. */ + const avxf ABC_kxy(shuffle<0, 1, 0, 1>(A_k, B_k), C_k); /* Pack A_kx, A_ky, B_kx, B_ky, C_kx, C_ky, _, _ */ + const avxf Sxy(shuffle<0, 1, 0, 1>(isect_precalc->S)); /* Pack Sx, Sy, Sx, Sy, Sx, Sy, _, _ */ + const avxf ABC_kz(shuffle<2>(A_k, B_k), shuffle<2>(C_k)); /* Pack A_kz, A_kz, B_kz, B_kz, C_kz, C_kz, _, _ */ + const avxf ABC_xy = nmadd(Sxy, ABC_kz, ABC_kxy); /* Pack Ax, Ay, Bx, By, Cx, Cy */ + + /* Calculate scaled barycentric coordinates. */ + /* Pack cy, cx, ay, ax, by, bx, _, _ */ +#ifdef __KERNEL_AVX2__ + const avxf CAB_yx = shuffle<5, 4, 1, 0, 3, 2, 3, 2>(ABC_xy); +#else + const avxf CAB_yx = shuffle<0, 2>(shuffle<1, 0, 1, 0>(shuffle<1, 0>(ABC_xy), ABC_xy), shuffle<3, 2, 3, 2>(ABC_xy)); +#endif + + /* Get packed result in V, W, _, _, U, _, _, _ */ + const avxf VWU = hsub(ABC_xy * CAB_yx); + const ssef VWU1 = extract<0>(VWU), VWU2 = extract<1>(VWU); + const ssef vU = shuffle<0>(VWU2); + const ssef vV = shuffle<0>(VWU1); + const ssef vW = shuffle<1>(VWU1); + + if (movemask((vU ^ vV) | (vU ^ vW))) { + return false; + } + + /* Calculate determinant. */ + float det = extract<0>(vU + vV + vW); + if(UNLIKELY(det == 0.0f)) { + return false; + } + + const ssef UVW = shuffle<0, 2, 0, 2>(shuffle<0, 1, 0, 1>(vU, vV), vW); + const float U = extract<0>(vU), V = extract<0>(vV); + + /* Calculate scaled z−coordinates of vertices and use them to calculate + * the hit distance. + */ + const ssef ABC_k_z = shuffle<0, 2, 2, 2>(shuffle<2>(A_k, B_k), C_k); + const float T = extract<2>(isect_precalc->S) * dot3(UVW, ABC_k_z); + + int sign_mask = (__float_as_int(U) & 0x80000000); +#else const int kx = isect_precalc->kx; const int ky = isect_precalc->ky; const int kz = isect_precalc->kz; @@ -154,6 +228,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, * the hit distance. */ const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; +#endif /* Perform "near clipping". */ const float abs_T = xor_signmast(T, sign_mask); @@ -190,9 +265,15 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, * it's quite tricky. */ if(UNLIKELY(abs_det > 100000.0f && t < 1e-3f)) { +#ifdef __KERNEL_AVX__ + const ssef Ng = cross(A - B, C - A); + const float pleucker_den = dot(Ng, load4f(dir)); + const float pleucker_T = dot(A, Ng); +#else const float3 Ng = cross(A - B, C - A); const float pleucker_den = dot(Ng, dir); const float pleucker_T = dot(A, Ng); +#endif if(UNLIKELY(pleucker_T * pleucker_den < 0.0f)) { return false; } @@ -229,6 +310,65 @@ ccl_device_inline void triangle_intersect_subsurface( uint *lcg_state, int max_hits) { +#ifdef __KERNEL_AVX__ + /* Calculate vertices relative to ray origin. */ + const sse3f tri(kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 0), + kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 1), + kernel_tex_fetch_ssef(__tri_woop, triAddr*TRI_NODE_SIZE + 2)); + + const ssef vP = load4f(P); + + const ssef A = tri.x - vP; + const ssef B = tri.y - vP; + const ssef C = tri.z - vP; + + const ssei vk = isect_precalc->k; + const ssef A_k = shuffle(A, vk); + const ssef B_k = shuffle(B, vk); + const ssef C_k = shuffle(C, vk); + + /* Perform shear and scale of vertices. */ + const avxf ABC_kxy(shuffle<0, 1, 0, 1>(A_k, B_k), C_k); /* Pack A_kx, A_ky, B_kx, B_ky, C_kx, C_ky, _, _ */ + const avxf Sxy(shuffle<0, 1, 0, 1>(isect_precalc->S)); /* Pack Sx, Sy, Sx, Sy, Sx, Sy, _, _ */ + const avxf ABC_kz(shuffle<2>(A_k, B_k), shuffle<2>(C_k)); /* Pack A_kz, A_kz, B_kz, B_kz, C_kz, C_kz, _, _ */ + const avxf ABC_xy = nmadd(Sxy, ABC_kz, ABC_kxy); /* Pack Ax, Ay, Bx, By, Cx, Cy */ + + /* Calculate scaled barycentric coordinates. */ + /* Pack cy, cx, ay, ax, by, bx, _, _ */ +#ifdef __KERNEL_AVX2__ + const avxf CAB_yx = shuffle<5, 4, 1, 0, 3, 2, 3, 2>(ABC_xy); +#else + const avxf CAB_yx = shuffle<0, 2>(shuffle<1, 0, 1, 0>(shuffle<1, 0>(ABC_xy), ABC_xy), shuffle<3, 2, 3, 2>(ABC_xy)); +#endif + + /* Get packed result in V, W, _, _, U, _, _, _ */ + const avxf VWU = hsub(ABC_xy * CAB_yx); + const ssef VWU1 = extract<0>(VWU), VWU2 = extract<1>(VWU); + const ssef vU = shuffle<0>(VWU2); + const ssef vV = shuffle<0>(VWU1); + const ssef vW = shuffle<1>(VWU1); + + if (movemask((vU ^ vV) | (vU ^ vW))) { + return; + } + + /* Calculate determinant. */ + float det = extract<0>(vU + vV + vW); + if(UNLIKELY(det == 0.0f)) { + return; + } + + const ssef UVW = shuffle<0, 2, 0, 2>(shuffle<0, 1, 0, 1>(vU, vV), vW); + const float U = extract<0>(vU), V = extract<0>(vV); + + /* Calculate scaled z−coordinates of vertices and use them to calculate + * the hit distance. + */ + const ssef ABC_k_z = shuffle<0, 2, 2, 2>(shuffle<2>(A_k, B_k), C_k); + const float T = extract<2>(isect_precalc->S) * dot3(UVW, ABC_k_z); + + int sign_mask = (__float_as_int(U) & 0x80000000); +#else const int kx = isect_precalc->kx; const int ky = isect_precalc->ky; const int kz = isect_precalc->kz; @@ -283,6 +423,7 @@ ccl_device_inline void triangle_intersect_subsurface( const float Bz = Sz * B_kz; const float Cz = Sz * C_kz; const float T = U * Az + V * Bz + W * Cz; +#endif if ((xor_signmast(T, sign_mask) < 0.0f) || (xor_signmast(T, sign_mask) > tmax * xor_signmast(det, sign_mask))) diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 0acb9e9..e0a3a63 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -37,6 +37,8 @@ set(SRC_HEADERS util_aligned_malloc.h util_args.h util_atomic.h + util_avxb.h + util_avxf.h util_boundbox.h util_cache.h util_debug.h diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h new file mode 100644 index 0000000..eb80e08 --- /dev/null +++ b/intern/cycles/util/util_avxb.h @@ -0,0 +1,162 @@ +/* + * Copyright 2009-2013 Intel Corporation + * Modifications Copyright 2015, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_AVXB_H__ +#define __UTIL_AVXB_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_AVX__ + +/*! 8-wide AVX bool type. */ +struct avxb +{ + typedef avxb Mask; // mask type for us + enum { size = 8 }; // number of SIMD elements + union { // data + __m256 m256; + struct { __m128 l,h; }; + int32_t v[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxb () {} + __forceinline avxb ( const avxb& a ) { m256 = a.m256; } + __forceinline avxb& operator=( const avxb& a ) { m256 = a.m256; return *this; } + + __forceinline avxb( const __m256 a ) : m256(a) {} + __forceinline operator const __m256&( void ) const { return m256; } + __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); } + __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); } + + __forceinline avxb ( const sseb& a ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline avxb ( const sseb& a, const sseb& b) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + __forceinline avxb ( const __m128 a, const __m128 b) : l(a), h(b) {} + + __forceinline avxb ( bool a ) : m256(avxb(sseb(a), sseb(a))) {} + __forceinline avxb ( bool a, bool b) : m256(avxb(sseb(a), sseb(b))) {} + __forceinline avxb ( bool a, bool b, bool c, bool d) : m256(avxb(sseb(a,b), sseb(c,d))) {} + __forceinline avxb ( bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h ) : m256(avxb(sseb(a,b,c,d), sseb(e,f,g,h))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {} + __forceinline avxb( TrueTy ) : m256(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; } + __forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); } + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); } +__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); } +__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); } + +__forceinline avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; } +__forceinline avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; } +__forceinline avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); } +__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_xor_ps(_mm256_xor_ps(a,b),avxb(True)); } + +__forceinline const avxb select( const avxb& mask, const avxb& t, const avxb& f ) { + return _mm256_blendv_ps(f, t, mask); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a.m256, b.m256); } +__forceinline avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a.m256, b.m256); } + +template __forceinline const avxb shuffle( const avxb& a ) { + return _mm256_permute_ps(a, _MM_SHUFFLE(i, i, i, i)); +} + +template __forceinline const avxb shuffle( const avxb& a ) { + return _mm256_permute2f128_ps(a, a, (i1 << 4) | (i0 << 0)); +} + +template __forceinline const avxb shuffle( const avxb& a, const avxb& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); +} + +template __forceinline const avxb shuffle( const avxb& a ) { + return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template __forceinline const avxb shuffle( const avxb& a, const avxb& b ) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<> __forceinline const avxb shuffle<0, 0, 2, 2>( const avxb& b ) { return _mm256_moveldup_ps(b); } +template<> __forceinline const avxb shuffle<1, 1, 3, 3>( const avxb& b ) { return _mm256_movehdup_ps(b); } +template<> __forceinline const avxb shuffle<0, 1, 0, 1>( const avxb& b ) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(b))); } + +template __forceinline const avxb insert (const avxb& a, const sseb& b) { return _mm256_insertf128_ps (a,b,i); } +template __forceinline const sseb extract(const avxb& a ) { return _mm256_extractf128_ps(a ,i); } + +//////////////////////////////////////////////////////////////////////////////// +/// Reduction Operations +//////////////////////////////////////////////////////////////////////////////// + +__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); } +__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xff; } +__forceinline bool reduce_or ( const avxb& a ) { return !_mm256_testz_ps(a,a); } +__forceinline bool all ( const avxb& a ) { return _mm256_movemask_ps(a) == 0xff; } +__forceinline bool none ( const avxb& a ) { return _mm256_testz_ps(a,a) != 0; } +__forceinline bool any ( const avxb& a ) { return !_mm256_testz_ps(a,a); } + +__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); } + +//////////////////////////////////////////////////////////////////////////////// +/// Debug Functions +//////////////////////////////////////////////////////////////////////////////// + +ccl_device_inline void print_avxb(const char *label, const avxb &a) +{ + printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); +} + +#endif + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h new file mode 100644 index 0000000..94e5eca --- /dev/null +++ b/intern/cycles/util/util_avxf.h @@ -0,0 +1,314 @@ +/* + * Copyright 2009-2013 Intel Corporation + * Modifications Copyright 2015, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_AVXF_H__ +#define __UTIL_AVXF_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_AVX__ + +/*! 8-wide AVX float type. */ +struct avxf +{ + typedef avxb Mask; // mask type for us + enum { size = 8 }; // number of SIMD elements + union { __m256 m256; float v[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxf ( ) {} + __forceinline avxf ( const avxf& other ) { m256 = other.m256; } + __forceinline avxf& operator=( const avxf& other ) { m256 = other.m256; return *this; } + + __forceinline avxf( const __m256 a ) : m256(a) {} + __forceinline operator const __m256&( void ) const { return m256; } + __forceinline operator __m256&( void ) { return m256; } + + __forceinline explicit avxf( const ssef& a ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline avxf( const ssef& a, const ssef& b ) : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + + static __forceinline avxf load( const void* const ptr ) { return *(__m256*)ptr; } + + __forceinline explicit avxf( const char* const a ) : m256(_mm256_loadu_ps((const float*)a)) {} + __forceinline avxf( const float& a ) : m256(_mm256_broadcast_ss(&a)) {} + __forceinline avxf( float a, float b) : m256(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} + __forceinline avxf( float a, float b, float c, float d ) : m256(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} + __forceinline avxf( float a, float b, float c, float d, float e, float f, float g, float h ) : m256(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit avxf( const __m256i a ) : m256(_mm256_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t i ) const { assert(i < 8); return v[i]; } + __forceinline float& operator []( const size_t i ) { assert(i < 8); return v[i]; } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf operator +( const avxf& a ) { return a; } +__forceinline const avxf operator -( const avxf& a ) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a.m256, mask); +} +__forceinline const avxf abs ( const avxf& a ) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); + return _mm256_and_ps(a.m256, mask); +} +__forceinline const avxf sign ( const avxf& a ) { return _mm256_blendv_ps(avxf(1.0f), -avxf(1.0f), _mm256_cmp_ps(a, avxf(0.0f), _CMP_NGE_UQ )); } +__forceinline const avxf signmsk ( const avxf& a ) { return _mm256_and_ps(a.m256,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } + +__forceinline const avxf rcp ( const avxf& a ) { + const avxf r = _mm256_rcp_ps(a.m256); + return _mm256_sub_ps(_mm256_add_ps(r, r), _mm256_mul_ps(_mm256_mul_ps(r, r), a)); +} +__forceinline const avxf sqr ( const avxf& a ) { return _mm256_mul_ps(a,a); } +__forceinline const avxf sqrt ( const avxf& a ) { return _mm256_sqrt_ps(a.m256); } +__forceinline const avxf rsqrt( const avxf& a ) { + const avxf r = _mm256_rsqrt_ps(a.m256); + return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf operator +( const avxf& a, const avxf& b ) { return _mm256_add_ps(a.m256, b.m256); } +__forceinline const avxf operator +( const avxf& a, const float b ) { return a + avxf(b); } +__forceinline const avxf operator +( const float a, const avxf& b ) { return avxf(a) + b; } + +__forceinline const avxf operator -( const avxf& a, const avxf& b ) { return _mm256_sub_ps(a.m256, b.m256); } +__forceinline const avxf operator -( const avxf& a, const float b ) { return a - avxf(b); } +__forceinline const avxf operator -( const float a, const avxf& b ) { return avxf(a) - b; } + +__forceinline const avxf operator *( const avxf& a, const avxf& b ) { return _mm256_mul_ps(a.m256, b.m256); } +__forceinline const avxf operator *( const avxf& a, const float b ) { return a * avxf(b); } +__forceinline const avxf operator *( const float a, const avxf& b ) { return avxf(a) * b; } + +__forceinline const avxf operator /( const avxf& a, const avxf& b ) { return a * rcp(b); } +//__forceinline const avxf operator /( const avxf& a, const float b ) { return a * rcp(b); } +__forceinline const avxf operator /( const float a, const avxf& b ) { return a * rcp(b); } + +__forceinline const avxf operator^( const avxf& a, const avxf& b ) { return _mm256_xor_ps(a.m256,b.m256); } + +__forceinline const avxf min( const avxf& a, const avxf& b ) { return _mm256_min_ps(a.m256, b.m256); } +__forceinline const avxf min( const avxf& a, const float b ) { return _mm256_min_ps(a.m256, avxf(b)); } +__forceinline const avxf min( const float a, const avxf& b ) { return _mm256_min_ps(avxf(a), b.m256); } + +__forceinline const avxf max( const avxf& a, const avxf& b ) { return _mm256_max_ps(a.m256, b.m256); } +__forceinline const avxf max( const avxf& a, const float b ) { return _mm256_max_ps(a.m256, avxf(b)); } +__forceinline const avxf max( const float a, const avxf& b ) { return _mm256_max_ps(avxf(a), b.m256); } + +//////////////////////////////////////////////////////////////////////////////// +/// Ternary Operators +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX2__) +__forceinline const avxf madd ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fmadd_ps(a,b,c); } +__forceinline const avxf msub ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fmsub_ps(a,b,c); } +__forceinline const avxf nmadd ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fnmadd_ps(a,b,c); } +__forceinline const avxf nmsub ( const avxf& a, const avxf& b, const avxf& c) { return _mm256_fnmsub_ps(a,b,c); } +#else +__forceinline const avxf madd ( const avxf& a, const avxf& b, const avxf& c) { return a*b+c; } +__forceinline const avxf msub ( const avxf& a, const avxf& b, const avxf& c) { return a*b-c; } +__forceinline const avxf nmadd ( const avxf& a, const avxf& b, const avxf& c) { return c-a*b;} +__forceinline const avxf nmsub ( const avxf& a, const avxf& b, const avxf& c) { return -a*b-c; } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline avxf& operator +=( avxf& a, const avxf& b ) { return a = a + b; } +__forceinline avxf& operator +=( avxf& a, const float b ) { return a = a + b; } + +__forceinline avxf& operator -=( avxf& a, const avxf& b ) { return a = a - b; } +__forceinline avxf& operator -=( avxf& a, const float b ) { return a = a - b; } + +__forceinline avxf& operator *=( avxf& a, const avxf& b ) { return a = a * b; } +__forceinline avxf& operator *=( avxf& a, const float b ) { return a = a * b; } + +__forceinline avxf& operator /=( avxf& a, const avxf& b ) { return a = a / b; } +__forceinline avxf& operator /=( avxf& a, const float b ) { return a = a / b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxb operator ==( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_EQ_UQ ); } +__forceinline const avxb operator ==( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_EQ_UQ ); } +__forceinline const avxb operator ==( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_EQ_UQ ); } + +__forceinline const avxb operator !=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NEQ_UQ); } +__forceinline const avxb operator !=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NEQ_UQ); } +__forceinline const avxb operator !=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NEQ_UQ); } + +__forceinline const avxb operator < ( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NGE_UQ ); } +__forceinline const avxb operator < ( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NGE_UQ ); } +__forceinline const avxb operator < ( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NGE_UQ ); } + +__forceinline const avxb operator >=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NLT_UQ); } +__forceinline const avxb operator >=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NLT_UQ); } +__forceinline const avxb operator >=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NLT_UQ); } + +__forceinline const avxb operator > ( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NLE_UQ); } +__forceinline const avxb operator > ( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NLE_UQ); } +__forceinline const avxb operator > ( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NLE_UQ); } + +__forceinline const avxb operator <=( const avxf& a, const avxf& b ) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_NGT_UQ ); } +__forceinline const avxb operator <=( const avxf& a, const float b ) { return _mm256_cmp_ps(a.m256, avxf(b), _CMP_NGT_UQ ); } +__forceinline const avxb operator <=( const float a, const avxf& b ) { return _mm256_cmp_ps(avxf(a), b.m256, _CMP_NGT_UQ ); } + +__forceinline const avxf select( const avxb& mask, const avxf& t, const avxf& f ) { + return _mm256_blendv_ps(f, t, mask); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Rounding Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf round_even( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +__forceinline const avxf round_down( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } +__forceinline const avxf round_up ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } +__forceinline const avxf round_zero( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } +__forceinline const avxf floor ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } +__forceinline const avxf ceil ( const avxf& a ) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline avxf unpacklo( const avxf& a, const avxf& b ) { return _mm256_unpacklo_ps(a.m256, b.m256); } +__forceinline avxf unpackhi( const avxf& a, const avxf& b ) { return _mm256_unpackhi_ps(a.m256, b.m256); } + +template __forceinline const avxf shuffle( const avxf& a ) { + return _mm256_permute_ps(a, _MM_SHUFFLE(i, i, i, i)); +} + +template __forceinline const avxf shuffle( const avxf& a ) { + return _mm256_permute2f128_ps(a, a, (i1 << 4) | (i0 << 0)); +} + +template __forceinline const avxf shuffle( const avxf& a, const avxf& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); +} + +template __forceinline const avxf shuffle( const avxf& a ) { + return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template __forceinline const avxf shuffle( const avxf& a, const avxf& b ) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<> __forceinline const avxf shuffle<0, 0, 2, 2>( const avxf& a ) { return _mm256_moveldup_ps(a); } +template<> __forceinline const avxf shuffle<1, 1, 3, 3>( const avxf& a ) { return _mm256_movehdup_ps(a); } + +#ifdef __KERNEL_AVX2__ +template +__forceinline const avxf shuffle( const avxf& a ) { + return _mm256_permutevar8x32_ps(a, _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7)); +} +#endif + +__forceinline const avxf broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } +template __forceinline const avxf insert (const avxf& a, const ssef& b) { return _mm256_insertf128_ps (a,b,i); } +template __forceinline const ssef extract (const avxf& a ) { return _mm256_extractf128_ps(a ,i); } +template<> __forceinline const ssef extract<0>(const avxf& a ) { return _mm256_castps256_ps128(a); } + +//////////////////////////////////////////////////////////////////////////////// +/// Transpose +//////////////////////////////////////////////////////////////////////////////// + +__forceinline void transpose4(const avxf& r0, const avxf& r1, const avxf& r2, const avxf& r3, avxf& c0, avxf& c1, avxf& c2, avxf& c3) +{ + avxf l02 = unpacklo(r0,r2); + avxf h02 = unpackhi(r0,r2); + avxf l13 = unpacklo(r1,r3); + avxf h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); +} + +__forceinline void transpose(const avxf& r0, const avxf& r1, const avxf& r2, const avxf& r3, const avxf& r4, const avxf& r5, const avxf& r6, const avxf& r7, + avxf& c0, avxf& c1, avxf& c2, avxf& c3, avxf& c4, avxf& c5, avxf& c6, avxf& c7) +{ + avxf h0,h1,h2,h3; transpose4(r0,r1,r2,r3,h0,h1,h2,h3); + avxf h4,h5,h6,h7; transpose4(r4,r5,r6,r7,h4,h5,h6,h7); + c0 = shuffle<0,2>(h0,h4); + c1 = shuffle<0,2>(h1,h5); + c2 = shuffle<0,2>(h2,h6); + c3 = shuffle<0,2>(h3,h7); + c4 = shuffle<1,3>(h0,h4); + c5 = shuffle<1,3>(h1,h5); + c6 = shuffle<1,3>(h2,h6); + c7 = shuffle<1,3>(h3,h7); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Reductions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf vreduce_min2(const avxf& v) { return min(v,shuffle<1,0,3,2>(v)); } +__forceinline const avxf vreduce_min4(const avxf& v) { avxf v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } +__forceinline const avxf vreduce_min (const avxf& v) { avxf v1 = vreduce_min4(v); return min(v1,shuffle<1,0>(v1)); } + +__forceinline const avxf vreduce_max2(const avxf& v) { return max(v,shuffle<1,0,3,2>(v)); } +__forceinline const avxf vreduce_max4(const avxf& v) { avxf v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } +__forceinline const avxf vreduce_max (const avxf& v) { avxf v1 = vreduce_max4(v); return max(v1,shuffle<1,0>(v1)); } + +__forceinline const avxf vreduce_add2(const avxf& v) { return v + shuffle<1,0,3,2>(v); } +__forceinline const avxf vreduce_add4(const avxf& v) { avxf v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } +__forceinline const avxf vreduce_add (const avxf& v) { avxf v1 = vreduce_add4(v); return v1 + shuffle<1,0>(v1); } + +__forceinline const avxf hsub(const avxf& v) { return _mm256_hsub_ps(v, v); } + +__forceinline float reduce_min(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_min(v))); } +__forceinline float reduce_max(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_max(v))); } +__forceinline float reduce_add(const avxf& v) { return _mm_cvtss_f32(extract<0>(vreduce_add(v))); } + +__forceinline size_t select_min(const avxf& v) { return __bsf(movemask(v == vreduce_min(v))); } +__forceinline size_t select_max(const avxf& v) { return __bsf(movemask(v == vreduce_max(v))); } + +__forceinline size_t select_min(const avxb& valid, const avxf& v) { const avxf a = select(valid,v,avxf(pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } +__forceinline size_t select_max(const avxb& valid, const avxf& v) { const avxf a = select(valid,v,avxf(neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); } + +//////////////////////////////////////////////////////////////////////////////// +/// Debug Functions +//////////////////////////////////////////////////////////////////////////////// + +ccl_device_inline void print_avxf(const char *label, const ssef &a) +{ + printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n", label, + (double)a[0], (double)a[1], (double)a[2], (double)a[3], + (double)a[4], (double)a[5], (double)a[6], (double)a[7]); +} + +#endif + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 625f26c..665c55f 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -434,6 +434,8 @@ CCL_NAMESPACE_END #include "util_sseb.h" #include "util_ssei.h" #include "util_ssef.h" +#include "util_avxb.h" +#include "util_avxf.h" #endif /* __UTIL_SIMD_TYPES_H__ */