/* Copyright 2012 Dietrich Epp */ #pragma once #include "fresample.h" #define LFR_UNREACHABLE (void) 0 #if defined(__clang__) # if __has_builtin(__builtin_unreachable) # undef LFR_UNREACHABLE # define LFR_UNREACHABLE __builtin_unreachable() # endif #elif defined(__GNUC__) # if (__GNUC__ >= 4 && __GNU_MINOR__ >= 5) || __GNUC__ > 4 # undef LFR_UNREACHABLE # define LFR_UNREACHABLE __builtin_unreachable() # endif #endif #define INLINE static inline /* Constants used by dithering algorithm. We usa a simple linear congruential generator to generate a uniform signal for dithering, taking the high order bits: x_{n+1} = (A * x_n + C) mod 2^32 The derived constants, AN/CN, are used for stepping the LCG forward by N steps. AI is the inverse of A. */ constexpr auto LCG_A = 1103515245u; constexpr auto LCG_A2 = 3265436265u; constexpr auto LCG_A4 = 3993403153u; constexpr auto LCG_C = 12345u; constexpr auto LCG_C2 = 3554416254u; constexpr auto LCG_C4 = 3596950572u; constexpr auto LCG_AI = 4005161829u; constexpr auto LCG_CI = 4235699843u; /* ==================== Utility functions ==================== */ #if defined(LFR_SSE2) && defined(LFR_CPU_X86) #include /* Store 16-bit words [i0,i1) in the given location. */ INLINE void lfr_storepartial_epi16(__m128i *dest, __m128i x, int i0, int i1) { union { unsigned short h[8]; __m128i x; } u; u.x = x; for (int i = i0; i < i1; ++i) ((unsigned short *) dest)[i] = u.h[i]; } /* Advance four linear congruential generators. The four generators should use the same A and C constants. The 32-bit multiply we want requires SSE 4.1. We construct it out of two 32 to 64 bit multiply operations. */ INLINE __m128i lfr_rand_epu32(__m128i x, __m128i a, __m128i c) { return _mm_add_epi32( _mm_unpacklo_epi32( _mm_shuffle_epi32( _mm_mul_epu32(x, a), _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32( _mm_mul_epu32(_mm_srli_si128(x, 4), a), _MM_SHUFFLE(0, 0, 2, 0))), c); } #endif #if defined(LFR_ALTIVEC) && defined(LFR_CPU_PPC) #if !defined(__APPLE_ALTIVEC__) #include #endif /* Advance four linear congruential generators. The four generators should use the same A and C constants. The 32-bit multiply we want does not exist. We construct it out of 16-bit multiply operations. */ INLINE vector unsigned int lfr_vecrand(vector unsigned int x, vector unsigned int a, vector unsigned int c) { vector unsigned int s = vec_splat_u32(-16); return vec_add( vec_add( vec_mulo( (vector unsigned short) x, (vector unsigned short) a), c), vec_sl( vec_msum( (vector unsigned short) x, (vector unsigned short) vec_rl(a, s), vec_splat_u32(0)), s)); } #endif