1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
/* Copyright 2012 Dietrich Epp <depp@zdome.net> */
#pragma once
#include "fresample.h"
#define LFR_UNREACHABLE (void) 0
#if defined(__clang__)
# if __has_builtin(__builtin_unreachable)
# undef LFR_UNREACHABLE
# define LFR_UNREACHABLE __builtin_unreachable()
# endif
#elif defined(__GNUC__)
# if (__GNUC__ >= 4 && __GNU_MINOR__ >= 5) || __GNUC__ > 4
# undef LFR_UNREACHABLE
# define LFR_UNREACHABLE __builtin_unreachable()
# endif
#endif
#define INLINE static inline
/*
Constants used by dithering algorithm. We usa a simple linear
congruential generator to generate a uniform signal for dithering,
taking the high order bits:
x_{n+1} = (A * x_n + C) mod 2^32
The derived constants, AN/CN, are used for stepping the LCG forward
by N steps. AI is the inverse of A.
*/
constexpr auto LCG_A = 1103515245u;
constexpr auto LCG_A2 = 3265436265u;
constexpr auto LCG_A4 = 3993403153u;
constexpr auto LCG_C = 12345u;
constexpr auto LCG_C2 = 3554416254u;
constexpr auto LCG_C4 = 3596950572u;
constexpr auto LCG_AI = 4005161829u;
constexpr auto LCG_CI = 4235699843u;
/* ====================
Utility functions
==================== */
#if defined(LFR_SSE2) && defined(LFR_CPU_X86)
#include <emmintrin.h>
/*
Store 16-bit words [i0,i1) in the given location.
*/
INLINE void lfr_storepartial_epi16(__m128i *dest, __m128i x, int i0, int i1)
{
union {
unsigned short h[8];
__m128i x;
} u;
u.x = x;
for (int i = i0; i < i1; ++i)
((unsigned short *) dest)[i] = u.h[i];
}
/*
Advance four linear congruential generators. The four generators
should use the same A and C constants.
The 32-bit multiply we want requires SSE 4.1. We construct it out of
two 32 to 64 bit multiply operations.
*/
INLINE __m128i
lfr_rand_epu32(__m128i x, __m128i a, __m128i c)
{
return _mm_add_epi32(
_mm_unpacklo_epi32(
_mm_shuffle_epi32(
_mm_mul_epu32(x, a),
_MM_SHUFFLE(0, 0, 2, 0)),
_mm_shuffle_epi32(
_mm_mul_epu32(_mm_srli_si128(x, 4), a),
_MM_SHUFFLE(0, 0, 2, 0))),
c);
}
#endif
#if defined(LFR_ALTIVEC) && defined(LFR_CPU_PPC)
#if !defined(__APPLE_ALTIVEC__)
#include <altivec.h>
#endif
/*
Advance four linear congruential generators. The four generators
should use the same A and C constants.
The 32-bit multiply we want does not exist. We construct it out of
16-bit multiply operations.
*/
INLINE vector unsigned int
lfr_vecrand(vector unsigned int x, vector unsigned int a,
vector unsigned int c)
{
vector unsigned int s = vec_splat_u32(-16);
return vec_add(
vec_add(
vec_mulo(
(vector unsigned short) x,
(vector unsigned short) a),
c),
vec_sl(
vec_msum(
(vector unsigned short) x,
(vector unsigned short) vec_rl(a, s),
vec_splat_u32(0)),
s));
}
#endif
|