lib/defs.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

/* Copyright 2012 Dietrich Epp <depp@zdome.net> */
#pragma once

#include "fresample.h"

#define LFR_UNREACHABLE (void) 0

#if defined(__clang__)
# if __has_builtin(__builtin_unreachable)
#  undef LFR_UNREACHABLE
#  define LFR_UNREACHABLE __builtin_unreachable()
# endif
#elif defined(__GNUC__)
# if (__GNUC__ >= 4 && __GNU_MINOR__ >= 5) || __GNUC__ > 4
#  undef LFR_UNREACHABLE
#  define LFR_UNREACHABLE __builtin_unreachable()
# endif
#endif

#define INLINE static inline

/*
  Constants used by dithering algorithm.  We usa a simple linear
  congruential generator to generate a uniform signal for dithering,
  taking the high order bits:

  x_{n+1} = (A * x_n + C) mod 2^32

  The derived constants, AN/CN, are used for stepping the LCG forward
  by N steps.  AI is the inverse of A.
*/

constexpr auto LCG_A  = 1103515245u;
constexpr auto LCG_A2 = 3265436265u;
constexpr auto LCG_A4 = 3993403153u;

constexpr auto LCG_C  =      12345u;
constexpr auto LCG_C2 = 3554416254u;
constexpr auto LCG_C4 = 3596950572u;

constexpr auto LCG_AI = 4005161829u;
constexpr auto LCG_CI = 4235699843u;

/* ====================
   Utility functions
   ==================== */

#if defined(LFR_SSE2) && defined(LFR_CPU_X86)
#include <emmintrin.h>

/*
  Store 16-bit words [i0,i1) in the given location.
*/
INLINE void lfr_storepartial_epi16(__m128i *dest, __m128i x, int i0, int i1)
{
    union {
        unsigned short h[8];
        __m128i x;
    } u;
    u.x = x;
    for (int i = i0; i < i1; ++i)
        ((unsigned short *) dest)[i] = u.h[i];
}

/*
  Advance four linear congruential generators.  The four generators
  should use the same A and C constants.

  The 32-bit multiply we want requires SSE 4.1.  We construct it out of
  two 32 to 64 bit multiply operations.
*/
INLINE __m128i
lfr_rand_epu32(__m128i x, __m128i a, __m128i c)
{
    return _mm_add_epi32(
        _mm_unpacklo_epi32(
            _mm_shuffle_epi32(
                _mm_mul_epu32(x, a),
                _MM_SHUFFLE(0, 0, 2, 0)),
            _mm_shuffle_epi32(
                _mm_mul_epu32(_mm_srli_si128(x, 4), a),
                _MM_SHUFFLE(0, 0, 2, 0))),
        c);
}

#endif

#if defined(LFR_ALTIVEC) && defined(LFR_CPU_PPC)
#if !defined(__APPLE_ALTIVEC__)
#include <altivec.h>
#endif

/*
  Advance four linear congruential generators.  The four generators
  should use the same A and C constants.

  The 32-bit multiply we want does not exist.  We construct it out of
  16-bit multiply operations.
*/
INLINE vector unsigned int
lfr_vecrand(vector unsigned int x, vector unsigned int a,
			vector unsigned int c)
{
	vector unsigned int s = vec_splat_u32(-16);
	return vec_add(
		vec_add(
			vec_mulo(
				(vector unsigned short) x,
				(vector unsigned short) a),
			c),
		vec_sl(
			vec_msum(
				(vector unsigned short) x,
				(vector unsigned short) vec_rl(a, s),
				vec_splat_u32(0)),
			s));
}

#endif