aboutsummaryrefslogtreecommitdiff
path: root/libopna/opnassg-sinc-sse2.c
blob: ee75515d1feb9d9d474dd7ac53e00a8e66de110b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include "libopna/opnassg.h"
#include <emmintrin.h>

void opna_ssg_sinc_calc_sse2(unsigned resampler_index, const int16_t *inbuf, int32_t *outbuf) {
  inbuf += (resampler_index >> 1) * 4;
  const int16_t *sinctable = opna_ssg_sinctable;
  if (!(resampler_index & 1u)) sinctable += OPNA_SSG_SINCTABLELEN;
  __m128i outacc[3];
  for (int c = 0; c < 3; c++) {
    outacc[c] = _mm_setzero_si128();
  }
  for (int j = 0; j < 16; j++) {
    // 8 samples per loop
    __m128i in4[4];
    for (int i = 0; i < 4; i++) {
      in4[i] = _mm_loadu_si128((__m128i *)&inbuf[(j*4+i)*8]);
    }
    // in4: x 2 1 0 x 2 1 0
    for (int i = 0; i < 4; i++) {
      in4[i] = _mm_shuffle_epi32(in4[i], 0xd8);
    }
    // in4: x 2 x 2 1 0 1 0
    for (int i = 0; i < 4; i++) {
      in4[i] = _mm_shufflehi_epi16(in4[i], 0xd8);
    }
    for (int i = 0; i < 4; i++) {
      in4[i] = _mm_shufflelo_epi16(in4[i], 0xd8);
    }
    // in4: x x 2 2 1 1 0 0
    __m128i inl[2], inh[2];
    for (int i = 0; i < 2; i++) {
      inl[i] = _mm_unpacklo_epi32(in4[i*2+0], in4[i*2+1]);
      inh[i] = _mm_unpackhi_epi32(in4[i*2+0], in4[i*2+1]);
    }
    // inl: 1 1 1 1 0 0 0 0
    // inh: x x x x 2 2 2 2
    __m128i in[3];
    in[0] = _mm_unpacklo_epi64(inl[0], inl[1]);
    in[1] = _mm_unpackhi_epi64(inl[0], inl[1]);
    in[2] = _mm_unpacklo_epi64(inh[0], inh[1]);
    __m128i sinc = _mm_loadu_si128((__m128i *)&sinctable[j*8]);
    __m128i out[3];
    for (int c = 0; c < 3; c++) {
      out[c] = _mm_madd_epi16(in[c], sinc);
    }
    for (int c = 0; c < 3; c++) {
      outacc[c] = _mm_add_epi32(outacc[c], out[c]);
    }
  }
  for (int c = 0; c < 3; c++) {
    outacc[c] = _mm_add_epi32(outacc[c], _mm_shuffle_epi32(outacc[c], 0x4e));
    outacc[c] = _mm_add_epi32(outacc[c], _mm_shuffle_epi32(outacc[c], 0xb1));
    outbuf[c] = ((uint32_t)_mm_extract_epi16(outacc[c], 0)) | (((uint32_t)_mm_extract_epi16(outacc[c], 1))<<16);
  }
}