diff options
author | Takamichi Horikawa <takamichiho@gmail.com> | 2017-03-30 23:31:39 +0000 |
---|---|---|
committer | Takamichi Horikawa <takamichiho@gmail.com> | 2017-03-30 23:31:39 +0000 |
commit | f47eba7d7d4c6a1d9501e027b63bbab04bb7d417 (patch) | |
tree | c6d8ebf4f84b76e9d32d4998d28a69d36fddd4a9 /libopna | |
parent | a86bb48b9f3acb081afa92e5efc50d0192c4f68c (diff) |
add NEON optimization
Diffstat (limited to 'libopna')
-rw-r--r-- | libopna/opna.c | 5 | ||||
-rw-r--r-- | libopna/opnassg-sinc-c.c | 13 | ||||
-rw-r--r-- | libopna/opnassg-sinc-neon.s | 118 | ||||
-rw-r--r-- | libopna/opnassg.c | 81 | ||||
-rw-r--r-- | libopna/opnassg.h | 13 |
5 files changed, 210 insertions, 20 deletions
diff --git a/libopna/opna.c b/libopna/opna.c index 1b0aa6c..567c913 100644 --- a/libopna/opna.c +++ b/libopna/opna.c @@ -37,8 +37,9 @@ void opna_mix_oscillo(struct opna *opna, int16_t *buf, unsigned samples, struct } } unsigned offset = OSCILLO_SAMPLE_COUNT - samples; - opna_fm_mix(&opna->fm, buf, samples, &oscillo[0], offset); - opna_ssg_mix_55466(&opna->ssg, &opna->resampler, buf, samples, &oscillo[6], offset); + opna_fm_mix(&opna->fm, buf, samples, oscillo ? &oscillo[0] : 0, offset); + opna_ssg_mix_55466(&opna->ssg, &opna->resampler, buf, samples, + oscillo ? &oscillo[6] : 0, offset); opna_drum_mix(&opna->drum, buf, samples); opna_adpcm_mix(&opna->adpcm, buf, samples); } diff --git a/libopna/opnassg-sinc-c.c b/libopna/opnassg-sinc-c.c new file mode 100644 index 0000000..bf93039 --- /dev/null +++ b/libopna/opnassg-sinc-c.c @@ -0,0 +1,13 @@ +#include "libopna/opnassg.h" + +void opna_ssg_sinc_calc_c(unsigned resampler_index, const int16_t *inbuf, int32_t *outbuf) { + for (int c = 0; c < 3; c++) { + int32_t chsample = 0; + for (int j = 0; j < OPNA_SSG_SINCTABLELEN; j++) { + unsigned sincindex = j; + if (!(resampler_index&1)) sincindex += OPNA_SSG_SINCTABLELEN; + chsample += inbuf[(((resampler_index)>>1)+j)*3+c] * opna_ssg_sinctable[sincindex]; + } + outbuf[c] = chsample; + } +} diff --git a/libopna/opnassg-sinc-neon.s b/libopna/opnassg-sinc-neon.s new file mode 100644 index 0000000..19dc3c6 --- /dev/null +++ b/libopna/opnassg-sinc-neon.s @@ -0,0 +1,118 @@ +@ neon register map: +@ 0, 3, 6, 9, 12, 15 ssg1 +@ 1, 4, 7, 10, 13, 16 ssg2 +@ 2, 5, 8, 11, 14, 17 ssg3 +@ 18, 19, 20, 21, 22, 23 sinc +@ 24-25 (q12): ssg1 out +@ 26-27 (q13): ssg2 out +@ 28-29 (q14): ssg3 out + +.global opna_ssg_sinc_calc_neon +@ r0: resampler_index +@ r1: const int16_t *inbuf +@ r2: int32_t *outbuf + +opna_ssg_sinc_calc_neon: + push {r4-r10,lr} +@ sinc table to r3 + movw r3, #:lower16:opna_ssg_sinctable + movt r3, #:upper16:opna_ssg_sinctable + tst r0, #1 + addeq r3, #256 + +@ add offset to ssg input buffer address + bic r0, #1 + add r0, r0, lsl #1 + add r1, r0 + +@ initialize output register + vmov.i64 q12, #0 + vmov.i64 q13, #0 + vmov.i64 q14, #0 + +@ sinc sample length + mov r0, #128 + +.loop: +@ + subs r0, #24 + blo .end + +@ load SSG channel data + vld3.16 {d0-d2}, [r1]! + vld3.16 {d3-d5}, [r1]! + vld3.16 {d6-d8}, [r1]! + vld3.16 {d9-d11}, [r1]! + vld3.16 {d12-d14}, [r1]! + vld3.16 {d15-d17}, [r1]! + +@ load sinc data + vld1.16 {d18-d21}, [r3]! + vld1.16 {d22-d23}, [r3]! + +@ multiply and accumulate + vmlal.s16 q12, d0, d18 + vmlal.s16 q13, d1, d18 + vmlal.s16 q14, d2, d18 + vmlal.s16 q12, d3, d19 + vmlal.s16 q13, d4, d19 + vmlal.s16 q14, d5, d19 + vmlal.s16 q12, d6, d20 + vmlal.s16 q13, d7, d20 + vmlal.s16 q14, d8, d20 + vmlal.s16 q12, d9, d21 + vmlal.s16 q13, d10, d21 + vmlal.s16 q14, d11, d21 + vmlal.s16 q12, d12, d22 + vmlal.s16 q13, d13, d22 + vmlal.s16 q14, d14, d22 + vmlal.s16 q12, d15, d23 + vmlal.s16 q13, d16, d23 + vmlal.s16 q14, d17, d23 + b .loop + +.end: +@ 8 samples left + vld3.16 {d0-d2}, [r1]! + vld3.16 {d3-d5}, [r1] + vld1.16 {d18-d19}, [r3] + + vmlal.s16 q12, d0, d18 + vmlal.s16 q13, d1, d18 + vmlal.s16 q14, d2, d18 + vmlal.s16 q12, d3, d19 + vmlal.s16 q13, d4, d19 + vmlal.s16 q14, d5, d19 + +@ extract data from result SIMD registers + + vmov.32 r0, d24[0] + vmov.32 r1, d24[1] + vmov.32 r3, d25[0] + vmov.32 r12, d25[1] + + vmov.32 r14, d26[0] + vmov.32 r4, d26[1] + vmov.32 r5, d27[0] + vmov.32 r6, d27[1] + + vmov.32 r7, d28[0] + vmov.32 r8, d28[1] + vmov.32 r9, d29[0] + vmov.32 r10, d29[1] + + add r0, r1 + add r3, r12 + + add r14, r4 + add r5, r6 + + add r7, r8 + add r9, r10 + + add r4, r0, r3 + add r5, r14 + add r6, r7, r9 + + stmia r2, {r4-r6} + pop {r4-r10,pc} diff --git a/libopna/opnassg.c b/libopna/opnassg.c index ec03437..4a12f76 100644 --- a/libopna/opnassg.c +++ b/libopna/opnassg.c @@ -1,5 +1,6 @@ #include "opnassg.h" #include "oscillo/oscillo.h" +#include <string.h> /* static const float voltable[32] = { 0.0f, 0.0f, 0x1.ae89f9p-8f, 0x1.000000p-7f, @@ -27,9 +28,6 @@ static const int16_t voltable[32] = { 6494, 7723, 9185, 10922 }; -#define SINCTABLEBIT 7 -#define SINCTABLELEN (1<<SINCTABLEBIT) - // GNU Octave // Fc = 7987200 // Ff = Fc/144 @@ -39,7 +37,8 @@ static const int16_t voltable[32] = { // B = 128 * O / 2 // FILTER=sinc(linspace(-127.5,127.5,256)*2/9/2).*rotdim(kaiser(256,B)) // FILTERI=round(FILTER(1:128).*32768) -static const int16_t sinctable[SINCTABLELEN] = { +#if 0 +const int16_t opna_ssg_sinctable[OPNA_SSG_SINCTABLELEN*2] = { 1, 0, -1, -2, -3, -5, -6, -6, -6, -5, -2, 2, 7, 11, 16, 19, 20, 18, 13, 5, -5, -17, -29, -38, @@ -56,8 +55,61 @@ static const int16_t sinctable[SINCTABLELEN] = { 3306, 3714, 3690, 3185, 2206, 815, -868, -2673, -4391, -5798, -6670, -6809, -6067, -4359, -1681, 1886, 6178, 10957, 15928, 20765, 25133, 28724, 31275, 32600, + 32600, 31275, 28724, 25133, 20765, 15928, 10957, 6178, + 1886, -1681, -4359, -6067, -6809, -6670, -5798, -4391, + -2673, -868, 815, 2206, 3185, 3690, 3714, 3306, + 2557, 1585, 523, -498, -1365, -1994, -2333, -2369, + -2125, -1655, -1032, -343, 328, 902, 1322, 1552, + 1580, 1421, 1108, 692, 230, -220, -607, -889, + -1043, -1062, -954, -744, -464, -154, 147, 405, + 593, 694, 705, 632, 491, 306, 101, -96, + -264, -385, -450, -455, -406, -315, -195, -64, + 61, 166, 241, 280, 282, 251, 193, 119, + 39, -37, -100, -144, -166, -166, -146, -112, + -68, -22, 21, 56, 80, 91, 90, 79, + 60, 36, 12, -11, -29, -40, -45, -44, + -38, -29, -17, -5, 5, 13, 18, 20, + 19, 16, 11, 7, 2, -2, -5, -6, + -6, -6, -5, -3, -2, -1, 0, 1, +}; +#endif +const int16_t opna_ssg_sinctable[OPNA_SSG_SINCTABLELEN*2] = { + 1, -1, -3, -6, -6, -2, 7, 16, + 20, 13, -5, -29, -44, -40, -11, 36, + 79, 91, 56, -22, -112, -166, -144, -37, + 119, 251, 280, 166, -64, -315, -455, -385, + -96, 306, 632, 694, 405, -154, -744, -1062, + -889, -220, 692, 1421, 1552, 902, -343, -1655, + -2369, -1994, -498, 1585, 3306, 3690, 2206, -868, + -4391, -6670, -6067, -1681, 6178, 15928, 25133, 31275, + 32600, 28724, 20765, 10957, 1886, -4359, -6809, -5798, + -2673, 815, 3185, 3714, 2557, 523, -1365, -2333, + -2125, -1032, 328, 1322, 1580, 1108, 230, -607, + -1043, -954, -464, 147, 593, 705, 491, 101, + -264, -450, -406, -195, 61, 241, 282, 193, + 39, -100, -166, -146, -68, 21, 80, 90, + 60, 12, -29, -45, -38, -17, 5, 18, + 19, 11, 2, -5, -6, -5, -2, 0, + 0, -2, -5, -6, -5, 2, 11, 19, + 18, 5, -17, -38, -45, -29, 12, 60, + 90, 80, 21, -68, -146, -166, -100, 39, + 193, 282, 241, 61, -195, -406, -450, -264, + 101, 491, 705, 593, 147, -464, -954, -1043, + -607, 230, 1108, 1580, 1322, 328, -1032, -2125, + -2333, -1365, 523, 2557, 3714, 3185, 815, -2673, + -5798, -6809, -4359, 1886, 10957, 20765, 28724, 32600, + 31275, 25133, 15928, 6178, -1681, -6067, -6670, -4391, + -868, 2206, 3690, 3306, 1585, -498, -1994, -2369, + -1655, -343, 902, 1552, 1421, 692, -220, -889, + -1062, -744, -154, 405, 694, 632, 306, -96, + -385, -455, -315, -64, 166, 280, 251, 119, + -37, -144, -166, -112, -22, 56, 91, 79, + 36, -11, -40, -44, -29, -5, 13, 20, + 16, 7, -2, -6, -6, -3, -1, 1, }; +opna_ssg_sinc_calc_func_type opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_c; + void opna_ssg_reset(struct opna_ssg *ssg) { for (int i = 0; i < 3; i++) { ssg->ch[i].tone_counter = 0; @@ -78,7 +130,7 @@ void opna_ssg_reset(struct opna_ssg *ssg) { } void opna_ssg_resampler_reset(struct opna_ssg_resampler *resampler) { - for (int i = 0; i < SINCTABLELEN; i++) { + for (int i = 0; i < OPNA_SSG_SINCTABLELEN; i++) { resampler->buf[i] = 0; } resampler->index = 0; @@ -215,7 +267,7 @@ void opna_ssg_generate_raw(struct opna_ssg *ssg, int16_t *buf, int samples) { } } -#define BUFINDEX(n) ((((resampler->index)>>1)+n)&(SINCTABLELEN-1)) +#define BUFINDEX(n) ((((resampler->index)>>1)+n)&(OPNA_SSG_SINCTABLELEN-1)) void opna_ssg_mix_55466( struct opna_ssg *ssg, struct opna_ssg_resampler *resampler, @@ -246,18 +298,13 @@ void opna_ssg_mix_55466( resampler->index += 9; } int32_t sample = 0; + resampler->index &= (1u<<(OPNA_SSG_SINCTABLEBIT+1))-1; + memcpy(resampler->buf + OPNA_SSG_SINCTABLELEN*3, resampler->buf, OPNA_SSG_SINCTABLELEN*3*sizeof(*resampler->buf)); + int32_t outbuf[3]; + opna_ssg_sinc_calc_func(resampler->index, resampler->buf, outbuf); for (int ch = 0; ch < 3; ch++) { - int32_t chsample = 0; - for (int j = 0; j < SINCTABLELEN; j++) { - unsigned sincindex = j*2; - if (!(resampler->index&1)) sincindex++; - bool sincsign = sincindex & (1<<(SINCTABLEBIT)); - unsigned sincmask = ((1<<(SINCTABLEBIT))-1); - sincindex = (sincindex & sincmask) ^ (sincsign ? sincmask : 0); - chsample += (resampler->buf[BUFINDEX(j)*3+ch] * sinctable[sincindex])>>2; - } - if (oscillo) oscillo[ch].buf[offset+i] = chsample >> 13; - if (!(ssg->mask & (1<<ch))) sample += chsample; + if (oscillo) oscillo[ch].buf[offset+i] = outbuf[ch] >> 15; + if (!(ssg->mask & (1<<ch))) sample += outbuf[ch] >> 2; } sample >>= 16; sample *= 13000; diff --git a/libopna/opnassg.h b/libopna/opnassg.h index 0321163..231db4d 100644 --- a/libopna/opnassg.h +++ b/libopna/opnassg.h @@ -8,6 +8,9 @@ extern "C" { #endif +#define OPNA_SSG_SINCTABLEBIT 7 +#define OPNA_SSG_SINCTABLELEN (1<<OPNA_SSG_SINCTABLEBIT) + struct opna_ssg_ch { uint16_t tone_counter; bool out; @@ -30,7 +33,7 @@ struct opna_ssg { }; struct opna_ssg_resampler { - int16_t buf[(1<<7)*3]; + int16_t buf[OPNA_SSG_SINCTABLELEN*3 * 2]; unsigned index; }; @@ -58,6 +61,14 @@ unsigned opna_ssg_readreg(const struct opna_ssg *ssg, unsigned reg); int opna_ssg_channel_level(const struct opna_ssg *ssg, int ch); unsigned opna_ssg_tone_period(const struct opna_ssg *ssg, int ch); +typedef void (*opna_ssg_sinc_calc_func_type)(unsigned resampler_index, + const int16_t *inbuf, int32_t *outbuf); +extern opna_ssg_sinc_calc_func_type opna_ssg_sinc_calc_func; +void opna_ssg_sinc_calc_c(unsigned resampler_index, + const int16_t *inbuf, int32_t *outbuf); + +extern const int16_t opna_ssg_sinctable[OPNA_SSG_SINCTABLELEN*2]; + #ifdef __cplusplus } #endif |