From 24225349831278c23c6dfc4515e071f4b27b2c41 Mon Sep 17 00:00:00 2001 From: Takamichi Horikawa Date: Sat, 1 Apr 2017 14:53:59 +0900 Subject: add sse2/ssse3 simd optimization --- fmdsp/fmdsp-vramlookup-ssse3.c | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 fmdsp/fmdsp-vramlookup-ssse3.c (limited to 'fmdsp/fmdsp-vramlookup-ssse3.c') diff --git a/fmdsp/fmdsp-vramlookup-ssse3.c b/fmdsp/fmdsp-vramlookup-ssse3.c new file mode 100644 index 0000000..30e7311 --- /dev/null +++ b/fmdsp/fmdsp-vramlookup-ssse3.c @@ -0,0 +1,48 @@ +#include "fmdsp/fmdsp.h" +#include + +void fmdsp_vramlookup_ssse3(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { + __m128i z = _mm_setzero_si128(); + __m128i p[3]; + { + union { + __m128i xmm; + uint8_t u8[16]; + } pi[3]; + for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) { + for (int c = 0; c < 3; c++) { + pi[c].u8[i] = palette[i*3+c]; + } + } + for (int c = 0; c < 3; c++) { + p[c] = _mm_load_si128(&pi[c].xmm); + } + } + + for (int y = 0; y < PC98_H; y++) { + for (int x = 0; x < 40; x++) { + // 16 pixels + __m128i v = _mm_loadu_si128((__m128i *)&vram[y*PC98_W+x*16]); + + __m128i r = _mm_shuffle_epi8(p[0], v); + __m128i g = _mm_shuffle_epi8(p[1], v); + __m128i b = _mm_shuffle_epi8(p[2], v); + + __m128i gb[2], zr[2]; + gb[0] = _mm_unpacklo_epi8(b, g); + gb[1] = _mm_unpackhi_epi8(b, g); + zr[0] = _mm_unpacklo_epi8(r, z); + zr[1] = _mm_unpackhi_epi8(r, z); + + __m128i o[4]; + o[0] = _mm_unpacklo_epi16(gb[0], zr[0]); + o[1] = _mm_unpackhi_epi16(gb[0], zr[0]); + o[2] = _mm_unpacklo_epi16(gb[1], zr[1]); + o[3] = _mm_unpackhi_epi16(gb[1], zr[1]); + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)&vram32[(x*4+i)*16], o[i]); + } + } + vram32 += stride; + } +} -- cgit v1.2.3