aboutsummaryrefslogtreecommitdiff
path: root/fmdsp
diff options
context:
space:
mode:
authorTakamichi Horikawa <takamichiho@gmail.com>2017-04-01 14:53:59 +0900
committerTakamichi Horikawa <takamichiho@gmail.com>2017-04-01 14:53:59 +0900
commit24225349831278c23c6dfc4515e071f4b27b2c41 (patch)
tree35a853f7f35a53560a5b1bcfd5eda3213990b872 /fmdsp
parent5460067b61f86843a0435ebb06a6ebb8223c3dca (diff)
add sse2/ssse3 simd optimization
Diffstat (limited to 'fmdsp')
-rw-r--r--fmdsp/fmdsp-vramlookup-c.c13
-rw-r--r--fmdsp/fmdsp-vramlookup-ssse3.c48
-rw-r--r--fmdsp/fmdsp.h3
3 files changed, 59 insertions, 5 deletions
diff --git a/fmdsp/fmdsp-vramlookup-c.c b/fmdsp/fmdsp-vramlookup-c.c
index f900c8d..3d06f71 100644
--- a/fmdsp/fmdsp-vramlookup-c.c
+++ b/fmdsp/fmdsp-vramlookup-c.c
@@ -1,14 +1,17 @@
#include "fmdsp/fmdsp.h"
void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) {
+ uint32_t palette32[FMDSP_PALETTE_COLORS];
+ for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) {
+ uint8_t r = palette[i*3+0];
+ uint8_t g = palette[i*3+1];
+ uint8_t b = palette[i*3+2];
+ palette32[i] = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b);
+ }
for (int y = 0; y < PC98_H; y++) {
for (int x = 0; x < PC98_W; x++) {
- uint8_t r = palette[vram[y*PC98_W+x]*3+0];
- uint8_t g = palette[vram[y*PC98_W+x]*3+1];
- uint8_t b = palette[vram[y*PC98_W+x]*3+2];
- uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b);
uint32_t *row = (uint32_t *)(vram32 + y*stride);
- row[x] = data;
+ row[x] = palette32[vram[y*PC98_W+x]];
}
}
}
diff --git a/fmdsp/fmdsp-vramlookup-ssse3.c b/fmdsp/fmdsp-vramlookup-ssse3.c
new file mode 100644
index 0000000..30e7311
--- /dev/null
+++ b/fmdsp/fmdsp-vramlookup-ssse3.c
@@ -0,0 +1,48 @@
+#include "fmdsp/fmdsp.h"
+#include <tmmintrin.h>
+
+void fmdsp_vramlookup_ssse3(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) {
+ __m128i z = _mm_setzero_si128();
+ __m128i p[3];
+ {
+ union {
+ __m128i xmm;
+ uint8_t u8[16];
+ } pi[3];
+ for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) {
+ for (int c = 0; c < 3; c++) {
+ pi[c].u8[i] = palette[i*3+c];
+ }
+ }
+ for (int c = 0; c < 3; c++) {
+ p[c] = _mm_load_si128(&pi[c].xmm);
+ }
+ }
+
+ for (int y = 0; y < PC98_H; y++) {
+ for (int x = 0; x < 40; x++) {
+ // 16 pixels
+ __m128i v = _mm_loadu_si128((__m128i *)&vram[y*PC98_W+x*16]);
+
+ __m128i r = _mm_shuffle_epi8(p[0], v);
+ __m128i g = _mm_shuffle_epi8(p[1], v);
+ __m128i b = _mm_shuffle_epi8(p[2], v);
+
+ __m128i gb[2], zr[2];
+ gb[0] = _mm_unpacklo_epi8(b, g);
+ gb[1] = _mm_unpackhi_epi8(b, g);
+ zr[0] = _mm_unpacklo_epi8(r, z);
+ zr[1] = _mm_unpackhi_epi8(r, z);
+
+ __m128i o[4];
+ o[0] = _mm_unpacklo_epi16(gb[0], zr[0]);
+ o[1] = _mm_unpackhi_epi16(gb[0], zr[0]);
+ o[2] = _mm_unpacklo_epi16(gb[1], zr[1]);
+ o[3] = _mm_unpackhi_epi16(gb[1], zr[1]);
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)&vram32[(x*4+i)*16], o[i]);
+ }
+ }
+ vram32 += stride;
+ }
+}
diff --git a/fmdsp/fmdsp.h b/fmdsp/fmdsp.h
index a7e4aab..a8c3edc 100644
--- a/fmdsp/fmdsp.h
+++ b/fmdsp/fmdsp.h
@@ -59,6 +59,9 @@ void fmdsp_vramlookup_c(uint8_t *vram32,
const uint8_t *vram,
const uint8_t *palette,
int stride);
+
+void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int);
+void fmdsp_vramlookup_ssse3(uint8_t *, const uint8_t *, const uint8_t *, int);
#ifdef __cplusplus
}
#endif