diff options
Diffstat (limited to 'fmdsp')
| -rw-r--r-- | fmdsp/fmdsp-vramlookup-c.c | 13 | ||||
| -rw-r--r-- | fmdsp/fmdsp-vramlookup-ssse3.c | 48 | ||||
| -rw-r--r-- | fmdsp/fmdsp.h | 3 | 
3 files changed, 59 insertions, 5 deletions
| diff --git a/fmdsp/fmdsp-vramlookup-c.c b/fmdsp/fmdsp-vramlookup-c.c index f900c8d..3d06f71 100644 --- a/fmdsp/fmdsp-vramlookup-c.c +++ b/fmdsp/fmdsp-vramlookup-c.c @@ -1,14 +1,17 @@  #include "fmdsp/fmdsp.h"  void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { +  uint32_t palette32[FMDSP_PALETTE_COLORS]; +  for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) { +    uint8_t r = palette[i*3+0]; +    uint8_t g = palette[i*3+1]; +    uint8_t b = palette[i*3+2]; +    palette32[i] = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b); +  }    for (int y = 0; y < PC98_H; y++) {      for (int x = 0; x < PC98_W; x++) { -      uint8_t r = palette[vram[y*PC98_W+x]*3+0]; -      uint8_t g = palette[vram[y*PC98_W+x]*3+1]; -      uint8_t b = palette[vram[y*PC98_W+x]*3+2]; -      uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b);        uint32_t *row = (uint32_t *)(vram32 + y*stride); -      row[x] = data; +      row[x] = palette32[vram[y*PC98_W+x]];      }    }  } diff --git a/fmdsp/fmdsp-vramlookup-ssse3.c b/fmdsp/fmdsp-vramlookup-ssse3.c new file mode 100644 index 0000000..30e7311 --- /dev/null +++ b/fmdsp/fmdsp-vramlookup-ssse3.c @@ -0,0 +1,48 @@ +#include "fmdsp/fmdsp.h" +#include <tmmintrin.h> + +void fmdsp_vramlookup_ssse3(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { +  __m128i z = _mm_setzero_si128(); +  __m128i p[3]; +  { +    union { +      __m128i xmm; +      uint8_t u8[16]; +    } pi[3]; +    for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) { +      for (int c = 0; c < 3; c++) { +        pi[c].u8[i] = palette[i*3+c]; +      } +    } +    for (int c = 0; c < 3; c++) { +      p[c] = _mm_load_si128(&pi[c].xmm); +    } +  } + +  for (int y = 0; y < PC98_H; y++) { +    for (int x = 0; x < 40; x++) { +      // 16 pixels +      __m128i v = _mm_loadu_si128((__m128i *)&vram[y*PC98_W+x*16]); + +      __m128i r = _mm_shuffle_epi8(p[0], v); +      __m128i g = _mm_shuffle_epi8(p[1], v); +      __m128i b = _mm_shuffle_epi8(p[2], v); + +      __m128i gb[2], zr[2]; +      gb[0] = _mm_unpacklo_epi8(b, g); +      gb[1] = _mm_unpackhi_epi8(b, g); +      zr[0] = _mm_unpacklo_epi8(r, z); +      zr[1] = _mm_unpackhi_epi8(r, z); +       +      __m128i o[4]; +      o[0] = _mm_unpacklo_epi16(gb[0], zr[0]); +      o[1] = _mm_unpackhi_epi16(gb[0], zr[0]); +      o[2] = _mm_unpacklo_epi16(gb[1], zr[1]); +      o[3] = _mm_unpackhi_epi16(gb[1], zr[1]); +      for (int i = 0; i < 4; i++) { +        _mm_storeu_si128((__m128i *)&vram32[(x*4+i)*16], o[i]); +      } +    } +    vram32 += stride; +  } +} diff --git a/fmdsp/fmdsp.h b/fmdsp/fmdsp.h index a7e4aab..a8c3edc 100644 --- a/fmdsp/fmdsp.h +++ b/fmdsp/fmdsp.h @@ -59,6 +59,9 @@ void fmdsp_vramlookup_c(uint8_t *vram32,                          const uint8_t *vram,                          const uint8_t *palette,                          int stride); + +void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int); +void fmdsp_vramlookup_ssse3(uint8_t *, const uint8_t *, const uint8_t *, int);  #ifdef __cplusplus  }  #endif | 
