From f47eba7d7d4c6a1d9501e027b63bbab04bb7d417 Mon Sep 17 00:00:00 2001 From: Takamichi Horikawa Date: Thu, 30 Mar 2017 23:31:39 +0000 Subject: add NEON optimization --- fmdsp/fmdsp-vramlookup-c.c | 14 ++++++++ fmdsp/fmdsp-vramlookup-neon.s | 83 +++++++++++++++++++++++++++++++++++++++++++ fmdsp/fmdsp.c | 14 +++----- fmdsp/fmdsp.h | 10 ++++++ 4 files changed, 111 insertions(+), 10 deletions(-) create mode 100644 fmdsp/fmdsp-vramlookup-c.c create mode 100644 fmdsp/fmdsp-vramlookup-neon.s (limited to 'fmdsp') diff --git a/fmdsp/fmdsp-vramlookup-c.c b/fmdsp/fmdsp-vramlookup-c.c new file mode 100644 index 0000000..f900c8d --- /dev/null +++ b/fmdsp/fmdsp-vramlookup-c.c @@ -0,0 +1,14 @@ +#include "fmdsp/fmdsp.h" + +void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { + for (int y = 0; y < PC98_H; y++) { + for (int x = 0; x < PC98_W; x++) { + uint8_t r = palette[vram[y*PC98_W+x]*3+0]; + uint8_t g = palette[vram[y*PC98_W+x]*3+1]; + uint8_t b = palette[vram[y*PC98_W+x]*3+2]; + uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b); + uint32_t *row = (uint32_t *)(vram32 + y*stride); + row[x] = data; + } + } +} diff --git a/fmdsp/fmdsp-vramlookup-neon.s b/fmdsp/fmdsp-vramlookup-neon.s new file mode 100644 index 0000000..3cfb957 --- /dev/null +++ b/fmdsp/fmdsp-vramlookup-neon.s @@ -0,0 +1,83 @@ +@ neon register map +@ 0, 3, 6, 9, 12, 15, 18, 21 b +@ 1, 4, 7, 10, 13, 16, 19, 22 g +@ 2, 5, 9, 11, 14, 17, 20, 23 r + +@ 16, 17, 18, 19, 20, 21, 22, 23 vram + +@ 26, 27 r palette +@ 28, 29 g palette +@ 30, 31 b palette + +.global fmdsp_vramlookup_neon + +@ r0: uint8_t *vram32 +@ 4 bytes aligned +@ b, g, r, 0, +@ r1: const uint8_t *vram +@ r2: const uint8_t *palette +@ r0, g0, b0, r1, g1, b1, ... +@ r3: int stride +fmdsp_vramlookup_neon: + push {lr} +@ load palette + vld3.8 {d26, d28, d30}, [r2]! + vld1.8 {d27}, [r2]! + vld1.8 {d29}, [r2]! + vld1.8 {d31}, [r2]! + + mov r14, #400 +.loopcol: + mov r2, r0 + mov r12, #10 +.looprow: +@ row address + +@ load vram + vld1.8 {d16-d19}, [r1]! + vld1.8 {d20-d23}, [r1]! + +@ lookup + vtbl.8 d0, {d30-d31}, d16 + vtbl.8 d1, {d28-d29}, d16 + vtbl.8 d2, {d26-d27}, d16 + vtbl.8 d3, {d30-d31}, d17 + vtbl.8 d4, {d28-d29}, d17 + vtbl.8 d5, {d26-d27}, d17 + vtbl.8 d6, {d30-d31}, d18 + vtbl.8 d7, {d28-d29}, d18 + vtbl.8 d8, {d26-d27}, d18 + vtbl.8 d9, {d30-d31}, d19 + vtbl.8 d10, {d28-d29}, d19 + vtbl.8 d11, {d26-d27}, d19 + vtbl.8 d12, {d30-d31}, d20 + vtbl.8 d13, {d28-d29}, d20 + vtbl.8 d14, {d26-d27}, d20 + vtbl.8 d15, {d30-d31}, d21 + vtbl.8 d16, {d28-d29}, d21 + vtbl.8 d17, {d26-d27}, d21 + vtbl.8 d18, {d30-d31}, d22 + vtbl.8 d19, {d28-d29}, d22 + vtbl.8 d20, {d26-d27}, d22 + vtbl.8 d21, {d30-d31}, d23 + vtbl.8 d22, {d28-d29}, d23 + vtbl.8 d23, {d26-d27}, d23 + +@ store vram32 + vst4.8 {d0-d3}, [r2]! + vst4.8 {d3-d6}, [r2]! + vst4.8 {d6-d9}, [r2]! + vst4.8 {d9-d12}, [r2]! + vst4.8 {d12-d15}, [r2]! + vst4.8 {d15-d18}, [r2]! + vst4.8 {d18-d21}, [r2]! + vst4.8 {d21-d24}, [r2]! + + subs r12, #1 + bne .looprow + + add r0, r3 + subs r14, #1 + bne .loopcol + + pop {pc} diff --git a/fmdsp/fmdsp.c b/fmdsp/fmdsp.c index 3d082af..2f708dd 100644 --- a/fmdsp/fmdsp.c +++ b/fmdsp/fmdsp.c @@ -5,6 +5,8 @@ #include #include "libopna/opna.h" +fmdsp_vramlookup_type fmdsp_vramlookup_func = fmdsp_vramlookup_c; + static void vramblit(uint8_t *vram, int x, int y, const uint8_t *data, int w, int h) { for (int yi = 0; yi < h; yi++) { @@ -727,17 +729,9 @@ void fmdsp_update(struct fmdsp *fmdsp, } fmdsp_palette_fade(fmdsp); } + void fmdsp_vrampalette(struct fmdsp *fmdsp, const uint8_t *vram, uint8_t *vram32, int stride) { - for (int y = 0; y < PC98_H; y++) { - for (int x = 0; x < PC98_W; x++) { - uint8_t r = fmdsp->palette[vram[y*PC98_W+x]*3+0]; - uint8_t g = fmdsp->palette[vram[y*PC98_W+x]*3+1]; - uint8_t b = fmdsp->palette[vram[y*PC98_W+x]*3+2]; - uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b); - uint32_t *row = (uint32_t *)(vram32 + y*stride); - row[x] = data; - } - } + fmdsp_vramlookup_func(vram32, vram, fmdsp->palette, stride); } void fmdsp_dispstyle_set(struct fmdsp *fmdsp, enum FMDSP_DISPSTYLE style) { diff --git a/fmdsp/fmdsp.h b/fmdsp/fmdsp.h index cce2310..a7e4aab 100644 --- a/fmdsp/fmdsp.h +++ b/fmdsp/fmdsp.h @@ -49,6 +49,16 @@ void fmdsp_vrampalette(struct fmdsp *fmdsp, const uint8_t *vram, uint8_t *vram32 void fmdsp_font_from_fontrom(uint8_t *font, const uint8_t *fontrom); void fmdsp_palette_set(struct fmdsp *fmdsp, int p); void fmdsp_dispstyle_set(struct fmdsp *fmdsp, enum FMDSP_DISPSTYLE style); + +typedef void (*fmdsp_vramlookup_type)(uint8_t *vram32, + const uint8_t *vram, + const uint8_t *palette, + int stride); +extern fmdsp_vramlookup_type fmdsp_vramlookup_func; +void fmdsp_vramlookup_c(uint8_t *vram32, + const uint8_t *vram, + const uint8_t *palette, + int stride); #ifdef __cplusplus } #endif -- cgit v1.2.3