diff options
author | Takamichi Horikawa <takamichiho@gmail.com> | 2017-04-01 14:53:59 +0900 |
---|---|---|
committer | Takamichi Horikawa <takamichiho@gmail.com> | 2017-04-01 14:53:59 +0900 |
commit | 24225349831278c23c6dfc4515e071f4b27b2c41 (patch) | |
tree | 35a853f7f35a53560a5b1bcfd5eda3213990b872 | |
parent | 5460067b61f86843a0435ebb06a6ebb8223c3dca (diff) |
add sse2/ssse3 simd optimization
-rw-r--r-- | fmdsp/fmdsp-vramlookup-c.c | 13 | ||||
-rw-r--r-- | fmdsp/fmdsp-vramlookup-ssse3.c | 48 | ||||
-rw-r--r-- | fmdsp/fmdsp.h | 3 | ||||
-rw-r--r-- | gtk/Makefile.am | 21 | ||||
-rw-r--r-- | gtk/configure.ac | 8 | ||||
-rw-r--r-- | gtk/main.c | 32 | ||||
-rw-r--r-- | libopna/opnassg.h | 2 | ||||
-rw-r--r-- | win32/amd64/Makefile | 10 | ||||
-rw-r--r-- | win32/fmplayer.mak | 4 | ||||
-rw-r--r-- | win32/main.c | 79 | ||||
-rw-r--r-- | win32/x86/Makefile | 6 |
11 files changed, 181 insertions, 45 deletions
diff --git a/fmdsp/fmdsp-vramlookup-c.c b/fmdsp/fmdsp-vramlookup-c.c index f900c8d..3d06f71 100644 --- a/fmdsp/fmdsp-vramlookup-c.c +++ b/fmdsp/fmdsp-vramlookup-c.c @@ -1,14 +1,17 @@ #include "fmdsp/fmdsp.h" void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { + uint32_t palette32[FMDSP_PALETTE_COLORS]; + for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) { + uint8_t r = palette[i*3+0]; + uint8_t g = palette[i*3+1]; + uint8_t b = palette[i*3+2]; + palette32[i] = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b); + } for (int y = 0; y < PC98_H; y++) { for (int x = 0; x < PC98_W; x++) { - uint8_t r = palette[vram[y*PC98_W+x]*3+0]; - uint8_t g = palette[vram[y*PC98_W+x]*3+1]; - uint8_t b = palette[vram[y*PC98_W+x]*3+2]; - uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b); uint32_t *row = (uint32_t *)(vram32 + y*stride); - row[x] = data; + row[x] = palette32[vram[y*PC98_W+x]]; } } } diff --git a/fmdsp/fmdsp-vramlookup-ssse3.c b/fmdsp/fmdsp-vramlookup-ssse3.c new file mode 100644 index 0000000..30e7311 --- /dev/null +++ b/fmdsp/fmdsp-vramlookup-ssse3.c @@ -0,0 +1,48 @@ +#include "fmdsp/fmdsp.h" +#include <tmmintrin.h> + +void fmdsp_vramlookup_ssse3(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) { + __m128i z = _mm_setzero_si128(); + __m128i p[3]; + { + union { + __m128i xmm; + uint8_t u8[16]; + } pi[3]; + for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) { + for (int c = 0; c < 3; c++) { + pi[c].u8[i] = palette[i*3+c]; + } + } + for (int c = 0; c < 3; c++) { + p[c] = _mm_load_si128(&pi[c].xmm); + } + } + + for (int y = 0; y < PC98_H; y++) { + for (int x = 0; x < 40; x++) { + // 16 pixels + __m128i v = _mm_loadu_si128((__m128i *)&vram[y*PC98_W+x*16]); + + __m128i r = _mm_shuffle_epi8(p[0], v); + __m128i g = _mm_shuffle_epi8(p[1], v); + __m128i b = _mm_shuffle_epi8(p[2], v); + + __m128i gb[2], zr[2]; + gb[0] = _mm_unpacklo_epi8(b, g); + gb[1] = _mm_unpackhi_epi8(b, g); + zr[0] = _mm_unpacklo_epi8(r, z); + zr[1] = _mm_unpackhi_epi8(r, z); + + __m128i o[4]; + o[0] = _mm_unpacklo_epi16(gb[0], zr[0]); + o[1] = _mm_unpackhi_epi16(gb[0], zr[0]); + o[2] = _mm_unpacklo_epi16(gb[1], zr[1]); + o[3] = _mm_unpackhi_epi16(gb[1], zr[1]); + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)&vram32[(x*4+i)*16], o[i]); + } + } + vram32 += stride; + } +} diff --git a/fmdsp/fmdsp.h b/fmdsp/fmdsp.h index a7e4aab..a8c3edc 100644 --- a/fmdsp/fmdsp.h +++ b/fmdsp/fmdsp.h @@ -59,6 +59,9 @@ void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride); + +void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int); +void fmdsp_vramlookup_ssse3(uint8_t *, const uint8_t *, const uint8_t *, int); #ifdef __cplusplus } #endif diff --git a/gtk/Makefile.am b/gtk/Makefile.am index 0a9a95b..5d75f84 100644 --- a/gtk/Makefile.am +++ b/gtk/Makefile.am @@ -18,12 +18,29 @@ FMDSP_SRC=../fmdsp/fmdsp.c \ ../fmdsp/font_rom.c \ ../fmdsp/font_fmdsp_small.c +#fmplayer_CFLAGS=$(CFLAGS) +#CFLAGS= +fmplayer_CPPFLAGS=-Wall -Wextra -pedantic \ + -I.. \ + $(GTK3_CFLAGS) $(PORTAUDIO_CFLAGS) +fmplayer_LDADD=$(GTK3_LIBS) $(PORTAUDIO_LIBS) + if ENABLE_NEON LIBOPNA_SRC+=../libopna/opnassg-sinc-neon.s FMDSP_SRC+=../fmdsp/fmdsp-vramlookup-neon.s fmplayer_CCASFLAGS=-march=armv8-a -mfpu=crypto-neon-fp-armv8 endif +if ENABLE_SSE +noinst_LIBRARIES=libsse.a +fmplayer_LDADD+=libsse.a +libsse_a_SOURCES=../libopna/opnassg-sinc-sse2.c \ + ../fmdsp/fmdsp-vramlookup-ssse3.c +libsse_a_CPPFLAGS=$(fmplayer_CPPFLAGS) +#no way to add -O3?? (always overridden by CFLAGS) +libsse_a_CFLAGS=-mssse3 +endif + fmplayer_SOURCES=main.c \ toneview.c \ oscilloview.c \ @@ -34,7 +51,3 @@ fmplayer_SOURCES=main.c \ $(FMDRIVER_SRC) \ $(FMDSP_SRC) -fmplayer_CPPFLAGS=-Wall -Wextra -pedantic \ - -I.. \ - $(GTK3_CFLAGS) $(PORTAUDIO_CFLAGS) -fmplayer_LDADD=$(GTK3_LIBS) $(PORTAUDIO_LIBS) diff --git a/gtk/configure.ac b/gtk/configure.ac index 2727888..f49bd74 100644 --- a/gtk/configure.ac +++ b/gtk/configure.ac @@ -2,6 +2,8 @@ AC_INIT([fmplayer], [0.1.0]) AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) AM_SILENT_RULES([yes]) AC_PROG_CC_C99 +AC_PROG_RANLIB +AM_PROG_AR AM_PROG_AS dnl AM_PATH_SDL2([2.0.5]) @@ -14,5 +16,11 @@ AS_IF([test "x$enable_neon" = "xyes"], [ AC_DEFINE([ENABLE_NEON]) ]) +AC_CHECK_HEADER([emmintrin.h], [emmintrin_found=yes]) +AM_CONDITIONAL([ENABLE_SSE], [test "x$emmintrin_found" = "xyes"]) +AS_IF([test "x$emmintrin_found" = "xyes"], [ + AC_DEFINE([ENABLE_SSE]) +]) + AC_CONFIG_FILES([Makefile]) AC_OUTPUT @@ -55,7 +55,9 @@ static struct { const char *current_uri; bool oscillo_should_update; struct oscillodata oscillodata_audiothread[LIBOPNA_OSCILLO_TRACK_COUNT]; -} g; +} g = { + .oscillo_should_update = true +}; static void quit(void) { if (g.pastream) { @@ -67,18 +69,26 @@ static void quit(void) { } static void on_destroy(GtkWidget *w, gpointer ptr) { + (void)w; + (void)ptr; quit(); } static void on_menu_quit(GtkMenuItem *menuitem, gpointer ptr) { + (void)menuitem; + (void)ptr; quit(); } static void on_tone_view(GtkMenuItem *menuitem, gpointer ptr) { + (void)menuitem; + (void)ptr; show_toneview(); } static void on_oscillo_view(GtkMenuItem *menuitem, gpointer ptr) { + (void)menuitem; + (void)ptr; show_oscilloview(); } @@ -95,6 +105,9 @@ static int pastream_cb(const void *inptr, void *outptr, unsigned long frames, const PaStreamCallbackTimeInfo *timeinfo, PaStreamCallbackFlags statusFlags, void *userdata) { + (void)inptr; + (void)timeinfo; + (void)statusFlags; struct opna_timer *timer = (struct opna_timer *)userdata; int16_t *buf = (int16_t *)outptr; memset(outptr, 0, sizeof(int16_t)*frames*2); @@ -133,7 +146,8 @@ static void opna_writereg_libopna(struct fmdriver_work *work, unsigned addr, uns } static unsigned opna_readreg_libopna(struct fmdriver_work *work, unsigned addr) { - struct opna_timer *timer = (struct opna_timer *)work->opna; + (void)work; + //struct opna_timer *timer = (struct opna_timer *)work->opna; return opna_readreg(&g.opna, addr); } @@ -280,7 +294,7 @@ static bool openfile(const char *uri) { g.pa_paused = false; { const char *turi = strdup(uri); - free(g.current_uri); + free((void *)g.current_uri); g.current_uri = turi; } return true; @@ -290,6 +304,7 @@ err: } static void on_file_activated(GtkFileChooser *chooser, gpointer ptr) { + (void)ptr; gchar *filename = gtk_file_chooser_get_uri(chooser); if (filename) { openfile(filename); @@ -326,6 +341,8 @@ static GtkWidget *create_menubar() { static gboolean draw_cb(GtkWidget *w, cairo_t *cr, gpointer p) { + (void)w; + (void)p; fmdsp_update(&g.fmdsp, &g.work, &g.opna, g.vram); fmdsp_vrampalette(&g.fmdsp, g.vram, g.vram32, g.vram32_stride); cairo_surface_t *s = cairo_image_surface_create_for_data( @@ -341,6 +358,7 @@ static gboolean draw_cb(GtkWidget *w, static gboolean tick_cb(GtkWidget *w, GdkFrameClock *frame_clock, gpointer p) { + (void)w; (void)frame_clock; gtk_widget_queue_draw(GTK_WIDGET(p)); return G_SOURCE_CONTINUE; @@ -472,6 +490,7 @@ static void drag_data_recv_cb( gint x, gint y, GtkSelectionData *data, guint info, guint time, gpointer ptr) { + (void)w; (void)x; (void)y; (void)info; @@ -484,14 +503,15 @@ static void drag_data_recv_cb( gtk_drag_finish(ctx, TRUE, FALSE, time); } -void opna_ssg_sinc_calc_neon(unsigned, const int16_t *, int32_t *); -void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int); - int main(int argc, char **argv) { #ifdef ENABLE_NEON opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_neon; fmdsp_vramlookup_func = fmdsp_vramlookup_neon; #endif +#ifdef ENABLE_SSE + if (__builtin_cpu_supports("sse2")) opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_sse2; + if (__builtin_cpu_supports("ssse3")) fmdsp_vramlookup_func = fmdsp_vramlookup_ssse3; +#endif load_fontrom(); gtk_init(&argc, &argv); GtkWidget *w = gtk_window_new(GTK_WINDOW_TOPLEVEL); diff --git a/libopna/opnassg.h b/libopna/opnassg.h index 223d542..aadf53e 100644 --- a/libopna/opnassg.h +++ b/libopna/opnassg.h @@ -66,6 +66,8 @@ typedef void (*opna_ssg_sinc_calc_func_type)(unsigned resampler_index, extern opna_ssg_sinc_calc_func_type opna_ssg_sinc_calc_func; void opna_ssg_sinc_calc_c(unsigned resampler_index, const int16_t *inbuf, int32_t *outbuf); +void opna_ssg_sinc_calc_neon(unsigned, const int16_t *, int32_t *); +void opna_ssg_sinc_calc_sse2(unsigned, const int16_t *, int32_t *); extern const int16_t opna_ssg_sinctable[OPNA_SSG_SINCTABLELEN*2]; diff --git a/win32/amd64/Makefile b/win32/amd64/Makefile index 177ef90..22ef073 100644 --- a/win32/amd64/Makefile +++ b/win32/amd64/Makefile @@ -9,14 +9,16 @@ vpath %.rc .. include ../fmplayer.mak OBJS=$(addsuffix .o,$(OBJBASE) $(RESBASE)) +OBJS+=$(addsuffix .sse.o,$(SSEOBJBASE)) ARCH=x86_64 PREFIX=$(ARCH)-w64-mingw32- CC=$(PREFIX)gcc WINDRES=$(PREFIX)windres STRIP=$(PREFIX)strip -CFLAGS=-std=c99 -O2 -Wall -Wextra -Werror -pedantic -Wno-unused-parameter -Wno-missing-field-initializers -I../.. \ +CFLAGS=-std=c99 -O2 -Wall -Werror -Wextra -pedantic -Wno-unused-parameter -Wno-missing-field-initializers -I../.. \ $(addprefix -D,$(DEFINES)) -LIBS=-s -mwindows -municode \ +SSECFLAGS=-mssse3 -O3 +LIBS=-mwindows -municode \ $(addprefix -l,$(LIBBASE)) $(TARGET): $(OBJS) @@ -28,6 +30,10 @@ $(TARGET): $(OBJS) @echo " CC $@" @$(CC) $(CFLAGS) -c $< -o $@ +%.sse.o: %.c + @echo " CC $@" + @$(CC) $(CFLAGS) $(SSECFLAGS) -c $< -o $@ + %.o: %.rc $(ICON) @echo " WINDRES $@" @$(WINDRES) -o $@ -i $< diff --git a/win32/fmplayer.mak b/win32/fmplayer.mak index 1f68f4d..180de1c 100644 --- a/win32/fmplayer.mak +++ b/win32/fmplayer.mak @@ -15,12 +15,16 @@ LIBOPNA_OBJS=opna \ opnatimer \ opnafm \ opnassg \ + opnassg-sinc-c \ opnadrum \ opnaadpcm FMDSP_OBJS=fmdsp \ + fmdsp-vramlookup-c \ font_rom \ font_fmdsp_small TONEDATA_OBJS=tonedata +SSEOBJBASE=opnassg-sinc-sse2 \ + fmdsp-vramlookup-ssse3 OBJBASE=main \ toneview \ oscilloview \ diff --git a/win32/main.c b/win32/main.c index ab038ae..8dc6bb8 100644 --- a/win32/main.c +++ b/win32/main.c @@ -52,6 +52,7 @@ static struct { struct fmplayer_file *fmfile; struct fmdsp fmdsp; uint8_t vram[PC98_W*PC98_H]; + //uint8_t *vram; struct fmdsp_font font; uint8_t fontrom[FONT_ROM_FILESIZE]; bool font_loaded; @@ -66,6 +67,8 @@ static struct { bool fmdsp_2x; struct oscillodata oscillodata_audiothread[LIBOPNA_OSCILLO_TRACK_COUNT]; UINT mmtimer; + HBITMAP bitmap_vram; + uint8_t *vram32; } g; HWND g_currentdlg; @@ -445,6 +448,20 @@ static void CALLBACK mmtimer_cb(UINT timerid, UINT msg, static bool on_create(HWND hwnd, CREATESTRUCT *cs) { (void)cs; + struct bitmap_info_fmdsp { + BITMAPINFOHEADER head; + RGBQUAD colors[FMDSP_PALETTE_COLORS]; + } bmi = {0}; + bmi.head.biSize = sizeof(bmi.head); + bmi.head.biWidth = PC98_W; + bmi.head.biHeight = -PC98_H; + bmi.head.biPlanes = 1; + bmi.head.biBitCount = 32; + bmi.head.biCompression = BI_RGB; + //bmi.head.biClrUsed = FMDSP_PALETTE_COLORS; + g.bitmap_vram = CreateDIBSection( + 0, (BITMAPINFO *)&bmi, DIB_RGB_COLORS, (void **)&g.vram32, 0, 0 + ); HWND button = CreateWindowEx( 0, L"BUTTON", @@ -563,40 +580,26 @@ static void on_destroy(HWND hwnd) { static void on_paint(HWND hwnd) { fmdsp_update(&g.fmdsp, &g.work, &g.opna, g.vram); + fmdsp_vrampalette(&g.fmdsp, g.vram, g.vram32, PC98_W*4); PAINTSTRUCT ps; - static BITMAPINFO *bi = 0; - if (!bi) { - bi = HeapAlloc(g.heap, HEAP_ZERO_MEMORY, - sizeof(BITMAPINFOHEADER) + sizeof(RGBQUAD)*FMDSP_PALETTE_COLORS); - if (!bi) return; - bi->bmiHeader.biSize = sizeof(bi->bmiHeader); - bi->bmiHeader.biWidth = PC98_W; - bi->bmiHeader.biHeight = -PC98_H; - bi->bmiHeader.biPlanes = 1; - bi->bmiHeader.biBitCount = 8; - bi->bmiHeader.biCompression = BI_RGB; - bi->bmiHeader.biClrUsed = FMDSP_PALETTE_COLORS; - } - for (int p = 0; p < FMDSP_PALETTE_COLORS; p++) { - bi->bmiColors[p].rgbRed = g.fmdsp.palette[p*3+0]; - bi->bmiColors[p].rgbGreen = g.fmdsp.palette[p*3+1]; - bi->bmiColors[p].rgbBlue = g.fmdsp.palette[p*3+2]; - } HDC dc = BeginPaint(hwnd, &ps); HDC mdc = CreateCompatibleDC(dc); - HBITMAP bitmap = CreateDIBitmap( - dc, - &bi->bmiHeader, CBM_INIT, - g.vram, - bi, DIB_RGB_COLORS); - SelectObject(mdc, bitmap); + SelectObject(mdc, g.bitmap_vram); + /* + RGBQUAD palette[FMDSP_PALETTE_COLORS]; + for (int p = 0; p < FMDSP_PALETTE_COLORS; p++) { + palette[p].rgbRed = g.fmdsp.palette[p*3+0]; + palette[p].rgbGreen = g.fmdsp.palette[p*3+1]; + palette[p].rgbBlue = g.fmdsp.palette[p*3+2]; + } + SetDIBColorTable(mdc, 0, FMDSP_PALETTE_COLORS, palette); + */ if (g.fmdsp_2x) { StretchBlt(dc, 0, 80, 1280, 800, mdc, 0, 0, 640, 400, SRCCOPY); } else { BitBlt(dc, 0, 80, 640, 400, mdc, 0, 0, SRCCOPY); } DeleteDC(mdc); - DeleteObject(bitmap); EndPaint(hwnd, &ps); } @@ -665,8 +668,20 @@ static LRESULT CALLBACK wndproc( HANDLE_MSG(hwnd, WM_SYSKEYUP, on_syskey); HANDLE_MSG(hwnd, WM_ACTIVATE, on_activate); case WM_USER: - InvalidateRect(hwnd, 0, FALSE); - return 0; + { + RECT r = { + .left = 0, + .top = 80, + .right = 640, + .bottom = 480, + }; + if (g.fmdsp_2x) { + r.right = 1280; + r.bottom = 880; + } + InvalidateRect(hwnd, &r, FALSE); + return 0; + } } return DefWindowProc(hwnd, msg, wParam, lParam); } @@ -688,6 +703,9 @@ int CALLBACK wWinMain(HINSTANCE hinst, HINSTANCE hpinst, (void)hpinst; (void)cmdline_; + if (__builtin_cpu_supports("sse2")) opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_sse2; + if (__builtin_cpu_supports("ssse3")) fmdsp_vramlookup_func = fmdsp_vramlookup_ssse3; + const wchar_t *argfile = 0; { wchar_t *cmdline = GetCommandLine(); @@ -725,9 +743,14 @@ int CALLBACK wWinMain(HINSTANCE hinst, HINSTANCE hpinst, wr.top = 0; wr.bottom = 480; AdjustWindowRectEx(&wr, style, 0, exStyle); +#ifdef _WIN64 +#define WIN64STR "(amd64)" +#else +#define WIN64STR "" +#endif g.mainwnd = CreateWindowEx( exStyle, - (wchar_t*)((uintptr_t)wcatom), L"FMPlayer/Win32 v" FMPLAYER_VERSION_STR, + (wchar_t*)((uintptr_t)wcatom), L"FMPlayer/Win32 " WIN64STR " v" FMPLAYER_VERSION_STR, style, CW_USEDEFAULT, CW_USEDEFAULT, wr.right-wr.left, wr.bottom-wr.top, diff --git a/win32/x86/Makefile b/win32/x86/Makefile index b26f3e1..6146d28 100644 --- a/win32/x86/Makefile +++ b/win32/x86/Makefile @@ -9,6 +9,7 @@ vpath %.rc .. include ../fmplayer.mak OBJS=$(addsuffix .o,$(OBJBASE) $(RESBASE)) +OBJS+=$(addsuffix .sse.o,$(SSEOBJBASE)) ARCH=i686 PREFIX=$(ARCH)-w64-mingw32- CC=$(PREFIX)gcc @@ -17,6 +18,7 @@ STRIP=$(PREFIX)strip CFLAGS=-std=c99 -O2 -Wall -Wextra -Werror -pedantic -I../.. \ $(addprefix -D,$(DEFINES)) \ -march=i586 -Wno-unused-parameter -Wno-missing-field-initializers +SSECFLAGS=-mssse3 -O3 LIBS=-s -mwindows -municode \ $(addprefix -l,$(LIBBASE)) @@ -29,6 +31,10 @@ $(TARGET): $(OBJS) @echo " CC $@" @$(CC) $(CFLAGS) -c $< -o $@ +%.sse.o: %.c + @echo " CC $@" + @$(CC) $(CFLAGS) $(SSECFLAGS) -c $< -o $@ + %.o: %.rc $(ICON) @echo " WINDRES $@" @$(WINDRES) -o $@ -i $< |