aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTakamichi Horikawa <takamichiho@gmail.com>2017-04-01 14:53:59 +0900
committerTakamichi Horikawa <takamichiho@gmail.com>2017-04-01 14:53:59 +0900
commit24225349831278c23c6dfc4515e071f4b27b2c41 (patch)
tree35a853f7f35a53560a5b1bcfd5eda3213990b872
parent5460067b61f86843a0435ebb06a6ebb8223c3dca (diff)
add sse2/ssse3 simd optimization
-rw-r--r--fmdsp/fmdsp-vramlookup-c.c13
-rw-r--r--fmdsp/fmdsp-vramlookup-ssse3.c48
-rw-r--r--fmdsp/fmdsp.h3
-rw-r--r--gtk/Makefile.am21
-rw-r--r--gtk/configure.ac8
-rw-r--r--gtk/main.c32
-rw-r--r--libopna/opnassg.h2
-rw-r--r--win32/amd64/Makefile10
-rw-r--r--win32/fmplayer.mak4
-rw-r--r--win32/main.c79
-rw-r--r--win32/x86/Makefile6
11 files changed, 181 insertions, 45 deletions
diff --git a/fmdsp/fmdsp-vramlookup-c.c b/fmdsp/fmdsp-vramlookup-c.c
index f900c8d..3d06f71 100644
--- a/fmdsp/fmdsp-vramlookup-c.c
+++ b/fmdsp/fmdsp-vramlookup-c.c
@@ -1,14 +1,17 @@
#include "fmdsp/fmdsp.h"
void fmdsp_vramlookup_c(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) {
+ uint32_t palette32[FMDSP_PALETTE_COLORS];
+ for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) {
+ uint8_t r = palette[i*3+0];
+ uint8_t g = palette[i*3+1];
+ uint8_t b = palette[i*3+2];
+ palette32[i] = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b);
+ }
for (int y = 0; y < PC98_H; y++) {
for (int x = 0; x < PC98_W; x++) {
- uint8_t r = palette[vram[y*PC98_W+x]*3+0];
- uint8_t g = palette[vram[y*PC98_W+x]*3+1];
- uint8_t b = palette[vram[y*PC98_W+x]*3+2];
- uint32_t data = (((uint32_t)r)<<16) | (((uint32_t)g)<<8) | ((uint32_t)b);
uint32_t *row = (uint32_t *)(vram32 + y*stride);
- row[x] = data;
+ row[x] = palette32[vram[y*PC98_W+x]];
}
}
}
diff --git a/fmdsp/fmdsp-vramlookup-ssse3.c b/fmdsp/fmdsp-vramlookup-ssse3.c
new file mode 100644
index 0000000..30e7311
--- /dev/null
+++ b/fmdsp/fmdsp-vramlookup-ssse3.c
@@ -0,0 +1,48 @@
+#include "fmdsp/fmdsp.h"
+#include <tmmintrin.h>
+
+void fmdsp_vramlookup_ssse3(uint8_t *vram32, const uint8_t *vram, const uint8_t *palette, int stride) {
+ __m128i z = _mm_setzero_si128();
+ __m128i p[3];
+ {
+ union {
+ __m128i xmm;
+ uint8_t u8[16];
+ } pi[3];
+ for (int i = 0; i < FMDSP_PALETTE_COLORS; i++) {
+ for (int c = 0; c < 3; c++) {
+ pi[c].u8[i] = palette[i*3+c];
+ }
+ }
+ for (int c = 0; c < 3; c++) {
+ p[c] = _mm_load_si128(&pi[c].xmm);
+ }
+ }
+
+ for (int y = 0; y < PC98_H; y++) {
+ for (int x = 0; x < 40; x++) {
+ // 16 pixels
+ __m128i v = _mm_loadu_si128((__m128i *)&vram[y*PC98_W+x*16]);
+
+ __m128i r = _mm_shuffle_epi8(p[0], v);
+ __m128i g = _mm_shuffle_epi8(p[1], v);
+ __m128i b = _mm_shuffle_epi8(p[2], v);
+
+ __m128i gb[2], zr[2];
+ gb[0] = _mm_unpacklo_epi8(b, g);
+ gb[1] = _mm_unpackhi_epi8(b, g);
+ zr[0] = _mm_unpacklo_epi8(r, z);
+ zr[1] = _mm_unpackhi_epi8(r, z);
+
+ __m128i o[4];
+ o[0] = _mm_unpacklo_epi16(gb[0], zr[0]);
+ o[1] = _mm_unpackhi_epi16(gb[0], zr[0]);
+ o[2] = _mm_unpacklo_epi16(gb[1], zr[1]);
+ o[3] = _mm_unpackhi_epi16(gb[1], zr[1]);
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)&vram32[(x*4+i)*16], o[i]);
+ }
+ }
+ vram32 += stride;
+ }
+}
diff --git a/fmdsp/fmdsp.h b/fmdsp/fmdsp.h
index a7e4aab..a8c3edc 100644
--- a/fmdsp/fmdsp.h
+++ b/fmdsp/fmdsp.h
@@ -59,6 +59,9 @@ void fmdsp_vramlookup_c(uint8_t *vram32,
const uint8_t *vram,
const uint8_t *palette,
int stride);
+
+void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int);
+void fmdsp_vramlookup_ssse3(uint8_t *, const uint8_t *, const uint8_t *, int);
#ifdef __cplusplus
}
#endif
diff --git a/gtk/Makefile.am b/gtk/Makefile.am
index 0a9a95b..5d75f84 100644
--- a/gtk/Makefile.am
+++ b/gtk/Makefile.am
@@ -18,12 +18,29 @@ FMDSP_SRC=../fmdsp/fmdsp.c \
../fmdsp/font_rom.c \
../fmdsp/font_fmdsp_small.c
+#fmplayer_CFLAGS=$(CFLAGS)
+#CFLAGS=
+fmplayer_CPPFLAGS=-Wall -Wextra -pedantic \
+ -I.. \
+ $(GTK3_CFLAGS) $(PORTAUDIO_CFLAGS)
+fmplayer_LDADD=$(GTK3_LIBS) $(PORTAUDIO_LIBS)
+
if ENABLE_NEON
LIBOPNA_SRC+=../libopna/opnassg-sinc-neon.s
FMDSP_SRC+=../fmdsp/fmdsp-vramlookup-neon.s
fmplayer_CCASFLAGS=-march=armv8-a -mfpu=crypto-neon-fp-armv8
endif
+if ENABLE_SSE
+noinst_LIBRARIES=libsse.a
+fmplayer_LDADD+=libsse.a
+libsse_a_SOURCES=../libopna/opnassg-sinc-sse2.c \
+ ../fmdsp/fmdsp-vramlookup-ssse3.c
+libsse_a_CPPFLAGS=$(fmplayer_CPPFLAGS)
+#no way to add -O3?? (always overridden by CFLAGS)
+libsse_a_CFLAGS=-mssse3
+endif
+
fmplayer_SOURCES=main.c \
toneview.c \
oscilloview.c \
@@ -34,7 +51,3 @@ fmplayer_SOURCES=main.c \
$(FMDRIVER_SRC) \
$(FMDSP_SRC)
-fmplayer_CPPFLAGS=-Wall -Wextra -pedantic \
- -I.. \
- $(GTK3_CFLAGS) $(PORTAUDIO_CFLAGS)
-fmplayer_LDADD=$(GTK3_LIBS) $(PORTAUDIO_LIBS)
diff --git a/gtk/configure.ac b/gtk/configure.ac
index 2727888..f49bd74 100644
--- a/gtk/configure.ac
+++ b/gtk/configure.ac
@@ -2,6 +2,8 @@ AC_INIT([fmplayer], [0.1.0])
AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
AM_SILENT_RULES([yes])
AC_PROG_CC_C99
+AC_PROG_RANLIB
+AM_PROG_AR
AM_PROG_AS
dnl AM_PATH_SDL2([2.0.5])
@@ -14,5 +16,11 @@ AS_IF([test "x$enable_neon" = "xyes"], [
AC_DEFINE([ENABLE_NEON])
])
+AC_CHECK_HEADER([emmintrin.h], [emmintrin_found=yes])
+AM_CONDITIONAL([ENABLE_SSE], [test "x$emmintrin_found" = "xyes"])
+AS_IF([test "x$emmintrin_found" = "xyes"], [
+ AC_DEFINE([ENABLE_SSE])
+])
+
AC_CONFIG_FILES([Makefile])
AC_OUTPUT
diff --git a/gtk/main.c b/gtk/main.c
index fc72d62..855fbf4 100644
--- a/gtk/main.c
+++ b/gtk/main.c
@@ -55,7 +55,9 @@ static struct {
const char *current_uri;
bool oscillo_should_update;
struct oscillodata oscillodata_audiothread[LIBOPNA_OSCILLO_TRACK_COUNT];
-} g;
+} g = {
+ .oscillo_should_update = true
+};
static void quit(void) {
if (g.pastream) {
@@ -67,18 +69,26 @@ static void quit(void) {
}
static void on_destroy(GtkWidget *w, gpointer ptr) {
+ (void)w;
+ (void)ptr;
quit();
}
static void on_menu_quit(GtkMenuItem *menuitem, gpointer ptr) {
+ (void)menuitem;
+ (void)ptr;
quit();
}
static void on_tone_view(GtkMenuItem *menuitem, gpointer ptr) {
+ (void)menuitem;
+ (void)ptr;
show_toneview();
}
static void on_oscillo_view(GtkMenuItem *menuitem, gpointer ptr) {
+ (void)menuitem;
+ (void)ptr;
show_oscilloview();
}
@@ -95,6 +105,9 @@ static int pastream_cb(const void *inptr, void *outptr, unsigned long frames,
const PaStreamCallbackTimeInfo *timeinfo,
PaStreamCallbackFlags statusFlags,
void *userdata) {
+ (void)inptr;
+ (void)timeinfo;
+ (void)statusFlags;
struct opna_timer *timer = (struct opna_timer *)userdata;
int16_t *buf = (int16_t *)outptr;
memset(outptr, 0, sizeof(int16_t)*frames*2);
@@ -133,7 +146,8 @@ static void opna_writereg_libopna(struct fmdriver_work *work, unsigned addr, uns
}
static unsigned opna_readreg_libopna(struct fmdriver_work *work, unsigned addr) {
- struct opna_timer *timer = (struct opna_timer *)work->opna;
+ (void)work;
+ //struct opna_timer *timer = (struct opna_timer *)work->opna;
return opna_readreg(&g.opna, addr);
}
@@ -280,7 +294,7 @@ static bool openfile(const char *uri) {
g.pa_paused = false;
{
const char *turi = strdup(uri);
- free(g.current_uri);
+ free((void *)g.current_uri);
g.current_uri = turi;
}
return true;
@@ -290,6 +304,7 @@ err:
}
static void on_file_activated(GtkFileChooser *chooser, gpointer ptr) {
+ (void)ptr;
gchar *filename = gtk_file_chooser_get_uri(chooser);
if (filename) {
openfile(filename);
@@ -326,6 +341,8 @@ static GtkWidget *create_menubar() {
static gboolean draw_cb(GtkWidget *w,
cairo_t *cr,
gpointer p) {
+ (void)w;
+ (void)p;
fmdsp_update(&g.fmdsp, &g.work, &g.opna, g.vram);
fmdsp_vrampalette(&g.fmdsp, g.vram, g.vram32, g.vram32_stride);
cairo_surface_t *s = cairo_image_surface_create_for_data(
@@ -341,6 +358,7 @@ static gboolean draw_cb(GtkWidget *w,
static gboolean tick_cb(GtkWidget *w,
GdkFrameClock *frame_clock,
gpointer p) {
+ (void)w;
(void)frame_clock;
gtk_widget_queue_draw(GTK_WIDGET(p));
return G_SOURCE_CONTINUE;
@@ -472,6 +490,7 @@ static void drag_data_recv_cb(
gint x, gint y,
GtkSelectionData *data,
guint info, guint time, gpointer ptr) {
+ (void)w;
(void)x;
(void)y;
(void)info;
@@ -484,14 +503,15 @@ static void drag_data_recv_cb(
gtk_drag_finish(ctx, TRUE, FALSE, time);
}
-void opna_ssg_sinc_calc_neon(unsigned, const int16_t *, int32_t *);
-void fmdsp_vramlookup_neon(uint8_t *, const uint8_t *, const uint8_t *, int);
-
int main(int argc, char **argv) {
#ifdef ENABLE_NEON
opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_neon;
fmdsp_vramlookup_func = fmdsp_vramlookup_neon;
#endif
+#ifdef ENABLE_SSE
+ if (__builtin_cpu_supports("sse2")) opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_sse2;
+ if (__builtin_cpu_supports("ssse3")) fmdsp_vramlookup_func = fmdsp_vramlookup_ssse3;
+#endif
load_fontrom();
gtk_init(&argc, &argv);
GtkWidget *w = gtk_window_new(GTK_WINDOW_TOPLEVEL);
diff --git a/libopna/opnassg.h b/libopna/opnassg.h
index 223d542..aadf53e 100644
--- a/libopna/opnassg.h
+++ b/libopna/opnassg.h
@@ -66,6 +66,8 @@ typedef void (*opna_ssg_sinc_calc_func_type)(unsigned resampler_index,
extern opna_ssg_sinc_calc_func_type opna_ssg_sinc_calc_func;
void opna_ssg_sinc_calc_c(unsigned resampler_index,
const int16_t *inbuf, int32_t *outbuf);
+void opna_ssg_sinc_calc_neon(unsigned, const int16_t *, int32_t *);
+void opna_ssg_sinc_calc_sse2(unsigned, const int16_t *, int32_t *);
extern const int16_t opna_ssg_sinctable[OPNA_SSG_SINCTABLELEN*2];
diff --git a/win32/amd64/Makefile b/win32/amd64/Makefile
index 177ef90..22ef073 100644
--- a/win32/amd64/Makefile
+++ b/win32/amd64/Makefile
@@ -9,14 +9,16 @@ vpath %.rc ..
include ../fmplayer.mak
OBJS=$(addsuffix .o,$(OBJBASE) $(RESBASE))
+OBJS+=$(addsuffix .sse.o,$(SSEOBJBASE))
ARCH=x86_64
PREFIX=$(ARCH)-w64-mingw32-
CC=$(PREFIX)gcc
WINDRES=$(PREFIX)windres
STRIP=$(PREFIX)strip
-CFLAGS=-std=c99 -O2 -Wall -Wextra -Werror -pedantic -Wno-unused-parameter -Wno-missing-field-initializers -I../.. \
+CFLAGS=-std=c99 -O2 -Wall -Werror -Wextra -pedantic -Wno-unused-parameter -Wno-missing-field-initializers -I../.. \
$(addprefix -D,$(DEFINES))
-LIBS=-s -mwindows -municode \
+SSECFLAGS=-mssse3 -O3
+LIBS=-mwindows -municode \
$(addprefix -l,$(LIBBASE))
$(TARGET): $(OBJS)
@@ -28,6 +30,10 @@ $(TARGET): $(OBJS)
@echo " CC $@"
@$(CC) $(CFLAGS) -c $< -o $@
+%.sse.o: %.c
+ @echo " CC $@"
+ @$(CC) $(CFLAGS) $(SSECFLAGS) -c $< -o $@
+
%.o: %.rc $(ICON)
@echo " WINDRES $@"
@$(WINDRES) -o $@ -i $<
diff --git a/win32/fmplayer.mak b/win32/fmplayer.mak
index 1f68f4d..180de1c 100644
--- a/win32/fmplayer.mak
+++ b/win32/fmplayer.mak
@@ -15,12 +15,16 @@ LIBOPNA_OBJS=opna \
opnatimer \
opnafm \
opnassg \
+ opnassg-sinc-c \
opnadrum \
opnaadpcm
FMDSP_OBJS=fmdsp \
+ fmdsp-vramlookup-c \
font_rom \
font_fmdsp_small
TONEDATA_OBJS=tonedata
+SSEOBJBASE=opnassg-sinc-sse2 \
+ fmdsp-vramlookup-ssse3
OBJBASE=main \
toneview \
oscilloview \
diff --git a/win32/main.c b/win32/main.c
index ab038ae..8dc6bb8 100644
--- a/win32/main.c
+++ b/win32/main.c
@@ -52,6 +52,7 @@ static struct {
struct fmplayer_file *fmfile;
struct fmdsp fmdsp;
uint8_t vram[PC98_W*PC98_H];
+ //uint8_t *vram;
struct fmdsp_font font;
uint8_t fontrom[FONT_ROM_FILESIZE];
bool font_loaded;
@@ -66,6 +67,8 @@ static struct {
bool fmdsp_2x;
struct oscillodata oscillodata_audiothread[LIBOPNA_OSCILLO_TRACK_COUNT];
UINT mmtimer;
+ HBITMAP bitmap_vram;
+ uint8_t *vram32;
} g;
HWND g_currentdlg;
@@ -445,6 +448,20 @@ static void CALLBACK mmtimer_cb(UINT timerid, UINT msg,
static bool on_create(HWND hwnd, CREATESTRUCT *cs) {
(void)cs;
+ struct bitmap_info_fmdsp {
+ BITMAPINFOHEADER head;
+ RGBQUAD colors[FMDSP_PALETTE_COLORS];
+ } bmi = {0};
+ bmi.head.biSize = sizeof(bmi.head);
+ bmi.head.biWidth = PC98_W;
+ bmi.head.biHeight = -PC98_H;
+ bmi.head.biPlanes = 1;
+ bmi.head.biBitCount = 32;
+ bmi.head.biCompression = BI_RGB;
+ //bmi.head.biClrUsed = FMDSP_PALETTE_COLORS;
+ g.bitmap_vram = CreateDIBSection(
+ 0, (BITMAPINFO *)&bmi, DIB_RGB_COLORS, (void **)&g.vram32, 0, 0
+ );
HWND button = CreateWindowEx(
0,
L"BUTTON",
@@ -563,40 +580,26 @@ static void on_destroy(HWND hwnd) {
static void on_paint(HWND hwnd) {
fmdsp_update(&g.fmdsp, &g.work, &g.opna, g.vram);
+ fmdsp_vrampalette(&g.fmdsp, g.vram, g.vram32, PC98_W*4);
PAINTSTRUCT ps;
- static BITMAPINFO *bi = 0;
- if (!bi) {
- bi = HeapAlloc(g.heap, HEAP_ZERO_MEMORY,
- sizeof(BITMAPINFOHEADER) + sizeof(RGBQUAD)*FMDSP_PALETTE_COLORS);
- if (!bi) return;
- bi->bmiHeader.biSize = sizeof(bi->bmiHeader);
- bi->bmiHeader.biWidth = PC98_W;
- bi->bmiHeader.biHeight = -PC98_H;
- bi->bmiHeader.biPlanes = 1;
- bi->bmiHeader.biBitCount = 8;
- bi->bmiHeader.biCompression = BI_RGB;
- bi->bmiHeader.biClrUsed = FMDSP_PALETTE_COLORS;
- }
- for (int p = 0; p < FMDSP_PALETTE_COLORS; p++) {
- bi->bmiColors[p].rgbRed = g.fmdsp.palette[p*3+0];
- bi->bmiColors[p].rgbGreen = g.fmdsp.palette[p*3+1];
- bi->bmiColors[p].rgbBlue = g.fmdsp.palette[p*3+2];
- }
HDC dc = BeginPaint(hwnd, &ps);
HDC mdc = CreateCompatibleDC(dc);
- HBITMAP bitmap = CreateDIBitmap(
- dc,
- &bi->bmiHeader, CBM_INIT,
- g.vram,
- bi, DIB_RGB_COLORS);
- SelectObject(mdc, bitmap);
+ SelectObject(mdc, g.bitmap_vram);
+ /*
+ RGBQUAD palette[FMDSP_PALETTE_COLORS];
+ for (int p = 0; p < FMDSP_PALETTE_COLORS; p++) {
+ palette[p].rgbRed = g.fmdsp.palette[p*3+0];
+ palette[p].rgbGreen = g.fmdsp.palette[p*3+1];
+ palette[p].rgbBlue = g.fmdsp.palette[p*3+2];
+ }
+ SetDIBColorTable(mdc, 0, FMDSP_PALETTE_COLORS, palette);
+ */
if (g.fmdsp_2x) {
StretchBlt(dc, 0, 80, 1280, 800, mdc, 0, 0, 640, 400, SRCCOPY);
} else {
BitBlt(dc, 0, 80, 640, 400, mdc, 0, 0, SRCCOPY);
}
DeleteDC(mdc);
- DeleteObject(bitmap);
EndPaint(hwnd, &ps);
}
@@ -665,8 +668,20 @@ static LRESULT CALLBACK wndproc(
HANDLE_MSG(hwnd, WM_SYSKEYUP, on_syskey);
HANDLE_MSG(hwnd, WM_ACTIVATE, on_activate);
case WM_USER:
- InvalidateRect(hwnd, 0, FALSE);
- return 0;
+ {
+ RECT r = {
+ .left = 0,
+ .top = 80,
+ .right = 640,
+ .bottom = 480,
+ };
+ if (g.fmdsp_2x) {
+ r.right = 1280;
+ r.bottom = 880;
+ }
+ InvalidateRect(hwnd, &r, FALSE);
+ return 0;
+ }
}
return DefWindowProc(hwnd, msg, wParam, lParam);
}
@@ -688,6 +703,9 @@ int CALLBACK wWinMain(HINSTANCE hinst, HINSTANCE hpinst,
(void)hpinst;
(void)cmdline_;
+ if (__builtin_cpu_supports("sse2")) opna_ssg_sinc_calc_func = opna_ssg_sinc_calc_sse2;
+ if (__builtin_cpu_supports("ssse3")) fmdsp_vramlookup_func = fmdsp_vramlookup_ssse3;
+
const wchar_t *argfile = 0;
{
wchar_t *cmdline = GetCommandLine();
@@ -725,9 +743,14 @@ int CALLBACK wWinMain(HINSTANCE hinst, HINSTANCE hpinst,
wr.top = 0;
wr.bottom = 480;
AdjustWindowRectEx(&wr, style, 0, exStyle);
+#ifdef _WIN64
+#define WIN64STR "(amd64)"
+#else
+#define WIN64STR ""
+#endif
g.mainwnd = CreateWindowEx(
exStyle,
- (wchar_t*)((uintptr_t)wcatom), L"FMPlayer/Win32 v" FMPLAYER_VERSION_STR,
+ (wchar_t*)((uintptr_t)wcatom), L"FMPlayer/Win32 " WIN64STR " v" FMPLAYER_VERSION_STR,
style,
CW_USEDEFAULT, CW_USEDEFAULT,
wr.right-wr.left, wr.bottom-wr.top,
diff --git a/win32/x86/Makefile b/win32/x86/Makefile
index b26f3e1..6146d28 100644
--- a/win32/x86/Makefile
+++ b/win32/x86/Makefile
@@ -9,6 +9,7 @@ vpath %.rc ..
include ../fmplayer.mak
OBJS=$(addsuffix .o,$(OBJBASE) $(RESBASE))
+OBJS+=$(addsuffix .sse.o,$(SSEOBJBASE))
ARCH=i686
PREFIX=$(ARCH)-w64-mingw32-
CC=$(PREFIX)gcc
@@ -17,6 +18,7 @@ STRIP=$(PREFIX)strip
CFLAGS=-std=c99 -O2 -Wall -Wextra -Werror -pedantic -I../.. \
$(addprefix -D,$(DEFINES)) \
-march=i586 -Wno-unused-parameter -Wno-missing-field-initializers
+SSECFLAGS=-mssse3 -O3
LIBS=-s -mwindows -municode \
$(addprefix -l,$(LIBBASE))
@@ -29,6 +31,10 @@ $(TARGET): $(OBJS)
@echo " CC $@"
@$(CC) $(CFLAGS) -c $< -o $@
+%.sse.o: %.c
+ @echo " CC $@"
+ @$(CC) $(CFLAGS) $(SSECFLAGS) -c $< -o $@
+
%.o: %.rc $(ICON)
@echo " WINDRES $@"
@$(WINDRES) -o $@ -i $<