From: Jim Kukunas Date: Thu, 21 Jun 2018 20:47:32 +0000 (+0000) Subject: Adds SSE2 optimized slide_hash. X-Git-Tag: 1.9.9-b1~444 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=11f2e8f33788f7e3619e6cee6f7634295f102da9;p=thirdparty%2Fzlib-ng.git Adds SSE2 optimized slide_hash. Edit: Removed glue code in deflate.c, since we want to implement this differently in zlib-ng. --- diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 2e473c62..95ad3682 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -16,7 +16,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo +all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_sse.o x86.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c @@ -48,6 +48,12 @@ crc_folding.o: crc_folding.lo: $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +slide_sse.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c + +slide_sse.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c + mostlyclean: clean clean: rm -f *.o *.lo *~ diff --git a/arch/x86/slide_sse.c b/arch/x86/slide_sse.c new file mode 100644 index 00000000..342fd562 --- /dev/null +++ b/arch/x86/slide_sse.c @@ -0,0 +1,52 @@ +/* + * SSE optimized hash slide + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "deflate.h" + +#ifdef USE_SSE_SLIDE +#include + +void slide_hash_sse(deflate_state *s) +{ + unsigned n; + Posf *p; + uInt wsize = s->w_size; + z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size); + + n = s->hash_size; + p = &s->head[n] - 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + p -= 8; + n -= 8; + } while (n > 0); + +#ifndef FASTEST + n = wsize; + p = &s->prev[n] - 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + + p -= 8; + n -= 8; + } while (n > 0); +#endif +} + +#endif + diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 9baf1e43..4ad62ebf 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -36,7 +36,7 @@ SUFFIX = OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \ deflate_medium.obj \ - functable.obj infback.obj inflate.obj inftrees.obj inffast.obj trees.obj uncompr.obj zutil.obj \ + functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \ x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj !if "$(ZLIB_COMPAT)" != "" WITH_GZFILEOP = yes @@ -126,6 +126,7 @@ infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/ inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h +slide_sse.obj: $(SRCDIR)/arch/x86/slide_sse.c $(SRCDIR)/deflate.h trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h