From: Anoop Saldanha Date: Mon, 25 Mar 2013 09:38:45 +0000 (+0530) Subject: Version 1 of AC Cuda. X-Git-Tag: suricata-2.0beta1~103 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=17c763f8554db6d438eb68610249126bdf6d2066;p=thirdparty%2Fsuricata.git Version 1 of AC Cuda. --- diff --git a/src/Makefile.am b/src/Makefile.am index 680388b6ed..905cd2fc6d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -257,6 +257,7 @@ util-cpu.c util-cpu.h \ util-crypt.c util-crypt.h \ util-cuda.c util-cuda.h \ util-cuda-buffer.c util-cuda-buffer.h \ +util-cuda-handlers.c util-cuda-handlers.h \ util-daemon.c util-daemon.h \ util-debug.c util-debug.h \ util-debug-filters.c util-debug-filters.h \ @@ -325,7 +326,7 @@ win32-misc.c win32-misc.h \ win32-service.c win32-service.h \ win32-syslog.h -EXTRA_DIST = util-mpm-b2g-cuda-kernel.cu ptxdump.py +EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py # set the include path found by configure INCLUDES= $(all_includes) @@ -345,7 +346,7 @@ BUILT_SOURCES = cuda-ptxdump.h suricata_SOURCES += cuda-ptxdump.h suricata_CUDA_KERNELS = \ -util-mpm-b2g-cuda-kernel.cu +util-mpm-ac-cuda-kernel.cu NVCCFLAGS=-O2 @@ -355,7 +356,8 @@ SUFFIXES = \ .ptx_sm_12 \ .ptx_sm_13 \ .ptx_sm_20 \ -.ptx_sm_21 +.ptx_sm_21 \ +.ptx_sm_30 PTXS = $(suricata_CUDA_KERNELS:.cu=.ptx_sm_10) PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_11) @@ -363,6 +365,7 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_12) PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_13) PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20) PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21) +PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30) .cu.ptx_sm_10: $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_10 -ptx $< @@ -382,6 +385,9 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21) .cu.ptx_sm_21: $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $< +.cu.ptx_sm_30: + $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $< + cuda-ptxdump.h: $(PTXS) $(PYTHON) ptxdump.py cuda-ptxdump $(PTXS) diff --git a/src/decode.h b/src/decode.h index 59d34d9338..69eb7b0819 100644 --- a/src/decode.h +++ b/src/decode.h @@ -28,9 +28,12 @@ #define COUNTERS #include "suricata-common.h" - #include "threadvars.h" +#ifdef __SC_CUDA_SUPPORT__ +#include "util-cuda-buffer.h" +#endif /* __SC_CUDA_SUPPORT__ */ + typedef enum { CHECKSUM_VALIDATION_DISABLE, CHECKSUM_VALIDATION_ENABLE, @@ -487,6 +490,14 @@ typedef struct Packet_ #ifdef PROFILING PktProfiling profile; #endif +#ifdef __SC_CUDA_SUPPORT__ + uint8_t cuda_mpm_enabled; + uint8_t cuda_done; + uint16_t cuda_gpu_matches; + SCMutex cuda_mutex; + SCCondT cuda_cond; + uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1]; +#endif } Packet; #define DEFAULT_PACKET_SIZE (1500 + ETHERNET_HEADER_LEN) @@ -570,6 +581,24 @@ typedef struct DecodeThreadVars_ uint16_t counter_defrag_ipv6_reassembled; uint16_t counter_defrag_ipv6_timeouts; uint16_t counter_defrag_max_hit; + +#ifdef __SC_CUDA_SUPPORT__ + /* cb - CudaBuffer */ + CudaBufferData *cuda_ac_cb; + + MpmCtx *mpm_proto_other_ctx; + + MpmCtx *mpm_proto_tcp_ctx_ts; + MpmCtx *mpm_proto_udp_ctx_ts; + + MpmCtx *mpm_proto_tcp_ctx_tc; + MpmCtx *mpm_proto_udp_ctx_tc; + + uint16_t data_buffer_size_max_limit; + uint16_t data_buffer_size_min_limit; + + uint8_t mpm_is_cuda; +#endif } DecodeThreadVars; /** @@ -586,12 +615,27 @@ typedef struct DecodeThreadVars_ /** * \brief Initialize a packet structure for use. */ -#define PACKET_INITIALIZE(p) { \ +#ifdef __SC_CUDA_SUPPORT__ +#include "util-cuda-handlers.h" +#include "util-mpm.h" + +#define PACKET_INITIALIZE(p) do { \ + memset((p), 0x00, SIZE_OF_PACKET); \ + SCMutexInit(&(p)->tunnel_mutex, NULL); \ + PACKET_RESET_CHECKSUMS((p)); \ + (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \ + (p)->livedev = NULL; \ + SCMutexInit(&(p)->cuda_mutex, NULL); \ + SCCondInit(&(p)->cuda_cond, NULL); \ + } while (0) +#else +#define PACKET_INITIALIZE(p) { \ SCMutexInit(&(p)->tunnel_mutex, NULL); \ PACKET_RESET_CHECKSUMS((p)); \ (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \ (p)->livedev = NULL; \ } +#endif /** * \brief Recycle a packet structure for reuse. diff --git a/src/detect-engine-mpm.c b/src/detect-engine-mpm.c index 87dc7cf1f3..a059483002 100644 --- a/src/detect-engine-mpm.c +++ b/src/detect-engine-mpm.c @@ -53,6 +53,9 @@ #include "util-debug.h" #include "util-print.h" #include "util-memcmp.h" +#ifdef __SC_CUDA_SUPPORT__ +#include "util-mpm-ac.h" +#endif /** \todo make it possible to use multiple pattern matcher algorithms next to eachother. */ @@ -221,11 +224,23 @@ uint32_t PacketPatternSearch(DetectEngineThreadCtx *det_ctx, Packet *p) if (mpm_ctx == NULL) SCReturnInt(0); +#ifdef __SC_CUDA_SUPPORT__ + if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) { + ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq); + } else { + ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx, + &det_ctx->mtc, + &det_ctx->pmq, + p->payload, + p->payload_len); + } +#else ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx, &det_ctx->mtc, &det_ctx->pmq, p->payload, p->payload_len); +#endif SCReturnInt(ret); } diff --git a/src/detect-engine.c b/src/detect-engine.c index 1b6f02bd47..c5f6d931d3 100644 --- a/src/detect-engine.c +++ b/src/detect-engine.c @@ -845,7 +845,11 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) { /* for now, since we still haven't implemented any intelligence into * understanding the patterns and distributing mpm_ctx across sgh */ if (de_ctx->mpm_matcher == MPM_AC || de_ctx->mpm_matcher == MPM_AC_GFBS || +#ifdef __SC_CUDA_SUPPORT__ + de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) { +#else de_ctx->mpm_matcher == MPM_AC_BS) { +#endif de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE; } else { de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL; @@ -854,6 +858,15 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) { if (strcmp(sgh_mpm_context, "single") == 0) { de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE; } else if (strcmp(sgh_mpm_context, "full") == 0) { +#ifdef __SC_CUDA_SUPPORT__ + if (de_ctx->mpm_matcher == MPM_AC_CUDA) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use " + "the cuda version of our mpm ac, i.e. \"ac-cuda\" " + "along with \"full\" \"sgh-mpm-context\". " + "Allowed values are \"single\" and \"auto\"."); + exit(EXIT_FAILURE); + } +#endif de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL; } else { SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an " diff --git a/src/detect.c b/src/detect.c index 8dc611e7c5..e629b20e8d 100644 --- a/src/detect.c +++ b/src/detect.c @@ -190,6 +190,7 @@ #include "util-optimize.h" #include "util-vector.h" #include "util-path.h" +#include "util-mpm-ac.h" #include "runmodes.h" @@ -1085,6 +1086,12 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx, } if (p->payload_len > 0 && (!(p->flags & PKT_NOPAYLOAD_INSPECTION))) { + if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) { + *sms_runflags |= SMS_USED_PM; + PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM); + PacketPatternSearchWithStreamCtx(det_ctx, p); + PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM); + } if (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_PACKET) { /* run the multi packet matcher against the payload of the packet */ SCLogDebug("search: (%p, maxlen %" PRIu32 ", sgh->sig_cnt %" PRIu32 ")", @@ -1096,12 +1103,6 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx, *sms_runflags |= SMS_USED_PM; } - if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) { - *sms_runflags |= SMS_USED_PM; - PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM); - PacketPatternSearchWithStreamCtx(det_ctx, p); - PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM); - } } } @@ -4387,6 +4388,255 @@ int SigAddressPrepareStage5(DetectEngineCtx *de_ctx) { return 0; } +#ifdef __SC_CUDA_SUPPORT__ + +static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx) +{ + MpmCtx *mpm_ctx = NULL; + + int ac_16_tables = 0; + int ac_32_tables = 0; + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + if (ac_16_tables > 0 && ac_32_tables > 0) + SCACConstructBoth16and32StateTables(); + + + SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables); + SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables); + +} +#endif + /** * \brief Convert the signature list into the runtime match structure. * @@ -4441,6 +4691,26 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) { MpmCtx *mpm_ctx = NULL; + +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) { + /* setting it to default. You've gotta remove it once you fix the state table thing */ + SCACConstructBoth16and32StateTables(); + + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); + if (cuda_context == 0) { + SCLogError(SC_ERR_FATAL, "cuda context is NULL."); + exit(EXIT_FAILURE); + } + int r = SCCudaCtxPushCurrent(cuda_context); + if (r < 0) { + SCLogError(SC_ERR_FATAL, "context push failed."); + exit(EXIT_FAILURE); + } + } +#endif + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0); if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) { mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); @@ -4562,7 +4832,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); } //printf("hsmd- %d\n", mpm_ctx->pattern_cnt); - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1); if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) { mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); @@ -4574,7 +4843,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); } //printf("hscd- %d\n", mpm_ctx->pattern_cnt); - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1); if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) { mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); @@ -4586,7 +4854,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); } //printf("huad- %d\n", mpm_ctx->pattern_cnt); - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1); if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) { mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); @@ -4616,6 +4883,21 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx); } //printf("hrhhd- %d\n", mpm_ctx->pattern_cnt); + +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) { + int r = SCCudaCtxPopCurrent(NULL); + if (r < 0) { + SCLogError(SC_ERR_FATAL, "cuda context pop failure."); + exit(EXIT_FAILURE); + } + } + + /* too late to call this either ways. Should be called post ac goto. + * \todo Support this. */ + DetermineCudaStateTableSize(de_ctx); +#endif + } // SigAddressPrepareStage5(de_ctx); diff --git a/src/source-pcap-file.c b/src/source-pcap-file.c index 98b9d0414b..543c936b0e 100644 --- a/src/source-pcap-file.c +++ b/src/source-pcap-file.c @@ -43,6 +43,19 @@ #include "util-profiling.h" #include "runmode-unix-socket.h" +#ifdef __SC_CUDA_SUPPORT__ + +#include "util-cuda.h" +#include "util-cuda-buffer.h" +#include "util-mpm-ac.h" +#include "util-cuda-handlers.h" +#include "detect-engine.h" +#include "detect-engine-mpm.h" + +static DetectEngineCtx *cuda_de_ctx = NULL; + +#endif /* __SC_CUDA_SUPPORT__ */ + extern uint8_t suricata_ctl_flags; extern int max_pending_packets; @@ -111,6 +124,15 @@ void TmModuleDecodePcapFileRegister (void) { tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM; } +#ifdef __SC_CUDA_SUPPORT__ +void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx) +{ + cuda_de_ctx = de_ctx; + + return; +} +#endif + void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) { SCEnter(); @@ -320,6 +342,90 @@ TmEcode ReceivePcapFileThreadDeinit(ThreadVars *tv, void *data) { SCReturnInt(TM_ECODE_OK); } +#ifdef __SC_CUDA_SUPPORT__ + +static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p) +{ + if (p->cuda_mpm_enabled) { + while (!p->cuda_done) { + SCMutexLock(&p->cuda_mutex); + if (p->cuda_done) { + SCMutexUnlock(&p->cuda_mutex); + break; + } else { + SCCondWait(&p->cuda_cond, &p->cuda_mutex); + SCMutexUnlock(&p->cuda_mutex); + } + } + } + p->cuda_done = 0; + + if (p->payload_len == 0 || + (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) || + (p->flags & PKT_ALLOC) || + (dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) || + (p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) { + p->cuda_mpm_enabled = 0; + return; + } + + MpmCtx *mpm_ctx = NULL; + if (p->proto == IPPROTO_TCP) { + if (p->flowflags & FLOW_PKT_TOSERVER) + mpm_ctx = dtv->mpm_proto_tcp_ctx_ts; + else + mpm_ctx = dtv->mpm_proto_tcp_ctx_tc; + } else if (p->proto == IPPROTO_UDP) { + if (p->flowflags & FLOW_PKT_TOSERVER) + mpm_ctx = dtv->mpm_proto_udp_ctx_ts; + else + mpm_ctx = dtv->mpm_proto_udp_ctx_tc; + } else { + mpm_ctx = dtv->mpm_proto_other_ctx; + } + if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) { + p->cuda_mpm_enabled = 0; + return; + } + +#if __WORDSIZE==64 + CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb, + p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr), + (void *)p); + if (slice == NULL) { + SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " + "this to dev."); + p->cuda_mpm_enabled = 0; + return; + } + *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len; + *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; + memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); +#else + CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb, + p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr), + (void *)p); + if (slice == NULL) { + SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " + "this to dev."); + p->cuda_mpm_enabled = 0; + return; + } + *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len; + *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; + memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); +#endif + p->cuda_mpm_enabled = 1; + SC_ATOMIC_SET(slice->done, 1); + + SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n", + p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda); + + return; +} + +#endif /* __SC_CUDA_SUPPORT__ */ + double prev_signaled_ts = 0; TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq) @@ -351,11 +457,49 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P TimeSet(&p->ts); /* call the decoder */ + pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq); +#ifdef DEBUG + BUG_ON(p->pkt_src != PKT_SRC_WIRE && p->pkt_src != PKT_SRC_FFR_V2); +#endif + +#ifdef __SC_CUDA_SUPPORT__ + if (dtv->mpm_is_cuda) + DecodePcapFileBufferPacket(dtv, p); +#endif + SCReturnInt(TM_ECODE_OK); } +#ifdef __SC_CUDA_SUPPORT__ + +static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv) +{ + if (PatternMatchDefaultMatcher() != MPM_AC_CUDA) + return 0; + + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + if (conf == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); + return -1; + } + + dtv->mpm_is_cuda = 1; + dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); + dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit; + dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit; + dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0); + dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1); + dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0); + dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1); + dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0); + + return 0; +} + +#endif /* __SC_CUDA_SUPPORT__ */ + TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data) { SCEnter(); @@ -367,6 +511,11 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data) DecodeRegisterPerfCounters(dtv, tv); +#ifdef __SC_CUDA_SUPPORT__ + if (DecodePcapFileThreadInitCuda(dtv) < 0) + SCReturnInt(TM_ECODE_FAILED); +#endif + *data = (void *)dtv; SCReturnInt(TM_ECODE_OK); diff --git a/src/source-pcap-file.h b/src/source-pcap-file.h index 67bd261b60..7646a25343 100644 --- a/src/source-pcap-file.h +++ b/src/source-pcap-file.h @@ -26,6 +26,9 @@ void TmModuleReceivePcapFileRegister (void); void TmModuleDecodePcapFileRegister (void); +#ifdef __SC_CUDA_SUPPORT__ +void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx); +#endif #endif /* __SOURCE_PCAP_FILE_H__ */ diff --git a/src/suricata.c b/src/suricata.c index 217c03690b..09420db5d3 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -188,6 +188,10 @@ #include "util-memcmp.h" #include "util-proto-name.h" #include "util-spm-bm.h" +#ifdef __SC_CUDA_SUPPORT__ +#include "util-cuda-buffer.h" +#include "util-mpm-ac.h" +#endif /* * we put this here, because we only use it here in main. @@ -1302,6 +1306,7 @@ int main(int argc, char **argv) SCCudaListCards(); exit(EXIT_SUCCESS); } + CudaBufferInit(); #endif if (!CheckValidDaemonModes(daemon, run_mode)) { @@ -1313,9 +1318,6 @@ int main(int argc, char **argv) TimeInit(); SupportFastPatternForSigMatchTypes(); - /* load the pattern matchers */ - MpmTableSetup(); - if (run_mode != RUNMODE_UNITTEST && !list_keywords && !list_app_layer_protocols) { @@ -1363,6 +1365,9 @@ int main(int argc, char **argv) } } + /* load the pattern matchers */ + MpmTableSetup(); + AppLayerDetectProtoThreadInit(); if (list_app_layer_protocols) { AppLayerListSupportedProtocols(); @@ -1703,6 +1708,9 @@ int main(int argc, char **argv) DetectProtoTests(); DetectPortTests(); SCAtomicRegisterTests(); +#ifdef __SC_CUDA_SUPPORT__ + CudaBufferRegisterUnittests(); +#endif if (list_unittests) { UtListTests(regex_arg); } @@ -1819,6 +1827,10 @@ int main(int argc, char **argv) "context failed."); exit(EXIT_FAILURE); } +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) + DecodePcapFileSetCudaDeCtx(de_ctx); +#endif /* __SC_CUDA_SUPPORT__ */ SCClassConfLoadClassficationConfigFile(de_ctx); SCRConfLoadReferenceConfigFile(de_ctx); @@ -1961,6 +1973,11 @@ int main(int argc, char **argv) SCPerfSpawnThreads(); } +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) + SCACCudaStartDispatcher(); +#endif + /* Check if the alloted queues have at least 1 reader and writer */ TmValidateQueueState(); @@ -2118,5 +2135,11 @@ int main(int argc, char **argv) SC_ATOMIC_DESTROY(engine_stage); +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) + MpmCudaBufferDeSetup(); + CudaHandlerFreeProfiles(); +#endif + exit(engine_retval); } diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c new file mode 100644 index 0000000000..2206e1be04 --- /dev/null +++ b/src/util-cuda-handlers.c @@ -0,0 +1,357 @@ +/* Copyright (C) 2007-2012 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/** + * \file + * + * \author Anoop Saldanha + */ + +/* compile in, only if we have a CUDA enabled device on the machine, with the + * toolkit and the driver installed */ +#ifdef __SC_CUDA_SUPPORT__ + +#include "suricata-common.h" + +#include "util-error.h" +#include "util-debug.h" +#include "conf.h" +#include "util-cuda.h" +#include "util-cuda-handlers.h" + +/* file only exists if cuda is enabled */ +#include "cuda-ptxdump.h" + +/************************conf file profile section**********************/ + +typedef struct CudaHandlerConfProfile_ { + char *name; + void *ctx; + void (*Free)(void *); + + struct CudaHandlerConfProfile_ *next; +} CudaHandlerConfProfile; + +static CudaHandlerConfProfile *conf_profiles = NULL; +/* protects above var */ +static SCMutex mutex = PTHREAD_MUTEX_INITIALIZER; + +void CudaHandlerAddCudaProfileFromConf(const char *name, + void *(*Callback)(ConfNode *node), + void (*Free)(void *)) +{ + /* we don't do data validation */ + SCMutexLock(&mutex); + + CudaHandlerConfProfile *tmp_cp = conf_profiles; + while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0) + tmp_cp = tmp_cp->next; + + if (tmp_cp != NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf " + "profile by the name \"%s\" registered.", name); + exit(EXIT_FAILURE); + } + + char tmp[200]; + int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name); + if (r < 0) { + SCLogError(SC_ERR_FATAL, "snprintf failure."); + exit(EXIT_FAILURE); + } else if (r > (int)sizeof(tmp)) { + SCLogError(SC_ERR_FATAL, "buffer not big enough to write param."); + exit(EXIT_FAILURE); + } + void *ctx = Callback(ConfGetNode(tmp)); + if (ctx == NULL) { + SCMutexUnlock(&mutex); + return; + } + + CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile)); + if (new_cp == NULL) + exit(EXIT_FAILURE); + memset(new_cp, 0, sizeof(CudaHandlerConfProfile)); + new_cp->name = SCStrdup(name); + if (new_cp->name == NULL) + exit(EXIT_FAILURE); + new_cp->ctx = ctx; + new_cp->Free = Free; + + if (conf_profiles == NULL) { + conf_profiles = new_cp; + } else { + new_cp->next = conf_profiles; + conf_profiles = new_cp; + } + + SCMutexUnlock(&mutex); + return; +} + +void *CudaHandlerGetCudaProfile(const char *name) +{ + SCMutexLock(&mutex); + + CudaHandlerConfProfile *tmp_cp = conf_profiles; + while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0) + tmp_cp = tmp_cp->next; + + if (tmp_cp == NULL) { + SCMutexUnlock(&mutex); + return NULL; + } + + SCMutexUnlock(&mutex); + return tmp_cp->ctx; +} + +void CudaHandlerFreeProfiles(void) +{ + SCMutexLock(&mutex); + + CudaHandlerConfProfile *tmp = conf_profiles; + while (tmp != NULL) { + CudaHandlerConfProfile *curr = tmp; + tmp = tmp->next; + SCFree(curr->name); + if (curr->Free != NULL) + curr->Free(curr->ctx); + SCFree(curr); + } + + SCMutexUnlock(&mutex); + return; +} + +/*******************cuda context related data section*******************/ + +/* we use a concept where every device on the gpu has only 1 context. If + * a section in the engine wants to use a device and tries to open a context + * on it, we first check if a context is already created for the device and if + * so we return it. If not we create a new one and update with the entry */ + +static CUcontext *cuda_contexts = NULL; +static int no_of_cuda_contexts = 0; + +typedef struct CudaHandlerModuleData_ { + char *name; + void *data; + + struct CudaHandlerModuleData_ *next; +} CudaHandlerModuleData; + +typedef struct CudaHandlerModule_ { + char *name; + + /* the context used by this module */ + CUcontext context; + /* the device on which the above context was created */ + int device_id; + CudaHandlerModuleData *module_data; + + struct CudaHandlerModule_ *next; +} CudaHandlerModule; + +static CudaHandlerModule *cudahl_modules = NULL; + +CUcontext CudaHandlerModuleGetContext(const char *name, int device_id) +{ + SCMutexLock(&mutex); + + CudaHandlerModule *module = cudahl_modules; + while (module != NULL && strcasecmp(module->name, name) != 0) + module = module->next; + if (module != NULL) { + if (module->device_id != device_id) { + SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already " + "registered, but the new device_id is different " + "from the already registered device_id."); + exit(EXIT_FAILURE); + } + SCMutexUnlock(&mutex); + return module->context; + } + + CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule)); + if (new_module == NULL) + exit(EXIT_FAILURE); + memset(new_module, 0, sizeof(CudaHandlerModule)); + new_module->device_id = device_id; + new_module->name = SCStrdup(name); + if (new_module->name == NULL) + exit(EXIT_FAILURE); + if (cudahl_modules == NULL) { + cudahl_modules = new_module; + } else { + new_module->next = cudahl_modules; + cudahl_modules = new_module; + } + + if (no_of_cuda_contexts <= device_id) { + cuda_contexts = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1)); + if (cuda_contexts == NULL) + exit(EXIT_FAILURE); + memset(cuda_contexts + no_of_cuda_contexts, 0, + sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts)); + no_of_cuda_contexts = device_id + 1; + } + + if (cuda_contexts[device_id] == 0) { + SCCudaDevices *devices = SCCudaGetDeviceList(); + if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + devices->devices[device_id]->device) == -1) { + SCLogDebug("ctxcreate failure."); + exit(EXIT_FAILURE); + } + } + new_module->context = cuda_contexts[device_id]; + + SCMutexUnlock(&mutex); + return cuda_contexts[device_id]; +} + +void CudaHandlerModuleStoreData(const char *module_name, + const char *data_name, void *data_ptr) +{ + SCMutexLock(&mutex); + + CudaHandlerModule *module = cudahl_modules; + while (module != NULL && strcasecmp(module->name, module_name) != 0) + module = module->next; + if (module == NULL) { + SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data " + "\"%s\" from module \"%s\" that hasn't been registered " + "yet.", module_name, data_name); + exit(EXIT_FAILURE); + } + + CudaHandlerModuleData *data = module->module_data; + while (data != NULL && (strcasecmp(data_name, data->name) != 0)) { + data = data->next; + } + if (data != NULL) { + SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already " + "registered for this module \"%s\".", data_name, + module_name); + SCMutexUnlock(&mutex); + goto end; + } + + CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData)); + if (new_data == NULL) + exit(EXIT_FAILURE); + memset(new_data, 0, sizeof(CudaHandlerModuleData)); + new_data->name = SCStrdup(data_name); + if (new_data->name == NULL) + exit(EXIT_FAILURE); + new_data->data = data_ptr; + + if (module->module_data == NULL) { + module->module_data = new_data; + } else { + new_data->next = module->module_data; + module->module_data = new_data; + } + + SCMutexUnlock(&mutex); + + end: + return; +} + +void *CudaHandlerModuleGetData(const char *module_name, const char *data_name) +{ + SCMutexLock(&mutex); + + CudaHandlerModule *module = cudahl_modules; + while (module != NULL && strcasecmp(module->name, module_name) != 0) + module = module->next; + if (module == NULL) { + SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data " + "\"%s\" from module \"%s\" that hasn't been registered " + "yet.", module_name, data_name); + SCMutexUnlock(&mutex); + return NULL; + } + + CudaHandlerModuleData *data = module->module_data; + while (data != NULL && (strcasecmp(data_name, data->name) != 0)) { + data = data->next; + } + if (data == NULL) { + SCLogInfo("Data \"%s\" already registered for this module \"%s\". " + "Returning it.", data_name, module_name); + SCMutexUnlock(&mutex); + return NULL; + } + + SCMutexUnlock(&mutex); + return data->data; +} + +int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image) +{ +#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15 + + int i = 0; + + /* select the ptx image based on the compute capability supported by all + * devices (i.e. the lowest) */ + char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE); + if (unlikely(image == NULL)) { + exit(EXIT_FAILURE); + } + memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE); + + int major = INT_MAX; + int minor = INT_MAX; + SCCudaDevices *devices = SCCudaGetDeviceList(); + for (i = 0; i < devices->count; i++){ + if (devices->devices[i]->major_rev < major){ + major = devices->devices[i]->major_rev; + minor = devices->devices[i]->minor_rev; + } + if (devices->devices[i]->major_rev == major && + devices->devices[i]->minor_rev < minor){ + minor = devices->devices[i]->minor_rev; + } + } + snprintf(image, + strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE, + "%s_sm_%u%u", + ptx_image, major, minor); + + /* we don't have a cuda module associated with this module. Create a + * cuda module, update the module with this cuda module reference and + * then return the module refernce back to the calling function using + * the argument */ + SCLogDebug("Loading kernel module: %s\n",image); + if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1) + goto error; + SCFree(image); + + return 0; + error: + SCFree(image); + return -1; + +#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE +} + + +#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h new file mode 100644 index 0000000000..eee227df60 --- /dev/null +++ b/src/util-cuda-handlers.h @@ -0,0 +1,50 @@ +/* Copyright (C) 2007-2012 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/** + * \file + * + * \author Anoop Saldanha + */ + +#ifndef __UTIL_CUDA_HANDLERS__H__ +#define __UTIL_CUDA_HANDLERS__H__ + +#include "conf.h" +#include "util-cuda.h" + +/************************conf file profile section**********************/ + +void CudaHandlerAddCudaProfileFromConf(const char *name, + void *(*Callback)(ConfNode *node), + void (*Free)(void *)); +void *CudaHandlerGetCudaProfile(const char *name); +void CudaHandlerFreeProfiles(void); + +/*******************cuda context related data section*******************/ + +#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0 +#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1 +#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2 + +CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id); +void CudaHandlerModuleStoreData(const char *module_name, + const char *data_name, void *data_ptr); +void *CudaHandlerModuleGetData(const char *module_name, const char *data_name); +int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image); + +#endif /* __UTIL_CUDA_HANDLERS__H__ */ diff --git a/src/util-cuda.c b/src/util-cuda.c index 80eb4788d4..a7044c85c8 100644 --- a/src/util-cuda.c +++ b/src/util-cuda.c @@ -2346,7 +2346,7 @@ int SCCudaMemAllocHost(void **pp, size_t byte_size) } result = cuMemAllocHost(pp, byte_size); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1) goto error; return 0; diff --git a/src/util-error.c b/src/util-error.c index af93e3ffdf..5ebcf0f407 100644 --- a/src/util-error.c +++ b/src/util-error.c @@ -161,7 +161,7 @@ const char * SCErrorToString(SCError err) CASE_CODE (SC_ERR_CUDA_HANDLER_ERROR); CASE_CODE (SC_ERR_TM_THREADS_ERROR); CASE_CODE (SC_ERR_TM_MODULES_ERROR); - CASE_CODE (SC_ERR_B2G_CUDA_ERROR); + CASE_CODE (SC_ERR_AC_CUDA_ERROR); CASE_CODE (SC_ERR_INVALID_YAML_CONF_ENTRY); CASE_CODE (SC_ERR_TMQ_ALREADY_REGISTERED); CASE_CODE (SC_ERR_CONFLICTING_RULE_KEYWORDS); diff --git a/src/util-error.h b/src/util-error.h index 9897d28f8f..6500751c9d 100644 --- a/src/util-error.h +++ b/src/util-error.h @@ -165,7 +165,7 @@ typedef enum { SC_ERR_CUDA_HANDLER_ERROR, SC_ERR_TM_THREADS_ERROR, SC_ERR_TM_MODULES_ERROR, - SC_ERR_B2G_CUDA_ERROR, + SC_ERR_AC_CUDA_ERROR, SC_ERR_INVALID_YAML_CONF_ENTRY, SC_ERR_TMQ_ALREADY_REGISTERED, SC_ERR_CONFLICTING_RULE_KEYWORDS, diff --git a/src/util-mpm-ac-cuda-kernel.cu b/src/util-mpm-ac-cuda-kernel.cu new file mode 100644 index 0000000000..d7cc125bf2 --- /dev/null +++ b/src/util-mpm-ac-cuda-kernel.cu @@ -0,0 +1,96 @@ +/* Copyright (C) 2007-2012 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/** + * \file + * + * \author Anoop Saldanha + * + * The Cuda kernel for MPM AC. + * + * \todo - This is a basic version of the kernel. + * - Support 16 bit state tables. + * - Texture memory. + * - Multiple threads per blocks of threads. Make use of + * shared memory/texture memory. + */ + +extern "C" +__global__ void SCACCudaSearch64(unsigned char *d_buffer, + unsigned int d_buffer_start_offset, + unsigned int *o_buffer, + unsigned int *results_buffer, + unsigned int nop, + unsigned char *tolower) +{ + unsigned int u = 0; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= nop) + return; + + unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset))); + unsigned int (*state_table_u32)[256] = + (unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8)); + unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16); + + unsigned int state = 0; + unsigned int matches = 0; + unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1); + for (u = 0; u < buflen; u++) { + state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]]; + if (state & 0xFF000000) { + results[matches++] = u; + results[matches++] = state & 0x00FFFFFF; + } + } + + *(results - 1) = matches; + return; +} + +extern "C" +__global__ void SCACCudaSearch32(unsigned char *d_buffer, + unsigned int d_buffer_start_offset, + unsigned int *o_buffer, + unsigned int *results_buffer, + unsigned int nop, + unsigned char *tolower) +{ + unsigned int u = 0; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= nop) + return; + + unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset))); + unsigned int (*state_table_u32)[256] = + (unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4)); + unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8); + + unsigned int state = 0; + unsigned int matches = 0; + unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1); + for (u = 0; u < buflen; u++) { + state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]]; + if (state & 0xFF000000) { + results[matches++] = u; + results[matches++] = state & 0x00FFFFFF; + } + } + + *(results - 1) = matches; + return; +} diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c index bde8e914b2..78d6b50faf 100644 --- a/src/util-mpm-ac.c +++ b/src/util-mpm-ac.c @@ -49,12 +49,22 @@ #include "suricata.h" #include "detect.h" -#include "util-mpm-ac.h" #include "conf.h" #include "util-debug.h" #include "util-unittest.h" #include "util-memcmp.h" +#include "util-mpm-ac.h" + +#ifdef __SC_CUDA_SUPPORT__ + +#include "util-mpm.h" +#include "tm-threads.h" +#include "util-mpm.h" +#include "detect-engine-mpm.h" +#include "util-cuda.h" +#include "util-cuda-handlers.h" +#endif /* __SC_CUDA_SUPPORT__ */ void SCACInitCtx(MpmCtx *, int); void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t); @@ -78,6 +88,8 @@ void SCACRegisterTests(void); #define STATE_QUEUE_CONTAINER_SIZE 65536 +static int construct_both_16_and_32_state_tables = 0; + /** * \brief Helper structure used by AC during state table creation */ @@ -87,31 +99,6 @@ typedef struct StateQueue_ { int bot; } StateQueue; -/** - * \brief Register the aho-corasick mpm. - */ -void MpmACRegister(void) -{ - mpm_table[MPM_AC].name = "ac"; - /* don't need this. isn't that awesome? no more chopping and blah blah */ - mpm_table[MPM_AC].max_pattern_length = 0; - - mpm_table[MPM_AC].InitCtx = SCACInitCtx; - mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx; - mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx; - mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx; - mpm_table[MPM_AC].AddPattern = SCACAddPatternCS; - mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI; - mpm_table[MPM_AC].Prepare = SCACPreparePatterns; - mpm_table[MPM_AC].Search = SCACSearch; - mpm_table[MPM_AC].Cleanup = NULL; - mpm_table[MPM_AC].PrintCtx = SCACPrintInfo; - mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats; - mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests; - - return; -} - /** * \internal * \brief Initialize the AC context with user specified conf parameters. We @@ -759,7 +746,7 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx) int ascii_code = 0; int32_t r_state = 0; - if (ctx->state_count < 32767) { + if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) { ctx->state_table_u16 = SCMalloc(ctx->state_count * sizeof(SC_AC_STATE_TYPE_U16) * 256); if (ctx->state_table_u16 == NULL) { @@ -797,7 +784,9 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx) } } } - } else { + } + + if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) { /* create space for the state table. We could have used the existing goto * table, but since we have it set to hold 32 bit state values, we will create * a new state table here of type SC_AC_STATE_TYPE(current set to uint16_t) */ @@ -850,7 +839,7 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx) uint32_t state = 0; uint32_t temp_state = 0; - if (ctx->state_count < 32767) { + if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) { for (state = 0; state < ctx->state_count; state++) { for (ascii_code = 0; ascii_code < 256; ascii_code++) { temp_state = ctx->state_table_u16[state & 0x7FFF][ascii_code]; @@ -858,7 +847,9 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx) ctx->state_table_u16[state & 0x7FFF][ascii_code] |= (1 << 15); } } - } else { + } + + if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) { for (state = 0; state < ctx->state_count; state++) { for (ascii_code = 0; ascii_code < 256; ascii_code++) { temp_state = ctx->state_table_u32[state & 0x00FFFFFF][ascii_code]; @@ -1032,6 +1023,25 @@ int SCACPreparePatterns(MpmCtx *mpm_ctx) /* prepare the state table required by AC */ SCACPrepareStateTable(mpm_ctx); +#ifdef __SC_CUDA_SUPPORT__ + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda, + ctx->state_count * sizeof(unsigned int) * 256); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + + r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda, + ctx->state_table_u32, + ctx->state_count * sizeof(unsigned int) * 256); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); + exit(EXIT_FAILURE); + } + } +#endif + /* free all the stored patterns. Should save us a good 100-200 mbs */ for (i = 0; i < mpm_ctx->pattern_cnt; i++) { if (ctx->parray[i] != NULL) { @@ -1166,7 +1176,8 @@ void SCACDestroyCtx(MpmCtx *mpm_ctx) mpm_ctx->memory_cnt++; mpm_ctx->memory_size -= (ctx->state_count * sizeof(SC_AC_STATE_TYPE_U16) * 256); - } else if (ctx->state_table_u32 != NULL) { + } + if (ctx->state_table_u32 != NULL) { SCFree(ctx->state_table_u32); ctx->state_table_u32 = NULL; @@ -1394,6 +1405,533 @@ void SCACPrintInfo(MpmCtx *mpm_ctx) return; } +/****************************Cuda side of things****************************/ + +#ifdef __SC_CUDA_SUPPORT__ +/* \todos + * - Use texture memory - Can we fit all the arrays into a 3d texture. + * Texture memory definitely offers slightly better performance even + * on gpus that offer cache for global memory. + * - Packetpool - modify to support > 65k max pending packets. We are + * hitting packetpool limit currently even with 65k packets. + * - Use streams. We have tried overlapping parsing results from the + * previous call with invoking the next call. + * - Offer higher priority to decode threads. + * - Modify pcap file mode to support reading from multiple pcap files + * and hence we will have multiple receive threads. + * - Split state table into many small pieces and have multiple threads + * run each small state table on the same payload. + * - Used a config peference of l1 over shared memory with no noticeable + * perf increase. Explore it in detail over cards/architectures. + * - Constant memory performance sucked. Explore it in detail. + * - Currently all our state tables are small. Implement 16 bit state + * tables on priority. + * - Introduce profiling. + * - Retrieve sgh before buffer packet. + * - Buffer smsgs too. + */ + +void SCACConstructBoth16and32StateTables(void) +{ + construct_both_16_and_32_state_tables = 1; + + return; +} + +/* \todo Reduce offset buffer size. Probably a 100,000 entry would be sufficient. */ +static void *SCACCudaDispatcher(void *arg) +{ +#define BLOCK_SIZE 32 + + int r = 0; + ThreadVars *tv = (ThreadVars *)arg; + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + uint32_t sleep_interval_ms = conf->batching_timeout; + + SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of " + "\"%"PRIu32"\" micro-seconds", sleep_interval_ms); + + CudaBufferData *cb_data = + CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, + MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); + + CUcontext cuda_context = + CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); + if (cuda_context == 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL."); + exit(EXIT_FAILURE); + } + r = SCCudaCtxPushCurrent(cuda_context); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed."); + exit(EXIT_FAILURE); + } + CUmodule cuda_module = 0; + if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module."); + exit(EXIT_FAILURE); + } + CUfunction kernel = 0; +#if __WORDSIZE==64 + if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel"); + exit(EXIT_FAILURE); + } +#else + if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel"); + exit(EXIT_FAILURE); + } +#endif + + uint8_t g_u8_lowercasetable[256]; + for (uint8_t c = 0; c < 255; c++) + g_u8_lowercasetable[c] = tolower((uint8_t)c); + CUdeviceptr cuda_g_u8_lowercasetable_d = 0; + CUdeviceptr cuda_packets_buffer_d = 0; + CUdeviceptr cuda_offset_buffer_d = 0; + CUdeviceptr cuda_results_buffer_d = 0; + uint32_t *cuda_results_buffer_h = NULL; + r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable)); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable)); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); + exit(EXIT_FAILURE); + } + + CudaBufferCulledInfo cb_culled_info; + memset(&cb_culled_info, 0, sizeof(cb_culled_info)); + + TmThreadsSetFlag(tv, THV_INIT_DONE); + while (1) { + if (TmThreadsCheckFlag(tv, THV_KILL)) + break; + + usleep(sleep_interval_ms); + + /**************** 1 SEND ****************/ + CudaBufferCullCompletedSlices(cb_data, &cb_culled_info); + if (cb_culled_info.no_of_items == 0) + continue; +#if 0 + SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" " + "cb_culled_info.buffer_len - %"PRIu32" " + "cb_culled_info.average size - %f " + "cb_culled_info.d_buffer_start_offset - %"PRIu32" " + "cb_culled_info.op_buffer_start_offset - %"PRIu32" " + "cb_data.no_of_items - %"PRIu32" " + "cb_data.d_buffer_read - %"PRIu32" " + "cb_data.d_buffer_write - %"PRIu32" " + "cb_data.op_buffer_read - %"PRIu32" " + "cb_data.op_buffer_write - %"PRIu32"\n", + cb_culled_info.no_of_items, + cb_culled_info.d_buffer_len, + cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items, + cb_culled_info.d_buffer_start_offset, + cb_culled_info.op_buffer_start_offset, + cb_data->no_of_items, + cb_data->d_buffer_read, + cb_data->d_buffer_write, + cb_data->op_buffer_read, + cb_data->op_buffer_write); +#endif + r = SCCudaMemcpyHtoD(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemcpyHtoD(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); + exit(EXIT_FAILURE); + } + void *args[] = { &cuda_packets_buffer_d, + &cb_culled_info.d_buffer_start_offset, + &cuda_offset_buffer_d, + &cuda_results_buffer_d, + &cb_culled_info.no_of_items, + &cuda_g_u8_lowercasetable_d }; + r = SCCudaLaunchKernel(kernel, + (cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1, + BLOCK_SIZE, 1, 1, + 0, 0, + args, NULL); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure."); + exit(EXIT_FAILURE); + } + r = SCCudaMemcpyDtoH(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2)); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure."); + exit(EXIT_FAILURE); + } + + + + /**************** 1 SYNCHRO ****************/ + r = SCCudaCtxSynchronize(); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure."); + exit(EXIT_FAILURE); + } + + /************* 1 Parse Results ************/ + uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset; + uint32_t no_of_items = cb_culled_info.no_of_items; + uint32_t *o_buffer = cb_data->o_buffer; + uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset; + for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) { + Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset]; + + p->cuda_gpu_matches = + cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)]; + if (p->cuda_gpu_matches != 0) { + memcpy(p->cuda_results, + cuda_results_buffer_h + + ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2), + (cuda_results_buffer_h[((o_buffer[i_op_start_offset] - + d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4); + } + + SCMutexLock(&p->cuda_mutex); + p->cuda_done = 1; + SCMutexUnlock(&p->cuda_mutex); + SCCondSignal(&p->cuda_cond); + } + if (no_of_items != 0) + CudaBufferReportCulledConsumption(cb_data, &cb_culled_info); + } /* while (1) */ + + r = SCCudaModuleUnload(cuda_module); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module."); + exit(EXIT_FAILURE); + } + r = SCCudaMemFree(cuda_packets_buffer_d); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); + exit(EXIT_FAILURE); + } + r = SCCudaMemFree(cuda_offset_buffer_d); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); + exit(EXIT_FAILURE); + } + r = SCCudaMemFree(cuda_results_buffer_d); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); + exit(EXIT_FAILURE); + } + r = SCCudaMemFreeHost(cuda_results_buffer_h); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); + exit(EXIT_FAILURE); + } + + TmThreadsSetFlag(tv, THV_RUNNING_DONE); + TmThreadWaitForFlag(tv, THV_DEINIT); + TmThreadsSetFlag(tv, THV_CLOSED); + + return NULL; + +#undef BLOCK_SIZE +} + +uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx, + PatternMatcherQueue *pmq) +{ + uint32_t u = 0; + + while (!p->cuda_done) { + SCMutexLock(&p->cuda_mutex); + if (p->cuda_done) { + SCMutexUnlock(&p->cuda_mutex); + break; + } else { + SCCondWait(&p->cuda_cond, &p->cuda_mutex); + SCMutexUnlock(&p->cuda_mutex); + } + } /* while */ + p->cuda_done = 0; + p->cuda_mpm_enabled = 0; + + uint32_t cuda_matches = p->cuda_gpu_matches; + if (cuda_matches == 0) + return 0; + + uint32_t matches = 0; + uint32_t *results = p->cuda_results + 1; + uint8_t *buf = p->payload; + SCACCtx *ctx = mpm_ctx->ctx; + SCACOutputTable *output_table = ctx->output_table; + SCACPatternList *pid_pat_list = ctx->pid_pat_list; + + for (u = 0; u < cuda_matches; u += 2) { + uint32_t offset = results[u]; + uint32_t state = results[u + 1]; + /* we should technically be doing state & 0x00FFFFFF, but we don't + * since the cuda kernel does that for us */ + uint32_t no_of_entries = output_table[state].no_of_entries; + /* we should technically be doing state & 0x00FFFFFF, but we don't + * since the cuda kernel does that for us */ + uint32_t *pids = output_table[state].pids; + uint32_t k; + /* note that this is not a verbatim copy from SCACSearch(). We + * don't copy the pattern id into the pattern_id_array. That's + * the only change */ + for (k = 0; k < no_of_entries; k++) { + if (pids[k] & 0xFFFF0000) { + if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs, + buf + offset - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1, + pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) { + /* inside loop */ + if (pid_pat_list[pids[k] & 0x0000FFFF].case_state != 3) { + continue; + } + } + if (pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] & (1 << ((pids[k] & 0x0000FFFF) % 8))) { + ; + } else { + pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] |= (1 << ((pids[k] & 0x0000FFFF) % 8)); + } + matches++; + } else { + if (pmq->pattern_id_bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) { + ; + } else { + pmq->pattern_id_bitarray[pids[k] / 8] |= (1 << (pids[k] % 8)); + } + matches++; + } + } + } + + return matches; +} + +void SCACCudaStartDispatcher(void) +{ + /* create the threads */ + ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher", + NULL, NULL, + NULL, NULL, + "custom", SCACCudaDispatcher, 0); + if (tv == NULL) { + SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for " + "ac cuda dispatcher. Killing engine."); + exit(EXIT_FAILURE); + } + if (TmThreadSpawn(tv) != 0) { + SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for " + "ac cuda dispatcher. Killing engine."); + exit(EXIT_FAILURE); + } + + return; +} + +int MpmCudaBufferSetup(void) +{ + int r = 0; + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + if (conf == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); + return -1; + } + + CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); + if (cuda_context == 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context."); + return -1; + } + r = SCCudaCtxPushCurrent(cuda_context); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context."); + return -1; + } + + uint8_t *d_buffer = NULL; + uint32_t *o_buffer = NULL; + void **p_buffer = NULL; + + r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure."); + return -1; + } + SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size); + r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue."); + return -1; + } + r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure."); + return -1; + } + + r = SCCudaCtxPopCurrent(NULL); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure."); + return -1; + } + + CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); + if (cb == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance."); + return -1; + } + CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb); + + return 0; +} + +int MpmCudaBufferDeSetup(void) +{ + int r = 0; + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + if (conf == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); + return -1; + } + + CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); + BUG_ON(cb_data == NULL); + + CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); + if (cuda_context == 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context."); + return -1; + } + r = SCCudaCtxPushCurrent(cuda_context); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context."); + return -1; + } + + r = SCCudaMemFreeHost(cb_data->d_buffer); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); + return -1; + } + r = SCCudaMemFreeHost(cb_data->o_buffer); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); + return -1; + } + r = SCCudaMemFreeHost(cb_data->p_buffer); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); + return -1; + } + + r = SCCudaCtxPopCurrent(NULL); + if (r < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure."); + return -1; + } + + CudaBufferDeRegister(cb_data); + + return 0; +} + +#endif /* __SC_CUDA_SUPPORT */ + +/************************** Mpm Registration ***************************/ + +/** + * \brief Register the aho-corasick mpm. + */ +void MpmACRegister(void) +{ + mpm_table[MPM_AC].name = "ac"; + /* don't need this. isn't that awesome? no more chopping and blah blah */ + mpm_table[MPM_AC].max_pattern_length = 0; + + mpm_table[MPM_AC].InitCtx = SCACInitCtx; + mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx; + mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx; + mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx; + mpm_table[MPM_AC].AddPattern = SCACAddPatternCS; + mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI; + mpm_table[MPM_AC].Prepare = SCACPreparePatterns; + mpm_table[MPM_AC].Search = SCACSearch; + mpm_table[MPM_AC].Cleanup = NULL; + mpm_table[MPM_AC].PrintCtx = SCACPrintInfo; + mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats; + mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests; + + return; +} + +#ifdef __SC_CUDA_SUPPORT__ + +/** + * \brief Register the aho-corasick cuda mpm. + */ +void MpmACCudaRegister(void) +{ + mpm_table[MPM_AC_CUDA].name = "ac-cuda"; + /* don't need this. isn't that awesome? no more chopping and blah blah */ + mpm_table[MPM_AC_CUDA].max_pattern_length = 0; + + mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx; + mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx; + mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx; + mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx; + mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS; + mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI; + mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns; + mpm_table[MPM_AC_CUDA].Search = SCACSearch; + mpm_table[MPM_AC_CUDA].Cleanup = NULL; + mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo; + mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats; + mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests; + + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) { + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + if (conf == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); + exit(EXIT_FAILURE); + } + + if (MpmCudaBufferSetup() < 0) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac cuda"); + exit(EXIT_FAILURE); + } + } + + return; +} + +#endif /* __SC_CUDA_SUPPORT__ */ + /*************************************Unittests********************************/ #ifdef UNITTESTS diff --git a/src/util-mpm-ac.h b/src/util-mpm-ac.h index 965dc05eb9..a9df2368cf 100644 --- a/src/util-mpm-ac.h +++ b/src/util-mpm-ac.h @@ -22,9 +22,16 @@ * */ +#ifndef __UTIL_MPM_AC__H__ +#define __UTIL_MPM_AC__H__ + #define SC_AC_STATE_TYPE_U16 uint16_t #define SC_AC_STATE_TYPE_U32 uint32_t +#ifdef __SC_CUDA_SUPPORT__ +#include "util-cuda.h" +#endif /* __SC_CUDA_SUPPORT__ */ + typedef struct SCACPattern_ { /* length of the pattern */ uint16_t len; @@ -79,6 +86,11 @@ typedef struct SCACCtx_ { /* the size of each state */ uint16_t single_state_size; uint16_t max_pat_id; + +#ifdef __SC_CUDA_SUPPORT__ + CUdeviceptr state_table_u16_cuda; + CUdeviceptr state_table_u32_cuda; +#endif /* __SC_CUDA_SUPPORT__ */ } SCACCtx; typedef struct SCACThreadCtx_ { @@ -89,3 +101,24 @@ typedef struct SCACThreadCtx_ { } SCACThreadCtx; void MpmACRegister(void); + + +#ifdef __SC_CUDA_SUPPORT__ + +#define MPM_AC_CUDA_MODULE_NAME "ac_cuda" +#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb" + + +void MpmACCudaRegister(void); +void SCACConstructBoth16and32StateTables(void); +int MpmCudaBufferSetup(void); +int MpmCudaBufferDeSetup(void); +void SCACCudaStartDispatcher(void); +void SCACCudaKillDispatcher(void); +uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx, + PatternMatcherQueue *pmq); + +#endif /* __SC_CUDA_SUPPORT__ */ + + +#endif /* __UTIL_MPM_AC__H__ */ diff --git a/src/util-mpm-b2g-cuda-kernel.cu b/src/util-mpm-b2g-cuda-kernel.cu deleted file mode 100644 index 8fd1c5582b..0000000000 --- a/src/util-mpm-b2g-cuda-kernel.cu +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - * - * The Cuda kernel for MPM B2G. - * - * \todo This is a basic version of the kernel. Modify it to support multiple - * blocks of threads. Make use of shared memory/texture memory. - */ - -#define B2G_CUDA_Q 2 -#define CUDA_THREADS 4000 -#define B2G_CUDA_HASHSHIFT 4 -#define B2G_CUDA_TYPE unsigned int -#define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b)) -#define u8_tolower(c) g_u8_lowercasetable[(c)] - -typedef struct SCCudaPBPacketDataForGPU_ { - /* holds the value B2gCtx->m */ - unsigned int m; - /* holds B2gCtx->B2g */ - unsigned int table; - /* holds the length of the payload */ - unsigned int payload_len; - /* holds the payload */ - unsigned char payload; -} SCCudaPBPacketDataForGPU; - -extern "C" -__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer, - unsigned char *packets_buffer, - unsigned int *packets_offset_buffer, - unsigned int *packets_payload_offset_buffer, - unsigned int nop, - unsigned char *g_u8_lowercasetable) - { - unsigned int tid = blockIdx.x * 32 + threadIdx.x; - /* if the thread id is greater than the no of packets sent in the packets - * buffer, terminate the thread */ - //if (tid <= nop) - if (tid >= nop) - return; - - SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]); - unsigned int m = packet->m; - unsigned char *buf = &packet->payload; - unsigned int buflen = packet->payload_len; - unsigned int *B2G = (unsigned int *)packet->table; - unsigned int pos = m - B2G_CUDA_Q + 1; - B2G_CUDA_TYPE d; - unsigned short h; - unsigned int first; - unsigned int j = 0; - - unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid; - //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1; - //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1; - unsigned short *offsets = matches_count + 1; - // temporarily hold the results here, before we shift it to matches_count - // before returning - unsigned short matches = 0; - - while (pos <= (buflen - B2G_CUDA_Q + 1)) { - h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos])); - d = B2G[h]; - - if (d != 0) { - j = pos; - first = pos - (m - B2G_CUDA_Q + 1); - - do { - j = j - 1; - if (d >= (1 << (m - 1))) { - if (j > first) { - pos = j; - } else { - offsets[matches++] = j; - } - } - - if (j == 0) - break; - - h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j])); - d = (d << 1) & B2G[h]; - } while (d != 0); - } - pos = pos + m - B2G_CUDA_Q + 1; - } - - matches_count[0] = matches; - - return; -} diff --git a/src/util-mpm.c b/src/util-mpm.c index a8307ee8be..41ba875c67 100644 --- a/src/util-mpm.c +++ b/src/util-mpm.c @@ -45,6 +45,9 @@ #include "conf-yaml-loader.h" #include "queue.h" #include "util-unittest.h" +#ifdef __SC_CUDA_SUPPORT__ +#include "util-cuda-handlers.h" +#endif /** * \brief Register a new Mpm Context. @@ -265,6 +268,118 @@ void MpmFactoryDeRegisterAllMpmCtxProfiles(DetectEngineCtx *de_ctx) return; } +#ifdef __SC_CUDA_SUPPORT__ + +static void MpmCudaConfFree(void *conf) +{ + SCFree(conf); + return; +} + +static void *MpmCudaConfParse(ConfNode *node) +{ + const char *value; + + MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf)); + if (conf == NULL) + exit(EXIT_FAILURE); + memset(conf, 0, sizeof(conf)); + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT; + } else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "data-buffer-size-min-limit - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT; + } else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "data-buffer-size-max-limit - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT; + } else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "cb-buffer-size - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "gpu-transfer-size"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE; + } else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "gpu-transfer-size - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "batching-timeout"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT; + } else if ((conf->batching_timeout = atoi(value)) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "batching-timeout - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "device-id"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT; + } else if ((conf->device_id = atoi(value)) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "device-id - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + if (node != NULL) + value = ConfNodeLookupChildValue(node, "cuda-streams"); + else + value = NULL; + if (value == NULL) { + /* default */ + conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT; + } else if ((conf->cuda_streams = atoi(value)) < 0) { + SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." + "cuda-streams - \"%s\"", node->name, value); + exit(EXIT_FAILURE); + } + + return conf; +} +#endif + /** * \brief Setup a pmq * @@ -447,6 +562,10 @@ void MpmTableSetup(void) { MpmB2gcRegister(); MpmB2gmRegister(); MpmACRegister(); +#ifdef __SC_CUDA_SUPPORT__ + CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree); + MpmACCudaRegister(); +#endif /* __SC_CUDA_SUPPORT__ */ MpmACBSRegister(); MpmACGfbsRegister(); } diff --git a/src/util-mpm.h b/src/util-mpm.h index 1de44f2283..6b3c266760 100644 --- a/src/util-mpm.h +++ b/src/util-mpm.h @@ -65,6 +65,9 @@ enum { /* aho-corasick */ MPM_AC, +#ifdef __SC_CUDA_SUPPORT__ + MPM_AC_CUDA, +#endif /* aho-corasick-goto-failure state based */ MPM_AC_GFBS, MPM_AC_BS, @@ -81,6 +84,7 @@ typedef struct MpmThreadCtx_ { uint32_t memory_cnt; uint32_t memory_size; + } MpmThreadCtx; /** \brief helper structure for the pattern matcher engine. The Pattern Matcher @@ -176,6 +180,40 @@ typedef struct MpmTableElmt_ { MpmTableElmt mpm_table[MPM_TABLE_SIZE]; +/* macros decides if cuda is enabled for the platform or not */ +#ifdef __SC_CUDA_SUPPORT__ + +/* the min size limit of a payload(or any other data) to be buffered */ +#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0 +/* the max size limit of a payload(or any other data) to be buffered */ +#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500 +/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */ +#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024 +/* Default value for the max data chunk that would be sent to gpu */ +#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024 +/* Default value for offset/pointer buffer to be used by cuda mpm + * engine for CudaBuffer reg */ +#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000 +#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000 +#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2 +#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0 + +/** + * \brief Cuda configuration for "mpm" profile. We can further extend this + * to have conf for specific mpms. For now its common for all mpms. + */ +typedef struct MpmCudaConf_ { + uint16_t data_buffer_size_min_limit; + uint16_t data_buffer_size_max_limit; + uint32_t cb_buffer_size; + uint32_t gpu_transfer_size; + int batching_timeout; + int device_id; + int cuda_streams; +} MpmCudaConf; + +#endif /* __SC_CUDA_SUPPORT__ */ + struct DetectEngineCtx_; int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *, uint8_t); diff --git a/suricata.yaml.in b/suricata.yaml.in index 88f521f4a6..4faae83722 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -404,34 +404,33 @@ threading: # Cuda configuration. cuda: # The "mpm" profile. On not specifying any of these parameters, the engine's - # internal default values are used, which are same as the ones specified here. - - mpm: - # Threshold limit for no of packets buffered to the GPU. Once we hit this - # limit, we pass the buffer to the gpu. - packet-buffer-limit: 2400 - # The maximum length for a packet that we would buffer to the gpu. - # Anything over this is MPM'ed on the CPU. All entries > 0 are valid. - # Can be specified in kb, mb, gb. Just a number indicates it's in bytes. - packet-size-limit: 1500 - # No of packet buffers we initialize. All entries > 0 are valid. - packet-buffers: 10 - # The timeout limit for batching of packets in secs. If we don't fill the - # buffer within this timeout limit, we pass the currently filled buffer to the gpu. - # All entries > 0 are valid. - batching-timeout: 1 - # Specifies whether to use page-locked memory whereever possible. Accepted values - # are "enabled" and "disabled". - page-locked: enabled - # The device to use for the mpm. Currently we don't support load balancing - # on multiple gpus. In case you have multiple devices on your system, you - # can specify the device to use, using this conf. By default we hold 0, to - # specify the first device cuda sees. To find out device-id associated with - # the card(s) on the system run "suricata --list-cuda-cards". - device-id: 0 - # No of Cuda streams used for asynchronous processing. All values > 0 are valid. - # For this option you need a device with Compute Capability > 1.0 and - # page-locked enabled to have any effect. - cuda-streams: 2 + # internal default values are used, which are same as the ones specified in + # in the default conf file. + mpm: + # The minimum length required to buffer data to the gpu. + # Anything below this is MPM'ed on the CPU. + # Can be specified in kb, mb, gb. Just a number indicates it's in bytes. + # A value of 0 indicates there's no limit. + data-buffer-size-min-limit: 0 + # The maximum length for data that we would buffer to the gpu. + # Anything over this is MPM'ed on the CPU. + # Can be specified in kb, mb, gb. Just a number indicates it's in bytes. + data-buffer-size-max-limit: 1500 + # The ring buffer size used by the CudaBuffer API to buffer data. + cudabuffer-buffer-size: 500mb + # The max chunk size that can be sent to the gpu in a single go. + gpu-transfer-size: 50mb + # The timeout limit for batching of packets in microseconds. + batching-timeout: 2000 + # The device to use for the mpm. Currently we don't support load balancing + # on multiple gpus. In case you have multiple devices on your system, you + # can specify the device to use, using this conf. By default we hold 0, to + # specify the first device cuda sees. To find out device-id associated with + # the card(s) on the system run "suricata --list-cuda-cards". + device-id: 0 + # No of Cuda streams used for asynchronous processing. All values > 0 are valid. + # For this option you need a device with Compute Capability > 1.0. + cuda-streams: 2 # Select the multi pattern algorithm you want to run for scan/search the # in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,