From: Anoop Saldanha Date: Thu, 20 Jun 2013 17:56:23 +0000 (+0530) Subject: Minor cosmetic changes to the cuda code. X-Git-Tag: suricata-2.0beta1~100 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=602c91ed4199d6422d5702d66cbb729d77f279d2;p=thirdparty%2Fsuricata.git Minor cosmetic changes to the cuda code. Moved a couple of functions to more cuda relevant files; Re-structured some data types. --- diff --git a/src/Makefile.am b/src/Makefile.am index 905cd2fc6d..d47afe0490 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -258,6 +258,7 @@ util-crypt.c util-crypt.h \ util-cuda.c util-cuda.h \ util-cuda-buffer.c util-cuda-buffer.h \ util-cuda-handlers.c util-cuda-handlers.h \ +util-cuda-vars.c util-cuda-vars.h \ util-daemon.c util-daemon.h \ util-debug.c util-debug.h \ util-debug-filters.c util-debug-filters.h \ diff --git a/src/decode.h b/src/decode.h index 69eb7b0819..687c871920 100644 --- a/src/decode.h +++ b/src/decode.h @@ -32,6 +32,7 @@ #ifdef __SC_CUDA_SUPPORT__ #include "util-cuda-buffer.h" +#include "util-cuda-vars.h" #endif /* __SC_CUDA_SUPPORT__ */ typedef enum { @@ -491,12 +492,7 @@ typedef struct Packet_ PktProfiling profile; #endif #ifdef __SC_CUDA_SUPPORT__ - uint8_t cuda_mpm_enabled; - uint8_t cuda_done; - uint16_t cuda_gpu_matches; - SCMutex cuda_mutex; - SCCondT cuda_cond; - uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1]; + CudaPacketVars cuda_pkt_vars; #endif } Packet; @@ -583,21 +579,7 @@ typedef struct DecodeThreadVars_ uint16_t counter_defrag_max_hit; #ifdef __SC_CUDA_SUPPORT__ - /* cb - CudaBuffer */ - CudaBufferData *cuda_ac_cb; - - MpmCtx *mpm_proto_other_ctx; - - MpmCtx *mpm_proto_tcp_ctx_ts; - MpmCtx *mpm_proto_udp_ctx_ts; - - MpmCtx *mpm_proto_tcp_ctx_tc; - MpmCtx *mpm_proto_udp_ctx_tc; - - uint16_t data_buffer_size_max_limit; - uint16_t data_buffer_size_min_limit; - - uint8_t mpm_is_cuda; + CudaThreadVars cuda_vars; #endif } DecodeThreadVars; @@ -625,8 +607,8 @@ typedef struct DecodeThreadVars_ PACKET_RESET_CHECKSUMS((p)); \ (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \ (p)->livedev = NULL; \ - SCMutexInit(&(p)->cuda_mutex, NULL); \ - SCCondInit(&(p)->cuda_cond, NULL); \ + SCMutexInit(&(p)->cuda_pkt_vars.cuda_mutex, NULL); \ + SCCondInit(&(p)->cuda_pkt_vars.cuda_cond, NULL); \ } while (0) #else #define PACKET_INITIALIZE(p) { \ diff --git a/src/detect-engine-mpm.c b/src/detect-engine-mpm.c index a059483002..62e405e9c7 100644 --- a/src/detect-engine-mpm.c +++ b/src/detect-engine-mpm.c @@ -225,7 +225,7 @@ uint32_t PacketPatternSearch(DetectEngineThreadCtx *det_ctx, Packet *p) SCReturnInt(0); #ifdef __SC_CUDA_SUPPORT__ - if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) { + if (p->cuda_pkt_vars.cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) { ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq); } else { ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx, diff --git a/src/detect.c b/src/detect.c index e629b20e8d..596ad3827c 100644 --- a/src/detect.c +++ b/src/detect.c @@ -4388,255 +4388,6 @@ int SigAddressPrepareStage5(DetectEngineCtx *de_ctx) { return 0; } -#ifdef __SC_CUDA_SUPPORT__ - -static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx) -{ - MpmCtx *mpm_ctx = NULL; - - int ac_16_tables = 0; - int ac_32_tables = 0; - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - if (ac_16_tables > 0 && ac_32_tables > 0) - SCACConstructBoth16and32StateTables(); - - - SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables); - SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables); - -} -#endif - /** * \brief Convert the signature list into the runtime match structure. * diff --git a/src/runmodes.c b/src/runmodes.c index 560c159100..5109051213 100644 --- a/src/runmodes.c +++ b/src/runmodes.c @@ -301,6 +301,15 @@ void RunModeDispatch(int runmode, const char *custom_mode, DetectEngineCtx *de_c } } +#ifdef __SC_CUDA_SUPPORT__ + if (PatternMatchDefaultMatcher() == MPM_AC_CUDA && + strcasecmp(custom_mode, "autofp") != 0) { + SCLogError(SC_ERR_RUNMODE, "When using a cuda mpm, the only runmode we " + "support is autofp."); + exit(EXIT_FAILURE); + } +#endif + RunMode *mode = RunModeGetCustomMode(runmode, custom_mode); if (mode == NULL) { SCLogError(SC_ERR_RUNMODE, "The custom type \"%s\" doesn't exist " diff --git a/src/source-pcap-file.c b/src/source-pcap-file.c index 543c936b0e..ae356caf25 100644 --- a/src/source-pcap-file.c +++ b/src/source-pcap-file.c @@ -51,8 +51,7 @@ #include "util-cuda-handlers.h" #include "detect-engine.h" #include "detect-engine-mpm.h" - -static DetectEngineCtx *cuda_de_ctx = NULL; +#include "util-cuda-vars.h" #endif /* __SC_CUDA_SUPPORT__ */ @@ -124,15 +123,6 @@ void TmModuleDecodePcapFileRegister (void) { tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM; } -#ifdef __SC_CUDA_SUPPORT__ -void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx) -{ - cuda_de_ctx = de_ctx; - - return; -} -#endif - void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) { SCEnter(); @@ -342,90 +332,6 @@ TmEcode ReceivePcapFileThreadDeinit(ThreadVars *tv, void *data) { SCReturnInt(TM_ECODE_OK); } -#ifdef __SC_CUDA_SUPPORT__ - -static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p) -{ - if (p->cuda_mpm_enabled) { - while (!p->cuda_done) { - SCMutexLock(&p->cuda_mutex); - if (p->cuda_done) { - SCMutexUnlock(&p->cuda_mutex); - break; - } else { - SCCondWait(&p->cuda_cond, &p->cuda_mutex); - SCMutexUnlock(&p->cuda_mutex); - } - } - } - p->cuda_done = 0; - - if (p->payload_len == 0 || - (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) || - (p->flags & PKT_ALLOC) || - (dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) || - (p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) { - p->cuda_mpm_enabled = 0; - return; - } - - MpmCtx *mpm_ctx = NULL; - if (p->proto == IPPROTO_TCP) { - if (p->flowflags & FLOW_PKT_TOSERVER) - mpm_ctx = dtv->mpm_proto_tcp_ctx_ts; - else - mpm_ctx = dtv->mpm_proto_tcp_ctx_tc; - } else if (p->proto == IPPROTO_UDP) { - if (p->flowflags & FLOW_PKT_TOSERVER) - mpm_ctx = dtv->mpm_proto_udp_ctx_ts; - else - mpm_ctx = dtv->mpm_proto_udp_ctx_tc; - } else { - mpm_ctx = dtv->mpm_proto_other_ctx; - } - if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) { - p->cuda_mpm_enabled = 0; - return; - } - -#if __WORDSIZE==64 - CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb, - p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr), - (void *)p); - if (slice == NULL) { - SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " - "this to dev."); - p->cuda_mpm_enabled = 0; - return; - } - *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len; - *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; - memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); -#else - CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb, - p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr), - (void *)p); - if (slice == NULL) { - SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " - "this to dev."); - p->cuda_mpm_enabled = 0; - return; - } - *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len; - *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; - memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); -#endif - p->cuda_mpm_enabled = 1; - SC_ATOMIC_SET(slice->done, 1); - - SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n", - p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda); - - return; -} - -#endif /* __SC_CUDA_SUPPORT__ */ - double prev_signaled_ts = 0; TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq) @@ -457,7 +363,6 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P TimeSet(&p->ts); /* call the decoder */ - pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq); #ifdef DEBUG @@ -465,41 +370,13 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P #endif #ifdef __SC_CUDA_SUPPORT__ - if (dtv->mpm_is_cuda) - DecodePcapFileBufferPacket(dtv, p); + if (dtv->cuda_vars.mpm_is_cuda) + CudaBufferPacket(&dtv->cuda_vars, p); #endif SCReturnInt(TM_ECODE_OK); } -#ifdef __SC_CUDA_SUPPORT__ - -static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv) -{ - if (PatternMatchDefaultMatcher() != MPM_AC_CUDA) - return 0; - - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - if (conf == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); - return -1; - } - - dtv->mpm_is_cuda = 1; - dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); - dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit; - dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit; - dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0); - dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1); - dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0); - dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1); - dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0); - - return 0; -} - -#endif /* __SC_CUDA_SUPPORT__ */ - TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data) { SCEnter(); @@ -512,7 +389,7 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data) DecodeRegisterPerfCounters(dtv, tv); #ifdef __SC_CUDA_SUPPORT__ - if (DecodePcapFileThreadInitCuda(dtv) < 0) + if (CudaThreadVarsInit(&dtv->cuda_vars) < 0) SCReturnInt(TM_ECODE_FAILED); #endif diff --git a/src/source-pcap-file.h b/src/source-pcap-file.h index 7646a25343..67bd261b60 100644 --- a/src/source-pcap-file.h +++ b/src/source-pcap-file.h @@ -26,9 +26,6 @@ void TmModuleReceivePcapFileRegister (void); void TmModuleDecodePcapFileRegister (void); -#ifdef __SC_CUDA_SUPPORT__ -void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx); -#endif #endif /* __SOURCE_PCAP_FILE_H__ */ diff --git a/src/suricata.c b/src/suricata.c index 09420db5d3..3f58191703 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -1829,7 +1829,7 @@ int main(int argc, char **argv) } #ifdef __SC_CUDA_SUPPORT__ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) - DecodePcapFileSetCudaDeCtx(de_ctx); + CudaVarsSetDeCtx(de_ctx); #endif /* __SC_CUDA_SUPPORT__ */ SCClassConfLoadClassficationConfigFile(de_ctx); diff --git a/src/util-cuda-buffer.c b/src/util-cuda-buffer.c index 8286761c00..a5401f6a68 100644 --- a/src/util-cuda-buffer.c +++ b/src/util-cuda-buffer.c @@ -272,7 +272,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void if (cb_data->d_buffer_write < cb_data->d_buffer_read) { if (cb_data->d_buffer_write + len >= cb_data->d_buffer_read) { - SCLogInfo("d_buffer full"); + SCLogDebug("d_buffer full"); SCMutexUnlock(&cb_data->m); SCMutexLock(&slice_pool_mutex); @@ -282,7 +282,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void } } else { if (cb_data->d_buffer_write + len > cb_data->d_buffer_len) { - SCLogInfo("d_buffer limit hit - buffer_len - %"PRIu32, + SCLogDebug("d_buffer limit hit - buffer_len - %"PRIu32, cb_data->d_buffer_len); SCMutexUnlock(&cb_data->m); @@ -295,7 +295,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void if (cb_data->op_buffer_write < cb_data->op_buffer_read) { if (cb_data->op_buffer_write + 1 >= cb_data->op_buffer_read) { - SCLogInfo("op_buffer full"); + SCLogDebug("op_buffer full"); SCMutexUnlock(&cb_data->m); SCMutexLock(&slice_pool_mutex); @@ -305,7 +305,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void } } else { if (cb_data->op_buffer_write + 1 > cb_data->op_buffer_len) { - SCLogInfo("op_buffer limit hit - buffer_len - %"PRIu32, + SCLogDebug("op_buffer limit hit - buffer_len - %"PRIu32, cb_data->op_buffer_len); SCMutexUnlock(&cb_data->m); @@ -866,7 +866,7 @@ int CudaBufferTest02(void) int CudaBufferTest03(void) { - CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp; + CudaBufferSlice *slice, *slice_temp; int result = 0; uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); @@ -886,9 +886,9 @@ int CudaBufferTest03(void) goto end; } - slice1 = CudaBufferGetSlice(data, 16, NULL); - slice2 = CudaBufferGetSlice(data, 16, NULL); - slice3 = CudaBufferGetSlice(data, 24, NULL); + slice = CudaBufferGetSlice(data, 16, NULL); + slice = CudaBufferGetSlice(data, 16, NULL); + slice = CudaBufferGetSlice(data, 24, NULL); /* culling */ CudaBufferCulledInfo culled_info; diff --git a/src/util-cuda-vars.c b/src/util-cuda-vars.c new file mode 100644 index 0000000000..624c09f851 --- /dev/null +++ b/src/util-cuda-vars.c @@ -0,0 +1,74 @@ +/* Copyright (C) 2007-2010 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/** + * \file + * + * \author Anoop Saldanha + */ + +#ifdef __SC_CUDA_SUPPORT__ + +#include "suricata.h" +#include "util-mpm.h" +#include "util-cuda-handlers.h" +#include "util-cuda-vars.h" +#include "detect-engine-mpm.h" +#include "util-debug.h" +#include "util-mpm-ac.h" + +static DetectEngineCtx *cuda_de_ctx = NULL; + +void CudaVarsSetDeCtx(DetectEngineCtx *de_ctx) +{ + if (cuda_de_ctx != NULL) { + SCLogError(SC_ERR_FATAL, "CudaVarsSetDeCtx() called more than once. " + "This function should be called only once during the " + "lifetime of the engine."); + exit(EXIT_FAILURE); + } + + cuda_de_ctx = de_ctx; + + return; +} + +int CudaThreadVarsInit(CudaThreadVars *ctv) +{ + if (PatternMatchDefaultMatcher() != MPM_AC_CUDA) + return 0; + + MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); + if (conf == NULL) { + SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); + return -1; + } + + ctv->mpm_is_cuda = 1; + ctv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); + ctv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit; + ctv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit; + ctv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0); + ctv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1); + ctv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0); + ctv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1); + ctv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0); + + return 0; +} + +#endif diff --git a/src/util-cuda-vars.h b/src/util-cuda-vars.h new file mode 100644 index 0000000000..9c24a915ba --- /dev/null +++ b/src/util-cuda-vars.h @@ -0,0 +1,65 @@ +/* Copyright (C) 2007-2010 Open Information Security Foundation + * + * You can copy, redistribute or modify this Program under the terms of + * the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/** + * \file + * + * \author Anoop Saldanha + */ + +#ifdef __SC_CUDA_SUPPORT__ + +#ifndef __UTIL_CUDA_VARS__H__ +#define __UTIL_CUDA_VARS__H__ + +#include "util-cuda-buffer.h" +#include "util-mpm.h" +#include "threads.h" + +typedef struct CudaThreadVars_ { + /* cb - CudaBuffer */ + CudaBufferData *cuda_ac_cb; + + MpmCtx *mpm_proto_other_ctx; + + MpmCtx *mpm_proto_tcp_ctx_ts; + MpmCtx *mpm_proto_udp_ctx_ts; + + MpmCtx *mpm_proto_tcp_ctx_tc; + MpmCtx *mpm_proto_udp_ctx_tc; + + uint16_t data_buffer_size_max_limit; + uint16_t data_buffer_size_min_limit; + + uint8_t mpm_is_cuda; +} CudaThreadVars; + +typedef struct CudaPacketVars_ { + uint8_t cuda_mpm_enabled; + uint8_t cuda_done; + uint16_t cuda_gpu_matches; + SCMutex cuda_mutex; + SCCondT cuda_cond; + uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1]; +} CudaPacketVars; + +void CudaVarsSetDeCtx(struct DetectEngineCtx_ *de_ctx); +int CudaThreadVarsInit(CudaThreadVars *ctv); + +#endif /* __UTIL_CUDA_VARS__H__ */ + +#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c index 7e9d542ba3..a8562a5236 100644 --- a/src/util-mpm-ac.c +++ b/src/util-mpm-ac.c @@ -1408,6 +1408,257 @@ void SCACPrintInfo(MpmCtx *mpm_ctx) /****************************Cuda side of things****************************/ #ifdef __SC_CUDA_SUPPORT__ + +/* \todo Technically it's generic to all mpms, but since we use ac only, the + * code internally directly references ac and hence it has found its + * home in this file, instead of util-mpm.c + */ +void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx) +{ + MpmCtx *mpm_ctx = NULL; + + int ac_16_tables = 0; + int ac_32_tables = 0; + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1); + if (mpm_ctx->mpm_type == MPM_AC_CUDA) { + SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; + if (ctx->state_count < 32767) + ac_16_tables++; + else + ac_32_tables++; + } + + if (ac_16_tables > 0 && ac_32_tables > 0) + SCACConstructBoth16and32StateTables(); + + + SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables); + SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables); + +} + /* \todos * - Use texture memory - Can we fit all the arrays into a 3d texture. * Texture memory definitely offers slightly better performance even @@ -1607,20 +1858,20 @@ static void *SCACCudaDispatcher(void *arg) for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) { Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset]; - p->cuda_gpu_matches = + p->cuda_pkt_vars.cuda_gpu_matches = cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)]; - if (p->cuda_gpu_matches != 0) { - memcpy(p->cuda_results, + if (p->cuda_pkt_vars.cuda_gpu_matches != 0) { + memcpy(p->cuda_pkt_vars.cuda_results, cuda_results_buffer_h + ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2), (cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4); } - SCMutexLock(&p->cuda_mutex); - p->cuda_done = 1; - SCMutexUnlock(&p->cuda_mutex); - SCCondSignal(&p->cuda_cond); + SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); + p->cuda_pkt_vars.cuda_done = 1; + SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); + SCCondSignal(&p->cuda_pkt_vars.cuda_cond); } if (no_of_items != 0) CudaBufferReportCulledConsumption(cb_data, &cb_culled_info); @@ -1666,25 +1917,25 @@ uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx, { uint32_t u = 0; - while (!p->cuda_done) { - SCMutexLock(&p->cuda_mutex); - if (p->cuda_done) { - SCMutexUnlock(&p->cuda_mutex); + while (!p->cuda_pkt_vars.cuda_done) { + SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); + if (p->cuda_pkt_vars.cuda_done) { + SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); break; } else { - SCCondWait(&p->cuda_cond, &p->cuda_mutex); - SCMutexUnlock(&p->cuda_mutex); + SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex); + SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); } } /* while */ - p->cuda_done = 0; - p->cuda_mpm_enabled = 0; + p->cuda_pkt_vars.cuda_done = 0; + p->cuda_pkt_vars.cuda_mpm_enabled = 0; - uint32_t cuda_matches = p->cuda_gpu_matches; + uint32_t cuda_matches = p->cuda_pkt_vars.cuda_gpu_matches; if (cuda_matches == 0) return 0; uint32_t matches = 0; - uint32_t *results = p->cuda_results + 1; + uint32_t *results = p->cuda_pkt_vars.cuda_results + 1; uint8_t *buf = p->payload; SCACCtx *ctx = mpm_ctx->ctx; SCACOutputTable *output_table = ctx->output_table; diff --git a/src/util-mpm-ac.h b/src/util-mpm-ac.h index a9df2368cf..d398f07aa3 100644 --- a/src/util-mpm-ac.h +++ b/src/util-mpm-ac.h @@ -30,6 +30,8 @@ #ifdef __SC_CUDA_SUPPORT__ #include "util-cuda.h" +#include "util-cuda-vars.h" +#include "decode.h" #endif /* __SC_CUDA_SUPPORT__ */ typedef struct SCACPattern_ { @@ -108,6 +110,85 @@ void MpmACRegister(void); #define MPM_AC_CUDA_MODULE_NAME "ac_cuda" #define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb" +static inline void CudaBufferPacket(CudaThreadVars *ctv, Packet *p) +{ + if (p->cuda_pkt_vars.cuda_mpm_enabled) { + while (!p->cuda_pkt_vars.cuda_done) { + SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); + if (p->cuda_pkt_vars.cuda_done) { + SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); + break; + } else { + SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex); + SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); + } + } + } + p->cuda_pkt_vars.cuda_done = 0; + + if (p->payload_len == 0 || + (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) || + (p->flags & PKT_ALLOC) || + (ctv->data_buffer_size_min_limit != 0 && p->payload_len < ctv->data_buffer_size_min_limit) || + (p->payload_len > ctv->data_buffer_size_max_limit && ctv->data_buffer_size_max_limit != 0) ) { + p->cuda_pkt_vars.cuda_mpm_enabled = 0; + return; + } + + MpmCtx *mpm_ctx = NULL; + if (p->proto == IPPROTO_TCP) { + if (p->flowflags & FLOW_PKT_TOSERVER) + mpm_ctx = ctv->mpm_proto_tcp_ctx_ts; + else + mpm_ctx = ctv->mpm_proto_tcp_ctx_tc; + } else if (p->proto == IPPROTO_UDP) { + if (p->flowflags & FLOW_PKT_TOSERVER) + mpm_ctx = ctv->mpm_proto_udp_ctx_ts; + else + mpm_ctx = ctv->mpm_proto_udp_ctx_tc; + } else { + mpm_ctx = ctv->mpm_proto_other_ctx; + } + if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) { + p->cuda_pkt_vars.cuda_mpm_enabled = 0; + return; + } + +#if __WORDSIZE==64 + CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb, + p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr), + (void *)p); + if (slice == NULL) { + SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " + "this to dev."); + p->cuda_pkt_vars.cuda_mpm_enabled = 0; + return; + } + *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len; + *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; + memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); +#else + CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb, + p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr), + (void *)p); + if (slice == NULL) { + SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " + "this to dev."); + p->cuda_pkt_vars.cuda_mpm_enabled = 0; + return; + } + *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len; + *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; + memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); +#endif + p->cuda_pkt_vars.cuda_mpm_enabled = 1; + SC_ATOMIC_SET(slice->done, 1); + + SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n", + p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda); + + return; +} void MpmACCudaRegister(void); void SCACConstructBoth16and32StateTables(void); @@ -117,6 +198,7 @@ void SCACCudaStartDispatcher(void); void SCACCudaKillDispatcher(void); uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx, PatternMatcherQueue *pmq); +void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx); #endif /* __SC_CUDA_SUPPORT__ */ diff --git a/suricata.yaml.in b/suricata.yaml.in index 4faae83722..891e7e4c6f 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -10,9 +10,9 @@ # conservative 1024. A higher number will make sure CPU's/CPU cores will be # more easily kept busy, but may negatively impact caching. # -# If you are using the CUDA pattern matcher (b2g_cuda below), different rules -# apply. In that case try something like 4000 or more. This is because the CUDA -# pattern matcher scans many packets in parallel. +# If you are using the CUDA pattern matcher (mpm-algo: ac-cuda), different rules +# apply. In that case try something like 60000 or more. This is because the CUDA +# pattern matcher buffers and scans as many packets as possible in parallel. #max-pending-packets: 1024 # Runmode the engine should use. Please check --list-runmodes to get the available