util-crypt.c util-crypt.h \
util-cuda.c util-cuda.h \
util-cuda-buffer.c util-cuda-buffer.h \
+util-cuda-handlers.c util-cuda-handlers.h \
util-daemon.c util-daemon.h \
util-debug.c util-debug.h \
util-debug-filters.c util-debug-filters.h \
win32-service.c win32-service.h \
win32-syslog.h
-EXTRA_DIST = util-mpm-b2g-cuda-kernel.cu ptxdump.py
+EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py
# set the include path found by configure
INCLUDES= $(all_includes)
suricata_SOURCES += cuda-ptxdump.h
suricata_CUDA_KERNELS = \
-util-mpm-b2g-cuda-kernel.cu
+util-mpm-ac-cuda-kernel.cu
NVCCFLAGS=-O2
.ptx_sm_12 \
.ptx_sm_13 \
.ptx_sm_20 \
-.ptx_sm_21
+.ptx_sm_21 \
+.ptx_sm_30
PTXS = $(suricata_CUDA_KERNELS:.cu=.ptx_sm_10)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_11)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_13)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30)
.cu.ptx_sm_10:
$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_10 -ptx $<
.cu.ptx_sm_21:
$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $<
+.cu.ptx_sm_30:
+ $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $<
+
cuda-ptxdump.h: $(PTXS)
$(PYTHON) ptxdump.py cuda-ptxdump $(PTXS)
#define COUNTERS
#include "suricata-common.h"
-
#include "threadvars.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-buffer.h"
+#endif /* __SC_CUDA_SUPPORT__ */
+
typedef enum {
CHECKSUM_VALIDATION_DISABLE,
CHECKSUM_VALIDATION_ENABLE,
#ifdef PROFILING
PktProfiling profile;
#endif
+#ifdef __SC_CUDA_SUPPORT__
+ uint8_t cuda_mpm_enabled;
+ uint8_t cuda_done;
+ uint16_t cuda_gpu_matches;
+ SCMutex cuda_mutex;
+ SCCondT cuda_cond;
+ uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
+#endif
} Packet;
#define DEFAULT_PACKET_SIZE (1500 + ETHERNET_HEADER_LEN)
uint16_t counter_defrag_ipv6_reassembled;
uint16_t counter_defrag_ipv6_timeouts;
uint16_t counter_defrag_max_hit;
+
+#ifdef __SC_CUDA_SUPPORT__
+ /* cb - CudaBuffer */
+ CudaBufferData *cuda_ac_cb;
+
+ MpmCtx *mpm_proto_other_ctx;
+
+ MpmCtx *mpm_proto_tcp_ctx_ts;
+ MpmCtx *mpm_proto_udp_ctx_ts;
+
+ MpmCtx *mpm_proto_tcp_ctx_tc;
+ MpmCtx *mpm_proto_udp_ctx_tc;
+
+ uint16_t data_buffer_size_max_limit;
+ uint16_t data_buffer_size_min_limit;
+
+ uint8_t mpm_is_cuda;
+#endif
} DecodeThreadVars;
/**
/**
* \brief Initialize a packet structure for use.
*/
-#define PACKET_INITIALIZE(p) { \
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-handlers.h"
+#include "util-mpm.h"
+
+#define PACKET_INITIALIZE(p) do { \
+ memset((p), 0x00, SIZE_OF_PACKET); \
+ SCMutexInit(&(p)->tunnel_mutex, NULL); \
+ PACKET_RESET_CHECKSUMS((p)); \
+ (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \
+ (p)->livedev = NULL; \
+ SCMutexInit(&(p)->cuda_mutex, NULL); \
+ SCCondInit(&(p)->cuda_cond, NULL); \
+ } while (0)
+#else
+#define PACKET_INITIALIZE(p) { \
SCMutexInit(&(p)->tunnel_mutex, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
(p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \
(p)->livedev = NULL; \
}
+#endif
/**
* \brief Recycle a packet structure for reuse.
#include "util-debug.h"
#include "util-print.h"
#include "util-memcmp.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-mpm-ac.h"
+#endif
/** \todo make it possible to use multiple pattern matcher algorithms next to
eachother. */
if (mpm_ctx == NULL)
SCReturnInt(0);
+#ifdef __SC_CUDA_SUPPORT__
+ if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
+ ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq);
+ } else {
+ ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
+ &det_ctx->mtc,
+ &det_ctx->pmq,
+ p->payload,
+ p->payload_len);
+ }
+#else
ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
&det_ctx->mtc,
&det_ctx->pmq,
p->payload,
p->payload_len);
+#endif
SCReturnInt(ret);
}
/* for now, since we still haven't implemented any intelligence into
* understanding the patterns and distributing mpm_ctx across sgh */
if (de_ctx->mpm_matcher == MPM_AC || de_ctx->mpm_matcher == MPM_AC_GFBS ||
+#ifdef __SC_CUDA_SUPPORT__
+ de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) {
+#else
de_ctx->mpm_matcher == MPM_AC_BS) {
+#endif
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
} else {
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
if (strcmp(sgh_mpm_context, "single") == 0) {
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
} else if (strcmp(sgh_mpm_context, "full") == 0) {
+#ifdef __SC_CUDA_SUPPORT__
+ if (de_ctx->mpm_matcher == MPM_AC_CUDA) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use "
+ "the cuda version of our mpm ac, i.e. \"ac-cuda\" "
+ "along with \"full\" \"sgh-mpm-context\". "
+ "Allowed values are \"single\" and \"auto\".");
+ exit(EXIT_FAILURE);
+ }
+#endif
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
} else {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an "
#include "util-optimize.h"
#include "util-vector.h"
#include "util-path.h"
+#include "util-mpm-ac.h"
#include "runmodes.h"
}
if (p->payload_len > 0 && (!(p->flags & PKT_NOPAYLOAD_INSPECTION))) {
+ if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
+ *sms_runflags |= SMS_USED_PM;
+ PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
+ PacketPatternSearchWithStreamCtx(det_ctx, p);
+ PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
+ }
if (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_PACKET) {
/* run the multi packet matcher against the payload of the packet */
SCLogDebug("search: (%p, maxlen %" PRIu32 ", sgh->sig_cnt %" PRIu32 ")",
*sms_runflags |= SMS_USED_PM;
}
- if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
- *sms_runflags |= SMS_USED_PM;
- PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
- PacketPatternSearchWithStreamCtx(det_ctx, p);
- PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
- }
}
}
return 0;
}
+#ifdef __SC_CUDA_SUPPORT__
+
+static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
+{
+ MpmCtx *mpm_ctx = NULL;
+
+ int ac_16_tables = 0;
+ int ac_32_tables = 0;
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+ mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+ if (ctx->state_count < 32767)
+ ac_16_tables++;
+ else
+ ac_32_tables++;
+ }
+
+ if (ac_16_tables > 0 && ac_32_tables > 0)
+ SCACConstructBoth16and32StateTables();
+
+
+ SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
+ SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
+
+}
+#endif
+
/**
* \brief Convert the signature list into the runtime match structure.
*
if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) {
MpmCtx *mpm_ctx = NULL;
+
+#ifdef __SC_CUDA_SUPPORT__
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+ /* setting it to default. You've gotta remove it once you fix the state table thing */
+ SCACConstructBoth16and32StateTables();
+
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+ if (cuda_context == 0) {
+ SCLogError(SC_ERR_FATAL, "cuda context is NULL.");
+ exit(EXIT_FAILURE);
+ }
+ int r = SCCudaCtxPushCurrent(cuda_context);
+ if (r < 0) {
+ SCLogError(SC_ERR_FATAL, "context push failed.");
+ exit(EXIT_FAILURE);
+ }
+ }
+#endif
+
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hsmd- %d\n", mpm_ctx->pattern_cnt);
-
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hscd- %d\n", mpm_ctx->pattern_cnt);
-
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("huad- %d\n", mpm_ctx->pattern_cnt);
-
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hrhhd- %d\n", mpm_ctx->pattern_cnt);
+
+#ifdef __SC_CUDA_SUPPORT__
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+ int r = SCCudaCtxPopCurrent(NULL);
+ if (r < 0) {
+ SCLogError(SC_ERR_FATAL, "cuda context pop failure.");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ /* too late to call this either ways. Should be called post ac goto.
+ * \todo Support this. */
+ DetermineCudaStateTableSize(de_ctx);
+#endif
+
}
// SigAddressPrepareStage5(de_ctx);
#include "util-profiling.h"
#include "runmode-unix-socket.h"
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-cuda.h"
+#include "util-cuda-buffer.h"
+#include "util-mpm-ac.h"
+#include "util-cuda-handlers.h"
+#include "detect-engine.h"
+#include "detect-engine-mpm.h"
+
+static DetectEngineCtx *cuda_de_ctx = NULL;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
extern uint8_t suricata_ctl_flags;
extern int max_pending_packets;
tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM;
}
+#ifdef __SC_CUDA_SUPPORT__
+void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx)
+{
+ cuda_de_ctx = de_ctx;
+
+ return;
+}
+#endif
+
void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) {
SCEnter();
SCReturnInt(TM_ECODE_OK);
}
+#ifdef __SC_CUDA_SUPPORT__
+
+static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p)
+{
+ if (p->cuda_mpm_enabled) {
+ while (!p->cuda_done) {
+ SCMutexLock(&p->cuda_mutex);
+ if (p->cuda_done) {
+ SCMutexUnlock(&p->cuda_mutex);
+ break;
+ } else {
+ SCCondWait(&p->cuda_cond, &p->cuda_mutex);
+ SCMutexUnlock(&p->cuda_mutex);
+ }
+ }
+ }
+ p->cuda_done = 0;
+
+ if (p->payload_len == 0 ||
+ (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
+ (p->flags & PKT_ALLOC) ||
+ (dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) ||
+ (p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) {
+ p->cuda_mpm_enabled = 0;
+ return;
+ }
+
+ MpmCtx *mpm_ctx = NULL;
+ if (p->proto == IPPROTO_TCP) {
+ if (p->flowflags & FLOW_PKT_TOSERVER)
+ mpm_ctx = dtv->mpm_proto_tcp_ctx_ts;
+ else
+ mpm_ctx = dtv->mpm_proto_tcp_ctx_tc;
+ } else if (p->proto == IPPROTO_UDP) {
+ if (p->flowflags & FLOW_PKT_TOSERVER)
+ mpm_ctx = dtv->mpm_proto_udp_ctx_ts;
+ else
+ mpm_ctx = dtv->mpm_proto_udp_ctx_tc;
+ } else {
+ mpm_ctx = dtv->mpm_proto_other_ctx;
+ }
+ if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
+ p->cuda_mpm_enabled = 0;
+ return;
+ }
+
+#if __WORDSIZE==64
+ CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
+ p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
+ (void *)p);
+ if (slice == NULL) {
+ SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report "
+ "this to dev.");
+ p->cuda_mpm_enabled = 0;
+ return;
+ }
+ *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+ *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+ memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#else
+ CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
+ p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
+ (void *)p);
+ if (slice == NULL) {
+ SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report "
+ "this to dev.");
+ p->cuda_mpm_enabled = 0;
+ return;
+ }
+ *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+ *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+ memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#endif
+ p->cuda_mpm_enabled = 1;
+ SC_ATOMIC_SET(slice->done, 1);
+
+ SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
+ p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
+
+ return;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
double prev_signaled_ts = 0;
TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq)
TimeSet(&p->ts);
/* call the decoder */
+
pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq);
+#ifdef DEBUG
+ BUG_ON(p->pkt_src != PKT_SRC_WIRE && p->pkt_src != PKT_SRC_FFR_V2);
+#endif
+
+#ifdef __SC_CUDA_SUPPORT__
+ if (dtv->mpm_is_cuda)
+ DecodePcapFileBufferPacket(dtv, p);
+#endif
+
SCReturnInt(TM_ECODE_OK);
}
+#ifdef __SC_CUDA_SUPPORT__
+
+static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv)
+{
+ if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
+ return 0;
+
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ if (conf == NULL) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+ return -1;
+ }
+
+ dtv->mpm_is_cuda = 1;
+ dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+ dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
+ dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
+ dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+ dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+ dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+ dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+ dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
+
+ return 0;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
{
SCEnter();
DecodeRegisterPerfCounters(dtv, tv);
+#ifdef __SC_CUDA_SUPPORT__
+ if (DecodePcapFileThreadInitCuda(dtv) < 0)
+ SCReturnInt(TM_ECODE_FAILED);
+#endif
+
*data = (void *)dtv;
SCReturnInt(TM_ECODE_OK);
void TmModuleReceivePcapFileRegister (void);
void TmModuleDecodePcapFileRegister (void);
+#ifdef __SC_CUDA_SUPPORT__
+void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx);
+#endif
#endif /* __SOURCE_PCAP_FILE_H__ */
#include "util-memcmp.h"
#include "util-proto-name.h"
#include "util-spm-bm.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-buffer.h"
+#include "util-mpm-ac.h"
+#endif
/*
* we put this here, because we only use it here in main.
SCCudaListCards();
exit(EXIT_SUCCESS);
}
+ CudaBufferInit();
#endif
if (!CheckValidDaemonModes(daemon, run_mode)) {
TimeInit();
SupportFastPatternForSigMatchTypes();
- /* load the pattern matchers */
- MpmTableSetup();
-
if (run_mode != RUNMODE_UNITTEST &&
!list_keywords &&
!list_app_layer_protocols) {
}
}
+ /* load the pattern matchers */
+ MpmTableSetup();
+
AppLayerDetectProtoThreadInit();
if (list_app_layer_protocols) {
AppLayerListSupportedProtocols();
DetectProtoTests();
DetectPortTests();
SCAtomicRegisterTests();
+#ifdef __SC_CUDA_SUPPORT__
+ CudaBufferRegisterUnittests();
+#endif
if (list_unittests) {
UtListTests(regex_arg);
}
"context failed.");
exit(EXIT_FAILURE);
}
+#ifdef __SC_CUDA_SUPPORT__
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+ DecodePcapFileSetCudaDeCtx(de_ctx);
+#endif /* __SC_CUDA_SUPPORT__ */
SCClassConfLoadClassficationConfigFile(de_ctx);
SCRConfLoadReferenceConfigFile(de_ctx);
SCPerfSpawnThreads();
}
+#ifdef __SC_CUDA_SUPPORT__
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+ SCACCudaStartDispatcher();
+#endif
+
/* Check if the alloted queues have at least 1 reader and writer */
TmValidateQueueState();
SC_ATOMIC_DESTROY(engine_stage);
+#ifdef __SC_CUDA_SUPPORT__
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+ MpmCudaBufferDeSetup();
+ CudaHandlerFreeProfiles();
+#endif
+
exit(engine_retval);
}
--- /dev/null
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+/* compile in, only if we have a CUDA enabled device on the machine, with the
+ * toolkit and the driver installed */
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "suricata-common.h"
+
+#include "util-error.h"
+#include "util-debug.h"
+#include "conf.h"
+#include "util-cuda.h"
+#include "util-cuda-handlers.h"
+
+/* file only exists if cuda is enabled */
+#include "cuda-ptxdump.h"
+
+/************************conf file profile section**********************/
+
+typedef struct CudaHandlerConfProfile_ {
+ char *name;
+ void *ctx;
+ void (*Free)(void *);
+
+ struct CudaHandlerConfProfile_ *next;
+} CudaHandlerConfProfile;
+
+static CudaHandlerConfProfile *conf_profiles = NULL;
+/* protects above var */
+static SCMutex mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void CudaHandlerAddCudaProfileFromConf(const char *name,
+ void *(*Callback)(ConfNode *node),
+ void (*Free)(void *))
+{
+ /* we don't do data validation */
+ SCMutexLock(&mutex);
+
+ CudaHandlerConfProfile *tmp_cp = conf_profiles;
+ while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
+ tmp_cp = tmp_cp->next;
+
+ if (tmp_cp != NULL) {
+ SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf "
+ "profile by the name \"%s\" registered.", name);
+ exit(EXIT_FAILURE);
+ }
+
+ char tmp[200];
+ int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name);
+ if (r < 0) {
+ SCLogError(SC_ERR_FATAL, "snprintf failure.");
+ exit(EXIT_FAILURE);
+ } else if (r > (int)sizeof(tmp)) {
+ SCLogError(SC_ERR_FATAL, "buffer not big enough to write param.");
+ exit(EXIT_FAILURE);
+ }
+ void *ctx = Callback(ConfGetNode(tmp));
+ if (ctx == NULL) {
+ SCMutexUnlock(&mutex);
+ return;
+ }
+
+ CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile));
+ if (new_cp == NULL)
+ exit(EXIT_FAILURE);
+ memset(new_cp, 0, sizeof(CudaHandlerConfProfile));
+ new_cp->name = SCStrdup(name);
+ if (new_cp->name == NULL)
+ exit(EXIT_FAILURE);
+ new_cp->ctx = ctx;
+ new_cp->Free = Free;
+
+ if (conf_profiles == NULL) {
+ conf_profiles = new_cp;
+ } else {
+ new_cp->next = conf_profiles;
+ conf_profiles = new_cp;
+ }
+
+ SCMutexUnlock(&mutex);
+ return;
+}
+
+void *CudaHandlerGetCudaProfile(const char *name)
+{
+ SCMutexLock(&mutex);
+
+ CudaHandlerConfProfile *tmp_cp = conf_profiles;
+ while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
+ tmp_cp = tmp_cp->next;
+
+ if (tmp_cp == NULL) {
+ SCMutexUnlock(&mutex);
+ return NULL;
+ }
+
+ SCMutexUnlock(&mutex);
+ return tmp_cp->ctx;
+}
+
+void CudaHandlerFreeProfiles(void)
+{
+ SCMutexLock(&mutex);
+
+ CudaHandlerConfProfile *tmp = conf_profiles;
+ while (tmp != NULL) {
+ CudaHandlerConfProfile *curr = tmp;
+ tmp = tmp->next;
+ SCFree(curr->name);
+ if (curr->Free != NULL)
+ curr->Free(curr->ctx);
+ SCFree(curr);
+ }
+
+ SCMutexUnlock(&mutex);
+ return;
+}
+
+/*******************cuda context related data section*******************/
+
+/* we use a concept where every device on the gpu has only 1 context. If
+ * a section in the engine wants to use a device and tries to open a context
+ * on it, we first check if a context is already created for the device and if
+ * so we return it. If not we create a new one and update with the entry */
+
+static CUcontext *cuda_contexts = NULL;
+static int no_of_cuda_contexts = 0;
+
+typedef struct CudaHandlerModuleData_ {
+ char *name;
+ void *data;
+
+ struct CudaHandlerModuleData_ *next;
+} CudaHandlerModuleData;
+
+typedef struct CudaHandlerModule_ {
+ char *name;
+
+ /* the context used by this module */
+ CUcontext context;
+ /* the device on which the above context was created */
+ int device_id;
+ CudaHandlerModuleData *module_data;
+
+ struct CudaHandlerModule_ *next;
+} CudaHandlerModule;
+
+static CudaHandlerModule *cudahl_modules = NULL;
+
+CUcontext CudaHandlerModuleGetContext(const char *name, int device_id)
+{
+ SCMutexLock(&mutex);
+
+ CudaHandlerModule *module = cudahl_modules;
+ while (module != NULL && strcasecmp(module->name, name) != 0)
+ module = module->next;
+ if (module != NULL) {
+ if (module->device_id != device_id) {
+ SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already "
+ "registered, but the new device_id is different "
+ "from the already registered device_id.");
+ exit(EXIT_FAILURE);
+ }
+ SCMutexUnlock(&mutex);
+ return module->context;
+ }
+
+ CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule));
+ if (new_module == NULL)
+ exit(EXIT_FAILURE);
+ memset(new_module, 0, sizeof(CudaHandlerModule));
+ new_module->device_id = device_id;
+ new_module->name = SCStrdup(name);
+ if (new_module->name == NULL)
+ exit(EXIT_FAILURE);
+ if (cudahl_modules == NULL) {
+ cudahl_modules = new_module;
+ } else {
+ new_module->next = cudahl_modules;
+ cudahl_modules = new_module;
+ }
+
+ if (no_of_cuda_contexts <= device_id) {
+ cuda_contexts = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1));
+ if (cuda_contexts == NULL)
+ exit(EXIT_FAILURE);
+ memset(cuda_contexts + no_of_cuda_contexts, 0,
+ sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts));
+ no_of_cuda_contexts = device_id + 1;
+ }
+
+ if (cuda_contexts[device_id] == 0) {
+ SCCudaDevices *devices = SCCudaGetDeviceList();
+ if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+ devices->devices[device_id]->device) == -1) {
+ SCLogDebug("ctxcreate failure.");
+ exit(EXIT_FAILURE);
+ }
+ }
+ new_module->context = cuda_contexts[device_id];
+
+ SCMutexUnlock(&mutex);
+ return cuda_contexts[device_id];
+}
+
+void CudaHandlerModuleStoreData(const char *module_name,
+ const char *data_name, void *data_ptr)
+{
+ SCMutexLock(&mutex);
+
+ CudaHandlerModule *module = cudahl_modules;
+ while (module != NULL && strcasecmp(module->name, module_name) != 0)
+ module = module->next;
+ if (module == NULL) {
+ SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
+ "\"%s\" from module \"%s\" that hasn't been registered "
+ "yet.", module_name, data_name);
+ exit(EXIT_FAILURE);
+ }
+
+ CudaHandlerModuleData *data = module->module_data;
+ while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
+ data = data->next;
+ }
+ if (data != NULL) {
+ SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already "
+ "registered for this module \"%s\".", data_name,
+ module_name);
+ SCMutexUnlock(&mutex);
+ goto end;
+ }
+
+ CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData));
+ if (new_data == NULL)
+ exit(EXIT_FAILURE);
+ memset(new_data, 0, sizeof(CudaHandlerModuleData));
+ new_data->name = SCStrdup(data_name);
+ if (new_data->name == NULL)
+ exit(EXIT_FAILURE);
+ new_data->data = data_ptr;
+
+ if (module->module_data == NULL) {
+ module->module_data = new_data;
+ } else {
+ new_data->next = module->module_data;
+ module->module_data = new_data;
+ }
+
+ SCMutexUnlock(&mutex);
+
+ end:
+ return;
+}
+
+void *CudaHandlerModuleGetData(const char *module_name, const char *data_name)
+{
+ SCMutexLock(&mutex);
+
+ CudaHandlerModule *module = cudahl_modules;
+ while (module != NULL && strcasecmp(module->name, module_name) != 0)
+ module = module->next;
+ if (module == NULL) {
+ SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
+ "\"%s\" from module \"%s\" that hasn't been registered "
+ "yet.", module_name, data_name);
+ SCMutexUnlock(&mutex);
+ return NULL;
+ }
+
+ CudaHandlerModuleData *data = module->module_data;
+ while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
+ data = data->next;
+ }
+ if (data == NULL) {
+ SCLogInfo("Data \"%s\" already registered for this module \"%s\". "
+ "Returning it.", data_name, module_name);
+ SCMutexUnlock(&mutex);
+ return NULL;
+ }
+
+ SCMutexUnlock(&mutex);
+ return data->data;
+}
+
+int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image)
+{
+#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15
+
+ int i = 0;
+
+ /* select the ptx image based on the compute capability supported by all
+ * devices (i.e. the lowest) */
+ char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
+ if (unlikely(image == NULL)) {
+ exit(EXIT_FAILURE);
+ }
+ memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
+
+ int major = INT_MAX;
+ int minor = INT_MAX;
+ SCCudaDevices *devices = SCCudaGetDeviceList();
+ for (i = 0; i < devices->count; i++){
+ if (devices->devices[i]->major_rev < major){
+ major = devices->devices[i]->major_rev;
+ minor = devices->devices[i]->minor_rev;
+ }
+ if (devices->devices[i]->major_rev == major &&
+ devices->devices[i]->minor_rev < minor){
+ minor = devices->devices[i]->minor_rev;
+ }
+ }
+ snprintf(image,
+ strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE,
+ "%s_sm_%u%u",
+ ptx_image, major, minor);
+
+ /* we don't have a cuda module associated with this module. Create a
+ * cuda module, update the module with this cuda module reference and
+ * then return the module refernce back to the calling function using
+ * the argument */
+ SCLogDebug("Loading kernel module: %s\n",image);
+ if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1)
+ goto error;
+ SCFree(image);
+
+ return 0;
+ error:
+ SCFree(image);
+ return -1;
+
+#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE
+}
+
+
+#endif /* __SC_CUDA_SUPPORT__ */
--- /dev/null
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+#ifndef __UTIL_CUDA_HANDLERS__H__
+#define __UTIL_CUDA_HANDLERS__H__
+
+#include "conf.h"
+#include "util-cuda.h"
+
+/************************conf file profile section**********************/
+
+void CudaHandlerAddCudaProfileFromConf(const char *name,
+ void *(*Callback)(ConfNode *node),
+ void (*Free)(void *));
+void *CudaHandlerGetCudaProfile(const char *name);
+void CudaHandlerFreeProfiles(void);
+
+/*******************cuda context related data section*******************/
+
+#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0
+#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1
+#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2
+
+CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id);
+void CudaHandlerModuleStoreData(const char *module_name,
+ const char *data_name, void *data_ptr);
+void *CudaHandlerModuleGetData(const char *module_name, const char *data_name);
+int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image);
+
+#endif /* __UTIL_CUDA_HANDLERS__H__ */
}
result = cuMemAllocHost(pp, byte_size);
- if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1)
+ if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1)
goto error;
return 0;
CASE_CODE (SC_ERR_CUDA_HANDLER_ERROR);
CASE_CODE (SC_ERR_TM_THREADS_ERROR);
CASE_CODE (SC_ERR_TM_MODULES_ERROR);
- CASE_CODE (SC_ERR_B2G_CUDA_ERROR);
+ CASE_CODE (SC_ERR_AC_CUDA_ERROR);
CASE_CODE (SC_ERR_INVALID_YAML_CONF_ENTRY);
CASE_CODE (SC_ERR_TMQ_ALREADY_REGISTERED);
CASE_CODE (SC_ERR_CONFLICTING_RULE_KEYWORDS);
SC_ERR_CUDA_HANDLER_ERROR,
SC_ERR_TM_THREADS_ERROR,
SC_ERR_TM_MODULES_ERROR,
- SC_ERR_B2G_CUDA_ERROR,
+ SC_ERR_AC_CUDA_ERROR,
SC_ERR_INVALID_YAML_CONF_ENTRY,
SC_ERR_TMQ_ALREADY_REGISTERED,
SC_ERR_CONFLICTING_RULE_KEYWORDS,
--- /dev/null
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ *
+ * The Cuda kernel for MPM AC.
+ *
+ * \todo - This is a basic version of the kernel.
+ * - Support 16 bit state tables.
+ * - Texture memory.
+ * - Multiple threads per blocks of threads. Make use of
+ * shared memory/texture memory.
+ */
+
+extern "C"
+__global__ void SCACCudaSearch64(unsigned char *d_buffer,
+ unsigned int d_buffer_start_offset,
+ unsigned int *o_buffer,
+ unsigned int *results_buffer,
+ unsigned int nop,
+ unsigned char *tolower)
+{
+ unsigned int u = 0;
+ unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (tid >= nop)
+ return;
+
+ unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
+ unsigned int (*state_table_u32)[256] =
+ (unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8));
+ unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16);
+
+ unsigned int state = 0;
+ unsigned int matches = 0;
+ unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
+ for (u = 0; u < buflen; u++) {
+ state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
+ if (state & 0xFF000000) {
+ results[matches++] = u;
+ results[matches++] = state & 0x00FFFFFF;
+ }
+ }
+
+ *(results - 1) = matches;
+ return;
+}
+
+extern "C"
+__global__ void SCACCudaSearch32(unsigned char *d_buffer,
+ unsigned int d_buffer_start_offset,
+ unsigned int *o_buffer,
+ unsigned int *results_buffer,
+ unsigned int nop,
+ unsigned char *tolower)
+{
+ unsigned int u = 0;
+ unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ if (tid >= nop)
+ return;
+
+ unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
+ unsigned int (*state_table_u32)[256] =
+ (unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4));
+ unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8);
+
+ unsigned int state = 0;
+ unsigned int matches = 0;
+ unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
+ for (u = 0; u < buflen; u++) {
+ state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
+ if (state & 0xFF000000) {
+ results[matches++] = u;
+ results[matches++] = state & 0x00FFFFFF;
+ }
+ }
+
+ *(results - 1) = matches;
+ return;
+}
#include "suricata.h"
#include "detect.h"
-#include "util-mpm-ac.h"
#include "conf.h"
#include "util-debug.h"
#include "util-unittest.h"
#include "util-memcmp.h"
+#include "util-mpm-ac.h"
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-mpm.h"
+#include "tm-threads.h"
+#include "util-mpm.h"
+#include "detect-engine-mpm.h"
+#include "util-cuda.h"
+#include "util-cuda-handlers.h"
+#endif /* __SC_CUDA_SUPPORT__ */
void SCACInitCtx(MpmCtx *, int);
void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t);
#define STATE_QUEUE_CONTAINER_SIZE 65536
+static int construct_both_16_and_32_state_tables = 0;
+
/**
* \brief Helper structure used by AC during state table creation
*/
int bot;
} StateQueue;
-/**
- * \brief Register the aho-corasick mpm.
- */
-void MpmACRegister(void)
-{
- mpm_table[MPM_AC].name = "ac";
- /* don't need this. isn't that awesome? no more chopping and blah blah */
- mpm_table[MPM_AC].max_pattern_length = 0;
-
- mpm_table[MPM_AC].InitCtx = SCACInitCtx;
- mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
- mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
- mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
- mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
- mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
- mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
- mpm_table[MPM_AC].Search = SCACSearch;
- mpm_table[MPM_AC].Cleanup = NULL;
- mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
- mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
- mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
-
- return;
-}
-
/**
* \internal
* \brief Initialize the AC context with user specified conf parameters. We
int ascii_code = 0;
int32_t r_state = 0;
- if (ctx->state_count < 32767) {
+ if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
ctx->state_table_u16 = SCMalloc(ctx->state_count *
sizeof(SC_AC_STATE_TYPE_U16) * 256);
if (ctx->state_table_u16 == NULL) {
}
}
}
- } else {
+ }
+
+ if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
/* create space for the state table. We could have used the existing goto
* table, but since we have it set to hold 32 bit state values, we will create
* a new state table here of type SC_AC_STATE_TYPE(current set to uint16_t) */
uint32_t state = 0;
uint32_t temp_state = 0;
- if (ctx->state_count < 32767) {
+ if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
for (state = 0; state < ctx->state_count; state++) {
for (ascii_code = 0; ascii_code < 256; ascii_code++) {
temp_state = ctx->state_table_u16[state & 0x7FFF][ascii_code];
ctx->state_table_u16[state & 0x7FFF][ascii_code] |= (1 << 15);
}
}
- } else {
+ }
+
+ if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
for (state = 0; state < ctx->state_count; state++) {
for (ascii_code = 0; ascii_code < 256; ascii_code++) {
temp_state = ctx->state_table_u32[state & 0x00FFFFFF][ascii_code];
/* prepare the state table required by AC */
SCACPrepareStateTable(mpm_ctx);
+#ifdef __SC_CUDA_SUPPORT__
+ if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+ int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda,
+ ctx->state_count * sizeof(unsigned int) * 256);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+
+ r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda,
+ ctx->state_table_u32,
+ ctx->state_count * sizeof(unsigned int) * 256);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+ exit(EXIT_FAILURE);
+ }
+ }
+#endif
+
/* free all the stored patterns. Should save us a good 100-200 mbs */
for (i = 0; i < mpm_ctx->pattern_cnt; i++) {
if (ctx->parray[i] != NULL) {
mpm_ctx->memory_cnt++;
mpm_ctx->memory_size -= (ctx->state_count *
sizeof(SC_AC_STATE_TYPE_U16) * 256);
- } else if (ctx->state_table_u32 != NULL) {
+ }
+ if (ctx->state_table_u32 != NULL) {
SCFree(ctx->state_table_u32);
ctx->state_table_u32 = NULL;
return;
}
+/****************************Cuda side of things****************************/
+
+#ifdef __SC_CUDA_SUPPORT__
+/* \todos
+ * - Use texture memory - Can we fit all the arrays into a 3d texture.
+ * Texture memory definitely offers slightly better performance even
+ * on gpus that offer cache for global memory.
+ * - Packetpool - modify to support > 65k max pending packets. We are
+ * hitting packetpool limit currently even with 65k packets.
+ * - Use streams. We have tried overlapping parsing results from the
+ * previous call with invoking the next call.
+ * - Offer higher priority to decode threads.
+ * - Modify pcap file mode to support reading from multiple pcap files
+ * and hence we will have multiple receive threads.
+ * - Split state table into many small pieces and have multiple threads
+ * run each small state table on the same payload.
+ * - Used a config peference of l1 over shared memory with no noticeable
+ * perf increase. Explore it in detail over cards/architectures.
+ * - Constant memory performance sucked. Explore it in detail.
+ * - Currently all our state tables are small. Implement 16 bit state
+ * tables on priority.
+ * - Introduce profiling.
+ * - Retrieve sgh before buffer packet.
+ * - Buffer smsgs too.
+ */
+
+void SCACConstructBoth16and32StateTables(void)
+{
+ construct_both_16_and_32_state_tables = 1;
+
+ return;
+}
+
+/* \todo Reduce offset buffer size. Probably a 100,000 entry would be sufficient. */
+static void *SCACCudaDispatcher(void *arg)
+{
+#define BLOCK_SIZE 32
+
+ int r = 0;
+ ThreadVars *tv = (ThreadVars *)arg;
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ uint32_t sleep_interval_ms = conf->batching_timeout;
+
+ SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of "
+ "\"%"PRIu32"\" micro-seconds", sleep_interval_ms);
+
+ CudaBufferData *cb_data =
+ CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME,
+ MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+
+ CUcontext cuda_context =
+ CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+ if (cuda_context == 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaCtxPushCurrent(cuda_context);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed.");
+ exit(EXIT_FAILURE);
+ }
+ CUmodule cuda_module = 0;
+ if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module.");
+ exit(EXIT_FAILURE);
+ }
+ CUfunction kernel = 0;
+#if __WORDSIZE==64
+ if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
+ exit(EXIT_FAILURE);
+ }
+#else
+ if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
+ exit(EXIT_FAILURE);
+ }
+#endif
+
+ uint8_t g_u8_lowercasetable[256];
+ for (uint8_t c = 0; c < 255; c++)
+ g_u8_lowercasetable[c] = tolower((uint8_t)c);
+ CUdeviceptr cuda_g_u8_lowercasetable_d = 0;
+ CUdeviceptr cuda_packets_buffer_d = 0;
+ CUdeviceptr cuda_offset_buffer_d = 0;
+ CUdeviceptr cuda_results_buffer_d = 0;
+ uint32_t *cuda_results_buffer_h = NULL;
+ r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable));
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable));
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+ exit(EXIT_FAILURE);
+ }
+
+ CudaBufferCulledInfo cb_culled_info;
+ memset(&cb_culled_info, 0, sizeof(cb_culled_info));
+
+ TmThreadsSetFlag(tv, THV_INIT_DONE);
+ while (1) {
+ if (TmThreadsCheckFlag(tv, THV_KILL))
+ break;
+
+ usleep(sleep_interval_ms);
+
+ /**************** 1 SEND ****************/
+ CudaBufferCullCompletedSlices(cb_data, &cb_culled_info);
+ if (cb_culled_info.no_of_items == 0)
+ continue;
+#if 0
+ SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" "
+ "cb_culled_info.buffer_len - %"PRIu32" "
+ "cb_culled_info.average size - %f "
+ "cb_culled_info.d_buffer_start_offset - %"PRIu32" "
+ "cb_culled_info.op_buffer_start_offset - %"PRIu32" "
+ "cb_data.no_of_items - %"PRIu32" "
+ "cb_data.d_buffer_read - %"PRIu32" "
+ "cb_data.d_buffer_write - %"PRIu32" "
+ "cb_data.op_buffer_read - %"PRIu32" "
+ "cb_data.op_buffer_write - %"PRIu32"\n",
+ cb_culled_info.no_of_items,
+ cb_culled_info.d_buffer_len,
+ cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items,
+ cb_culled_info.d_buffer_start_offset,
+ cb_culled_info.op_buffer_start_offset,
+ cb_data->no_of_items,
+ cb_data->d_buffer_read,
+ cb_data->d_buffer_write,
+ cb_data->op_buffer_read,
+ cb_data->op_buffer_write);
+#endif
+ r = SCCudaMemcpyHtoD(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemcpyHtoD(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+ exit(EXIT_FAILURE);
+ }
+ void *args[] = { &cuda_packets_buffer_d,
+ &cb_culled_info.d_buffer_start_offset,
+ &cuda_offset_buffer_d,
+ &cuda_results_buffer_d,
+ &cb_culled_info.no_of_items,
+ &cuda_g_u8_lowercasetable_d };
+ r = SCCudaLaunchKernel(kernel,
+ (cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1,
+ BLOCK_SIZE, 1, 1,
+ 0, 0,
+ args, NULL);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemcpyDtoH(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2));
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure.");
+ exit(EXIT_FAILURE);
+ }
+
+
+
+ /**************** 1 SYNCHRO ****************/
+ r = SCCudaCtxSynchronize();
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure.");
+ exit(EXIT_FAILURE);
+ }
+
+ /************* 1 Parse Results ************/
+ uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset;
+ uint32_t no_of_items = cb_culled_info.no_of_items;
+ uint32_t *o_buffer = cb_data->o_buffer;
+ uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset;
+ for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
+ Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];
+
+ p->cuda_gpu_matches =
+ cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
+ if (p->cuda_gpu_matches != 0) {
+ memcpy(p->cuda_results,
+ cuda_results_buffer_h +
+ ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2),
+ (cuda_results_buffer_h[((o_buffer[i_op_start_offset] -
+ d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
+ }
+
+ SCMutexLock(&p->cuda_mutex);
+ p->cuda_done = 1;
+ SCMutexUnlock(&p->cuda_mutex);
+ SCCondSignal(&p->cuda_cond);
+ }
+ if (no_of_items != 0)
+ CudaBufferReportCulledConsumption(cb_data, &cb_culled_info);
+ } /* while (1) */
+
+ r = SCCudaModuleUnload(cuda_module);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemFree(cuda_packets_buffer_d);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemFree(cuda_offset_buffer_d);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemFree(cuda_results_buffer_d);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+ exit(EXIT_FAILURE);
+ }
+ r = SCCudaMemFreeHost(cuda_results_buffer_h);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+ exit(EXIT_FAILURE);
+ }
+
+ TmThreadsSetFlag(tv, THV_RUNNING_DONE);
+ TmThreadWaitForFlag(tv, THV_DEINIT);
+ TmThreadsSetFlag(tv, THV_CLOSED);
+
+ return NULL;
+
+#undef BLOCK_SIZE
+}
+
+uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
+ PatternMatcherQueue *pmq)
+{
+ uint32_t u = 0;
+
+ while (!p->cuda_done) {
+ SCMutexLock(&p->cuda_mutex);
+ if (p->cuda_done) {
+ SCMutexUnlock(&p->cuda_mutex);
+ break;
+ } else {
+ SCCondWait(&p->cuda_cond, &p->cuda_mutex);
+ SCMutexUnlock(&p->cuda_mutex);
+ }
+ } /* while */
+ p->cuda_done = 0;
+ p->cuda_mpm_enabled = 0;
+
+ uint32_t cuda_matches = p->cuda_gpu_matches;
+ if (cuda_matches == 0)
+ return 0;
+
+ uint32_t matches = 0;
+ uint32_t *results = p->cuda_results + 1;
+ uint8_t *buf = p->payload;
+ SCACCtx *ctx = mpm_ctx->ctx;
+ SCACOutputTable *output_table = ctx->output_table;
+ SCACPatternList *pid_pat_list = ctx->pid_pat_list;
+
+ for (u = 0; u < cuda_matches; u += 2) {
+ uint32_t offset = results[u];
+ uint32_t state = results[u + 1];
+ /* we should technically be doing state & 0x00FFFFFF, but we don't
+ * since the cuda kernel does that for us */
+ uint32_t no_of_entries = output_table[state].no_of_entries;
+ /* we should technically be doing state & 0x00FFFFFF, but we don't
+ * since the cuda kernel does that for us */
+ uint32_t *pids = output_table[state].pids;
+ uint32_t k;
+ /* note that this is not a verbatim copy from SCACSearch(). We
+ * don't copy the pattern id into the pattern_id_array. That's
+ * the only change */
+ for (k = 0; k < no_of_entries; k++) {
+ if (pids[k] & 0xFFFF0000) {
+ if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs,
+ buf + offset - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1,
+ pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) {
+ /* inside loop */
+ if (pid_pat_list[pids[k] & 0x0000FFFF].case_state != 3) {
+ continue;
+ }
+ }
+ if (pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] & (1 << ((pids[k] & 0x0000FFFF) % 8))) {
+ ;
+ } else {
+ pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] |= (1 << ((pids[k] & 0x0000FFFF) % 8));
+ }
+ matches++;
+ } else {
+ if (pmq->pattern_id_bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) {
+ ;
+ } else {
+ pmq->pattern_id_bitarray[pids[k] / 8] |= (1 << (pids[k] % 8));
+ }
+ matches++;
+ }
+ }
+ }
+
+ return matches;
+}
+
+void SCACCudaStartDispatcher(void)
+{
+ /* create the threads */
+ ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher",
+ NULL, NULL,
+ NULL, NULL,
+ "custom", SCACCudaDispatcher, 0);
+ if (tv == NULL) {
+ SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for "
+ "ac cuda dispatcher. Killing engine.");
+ exit(EXIT_FAILURE);
+ }
+ if (TmThreadSpawn(tv) != 0) {
+ SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for "
+ "ac cuda dispatcher. Killing engine.");
+ exit(EXIT_FAILURE);
+ }
+
+ return;
+}
+
+int MpmCudaBufferSetup(void)
+{
+ int r = 0;
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ if (conf == NULL) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+ return -1;
+ }
+
+ CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+ if (cuda_context == 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
+ return -1;
+ }
+ r = SCCudaCtxPushCurrent(cuda_context);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
+ return -1;
+ }
+
+ uint8_t *d_buffer = NULL;
+ uint32_t *o_buffer = NULL;
+ void **p_buffer = NULL;
+
+ r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
+ return -1;
+ }
+ SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size);
+ r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue.");
+ return -1;
+ }
+ r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
+ return -1;
+ }
+
+ r = SCCudaCtxPopCurrent(NULL);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
+ return -1;
+ }
+
+ CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+ if (cb == NULL) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance.");
+ return -1;
+ }
+ CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb);
+
+ return 0;
+}
+
+int MpmCudaBufferDeSetup(void)
+{
+ int r = 0;
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ if (conf == NULL) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+ return -1;
+ }
+
+ CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+ BUG_ON(cb_data == NULL);
+
+ CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+ if (cuda_context == 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
+ return -1;
+ }
+ r = SCCudaCtxPushCurrent(cuda_context);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
+ return -1;
+ }
+
+ r = SCCudaMemFreeHost(cb_data->d_buffer);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+ return -1;
+ }
+ r = SCCudaMemFreeHost(cb_data->o_buffer);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+ return -1;
+ }
+ r = SCCudaMemFreeHost(cb_data->p_buffer);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+ return -1;
+ }
+
+ r = SCCudaCtxPopCurrent(NULL);
+ if (r < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
+ return -1;
+ }
+
+ CudaBufferDeRegister(cb_data);
+
+ return 0;
+}
+
+#endif /* __SC_CUDA_SUPPORT */
+
+/************************** Mpm Registration ***************************/
+
+/**
+ * \brief Register the aho-corasick mpm.
+ */
+void MpmACRegister(void)
+{
+ mpm_table[MPM_AC].name = "ac";
+ /* don't need this. isn't that awesome? no more chopping and blah blah */
+ mpm_table[MPM_AC].max_pattern_length = 0;
+
+ mpm_table[MPM_AC].InitCtx = SCACInitCtx;
+ mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
+ mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
+ mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
+ mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
+ mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
+ mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
+ mpm_table[MPM_AC].Search = SCACSearch;
+ mpm_table[MPM_AC].Cleanup = NULL;
+ mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
+ mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
+ mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
+
+ return;
+}
+
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Register the aho-corasick cuda mpm.
+ */
+void MpmACCudaRegister(void)
+{
+ mpm_table[MPM_AC_CUDA].name = "ac-cuda";
+ /* don't need this. isn't that awesome? no more chopping and blah blah */
+ mpm_table[MPM_AC_CUDA].max_pattern_length = 0;
+
+ mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx;
+ mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx;
+ mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx;
+ mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx;
+ mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS;
+ mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI;
+ mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns;
+ mpm_table[MPM_AC_CUDA].Search = SCACSearch;
+ mpm_table[MPM_AC_CUDA].Cleanup = NULL;
+ mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo;
+ mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats;
+ mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests;
+
+ if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+ MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+ if (conf == NULL) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+ exit(EXIT_FAILURE);
+ }
+
+ if (MpmCudaBufferSetup() < 0) {
+ SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac cuda");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ return;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
/*************************************Unittests********************************/
#ifdef UNITTESTS
*
*/
+#ifndef __UTIL_MPM_AC__H__
+#define __UTIL_MPM_AC__H__
+
#define SC_AC_STATE_TYPE_U16 uint16_t
#define SC_AC_STATE_TYPE_U32 uint32_t
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda.h"
+#endif /* __SC_CUDA_SUPPORT__ */
+
typedef struct SCACPattern_ {
/* length of the pattern */
uint16_t len;
/* the size of each state */
uint16_t single_state_size;
uint16_t max_pat_id;
+
+#ifdef __SC_CUDA_SUPPORT__
+ CUdeviceptr state_table_u16_cuda;
+ CUdeviceptr state_table_u32_cuda;
+#endif /* __SC_CUDA_SUPPORT__ */
} SCACCtx;
typedef struct SCACThreadCtx_ {
} SCACThreadCtx;
void MpmACRegister(void);
+
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#define MPM_AC_CUDA_MODULE_NAME "ac_cuda"
+#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb"
+
+
+void MpmACCudaRegister(void);
+void SCACConstructBoth16and32StateTables(void);
+int MpmCudaBufferSetup(void);
+int MpmCudaBufferDeSetup(void);
+void SCACCudaStartDispatcher(void);
+void SCACCudaKillDispatcher(void);
+uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
+ PatternMatcherQueue *pmq);
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
+
+#endif /* __UTIL_MPM_AC__H__ */
+++ /dev/null
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * The Cuda kernel for MPM B2G.
- *
- * \todo This is a basic version of the kernel. Modify it to support multiple
- * blocks of threads. Make use of shared memory/texture memory.
- */
-
-#define B2G_CUDA_Q 2
-#define CUDA_THREADS 4000
-#define B2G_CUDA_HASHSHIFT 4
-#define B2G_CUDA_TYPE unsigned int
-#define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
-#define u8_tolower(c) g_u8_lowercasetable[(c)]
-
-typedef struct SCCudaPBPacketDataForGPU_ {
- /* holds the value B2gCtx->m */
- unsigned int m;
- /* holds B2gCtx->B2g */
- unsigned int table;
- /* holds the length of the payload */
- unsigned int payload_len;
- /* holds the payload */
- unsigned char payload;
-} SCCudaPBPacketDataForGPU;
-
-extern "C"
-__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
- unsigned char *packets_buffer,
- unsigned int *packets_offset_buffer,
- unsigned int *packets_payload_offset_buffer,
- unsigned int nop,
- unsigned char *g_u8_lowercasetable)
- {
- unsigned int tid = blockIdx.x * 32 + threadIdx.x;
- /* if the thread id is greater than the no of packets sent in the packets
- * buffer, terminate the thread */
- //if (tid <= nop)
- if (tid >= nop)
- return;
-
- SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
- unsigned int m = packet->m;
- unsigned char *buf = &packet->payload;
- unsigned int buflen = packet->payload_len;
- unsigned int *B2G = (unsigned int *)packet->table;
- unsigned int pos = m - B2G_CUDA_Q + 1;
- B2G_CUDA_TYPE d;
- unsigned short h;
- unsigned int first;
- unsigned int j = 0;
-
- unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
- //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
- //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
- unsigned short *offsets = matches_count + 1;
- // temporarily hold the results here, before we shift it to matches_count
- // before returning
- unsigned short matches = 0;
-
- while (pos <= (buflen - B2G_CUDA_Q + 1)) {
- h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
- d = B2G[h];
-
- if (d != 0) {
- j = pos;
- first = pos - (m - B2G_CUDA_Q + 1);
-
- do {
- j = j - 1;
- if (d >= (1 << (m - 1))) {
- if (j > first) {
- pos = j;
- } else {
- offsets[matches++] = j;
- }
- }
-
- if (j == 0)
- break;
-
- h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
- d = (d << 1) & B2G[h];
- } while (d != 0);
- }
- pos = pos + m - B2G_CUDA_Q + 1;
- }
-
- matches_count[0] = matches;
-
- return;
-}
#include "conf-yaml-loader.h"
#include "queue.h"
#include "util-unittest.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-handlers.h"
+#endif
/**
* \brief Register a new Mpm Context.
return;
}
+#ifdef __SC_CUDA_SUPPORT__
+
+static void MpmCudaConfFree(void *conf)
+{
+ SCFree(conf);
+ return;
+}
+
+static void *MpmCudaConfParse(ConfNode *node)
+{
+ const char *value;
+
+ MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf));
+ if (conf == NULL)
+ exit(EXIT_FAILURE);
+ memset(conf, 0, sizeof(conf));
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT;
+ } else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "data-buffer-size-min-limit - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT;
+ } else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "data-buffer-size-max-limit - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT;
+ } else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "cb-buffer-size - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "gpu-transfer-size");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE;
+ } else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "gpu-transfer-size - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "batching-timeout");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT;
+ } else if ((conf->batching_timeout = atoi(value)) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "batching-timeout - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "device-id");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT;
+ } else if ((conf->device_id = atoi(value)) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "device-id - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ if (node != NULL)
+ value = ConfNodeLookupChildValue(node, "cuda-streams");
+ else
+ value = NULL;
+ if (value == NULL) {
+ /* default */
+ conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT;
+ } else if ((conf->cuda_streams = atoi(value)) < 0) {
+ SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+ "cuda-streams - \"%s\"", node->name, value);
+ exit(EXIT_FAILURE);
+ }
+
+ return conf;
+}
+#endif
+
/**
* \brief Setup a pmq
*
MpmB2gcRegister();
MpmB2gmRegister();
MpmACRegister();
+#ifdef __SC_CUDA_SUPPORT__
+ CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree);
+ MpmACCudaRegister();
+#endif /* __SC_CUDA_SUPPORT__ */
MpmACBSRegister();
MpmACGfbsRegister();
}
/* aho-corasick */
MPM_AC,
+#ifdef __SC_CUDA_SUPPORT__
+ MPM_AC_CUDA,
+#endif
/* aho-corasick-goto-failure state based */
MPM_AC_GFBS,
MPM_AC_BS,
uint32_t memory_cnt;
uint32_t memory_size;
+
} MpmThreadCtx;
/** \brief helper structure for the pattern matcher engine. The Pattern Matcher
MpmTableElmt mpm_table[MPM_TABLE_SIZE];
+/* macros decides if cuda is enabled for the platform or not */
+#ifdef __SC_CUDA_SUPPORT__
+
+/* the min size limit of a payload(or any other data) to be buffered */
+#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0
+/* the max size limit of a payload(or any other data) to be buffered */
+#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500
+/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */
+#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024
+/* Default value for the max data chunk that would be sent to gpu */
+#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024
+/* Default value for offset/pointer buffer to be used by cuda mpm
+ * engine for CudaBuffer reg */
+#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000
+#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000
+#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2
+#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0
+
+/**
+ * \brief Cuda configuration for "mpm" profile. We can further extend this
+ * to have conf for specific mpms. For now its common for all mpms.
+ */
+typedef struct MpmCudaConf_ {
+ uint16_t data_buffer_size_min_limit;
+ uint16_t data_buffer_size_max_limit;
+ uint32_t cb_buffer_size;
+ uint32_t gpu_transfer_size;
+ int batching_timeout;
+ int device_id;
+ int cuda_streams;
+} MpmCudaConf;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
struct DetectEngineCtx_;
int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *, uint8_t);
# Cuda configuration.
cuda:
# The "mpm" profile. On not specifying any of these parameters, the engine's
- # internal default values are used, which are same as the ones specified here.
- - mpm:
- # Threshold limit for no of packets buffered to the GPU. Once we hit this
- # limit, we pass the buffer to the gpu.
- packet-buffer-limit: 2400
- # The maximum length for a packet that we would buffer to the gpu.
- # Anything over this is MPM'ed on the CPU. All entries > 0 are valid.
- # Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
- packet-size-limit: 1500
- # No of packet buffers we initialize. All entries > 0 are valid.
- packet-buffers: 10
- # The timeout limit for batching of packets in secs. If we don't fill the
- # buffer within this timeout limit, we pass the currently filled buffer to the gpu.
- # All entries > 0 are valid.
- batching-timeout: 1
- # Specifies whether to use page-locked memory whereever possible. Accepted values
- # are "enabled" and "disabled".
- page-locked: enabled
- # The device to use for the mpm. Currently we don't support load balancing
- # on multiple gpus. In case you have multiple devices on your system, you
- # can specify the device to use, using this conf. By default we hold 0, to
- # specify the first device cuda sees. To find out device-id associated with
- # the card(s) on the system run "suricata --list-cuda-cards".
- device-id: 0
- # No of Cuda streams used for asynchronous processing. All values > 0 are valid.
- # For this option you need a device with Compute Capability > 1.0 and
- # page-locked enabled to have any effect.
- cuda-streams: 2
+ # internal default values are used, which are same as the ones specified in
+ # in the default conf file.
+ mpm:
+ # The minimum length required to buffer data to the gpu.
+ # Anything below this is MPM'ed on the CPU.
+ # Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
+ # A value of 0 indicates there's no limit.
+ data-buffer-size-min-limit: 0
+ # The maximum length for data that we would buffer to the gpu.
+ # Anything over this is MPM'ed on the CPU.
+ # Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
+ data-buffer-size-max-limit: 1500
+ # The ring buffer size used by the CudaBuffer API to buffer data.
+ cudabuffer-buffer-size: 500mb
+ # The max chunk size that can be sent to the gpu in a single go.
+ gpu-transfer-size: 50mb
+ # The timeout limit for batching of packets in microseconds.
+ batching-timeout: 2000
+ # The device to use for the mpm. Currently we don't support load balancing
+ # on multiple gpus. In case you have multiple devices on your system, you
+ # can specify the device to use, using this conf. By default we hold 0, to
+ # specify the first device cuda sees. To find out device-id associated with
+ # the card(s) on the system run "suricata --list-cuda-cards".
+ device-id: 0
+ # No of Cuda streams used for asynchronous processing. All values > 0 are valid.
+ # For this option you need a device with Compute Capability > 1.0.
+ cuda-streams: 2
# Select the multi pattern algorithm you want to run for scan/search the
# in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,