From b334b8a6e9bebe3286c0dbc8c8aa20e62ba90868 Mon Sep 17 00:00:00 2001
From: Anoop Saldanha <anoopsaldanha@gmail.com>
Date: Fri, 25 Jul 2014 13:20:28 +0530
Subject: [PATCH] CUDA: Update the inspection engine to inform the cuda module
 that it doesn't need the gpu results and to release the packet for the next
 run.

Previously the inspection engine wouldn't inform the cuda module, if it
didn't need the results.  As a consequence, when the packet is next taken
for re-use, and if the packet is still being processed by the cuda module,
the engine would wait till the cuda module frees the packet.

This commits updates this functionality to inform the cuda module to
release the packet for the afore-mentioned case.
---
 src/detect.c      |  4 ++++
 src/util-mpm-ac.c | 20 +++++++++++++++++++-
 src/util-mpm-ac.h |  2 ++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/detect.c b/src/detect.c
index e8950a9707..f4f0d54e3d 100644
--- a/src/detect.c
+++ b/src/detect.c
@@ -1572,6 +1572,10 @@ next:
     PACKET_PROFILING_DETECT_END(p, PROF_DETECT_RULES);
 
 end:
+#ifdef __SC_CUDA_SUPPORT__
+    CudaReleasePacket(p);
+#endif
+
     /* see if we need to increment the inspect_id and reset the de_state */
     if (has_state && AppLayerParserProtocolSupportsTxs(p->proto, alproto)) {
         PACKET_PROFILING_DETECT_START(p, PROF_DETECT_STATEFUL);
diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c
index 6271f2fb70..f8e2199a39 100644
--- a/src/util-mpm-ac.c
+++ b/src/util-mpm-ac.c
@@ -1691,6 +1691,18 @@ void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
 
 }
 
+void CudaReleasePacket(Packet *p)
+{
+    if (p->cuda_pkt_vars.cuda_mpm_enabled == 1) {
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+        p->cuda_pkt_vars.cuda_done = 0;
+        SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+    }
+
+    return;
+}
+
 /* \todos
  * - Use texture memory - Can we fit all the arrays into a 3d texture.
  *   Texture memory definitely offers slightly better performance even
@@ -1890,6 +1902,13 @@ static void *SCACCudaDispatcher(void *arg)
         for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
             Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];
 
+            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+            if (p->cuda_pkt_vars.cuda_mpm_enabled == 0) {
+                p->cuda_pkt_vars.cuda_done = 0;
+                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+                continue;
+            }
+
             p->cuda_pkt_vars.cuda_gpu_matches =
                 cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
             if (p->cuda_pkt_vars.cuda_gpu_matches != 0) {
@@ -1900,7 +1919,6 @@ static void *SCACCudaDispatcher(void *arg)
                                                 d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
             }
 
-            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
             p->cuda_pkt_vars.cuda_done = 1;
             SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
             SCCondSignal(&p->cuda_pkt_vars.cuda_cond);
diff --git a/src/util-mpm-ac.h b/src/util-mpm-ac.h
index 7b6b83ecf2..760fb70495 100644
--- a/src/util-mpm-ac.h
+++ b/src/util-mpm-ac.h
@@ -205,6 +205,8 @@ uint32_t  SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
                                           PatternMatcherQueue *pmq);
 void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx);
 
+void CudaReleasePacket(Packet *p);
+
 #endif /* __SC_CUDA_SUPPORT__ */
 
 
-- 
2.47.2