From: Victor Julien <victor@inliniac.net>
Date: Sat, 30 Dec 2017 13:55:26 +0000 (+0100)
Subject: cuda: remove
X-Git-Tag: suricata-4.1.0-beta1~377
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F3128%2Fhead;p=thirdparty%2Fsuricata.git

cuda: remove

Remove CUDA support as it has been broken for a long time.

Ticket #2382.
---

diff --git a/configure.ac b/configure.ac
index 805afdeddc..6536e70af8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1347,80 +1347,6 @@
     fi
 
 
-  # enable CUDA output
-    AC_ARG_ENABLE(cuda,
-           AS_HELP_STRING([--enable-cuda], [Enable experimental CUDA pattern matching]),,[enable_cuda=no])
-    AS_IF([test "x$enable_cuda" = "xyes"], [
-        AC_ARG_WITH(cuda_includes,
-                [  --with-cuda-includes=DIR  cuda include directory],
-                [with_cuda_includes="$withval"],[with_cuda_includes=no])
-        AC_ARG_WITH(cuda_libraries,
-                [  --with-cuda-libraries=DIR    cuda library directory],
-                [with_cuda_libraries="$withval"],[with_cuda_libraries="no"])
-        AC_ARG_WITH(cuda_nvcc,
-                [  --with-cuda-nvcc=DIR  cuda nvcc compiler directory],
-                [with_cuda_nvcc="$withval"],[with_cuda_nvcc=no])
-
-        AC_DEFINE([__SC_CUDA_SUPPORT__],[1],(CUDA support enabled))
-
-        if test "$with_cuda_includes" != "no"; then
-            CPPFLAGS="${CPPFLAGS} -I${with_cuda_includes}"
-        else
-            CPPFLAGS="${CPPFLAGS} -I/usr/local/cuda/include"
-        fi
-
-        if test "$with_cuda_libraries" != "no"; then
-            LDFLAGS="${LDFLAGS} -L${with_cuda_libraries}"
-        fi
-
-        if test "$with_cuda_nvcc" != "no"; then
-            NVCC_DIR="${with_cuda_nvcc}"
-        else
-            NVCC_DIR="/usr/local/cuda/bin"
-        fi
-
-        AC_CHECK_HEADER(cuda.h,,[AC_ERROR(cuda.h not found ...)])
-
-        LIBCUDA=""
-        AC_CHECK_LIB(cuda, cuArray3DCreate,, LIBCUDA="no")
-        if test "$LIBCUDA" = "no"; then
-            echo
-            echo "   ERROR! libcuda library not found"
-            echo
-            exit 1
-        fi
-
-        AC_PATH_PROG([NVCC], [nvcc], no, [$PATH:$NVCC_DIR])
-        if test "x$NVCC" = "xno"; then
-            echo
-            echo "   ERROR! CUDA nvcc compiler not found: use --with-cuda-nvcc=DIR"
-            echo
-            exit 1
-        fi
-
-        AC_MSG_CHECKING(for nvcc version)
-        NVCCVER=`$NVCC --version | grep "release" | sed 's/.*release \(@<:@0-9@:>@\)\.\(@<:@0-9@:>@\).*/\1\2/'`
-        AC_MSG_RESULT($NVCCVER)
-        if test "$NVCCVER" -lt 31; then
-            echo
-            echo "   Warning! Your CUDA nvcc version might be outdated."
-            echo "   If compilation fails try the latest CUDA toolkit from"
-            echo "   www.nvidia.com/object/cuda_develop.html"
-            echo
-        fi
-
-        AM_PATH_PYTHON(,, no)
-        if test "x$PYTHON" = "xno"; then
-            echo
-            echo "   ERROR! Compiling CUDA kernels requires python."
-            echo
-            exit 1
-        fi
-    ])
-    AM_CONDITIONAL([BUILD_CUDA], [test "x$enable_cuda" = "xyes"])
-    AM_CONDITIONAL([__SC_CUDA_SUPPORT__], [test "x$enable_cuda" = "xyes"])
-
-
   # Check for libcap-ng
     case $host in
     *-*-linux*)
@@ -2220,7 +2146,6 @@ SURICATA_BUILD_CONF="Suricata Configuration:
   libgeoip:                                ${enable_geoip}
   Non-bundled htp:                         ${enable_non_bundled_htp}
   Old barnyard2 support:                   ${enable_old_barnyard2}
-  CUDA enabled:                            ${enable_cuda}
   Hyperscan support:                       ${enable_hyperscan}
   Libnet support:                          ${enable_libnet}
 
diff --git a/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt b/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt
deleted file mode 100644
index 18ea5d1027..0000000000
--- a/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt
+++ /dev/null
@@ -1,149 +0,0 @@
-Autogenerated on 2012-11-29
-from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6
-
-
-Installation with CUDA and PFRING on Scientific Linux 6
-
-For setup and install you need to be root:
-mkdir /root/src
-cd /root/src
-
-Pre installation requirements
-
-Install the following packages, to make sure you have everything needed for the
-installation:
-
-  yum install mpfr-2.4.1-6.el6.x86_64 cpp-4.4.4-13.el6.x86_64 ppl-0.10.2-
-  11.el6.x86_64 \
-  cloog-ppl-0.15.7-1.2.el6.x86_64 gcc-4.4.4-13.el6.x86_64 kernel-devel-2.6.32-
-  131.2.1.el6.x86_64 \
-  pcre-devel-7.8-3.1.el6.x86_64 libpcap-devel-1.0.0-
-  6.20091201git117cb5.el6.x86_64 \
-  yum-plugin-priorities-1.1.26-11.el6.noarch yum-conf-sl6x-1-1.noarch libyaml-
-  0.1.3-1.el6.rf.x86_64 \
-  libyaml-devel-0.1.3-1.el6.rf.x86_64 libnet-1.1.2.1-2.2.el6.rf.x86_64 flex-
-  2.5.35-8.el6.x86_64 \
-  bison-2.4.1-5.el6.x86_64 gcc-c++-4.4.4-13.el6.x86_64
-
-
-CUDA
-
-Download and install NVIDIA CUDA drivers:
-
-  wget http://us.download.nvidia.com/XFree86/Linux-x86_64/270.41.19/NVIDIA-
-  Linux-x86_64-270.41.19.run
-  chmod +x NVIDIA-Linux-x86_64-270.41.19.run
-  ./NVIDIA-Linux-x86_64-270.41.19.run
-
-You also need to download and install the CUDA toolkit for RHEL6 :
-
-  wget http://developer.download.nvidia.com/compute/cuda/4_0/toolkit/
-  cudatoolkit_4.0.17_linux_64_rhel6.0.run
-  chmod +x cudatoolkit_4.0.17_linux_64_rhel6.0.run
-  ./cudatoolkit_4.0.17_linux_64_rhel6.0.run
-
-Make sure the kernel modules are loaded:
-
-  /sbin/modprobe -r nouveau && /sbin/modprobe nvidia
-
-To ensure the proper NVIDIA CUDA modules get loaded on reboot, add the above
-line to your /etc/rc.local file.
-
-PF_RING
-
-Go to your download directory and get the latest PF_RING:
-
-     svn export https://svn.ntop.org/svn/ntop/trunk/PF_RING/ pfring-svn-
-     latest
-
-Compile and install
-Next, enter the following commands for configuration and installation:
-
-     cd pfring-svn-latest/kernel
-     make && sudo make install
-     cd ../userland/lib
-     ./configure --prefix=/usr/local/pfring && make && sudo make install
-     cd ../libpcap-1.1.1-ring
-     ./configure --prefix=/usr/local/pfring && make && sudo make install
-     cd ../tcpdump-4.1.1
-     ./configure --prefix=/usr/local/pfring && make && sudo make install
-
-Load the pf_ring kernel module:
-
-  /sbin/modprobe pf_ring
-
-To ensure the pf_ring module gets loaded on reboot, add the above line to your
-/etc/rc.local file.
-
-Suricata
-
-Download and install Suricata:
-
-  wget http://www.openinfosecfoundation.org/download/suricata-1.1beta2.tar.gz
-
-And unpack it:
-
-  tar -xvzf suricata-1.1beta2.tar.gz
-
-Change to the unpacked directory:
-
-  cd suricata-1.1beta2
-
-Now compile and install Suricata with PF_RING and CUDA support:
-
-  ./configure --enable-gccprotect --enable-profiling --enable-cuda --with-cuda-
-  includes=/usr/local/cuda/include \
-  --with-cuda-libraries=/usr/local/cuda/lib64 --enable-pfring --with-libpfring-
-  libraries=/usr/local/lib \
-  --with-libpfring-includes=/usr/local/include --with-libpcap-libraries=/usr/
-  local/lib --with-libpcap-includes=/usr/local/include
-  make
-  make install
-
-Continue with the Basic_Setup
-Next, you need to edit max-pending-packets in your /etc/suricata/suricata.yaml.
-If you don't have one, download a generic one to get started:
-
-  cd /etc/suricata
-  wget https://rules.emergingthreatspro.com/open-nogpl/suricata/suricata-
-  open.yaml
-
-Edit your suricata-open.yaml file accordingly.
-The number of packets allowed to be processed simultaneously can be whatever
-you want but it is recommended that it be 4000 or more.
-For example:
-
-  max-pending-packets: 12288
-
-Next make sure the following line is present in the multi pattern algorithm
-section:
-
-  mpm-algo: b2g_cuda
-
-
-Rules
-
-Read the information in Rule_Management_with_Oinkmaster
-Add rules to suricata:
-
-  cd /etc/suricata
-  wget https://rules.emergingthreatspro.com/open-nogpl/suricata/
-  emerging.rules.tar.gz
-  tar -xvzf emerging.rules.tar.gz
-
-Make sure your .yaml file includes the /etc/suricata/rules/emerging-*.rules
-files (they may need to be uncommented).
-Run Suricata as followed:
-
-  cd /etc/suricata
-  /usr/local/bin/suricata -c /etc/suricata/suricata.yaml\
-  --pfring-int=eth0 --pfring-cluster-id=99 --pfring-cluster-type=cluster_flow
-
-
-  touch /var/lock/subsys/local
-
-
-References
-
-PF_RING
-http://www.ntop.org/products/pf_ring/
diff --git a/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt b/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt
deleted file mode 100644
index 1d1cd22000..0000000000
--- a/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt
+++ /dev/null
@@ -1,280 +0,0 @@
-Autogenerated on 2012-01-11
-from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104
-
-
-Installation with CUDA and PF RING on Ubuntu server 11.04
-
-THIS WOULD NOT WORK ON A VIRTUAL MACHINE!
-This guide is written using:
-Ubuntu Server 11.04
-Linux ubuntu 2.6.38-8-generic x86_64 GNU/Linux
-
-Pre installation requirements
-
-
-  apt-get update
-  apt-get upgrade
-
-To get the CUDA toolkit, enter:
-
-  http://developer.nvidia.com/cuda-toolkit-40
-
-Pick up the correct NVIDIA drivers for your card and system
-
-  http://www.nvidia.com/Download/index.aspx?lang=en-us
-
-Go to your download directory
-chmod the 2 *.run files that you just downloaded.
-For example:
-
-  chmod 655 cudatoolkit_4.0.17_linux_64_ubuntu10.10.run
-  chmod 655 NVIDIA-Linux-x86_64-280.13.run
-
-
-  sudo apt-get -y install libpcre3 libpcre3-dbg libpcre3-dev \
-  build-essential autoconf automake libtool libpcap-dev libnet1-dev \
-  libyaml-0-2 libyaml-dev zlib1g zlib1g-dev libcap-ng-dev libcap-ng0 \
-  make flex bison git
-
-Run the cuda toolkit installation package:
-
-  sudo ./cudatoolkit_4.0.17_linux_64_ubuntu10.10.run
-
-Close all windows and as you are logged in press:
-
-  Ctr+Alt+F1
-
-Log in with your credentials
-
-  sudo -i
-
-And enter your password
-Stop the x server:
-
-  /etc/init.d/gdm stop
-
-Uninstall xserver video drivers:
-
-  apt-get remove --purge xserver-xorg-video-nouveau
-
-Go to the directory where you downloaded nvidia/cuda drivers.
-Run the NVIDIA*******.run:
-
-  ./NVIDIA********.run
-
-Ok and yes your way out.
-At some point it will ask you to make a special configuration file to disable a
-"nouveau"
-driver that the system is currently using - say yes!
-Reboot:
-
-  shutdown -r now
-
-After reboot log in as you would normally do through the GUI
-Log in as you would normally.
-Go to shell:
-
-  Ctrl+Alt+F1
-
-Type in your credentials and pass
-
-  sudo -i
-
-Stop the xserver again:
-
-  /etc/init.d/gdm stop
-
-Run the NVIDIA driver again.
-This time it would finish and be successful....
-Reboot:
-
-  shutdown -r now
-
-After start you would notice that the display has much better resolution - it
-is a good thing.
-Log in as you would normally.
-Because the 11.04 Ubuntu comes with gcc version 4.5 by default, you need to
-install gcc 4.4 since you must use 4.4 for the cuda compilation:
-
-  apt-get install gcc-4.4 gcc-4.4-base g++-4.4
-
-Then we switch and make ubuntu use the gcc 4.4 by default:
-
-  sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.5 40 --
-  slave /usr/bin/g++ g++ /usr/bin/g++-4.5
-  sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 --
-  slave /usr/bin/g++ g++ /usr/bin/g++-4.4
-
-Make sure that this is the case:
-
-  sudo update-alternatives --config gcc
-
-""
-
-  update-alternatives --config gcc (as root)
-
-There are 2 choices for the alternative gcc (providing /usr/bin/gcc).
-
-
-    Selection    Path              Priority   Status
-  ------------------------------------------------------------
-  * 0            /usr/bin/gcc-4.4   60        auto mode
-    1            /usr/bin/gcc-4.4   60        manual mode
-    2            /usr/bin/gcc-4.5   40        manual mode
-
-  Press enter to keep the current choice[*], or type selection number  (as
-  root)
-  ""
-
-
-PF_RING installation.
-
-Install pre-requisites:
-
-  cd /opt
-  apt-get install subversion gobjc++-4.4-multilib gobjc++-4.4
-
-Get the latest PF_RING:
-
-  svn --force export https://svn.ntop.org/svn/ntop/trunk/PF_RING/ PF_RING
-
-Install PF_RING:
-
-  cd /kernel
-  make && make install
-  sudo insmod ./pf_ring.ko
-  cd ../userland
-  make && make install
-  cd /lib
-  ./configure && make && make install
-  cd ../libpcap
-  ./configure && make && make install
-  cd ../examples
-  echo "options pf_ring transparent_mode=0 min_num_slots=32768
-  enable_tx_capture=0" > /etc/modprobe.d/pf_ring.conf
-
-Check info:
-
-  cat /proc/net/pf_ring/info
-  ""
-  cd ../kernel
-  cat /proc/net/pf_ring/info
-  PF_RING Version     : 4.7.3 ($Revision: exported$)
-  Ring slots          : 4096
-  Slot version        : 13
-  Capture TX          : Yes [RX+TX]
-  IP Defragment       : No
-  Socket Mode         : Standard
-  Transparent mode    : Yes (mode 0)
-  Total rings         : 0
-  Total plugins       : 0
-
-  ""
-
-Check functionality:
-
-  ./pfcount -i eth0
-
-You should see something even if you have no traffic at the moment:
-""
-cd /opt/PF_RING/userland/examples
-./pfcount -i eth0
-Using PF_RING v.4.7.3
-Capturing from eth0 [88:AE:1D:56:90:FA]
-
-  1. Device RX channels: 1
-  2. Polling threads: 1 =========================
-     Absolute Stats: [0 pkts rcvd][0 pkts dropped]
-     Total Pkts=0/Dropped=0.0 %
-     0 pkts - 0 bytes =========================
-
-=========================
-Absolute Stats: [0 pkts rcvd][0 pkts dropped]
-Total Pkts=0/Dropped=0.0 %
-0 pkts - 0 bytes [0.00 pkt/sec - 0.00 Mbit/sec] =========================
-Actual Stats: 0 pkts [1'000.32 ms][0.00 pkt/sec] =========================
-^CLeaving... =========================
-Absolute Stats: [0 pkts rcvd][0 pkts dropped]
-Total Pkts=0/Dropped=0.0 %
-0 pkts - 0 bytes [0.00 pkt/sec - 0.00 Mbit/sec] =========================
-Actual Stats: 0 pkts [629.37 ms][0.00 pkt/sec] =========================
-
-  cd /opt/PF_RING/userland/examples
-
-""
-
-Suricata
-
-Go to directory of your choice and get Suricata:
-
-  git clone git://phalanx.openinfosecfoundation.org/oisf.git
-  cd oisf/
-
-Configure:
-
-  ./autogen.sh
-  ./configure --enable-gccprotect --enable-profiling --enable-cuda --with-cuda-
-  includes=/usr/local/cuda/include \
-  --with-cuda-libraries=/usr/local/cuda/lib64 --enable-pfring
-
-You should get at the end:
-""
-
-  Suricata Configuration:
-    NFQueue support:          no
-    IPFW support:             no
-    PF_RING support:          yes
-    Prelude support:          no
-    Unit tests enabled:       no
-    Debug output enabled:     no
-    Debug validation enabled: no
-    CUDA enabled:             yes
-    DAG enabled:              no
-    Profiling enabled:        yes
-    GCC Protect enabled:      yes
-    GCC march native enabled: yes
-    GCC Profile enabled:      no
-    Unified native time:      no
-    Non-bundled htp:          no
-    PCRE sljit:               no
-
-
-""
-Install:
-
-  make && make install
-  ldconfig
-
-Verify:
-
-  suricata --build-info
-
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:622) <Info> (main) -- This is
-  Suricata version 1.1beta2 (rev b3f7e6a)
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:507) <Info> (SCPrintBuildInfo) -
-  - Features: PCAP_SET_BUFF LIBPCAP_VERSION_MAJOR=1 CUDA PF_RING LIBCAP_NG
-  LIBNET1.1 HAVE_HTP_URI_NORMALIZE_HOOK
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:521) <Info> (SCPrintBuildInfo) -
-  - 64-bits, Little-endian architecture
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:523) <Info> (SCPrintBuildInfo) -
-  - GCC version 4.4.5, C version 199901
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:529) <Info> (SCPrintBuildInfo) -
-  - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:532) <Info> (SCPrintBuildInfo) -
-  - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:535) <Info> (SCPrintBuildInfo) -
-  - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:538) <Info> (SCPrintBuildInfo) -
-  - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:541) <Info> (SCPrintBuildInfo) -
-  - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:545) <Info> (SCPrintBuildInfo) -
-  - compiled with -fstack-protector
-  [1840] 13/8/2011 -- 14:26:39 - (suricata.c:551) <Info> (SCPrintBuildInfo) -
-  - compiled with _FORTIFY_SOURCE=2
-
-Run Suricata:
-
-  suricata -c /etc/suricata/suricata.yaml\
-  --pfring-int=eth0 --pfring-cluster-id=99 --pfring-cluster-type=cluster_flow
-
diff --git a/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt b/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt
deleted file mode 100644
index 604ee8bcfc..0000000000
--- a/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-Autogenerated on 2012-11-29
-from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_on_Scientific_Linux_6
-
-
-Installation with CUDA on Scientific Linux 6
-
-Hardware used: HP Proliant G7, 16 cores, 30 GB RAM, NVIDIA CUDA Quadro 4000
-graphics card
-For setup you need to be root. Enter the following:
-
-  mkdir /root/src
-  cd /root/src
-
-
-Pre installation requirements
-
-Run the following command to ensure that you have everything you need for the
-installation:
-
-  yum install mpfr-2.4.1-6.el6.x86_64 cpp-4.4.4-13.el6.x86_64 ppl-0.10.2-
-  11.el6.x86_64 \
-  cloog-ppl-0.15.7-1.2.el6.x86_64 gcc-4.4.4-13.el6.x86_64 kernel-devel-2.6.32-
-  131.2.1.el6.x86_64 \
-  pcre-devel-7.8-3.1.el6.x86_64 libpcap-devel-1.0.0-
-  6.20091201git117cb5.el6.x86_64 \
-  yum-plugin-priorities-1.1.26-11.el6.noarch yum-conf-sl6x-1-1.noarch libyaml-
-  0.1.3-1.el6.rf.x86_64 \
-  libyaml-devel-0.1.3-1.el6.rf.x86_64 libnet-1.1.2.1-2.2.el6.rf.x86_64 flex-
-  2.5.35-8.el6.x86_64 \
-  bison-2.4.1-5.el6.x86_64 gcc-c++-4.4.4-13.el6.x86_64
-
-
-CUDA
-
-Download and install NVIDIA CUDA drivers:
-
-  wget http://us.download.nvidia.com/XFree86/Linux-x86_64/270.41.19/NVIDIA-
-  Linux-x86_64-270.41.19.run
-  chmod +x NVIDIA-Linux-x86_64-270.41.19.run
-  ./NVIDIA-Linux-x86_64-270.41.19.run
-
-You also need to download and install the CUDA toolkit for RHEL6 :
-
-  wget http://developer.download.nvidia.com/compute/cuda/4_0/toolkit/
-  cudatoolkit_4.0.17_linux_64_rhel6.0.run
-  chmod +x cudatoolkit_4.0.17_linux_64_rhel6.0.run
-  ./cudatoolkit_4.0.17_linux_64_rhel6.0.run
-
-Make sure the kernel modules are loaded:
-
-  /sbin/modprobe -r nouveau && /sbin/modprobe nvidia
-
-To ensure the proper NVIDIA CUDA modules get loaded on reboot, add the above
-line to your /etc/rc.local file.
-
-Suricata
-
-Download and install Suricata:
-
-  wget http://www.openinfosecfoundation.org/download/suricata-1.1beta2.tar.gz
-
-And unpack it:
-
-  tar -xvzf suricata-1.1beta2.tar.gz
-
-Change to the unpacked directory:
-
-  cd suricata-1.1beta2
-
-Compile and install the engine with CUDA support:
-
-  ./configure --enable-gccprotect --enable-profiling --enable-cuda \
-  --with-cuda-includes=/usr/local/cuda/include --with-cuda-libraries=/usr/
-  local/cuda/lib64/
-  make
-  make install
-
-
-Rules
-
-Read the information in Rule_Management_with_Oinkmaster
-Add rules to suricata:
-
-  cd /etc/suricata
-  wget https://rules.emergingthreatspro.com/open-nogpl/suricata/
-  emerging.rules.tar.gz
-  tar -xvzf emerging.rules.tar.gz
-
-Make sure your .yaml file includes the /etc/suricata/rules/emerging-*.rules
-files (they may need to be uncommented).
-Run Suricata as followed:
-
-  cd /etc/suricata
-  /usr/local/bin/suricata -c /etc/suricata/suricata.yaml -i eth0
-
diff --git a/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt b/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt
deleted file mode 100644
index 9c6c82fafa..0000000000
--- a/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-Autogenerated on 2012-11-29
-from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_on_Ubuntu_server_1104
-
-
-Installation with CUDA on Ubuntu server 11.04
-
-THIS WOULD NOT WORK ON A VIRTUAL MACHINE!
-This guide is written using:
-Ubuntu Server 11.04
-Linux ubuntu 2.6.38-8-generic x86_64 GNU/Linux
-
-Pre installation requirements
-
-
-  apt-get update
-  apt-get upgrade
-
-Get the CUDA toolkit
-
-  http://developer.nvidia.com/cuda-toolkit-40
-
-Pick up the correct NVIDIA drivers for your card and system
-
-  http://www.nvidia.com/Download/index.aspx?lang=en-us
-
-Go to your download directory
-and chmod the 2 *.run files that you just downloaded.
-Example:
-
-  chmod 655 cudatoolkit_4.0.17_linux_64_ubuntu10.10.run
-  chmod 655 NVIDIA-Linux-x86_64-280.13.run
-
-
-  sudo apt-get -y install libpcre3 libpcre3-dbg libpcre3-dev \
-  build-essential autoconf automake libtool libpcap-dev libnet1-dev \
-  libyaml-0-2 libyaml-dev zlib1g zlib1g-dev libcap-ng-dev libcap-ng0 \
-  make flex bison git
-
-Run the cuda toolkit installation package:
-
-  sudo ./cudatoolkit_4.0.17_linux_64_ubuntu10.10.run
-
-Close all windows and as you are logged in press:
-
-  Ctr+Alt+F1
-
-Log in with your credentials
-
-  sudo -i
-
-And enter your password
-Stop the x server:
-
-  /etc/init.d/gdm stop
-
-Uninstall xserver video drivers:
-
-  apt-get remove --purge xserver-xorg-video-nouveau
-
-Go to the directory where you downloaded nvidia/cuda drivers.
-
-  Run the NVIDIA*******.run:
-  ./NVIDIA********.run
-
-Ok and yes your way out.
-At some point it will ask you to make a special configuration file to disable a
-"nouveau"
-driver that the system is currently using and prevents the NVIDIA drivers to be
-installed - say yes!
-Reboot:
-
-  shutdown -r now
-
-After reboot log in as you would normally through the GUI
-Log in as you would normally.
-Go to shell:
-
-  Ctrl+Alt+F1
-
-Type in your credentials and pass
-
-  sudo -i
-
-Stop the xserver again:
-
-  /etc/init.d/gdm stop
-
-Run the NVIDIA driver again.
-This time it would finish and be successful....
-Reboot:
-
-  shutdown -r now
-
-After start you would notice that the display has much better resolution - it
-is a good thing.
-Log in as you would normally.
-Because the 11.04 Ubuntu comes with gcc version 4.5 by default we need to
-install gcc 4.4 since we must use 4.4 for the cuda compilation:
-
-  apt-get install gcc-4.4 gcc-4.4-base g++-4.4
-
-Then we switch and make ubuntu use the gcc 4.4 by default:
-
-  sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.5 40 --
-  slave /usr/bin/g++ g++ /usr/bin/g++-4.5
-  udo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 --
-  slave /usr/bin/g++ g++ /usr/bin/g++-4.4
-
-We make sure that this is the case:
-
-  sudo update-alternatives --config gcc
-
-""
-
-  update-alternatives --config gcc  (as root)
-
-
-There are 2 choices for the alternative gcc (providing /usr/bin/gcc).
-
-* 0 /usr/bin/gcc-4.4 60 auto mode
-  1 /usr/bin/gcc-4.4 60 manual mode
-  2 /usr/bin/gcc-4.5 40 manual mode
-
-
-  Selection    Path              Priority   Status
-  ------------------------------------------------------------
-
-Press enter to keep the current choice[*], or type selection number:
-""
-
-Suricata
-
-Enter the following in your download directory:
-
-  git clone git://phalanx.openinfosecfoundation.org/oisf.git
-  cd oisf/
-  ./autogen.sh
-  ./configure --enable-gccprotect --enable-profiling --enable-cuda \
-  --with-cuda-includes=/usr/local/cuda/include --with-cuda-libraries=/usr/
-  local/cuda/lib64/
-
-After that you should get the following result:
-""
-
-  Suricata Configuration:
-    NFQueue support:          no
-    IPFW support:             no
-    PF_RING support:          no
-    Prelude support:          no
-    Unit tests enabled:       no
-    Debug output enabled:     no
-    Debug validation enabled: no
-    CUDA enabled:             yes
-    DAG enabled:              no
-    Profiling enabled:        yes
-    GCC Protect enabled:      yes
-    GCC march native enabled: yes
-    GCC Profile enabled:      no
-    Unified native time:      no
-    Non-bundled htp:          no
-    PCRE sljit:               no
-  ""
-
-
-  make && make install
-  ldconfig
-
-Proceed with Basic_Setup
-After you start suricata , you should see cuda
-
-  example :
-  ""
-  suricata -c suricata.yaml -i eth0
-  [12406] 13/8/2011 -- 10:14:39 - (suricata.c:622) <Info> (main) -- This is
-  Suricata version 1.1beta2 (rev b3f7e6a)
-  [12406] 13/8/2011 -- 10:14:39 - (util-cpu.c:171) <Info> (UtilCpuPrintSummary)
-  -- CPUs/cores online: 8
-  [12406] 13/8/2011 -- 10:14:39 - (util-cuda.c:4504) <Info>
-  (SCCudaPrintBasicDeviceInfo) -- GPU Device 1: GeForce 310M, 2
-  Multiprocessors, 1468MHz, CUDA Compute Capability 1.2...................
-  ........................
-  ""
-
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 4589a9fe10..1e64e4c11d 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -18,12 +18,8 @@ Fedora_Core.txt \
 FreeBSD_8.txt \
 HTP_library_installation.txt \
 Installation_from_GIT_with_PF_RING_on_Ubuntu_server_1104.txt \
-Installation_with_CUDA_on_Ubuntu_server_1104.txt \
-Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt \
 Installation_with_PF_RING.txt \
-Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt \
 Installation_from_GIT_with_PCRE-JIT.txt \
-Installation_with_CUDA_on_Scientific_Linux_6.txt \
 Mac_OS_X_106x.txt \
 OpenBSD_Installation_from_GIT.txt \
 Setting_up_IPSinline_for_Linux.txt \
diff --git a/doc/userguide/configuration/suricata-yaml.rst b/doc/userguide/configuration/suricata-yaml.rst
index c7ecabb682..e79c3576ca 100644
--- a/doc/userguide/configuration/suricata-yaml.rst
+++ b/doc/userguide/configuration/suricata-yaml.rst
@@ -787,69 +787,6 @@ To let Suricata make these decisions set default to 'auto':
       default: auto
 
 
-CUDA (Compute United Device Architecture)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Suricata utilizes CUDA for offloading CPU intensive tasks to the
-(NVIDIA) GPU (graphics processing unit). Suricata supports an
-experimental multi-pattern-matcher using CUDA.  Only if you have
-compiled Suricata with CUDA (by entering --enable-cuda in the
-configure stage) you can make use of these features.  There are
-several options for CUDA.  The option 'packet_buffer_limit' designates
-how many packets will be send to the GPU at the same time. Suricata
-sends packets in 'batches', meaning it sends multiple packets at
-once. As soon as Suricata has collected the amount of packets set in
-the 'packet_buffer_limit' option, it sends them to the GPU. The
-default amount of packets is 2400.
-
-The option 'packet_size_limit' makes sure that packets with payloads
-bigger than a certain amount of bytes will not be send to the
-GPU. Other packets will be send to the GPU. The default setting is
-1500 bytes.
-
-The option 'packet_buffers' designates the amount of buffers that will
-be filled with packets and will be processed. Buffers contain the
-batches of packets. During the time these filled buffers are being
-processed, new buffers will be filled.
-
-The option 'batching_timeout' can have all values higher than 0. If a
-buffers is not fully filled after a period of time (set in this option
-'batching_timeout'), the buffer will be send to the GPU anyway.
-
-The option 'page_locked' designates whether the page locked memory
-will or will not be used. The advantage of page locked memory is that
-it can not be swapped out to disk. You would not want your computer to
-use your hard disk for Suricata, because it lowers the performance a
-lot. In this option you can set whether you still want this for CUDA
-or not.
-
-The option 'device_id' is an option within CUDA to determine which GPU
-should be turned to account.(If there is only one GPU present at your
-computer, there is no benefit making use of the 'device-id' option.)
-To detect the id of your GPU's, enter the following in your command
-line:
-
-::
-
-  suricata --list-cuda-cards
-
-With the option 'cuda_streams' you can determine how many cuda-streams
-should be used for asynchronous processing. All values > 0 are
-valid. For this option you need a device with Compute Capability > 1.0
-and page_locked enabled to have any effect.
-
-::
-
-  cuda:
-    -mpm:
-       packet_buffer_limit: 2400
-       packet_size_limit: 1500
-       packet_buffers: 10
-       batching_timeout: 1
-       page_locked: enabled
-       device_id: 0
-       cuda_streams: 2
-
 Pattern matcher settings
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/Makefile.am b/src/Makefile.am
index eab7b1c51c..cbbde7973a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -51,7 +51,6 @@ app-layer-tls-handshake.c app-layer-tls-handshake.h \
 conf.c conf.h \
 conf-yaml-loader.c conf-yaml-loader.h \
 counters.c counters.h \
-data-queue.c data-queue.h \
 decode.c decode.h \
 decode-afl.c \
 decode-erspan.c decode-erspan.h \
@@ -384,10 +383,6 @@ util-conf.c util-conf.h \
 util-coredump-config.c util-coredump-config.h \
 util-cpu.c util-cpu.h \
 util-crypt.c util-crypt.h \
-util-cuda.c util-cuda.h \
-util-cuda-buffer.c util-cuda-buffer.h \
-util-cuda-handlers.c util-cuda-handlers.h \
-util-cuda-vars.c util-cuda-vars.h \
 util-daemon.c util-daemon.h \
 util-debug.c util-debug.h \
 util-debug-filters.c util-debug-filters.h \
@@ -482,7 +477,7 @@ win32-misc.c win32-misc.h \
 win32-service.c win32-service.h \
 win32-syslog.h
 
-EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py tests
+EXTRA_DIST = tests
 
 # set the include path found by configure
 AM_CPPFLAGS = $(all_includes)
@@ -495,58 +490,6 @@ if HAVE_RUST
 suricata_DEPENDENCIES = $(RUST_SURICATA_LIB)
 endif
 
-# Rules to build CUDA ptx modules
-if BUILD_CUDA
-BUILT_SOURCES = cuda-ptxdump.h
-
-suricata_CUDA_KERNELS = \
-util-mpm-ac-cuda-kernel.cu
-
-NVCCFLAGS=-O2
-
-PTXS = $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_32)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_35)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_37)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_50)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_52)
-PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_53)
-
-.cu.ptx_sm_20:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_20 -ptx $<
-
-.cu.ptx_sm_21:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $<
-
-.cu.ptx_sm_30:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $<
-
-.cu.ptx_sm_32:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_32 -ptx $<
-
-.cu.ptx_sm_35:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_35 -ptx $<
-
-.cu.ptx_sm_37:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_37 -ptx $<
-
-.cu.ptx_sm_50:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_50 -ptx $<
-
-.cu.ptx_sm_52:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_52 -ptx $<
-
-.cu.ptx_sm_53:
-	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_53 -ptx $<
-
-cuda-ptxdump.h: $(PTXS)
-	$(PYTHON) ptxdump.py cuda-ptxdump $(PTXS)
-
-CLEANFILES = $(PTXS) cuda-ptxdump.h
-endif
-
 # default CFLAGS
 AM_CFLAGS = ${OPTIMIZATION_CFLAGS} ${GCC_CFLAGS} ${CLANG_CFLAGS}            \
     ${SECCFLAGS} ${PCAP_CFLAGS} -DLOCAL_STATE_DIR=\"$(localstatedir)\"      \
diff --git a/src/app-layer-detect-proto.c b/src/app-layer-detect-proto.c
index 7b0f66ff67..92d8014c9b 100644
--- a/src/app-layer-detect-proto.c
+++ b/src/app-layer-detect-proto.c
@@ -60,7 +60,6 @@
 #include "conf.h"
 #include "util-memcmp.h"
 #include "util-spm.h"
-#include "util-cuda.h"
 #include "util-debug.h"
 
 #include "runmodes.h"
@@ -1591,12 +1590,6 @@ int AppLayerProtoDetectSetup(void)
     uint16_t spm_matcher = SinglePatternMatchDefaultMatcher();
     uint16_t mpm_matcher = PatternMatchDefaultMatcher();
 
-#ifdef __SC_CUDA_SUPPORT__
-    /* CUDA won't work here, so fall back to AC */
-    if (mpm_matcher == MPM_AC_CUDA)
-        mpm_matcher = mpm_default_matcher;
-#endif
-
     alpd_ctx.spm_global_thread_ctx = SpmInitGlobalThreadCtx(spm_matcher);
     if (alpd_ctx.spm_global_thread_ctx == NULL) {
         SCLogError(SC_ERR_FATAL, "Unable to alloc SpmGlobalThreadCtx.");
diff --git a/src/data-queue.c b/src/data-queue.c
deleted file mode 100644
index a3afd4acc9..0000000000
--- a/src/data-queue.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/**
- * Copyright (c) 2009, 2010 Open Information Security Foundation.
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#include "suricata-common.h"
-#include "data-queue.h"
-#include "threads.h"
-
-/**
- * \brief Enqueues data on the queue.
- *
- * \param q    Pointer to the data queue.
- * \param data Pointer to the data to be queued.  It should be a pointer to a
- *             structure instance that implements the template structure
- *             struct SCDQGenericQData_ defined in data-queue.h.
- */
-void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data)
-{
-    /* we already have some data in queue */
-    if (q->top != NULL) {
-        data->next = q->top;
-        q->top->prev = data;
-        q->top = data;
-
-    /* the queue is empty */
-    } else {
-        q->top = data;
-        q->bot = data;
-    }
-
-    q->len++;
-
-#ifdef DBG_PERF
-    if (q->len > q->dbg_maxlen)
-        q->dbg_maxlen = q->len;
-#endif /* DBG_PERF */
-
-    return;
-}
-
-/**
- * \brief Dequeues and returns an entry from the queue.
- *
- * \param q      Pointer to the data queue.
- * \param retval Pointer to the data that has been enqueued.  The instance
- *               returned is/should be a pointer to a structure instance that
- *               implements the template structure struct SCDQGenericQData_
- *               defined in data-queue.h.
- */
-SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q)
-{
-    SCDQGenericQData *data = NULL;
-
-    /* if the queue is empty there are is no data left and we return NULL */
-    if (q->len == 0) {
-        return NULL;
-    }
-
-    /* If we are going to get the last packet, set len to 0
-     * before doing anything else (to make the threads to follow
-     * the SCondWait as soon as possible) */
-    q->len--;
-
-    /* pull the bottom packet from the queue */
-    data = q->bot;
-
-#ifdef OS_DARWIN
-    /* Weird issue in OS_DARWIN
-     * Sometimes it looks that two thread arrive here at the same time
-     * so the bot ptr is NULL */
-    if (data == NULL) {
-        printf("No data to dequeue!\n");
-        return NULL;
-    }
-#endif /* OS_DARWIN */
-
-    /* more data in queue */
-    if (q->bot->prev != NULL) {
-        q->bot = q->bot->prev;
-        q->bot->next = NULL;
-    /* just the one we remove, so now empty */
-    } else {
-        q->top = NULL;
-        q->bot = NULL;
-    }
-
-    data->next = NULL;
-    data->prev = NULL;
-
-    return data;
-}
diff --git a/src/data-queue.h b/src/data-queue.h
deleted file mode 100644
index f1f6bb3839..0000000000
--- a/src/data-queue.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Copyright (c) 2009, 2010 Open Information Security Foundation.
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * \file Generic queues.  Any instance that wants to get itself on the generic
- *       queue, would have to implement the template struct SCDQGenericQData_
- *       defined below.
- */
-
-#ifndef __DATA_QUEUE_H__
-#define __DATA_QUEUE_H__
-
-#include "threads.h"
-
-/**
- * \brief Generic template for any data structure that wants to be on the
- *        queue.  Any other data structure that wants to be on the queue
- *        needs to use this template and define its own members from
- *        <your_own_structure_members_from_here_on> onwards.
- */
-typedef struct SCDQGenericQData_ {
-    /* this is needed when we want to supply a list of data items */
-    struct SCDQGenericQData_ *next;
-    struct SCDQGenericQData_ *prev;
-    /* if we want to consider this pointer as the head of a list, this var
-     * holds the no of elements in the list.  Else it holds a <need_to_think>. */
-    //uint16_t len;
-    /* in case this data instance is the head of a list, we can refer the
-     * bottomost instance directly using this var */
-    //struct SCDQGenericaQData *bot;
-
-
-    /* any other data structure that wants to be on the queue can implement
-     * its own memebers from here on, in its structure definition.  Just note
-     * that the first 2 members should always be next and prev in the same
-     * order */
-    // <your_own_structure_members_from_here_on>
-} SCDQGenericQData;
-
-/**
- * \brief The data queue to hold instances that implement the template
- *        SCDQGenericQData.
- */
-typedef struct SCDQDataQueue_ {
-    /* holds the item at the top of the queue */
-    SCDQGenericQData *top;
-    /* holds the item at the bottom of the queue */
-    SCDQGenericQData *bot;
-    /* no of items currently in the queue */
-    uint16_t len;
-#ifdef DBG_PERF
-    uint16_t dbg_maxlen;
-#endif /* DBG_PERF */
-
-    SCMutex mutex_q;
-    SCCondT cond_q;
-
-} __attribute__((aligned(CLS))) SCDQDataQueue;
-
-void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *);
-SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *);
-
-#endif /* __DATA_QUEUE_H__ */
diff --git a/src/decode.c b/src/decode.c
index 30dd8cee3a..e9e2f38be6 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -119,11 +119,6 @@ void PacketDecodeFinalize(ThreadVars *tv, DecodeThreadVars *dtv, Packet *p)
             }
         }
     }
-#ifdef __SC_CUDA_SUPPORT__
-    if (dtv->cuda_vars.mpm_is_cuda)
-        CudaBufferPacket(&dtv->cuda_vars, p);
-#endif
-
 }
 
 /**
diff --git a/src/decode.h b/src/decode.h
index 76f71eca50..0f0c6020c0 100644
--- a/src/decode.h
+++ b/src/decode.h
@@ -32,11 +32,6 @@
 #include "decode-events.h"
 #include "flow-worker.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-#include "util-cuda-buffer.h"
-#include "util-cuda-vars.h"
-#endif /* __SC_CUDA_SUPPORT__ */
-
 #ifdef HAVE_NAPATECH
 #include "util-napatech.h"
 #endif /* HAVE_NAPATECH */
@@ -596,9 +591,6 @@ typedef struct Packet_
 #ifdef PROFILING
     PktProfiling *profile;
 #endif
-#ifdef __SC_CUDA_SUPPORT__
-    CudaPacketVars cuda_pkt_vars;
-#endif
 #ifdef HAVE_NAPATECH
     NapatechPacketVars ntpv;
 #endif
@@ -692,9 +684,6 @@ typedef struct DecodeThreadVars_
      * flow recycle during lookups */
     void *output_flow_thread_data;
 
-#ifdef __SC_CUDA_SUPPORT__
-    CudaThreadVars cuda_vars;
-#endif
 } DecodeThreadVars;
 
 typedef struct CaptureStats_ {
@@ -734,25 +723,11 @@ void CaptureStatsSetup(ThreadVars *tv, CaptureStats *s);
 /**
  *  \brief Initialize a packet structure for use.
  */
-#ifdef __SC_CUDA_SUPPORT__
-#include "util-cuda-handlers.h"
-#include "util-mpm.h"
-
-#define PACKET_INITIALIZE(p) do {                                       \
-        memset((p), 0x00, SIZE_OF_PACKET);                              \
-        SCMutexInit(&(p)->tunnel_mutex, NULL);                          \
-        PACKET_RESET_CHECKSUMS((p));                                    \
-        (p)->livedev = NULL;                                            \
-        SCMutexInit(&(p)->cuda_pkt_vars.cuda_mutex, NULL);            \
-        SCCondInit(&(p)->cuda_pkt_vars.cuda_cond, NULL);                \
-    } while (0)
-#else
 #define PACKET_INITIALIZE(p) {         \
     SCMutexInit(&(p)->tunnel_mutex, NULL); \
     PACKET_RESET_CHECKSUMS((p)); \
     (p)->livedev = NULL; \
 }
-#endif
 
 #define PACKET_RELEASE_REFS(p) do {              \
         FlowDeReference(&((p)->flow));          \
diff --git a/src/detect-engine-build.c b/src/detect-engine-build.c
index d3e7bc40aa..31e0545f6f 100644
--- a/src/detect-engine-build.c
+++ b/src/detect-engine-build.c
@@ -1998,39 +1998,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
         exit(EXIT_FAILURE);
     }
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) {
-        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
-            /* setting it to default.  You've gotta remove it once you fix the state table thing */
-            SCACConstructBoth16and32StateTables();
-
-            MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-            CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
-            if (cuda_context == 0) {
-                SCLogError(SC_ERR_FATAL, "cuda context is NULL.");
-                exit(EXIT_FAILURE);
-            }
-            int r = SCCudaCtxPushCurrent(cuda_context);
-            if (r < 0) {
-                SCLogError(SC_ERR_FATAL, "context push failed.");
-                exit(EXIT_FAILURE);
-            }
-        }
-
-        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
-            int r = SCCudaCtxPopCurrent(NULL);
-            if (r < 0) {
-                SCLogError(SC_ERR_FATAL, "cuda context pop failure.");
-                exit(EXIT_FAILURE);
-            }
-        }
-
-        /* too late to call this either ways.  Should be called post ac goto.
-         * \todo Support this. */
-        DetermineCudaStateTableSize(de_ctx);
-    }
-#endif
-
     int r = DetectMpmPrepareBuiltinMpms(de_ctx);
     r |= DetectMpmPrepareAppMpms(de_ctx);
     if (r != 0) {
diff --git a/src/detect-engine-payload.c b/src/detect-engine-payload.c
index a4e8f62eb3..313c897544 100644
--- a/src/detect-engine-payload.c
+++ b/src/detect-engine-payload.c
@@ -117,19 +117,9 @@ static void PrefilterPktPayload(DetectEngineThreadCtx *det_ctx,
     if (p->payload_len < mpm_ctx->minlen)
         SCReturn;
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (p->cuda_pkt_vars.cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
-        (void)SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq);
-    } else {
-        (void)mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
-                &det_ctx->mtc, &det_ctx->pmq,
-                p->payload, p->payload_len);
-    }
-#else
     (void)mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
             &det_ctx->mtc, &det_ctx->pmq,
             p->payload, p->payload_len);
-#endif
 }
 
 int PrefilterPktPayloadRegister(SigGroupHead *sgh, MpmCtx *mpm_ctx)
diff --git a/src/detect-engine-register.c b/src/detect-engine-register.c
index 776ebe6fd7..788745a334 100644
--- a/src/detect-engine-register.c
+++ b/src/detect-engine-register.c
@@ -219,7 +219,6 @@
 #include "util-unittest-helper.h"
 #include "util-debug.h"
 #include "util-hashlist.h"
-#include "util-cuda.h"
 #include "util-privs.h"
 #include "util-profiling.h"
 #include "util-validate.h"
diff --git a/src/detect-engine.c b/src/detect-engine.c
index dad0c8e85a..652329ea7d 100644
--- a/src/detect-engine.c
+++ b/src/detect-engine.c
@@ -1263,11 +1263,7 @@ static int DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx)
 #ifdef BUILD_HYPERSCAN
             de_ctx->mpm_matcher == MPM_HS ||
 #endif
-#ifdef __SC_CUDA_SUPPORT__
-            de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) {
-#else
             de_ctx->mpm_matcher == MPM_AC_BS) {
-#endif
             de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
         } else {
             de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
@@ -1276,15 +1272,6 @@ static int DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx)
         if (strcmp(sgh_mpm_context, "single") == 0) {
             de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
         } else if (strcmp(sgh_mpm_context, "full") == 0) {
-#ifdef __SC_CUDA_SUPPORT__
-            if (de_ctx->mpm_matcher == MPM_AC_CUDA) {
-                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use "
-                           "the cuda version of our mpm ac, i.e. \"ac-cuda\" "
-                           "along with \"full\" \"sgh-mpm-context\".  "
-                           "Allowed values are \"single\" and \"auto\".");
-                exit(EXIT_FAILURE);
-            }
-#endif
             de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
         } else {
            SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an "
diff --git a/src/detect.c b/src/detect.c
index e11ab0ad14..5a86e04c9a 100644
--- a/src/detect.c
+++ b/src/detect.c
@@ -902,10 +902,6 @@ next:
     PACKET_PROFILING_DETECT_END(p, PROF_DETECT_RULES);
 
 end:
-#ifdef __SC_CUDA_SUPPORT__
-    CudaReleasePacket(p);
-#endif
-
     /* see if we need to increment the inspect_id and reset the de_state */
     if (has_state && AppLayerParserProtocolSupportsTxs(p->proto, alproto)) {
         PACKET_PROFILING_DETECT_START(p, PROF_DETECT_STATEFUL_UPDATE);
diff --git a/src/ptxdump.py b/src/ptxdump.py
deleted file mode 100644
index 097e517334..0000000000
--- a/src/ptxdump.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-from string import *
-import os, getopt, sys, platform
-
-header = '''/* Auto-generated by ptxdump.py DO NOT EDIT
-*
-* This file contains the ptx code of the Cuda kernels.
-* A kernel is identified by its name and the compute capability (e.g. _sm_10).
-*/
-'''
-
-def FormatCharHex(d):
-    s = hex(ord(d))
-    if len(s) == 3:
-        s = "0x0" + s[2]
-    return s
-
-def CleanFileName(f):
-    v = f.replace("-","_")
-    v = v.replace(".ptx","")
-    return v
-
-if not(len(sys.argv[1:]) >= 2):
-    print("Usage: ptx2c.py <output> <in.ptx ..> ")
-    print("Description: creates a header file containing the ptx files as character array" + os.linesep)
-    sys.exit(0)
-
-out_h = sys.argv[1] + ".h"
-out = open(out_h, 'w')
-
-out.writelines(header)
-out.writelines("#ifdef __SC_CUDA_SUPPORT__\n")
-out.writelines("#ifndef __ptxdump_h__\n")
-out.writelines("#define __ptxdump_h__\n\n")
-
-# write char arrays
-for file in sys.argv[2:]:
-    in_ptx = open(file, 'r')
-    source = in_ptx.read()
-    source_len = len(source)
-
-    varname = CleanFileName(file)
-
-    out.writelines("const unsigned char " + varname + "[" + str(source_len+1) + "] = {\n")
-    newlinecnt = 0
-    for i in range(0, source_len):
-        out.write(FormatCharHex(source[i]) + ", ")
-        newlinecnt += 1
-        if newlinecnt == 16:
-            newlinecnt = 0
-            out.write("\n")
-    out.write("0x00\n};\n\n")
-
-    print(sys.argv[0] + ": CUmodule " + varname + " packed successfully")
-
-# write retrieval function
-out.writelines("const unsigned char* SCCudaPtxDumpGetModule(const char* module){\n");
-for file in sys.argv[2:]:
-    out.writelines('\tif (!strcmp(module, "' + file.replace(".ptx","")+'"))\n')
-    out.writelines("\t\treturn " + CleanFileName(file)+";\n")
-out.writelines('\tSCLogError(SC_ERR_FATAL, "Error in SCCudaPtxDumpGetModule, module %s not found. Exiting...",module);\n')
-out.writelines("\texit(EXIT_FAILURE);\n")
-out.writelines("};\n")
-
-out.writelines("#endif /* __ptxdump_h__ */\n")
-out.writelines("#endif /* __SC_CUDA_SUPPORT__ */\n")
-
-print(sys.argv[0] + ": " + out_h + " written successfully")
-
-in_ptx.close()
-out.close()
diff --git a/src/runmode-unittests.c b/src/runmode-unittests.c
index f89a9439ed..848e88d8d8 100644
--- a/src/runmode-unittests.c
+++ b/src/runmode-unittests.c
@@ -185,9 +185,6 @@ static void RegisterUnittests(void)
     SCClassConfRegisterTests();
     SCThresholdConfRegisterTests();
     SCRConfRegisterTests();
-#ifdef __SC_CUDA_SUPPORT__
-    SCCudaRegisterTests();
-#endif
     PayloadRegisterTests();
     DcePayloadRegisterTests();
     UriRegisterTests();
@@ -218,9 +215,6 @@ static void RegisterUnittests(void)
     DetectPortTests();
     SCAtomicRegisterTests();
     MemrchrRegisterTests();
-#ifdef __SC_CUDA_SUPPORT__
-    CudaBufferRegisterUnittests();
-#endif
     AppLayerUnittestsRegister();
     MimeDecRegisterTests();
     StreamingBufferRegisterTests();
@@ -251,9 +245,6 @@ void RunUnittests(int list_unittests, const char *regex_arg)
     default_packet_size = DEFAULT_PACKET_SIZE;
     /* load the pattern matchers */
     MpmTableSetup();
-#ifdef __SC_CUDA_SUPPORT__
-    MpmCudaEnvironmentSetup();
-#endif
     SpmTableSetup();
 
     AppLayerSetup();
@@ -312,11 +303,6 @@ void RunUnittests(int list_unittests, const char *regex_arg)
         UtCleanup();
 #ifdef BUILD_HYPERSCAN
         MpmHSGlobalCleanup();
-#endif
-#ifdef __SC_CUDA_SUPPORT__
-        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
-            MpmCudaBufferDeSetup();
-        CudaHandlerFreeProfiles();
 #endif
         if (failed) {
             exit(EXIT_FAILURE);
diff --git a/src/runmodes.c b/src/runmodes.c
index 139a5a35d2..79e5f4d7e0 100644
--- a/src/runmodes.c
+++ b/src/runmodes.c
@@ -53,11 +53,6 @@
 #include "flow-manager.h"
 #include "counters.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#endif
-
 int debuglog_enabled = 0;
 
 /* Runmode Global Thread Names */
@@ -349,15 +344,6 @@ void RunModeDispatch(int runmode, const char *custom_mode)
         }
     }
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA &&
-        strcasecmp(custom_mode, "autofp") != 0) {
-        SCLogError(SC_ERR_RUNMODE, "When using a cuda mpm, the only runmode we "
-                   "support is autofp.");
-        exit(EXIT_FAILURE);
-    }
-#endif
-
     RunMode *mode = RunModeGetCustomMode(runmode, custom_mode);
     if (mode == NULL) {
         SCLogError(SC_ERR_RUNMODE, "The custom type \"%s\" doesn't exist "
@@ -386,11 +372,6 @@ void RunModeDispatch(int runmode, const char *custom_mode)
     if (local_custom_mode != NULL)
         SCFree(local_custom_mode);
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
-        SCACCudaStartDispatcher();
-#endif
-
     /* Check if the alloted queues have at least 1 reader and writer */
     TmValidateQueueState();
 
diff --git a/src/runmodes.h b/src/runmodes.h
index da0091e411..0329c638b2 100644
--- a/src/runmodes.h
+++ b/src/runmodes.h
@@ -43,7 +43,6 @@ enum RunModes {
     RUNMODE_USER_MAX, /* Last standard running mode */
     RUNMODE_LIST_KEYWORDS,
     RUNMODE_LIST_APP_LAYERS,
-    RUNMODE_LIST_CUDA_CARDS,
     RUNMODE_LIST_RUNMODES,
     RUNMODE_PRINT_VERSION,
     RUNMODE_PRINT_BUILDINFO,
diff --git a/src/source-af-packet.c b/src/source-af-packet.c
index 3c1d000eb7..11a872ae4f 100644
--- a/src/source-af-packet.c
+++ b/src/source-af-packet.c
@@ -56,18 +56,6 @@
 #include "source-af-packet.h"
 #include "runmodes.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-cuda.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#include "util-cuda-handlers.h"
-#include "detect-engine.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda-vars.h"
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 #ifdef HAVE_AF_PACKET
 
 #if HAVE_SYS_IOCTL_H
@@ -2351,11 +2339,6 @@ TmEcode DecodeAFPThreadInit(ThreadVars *tv, const void *initdata, void **data)
 
     *data = (void *)dtv;
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (CudaThreadVarsInit(&dtv->cuda_vars) < 0)
-        SCReturnInt(TM_ECODE_FAILED);
-#endif
-
     SCReturnInt(TM_ECODE_OK);
 }
 
diff --git a/src/source-netmap.c b/src/source-netmap.c
index cf0b443183..9521e7bede 100644
--- a/src/source-netmap.c
+++ b/src/source-netmap.c
@@ -54,18 +54,6 @@
 #include "source-netmap.h"
 #include "runmodes.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-cuda.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#include "util-cuda-handlers.h"
-#include "detect-engine.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda-vars.h"
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 #ifdef HAVE_NETMAP
 
 #if HAVE_SYS_IOCTL_H
@@ -990,11 +978,6 @@ static TmEcode DecodeNetmapThreadInit(ThreadVars *tv, const void *initdata, void
 
     *data = (void *)dtv;
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (CudaThreadVarsInit(&dtv->cuda_vars) < 0)
-        SCReturnInt(TM_ECODE_FAILED);
-#endif
-
     SCReturnInt(TM_ECODE_OK);
 }
 
diff --git a/src/source-pcap-file-helper.c b/src/source-pcap-file-helper.c
index 40b753f52e..6a5c0bffed 100644
--- a/src/source-pcap-file-helper.c
+++ b/src/source-pcap-file-helper.c
@@ -27,18 +27,6 @@
 #include "util-checksum.h"
 #include "util-profiling.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-cuda.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#include "util-cuda-handlers.h"
-#include "detect-engine.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda-vars.h"
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 extern int max_pending_packets;
 extern PcapFileGlobalVars pcap_g;
 
diff --git a/src/source-pcap-file.c b/src/source-pcap-file.c
index 1c408829ec..30c5483bde 100644
--- a/src/source-pcap-file.c
+++ b/src/source-pcap-file.c
@@ -30,18 +30,6 @@
 #include "flow-manager.h"
 #include "util-checksum.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-cuda.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#include "util-cuda-handlers.h"
-#include "detect-engine.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda-vars.h"
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 extern int max_pending_packets;
 PcapFileGlobalVars pcap_g;
 
@@ -424,11 +412,6 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, const void *initdata, void **da
 
     DecodeRegisterPerfCounters(dtv, tv);
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (CudaThreadVarsInit(&dtv->cuda_vars) < 0)
-        SCReturnInt(TM_ECODE_FAILED);
-#endif
-
     *data = (void *)dtv;
 
     SCReturnInt(TM_ECODE_OK);
diff --git a/src/source-pcap.c b/src/source-pcap.c
index 84e9456735..ba84988879 100644
--- a/src/source-pcap.c
+++ b/src/source-pcap.c
@@ -42,18 +42,6 @@
 #include "util-ioctl.h"
 #include "tmqh-packetpool.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-cuda.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#include "util-cuda-handlers.h"
-#include "detect-engine.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda-vars.h"
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 #define PCAP_STATE_DOWN 0
 #define PCAP_STATE_UP 1
 
@@ -602,11 +590,6 @@ TmEcode DecodePcapThreadInit(ThreadVars *tv, const void *initdata, void **data)
 
     DecodeRegisterPerfCounters(dtv, tv);
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (CudaThreadVarsInit(&dtv->cuda_vars) < 0)
-        SCReturnInt(TM_ECODE_FAILED);
-#endif
-
     *data = (void *)dtv;
 
     SCReturnInt(TM_ECODE_OK);
diff --git a/src/suricata.c b/src/suricata.c
index a9b7fc49a3..120b5fc2a4 100644
--- a/src/suricata.c
+++ b/src/suricata.c
@@ -149,7 +149,6 @@
 #include "runmodes.h"
 #include "runmode-unittests.h"
 
-#include "util-cuda.h"
 #include "util-decode-asn1.h"
 #include "util-debug.h"
 #include "util-error.h"
@@ -163,10 +162,6 @@
 #include "tmqh-packetpool.h"
 
 #include "util-proto-name.h"
-#ifdef __SC_CUDA_SUPPORT__
-#include "util-cuda-buffer.h"
-#include "util-mpm-ac.h"
-#endif
 #include "util-mpm-hs.h"
 #include "util-storage.h"
 #include "host-storage.h"
@@ -321,30 +316,9 @@ uint8_t print_mem_flag = 1;
 #endif
 #endif
 
-static void CreateLowercaseTable(void)
-{
-    /* create table for O(1) lowercase conversion lookup.  It was removed, but
-     * we still need it for cuda.  So resintalling it back into the codebase */
-    int c = 0;
-    memset(g_u8_lowercasetable, 0x00, sizeof(g_u8_lowercasetable));
-    for ( ; c < 256; c++) {
-        if (c >= 'A' && c <= 'Z')
-            g_u8_lowercasetable[c] = (c + ('a' - 'A'));
-        else
-            g_u8_lowercasetable[c] = c;
-    }
-}
-
 void GlobalsInitPreConfig(void)
 {
-#ifdef __SC_CUDA_SUPPORT__
-    /* Init the CUDA environment */
-    SCCudaInitCudaEnvironment();
-    CudaBufferInit();
-#endif
-
     memset(trans_q, 0, sizeof(trans_q));
-    memset(data_queues, 0, sizeof(data_queues));
 
     /* Initialize the trans_q mutex */
     int blah;
@@ -352,9 +326,6 @@ void GlobalsInitPreConfig(void)
     for(blah=0;blah<256;blah++) {
         r |= SCMutexInit(&trans_q[blah].mutex_q, NULL);
         r |= SCCondInit(&trans_q[blah].cond_q, NULL);
-
-        r |= SCMutexInit(&data_queues[blah].mutex_q, NULL);
-        r |= SCCondInit(&data_queues[blah].cond_q, NULL);
    }
 
     if (r != 0) {
@@ -362,8 +333,6 @@ void GlobalsInitPreConfig(void)
         exit(EXIT_FAILURE);
     }
 
-    CreateLowercaseTable();
-
     TimeInit();
     SupportFastPatternForSigMatchTypes();
 }
@@ -424,11 +393,6 @@ static void GlobalsDestroy(SCInstance *suri)
     MpmHSGlobalCleanup();
 #endif
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
-        MpmCudaBufferDeSetup();
-    CudaHandlerFreeProfiles();
-#endif
     ConfDeInit();
 #ifdef HAVE_LUAJIT
     LuajitFreeStatesPool();
@@ -637,9 +601,6 @@ static void PrintUsage(const char *progname)
 #endif /* UNITTESTS */
     printf("\t--list-app-layer-protos              : list supported app layer protocols\n");
     printf("\t--list-keywords[=all|csv|<kword>]    : list keywords implemented by the engine\n");
-#ifdef __SC_CUDA_SUPPORT__
-    printf("\t--list-cuda-cards                    : list cuda supported cards\n");
-#endif
     printf("\t--list-runmodes                      : list supported runmodes\n");
     printf("\t--runmode <runmode_id>               : specific runmode modification the engine should run.  The argument\n"
            "\t                                       supplied should be the id for the runmode obtained by running\n"
@@ -728,9 +689,6 @@ static void PrintBuildInfo(void)
 #ifdef HAVE_PCAP_SET_BUFF
     strlcat(features, "PCAP_SET_BUFF ", sizeof(features));
 #endif
-#ifdef __SC_CUDA_SUPPORT__
-    strlcat(features, "CUDA ", sizeof(features));
-#endif
 #ifdef HAVE_PFRING
     strlcat(features, "PF_RING ", sizeof(features));
 #endif
@@ -1469,7 +1427,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri)
     int dump_config = 0;
     int list_app_layer_protocols = 0;
     int list_unittests = 0;
-    int list_cuda_cards = 0;
     int list_runmodes = 0;
     int list_keywords = 0;
     int build_info = 0;
@@ -1549,7 +1506,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri)
         {"unittest-filter", required_argument, 0, 'U'},
         {"list-app-layer-protos", 0, &list_app_layer_protocols, 1},
         {"list-unittests", 0, &list_unittests, 1},
-        {"list-cuda-cards", 0, &list_cuda_cards, 1},
         {"list-runmodes", 0, &list_runmodes, 1},
         {"list-keywords", optional_argument, &list_keywords, 1},
         {"runmode", required_argument, NULL, 0},
@@ -1719,12 +1675,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri)
 #else
                 fprintf(stderr, "ERROR: Unit tests not enabled. Make sure to pass --enable-unittests to configure when building.\n");
                 return TM_ECODE_FAILED;
-#endif /* UNITTESTS */
-            } else if(strcmp((long_opts[option_index]).name, "list-cuda-cards") == 0) {
-#ifndef __SC_CUDA_SUPPORT__
-                fprintf(stderr, "ERROR: Cuda not enabled. Make sure to pass "
-                        "--enable-cuda to configure when building.\n");
-                return TM_ECODE_FAILED;
 #endif /* UNITTESTS */
             } else if (strcmp((long_opts[option_index]).name, "list-runmodes") == 0) {
                 suri->run_mode = RUNMODE_LIST_RUNMODES;
@@ -2107,8 +2057,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri)
 
     if (list_app_layer_protocols)
         suri->run_mode = RUNMODE_LIST_APP_LAYERS;
-    if (list_cuda_cards)
-        suri->run_mode = RUNMODE_LIST_CUDA_CARDS;
     if (list_keywords)
         suri->run_mode = RUNMODE_LIST_KEYWORDS;
     if (list_unittests)
@@ -2343,10 +2291,6 @@ static int StartInternalRunMode(SCInstance *suri, int argc, char **argv)
         case RUNMODE_PRINT_USAGE:
             PrintUsage(argv[0]);
             return TM_ECODE_DONE;
-#ifdef __SC_CUDA_SUPPORT__
-        case RUNMODE_LIST_CUDA_CARDS:
-            return ListCudaCards();
-#endif
         case RUNMODE_LIST_RUNMODES:
             RunModeListRunmodes();
             return TM_ECODE_DONE;
@@ -2554,11 +2498,6 @@ static void PostConfLoadedDetectSetup(SCInstance *suri)
             exit(EXIT_FAILURE);
         }
 
-#ifdef __SC_CUDA_SUPPORT__
-        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
-            CudaVarsSetDeCtx(de_ctx);
-#endif /* __SC_CUDA_SUPPORT__ */
-
         if (!de_ctx->minimal) {
             if (LoadSignatures(de_ctx, suri) != TM_ECODE_OK)
                 exit(EXIT_FAILURE);
@@ -2593,9 +2532,6 @@ static int PostConfLoadedSetup(SCInstance *suri)
 
     /* load the pattern matchers */
     MpmTableSetup();
-#ifdef __SC_CUDA_SUPPORT__
-    MpmCudaEnvironmentSetup();
-#endif
     SpmTableSetup();
 
     int disable_offloading;
diff --git a/src/suricata.h b/src/suricata.h
index 65b1df4638..4a5ffdafc8 100644
--- a/src/suricata.h
+++ b/src/suricata.h
@@ -66,7 +66,6 @@
 
 #include "suricata-common.h"
 #include "packet-queue.h"
-#include "data-queue.h"
 
 /* the name of our binary */
 #define PROG_NAME "Suricata"
@@ -131,8 +130,6 @@ enum {
  */
 PacketQueue trans_q[256];
 
-SCDQDataQueue data_queues[256];
-
 typedef struct SCInstance_ {
     enum RunModes run_mode;
 
@@ -173,16 +170,6 @@ void GlobalsInitPreConfig(void);
 extern volatile uint8_t suricata_ctl_flags;
 extern int g_disable_randomness;
 
-/* uppercase to lowercase conversion lookup table */
-uint8_t g_u8_lowercasetable[256];
-
-/* marco to do the actual lookup */
-//#define u8_tolower(c) g_u8_lowercasetable[(c)]
-// these 2 are slower:
-//#define u8_tolower(c) ((c) >= 'A' && (c) <= 'Z') ? g_u8_lowercasetable[(c)] : (c)
-//#define u8_tolower(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) + ('a' - 'A')) : (c))
-
-/* this is faster than the table lookup */
 #include <ctype.h>
 #define u8_tolower(c) tolower((uint8_t)(c))
 
diff --git a/src/tm-queues.c b/src/tm-queues.c
index bb8c045476..f4102d8f21 100644
--- a/src/tm-queues.c
+++ b/src/tm-queues.c
@@ -44,8 +44,6 @@ Tmq *TmqCreateQueue(const char *name)
         goto error;
 
     q->id = tmq_id++;
-    /* for cuda purposes */
-    q->q_type = 0;
 
     SCLogDebug("created queue \'%s\', %p", name, q);
     return q;
diff --git a/src/tm-queues.h b/src/tm-queues.h
index 32e1e5203c..502ef2cd56 100644
--- a/src/tm-queues.h
+++ b/src/tm-queues.h
@@ -29,8 +29,6 @@ typedef struct Tmq_ {
     uint16_t id;
     uint16_t reader_cnt;
     uint16_t writer_cnt;
-    /* 0 for packet-queue and 1 for data-queue */
-    uint8_t q_type;
 } Tmq;
 
 Tmq* TmqCreateQueue(const char *name);
diff --git a/src/tm-threads.c b/src/tm-threads.c
index b71cc99fde..25fef956a8 100644
--- a/src/tm-threads.c
+++ b/src/tm-threads.c
@@ -1487,10 +1487,7 @@ static int TmThreadKillThread(ThreadVars *tv)
         }
         if (tv->inq != NULL) {
             for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
-                if (tv->inq->q_type == 0)
-                    SCCondSignal(&trans_q[tv->inq->id].cond_q);
-                else
-                    SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                SCCondSignal(&trans_q[tv->inq->id].cond_q);
             }
             SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id);
         }
@@ -1641,10 +1638,7 @@ again:
             if (tv->inq != NULL) {
                 int i;
                 for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
-                    if (tv->inq->q_type == 0)
-                        SCCondSignal(&trans_q[tv->inq->id].cond_q);
-                    else
-                        SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                    SCCondSignal(&trans_q[tv->inq->id].cond_q);
                 }
                 SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id);
             }
@@ -1723,10 +1717,7 @@ again:
         if (tv->inq != NULL) {
             int i;
             for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
-                if (tv->inq->q_type == 0)
-                    SCCondSignal(&trans_q[tv->inq->id].cond_q);
-                else
-                    SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                SCCondSignal(&trans_q[tv->inq->id].cond_q);
             }
             SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id);
         }
diff --git a/src/tmqh-simple.c b/src/tmqh-simple.c
index 3228aaf1b5..255406476d 100644
--- a/src/tmqh-simple.c
+++ b/src/tmqh-simple.c
@@ -92,65 +92,3 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p)
     SCMutexUnlock(&q->mutex_q);
 }
 
-/*******************************Generic-Q-Handlers*****************************/
-
-/**
- * \brief Public version of TmqhInputSimple from the tmqh-simple queue
- *        handler, except that it is a generic version that is directly
- *        tied to a "SCDQDataQueue" instance(sent as an arg).
- *
- *        Retrieves a data_instance from the queue.  If the queue is empty, it
- *        waits on the queue, till a data_instance is enqueued into the queue
- *        by some other module.
- *
- *        All references to "data_instance" means a reference to a data structure
- *        instance that implements the template "struct SCDQGenericQData_".
- *
- * \param q The SCDQDataQueue instance to wait on.
- *
- * \retval p The returned packet from the queue.
- * \retval data The returned data_instance from the queue.
- */
-SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q)
-{
-    SCMutexLock(&q->mutex_q);
-    if (q->len == 0) {
-        /* if we have no packets in queue, wait... */
-        SCCondWait(&q->cond_q, &q->mutex_q);
-    }
-
-    if (q->len > 0) {
-        SCDQGenericQData *data = SCDQDataDequeue(q);
-        SCMutexUnlock(&q->mutex_q);
-        return data;
-    } else {
-        /* return NULL if we have no data in the queue. Should only happen
-         * on signals. */
-        SCMutexUnlock(&q->mutex_q);
-        return NULL;
-    }
-}
-
-/**
- * \brief Public version of TmqhOutputSimple from the tmqh-simple queue
- *        handler, except that it is a generic version that is directly
- *        tied to a SCDQDataQueue instance(sent as an arg).
- *
- *        Pumps out a data_instance into the queue.  If the queue is empty, it
- *        waits on the queue, till a data_instance is enqueued into the queue.
- *
- *        All references to "data_instance" means a reference to a data structure
- *        instance that implements the template "struct SCDQGenericQData_".
- *
- * \param q    The SCDQDataQueue instance to pump the data into.
- * \param data The data instance to be enqueued.
- */
-void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data)
-{
-    SCMutexLock(&q->mutex_q);
-    SCDQDataEnqueue(q, data);
-    SCCondSignal(&q->cond_q);
-    SCMutexUnlock(&q->mutex_q);
-
-    return;
-}
diff --git a/src/tmqh-simple.h b/src/tmqh-simple.h
index 1d4417b4e8..d80de50852 100644
--- a/src/tmqh-simple.h
+++ b/src/tmqh-simple.h
@@ -24,11 +24,6 @@
 #ifndef __TMQH_SIMPLE_H__
 #define __TMQH_SIMPLE_H__
 
-#include "data-queue.h"
-
-SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *);
-void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *);
-
 void TmqhSimpleRegister (void);
 
 #endif /* __TMQH_SIMPLE_H__ */
diff --git a/src/util-cuda-buffer.c b/src/util-cuda-buffer.c
deleted file mode 100644
index 54ae272a52..0000000000
--- a/src/util-cuda-buffer.c
+++ /dev/null
@@ -1,1358 +0,0 @@
-/* Copyright (C) 2007-2012 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * API has be introduced to allow buffering of data by multiple writers
- * asynronously.  The current version only allows sequential reads.
- *
- * The API works by first registering a couple of buffers, which would
- * be sliced and allocated for use by the API to potential writers.
- *
- * The registration API requires 3 buffers to be registered.  The data
- * buffer(d_buffer), into which the API buffers data, the pointer buffer
- * (p_buffer), which would hold the pointer var instance corresponding to
- * its entry in the d_buffer, and the offset buffer(o_buffer), which
- * holds an offset entry for the data corresponding to the pointer buffer
- * entry.
- *
- * A writer wishing to write data would be required to obtain a slice
- * using CudaBufferGetSlice.  Once data has been written to the slice,
- * it can report back saying the slice has been written to by setting
- * a flag in the slice - SC_ATOMIC_SET(slice->done, 1).
- *
- * A reader wishing to retrieve the data written by writers, will do
- * so using the API call - CudaBufferCullCompletedSlices().  Once data
- * has been consumed, the reader would report back using
- * CudaBufferReportCulledConsumption() so that resources can be freed
- * to be reallocated to other writers.
- */
-
-
-#include "suricata-common.h"
-#ifdef __SC_CUDA_SUPPORT__
-#include "suricata.h"
-
-#include "util-atomic.h"
-#include "util-pool.h"
-#include "util-misc.h"
-#include "util-error.h"
-#include "util-debug.h"
-#include "util-unittest.h"
-#include "util-cuda-buffer.h"
-
-/* rotation limit for the buffers.  This basically decides at what position
- * inside alloced buffer should the API rotate and start using the buffer
- * from the start - The right value's from 0.1-1.0.  Do note that the
- * rotation decision is taken when the culling process takes place.
- * Have a look at - CudaBufferCullCompletedSlices */
-#define CUDA_BUFFER_BUFFER_ROTATION_LIMIT 0.75
-
-/* The max buffer size that be registered to CudaBufferRegisterNew */
-#define CUDA_BUFFER_BUFFER_LIMIT (1 * 1024 * 1024 * 1024)
-
-/* 100,000 * 5 = 500,000 */
-#define CUDA_BUFFER_ITEM_LIMIT (100000 * 5)
-
-/* a million slices to be prealloced = 100,000 * 10 */
-#define CUDA_BUFFER_SLICE_POOL_PREALLOC (100000 * 10)
-
-/* we store all our slices here */
-static Pool *slice_pool = NULL;
-/* mutex for the above slice pool */
-static SCMutex slice_pool_mutex;
-
-/**
- * \brief Used by a consumer to report back(and thus have it freed),
- *        once it has consumed data returned in the CudaBufferCulledInfo
- *        instance(obtained from the call to CudaBufferCullCompletedSlices).
- */
-void CudaBufferReportCulledConsumption(CudaBufferData *cb_data,
-                                       CudaBufferCulledInfo *culled_info)
-{
-    SCMutexLock(&cb_data->m);
-
-    if (culled_info->d_buffer_reset) {
-        cb_data->d_buffer_read = 0;
-    } else {
-        if (culled_info->no_of_items != 0) {
-            cb_data->d_buffer_read = culled_info->d_buffer_start_offset +
-                culled_info->d_buffer_len;
-        }
-    }
-
-    if (culled_info->op_buffer_reset) {
-        cb_data->op_buffer_read = 0;
-    } else {
-        if (culled_info->no_of_items != 0) {
-            cb_data->op_buffer_read += culled_info->no_of_items;
-        }
-    }
-
-    SCMutexUnlock(&cb_data->m);
-}
-
-/**
- * \brief Remove slices that are done.  "Done" as in worker threads are done
- *        writing data to it.
- *
- * \param cb_data Pointer to the CudaBufferData instance.
- */
-void CudaBufferCullCompletedSlices(CudaBufferData *cb_data,
-                                   CudaBufferCulledInfo *culled_info,
-                                   uint32_t size_limit)
-{
-    culled_info->no_of_items = 0;
-    culled_info->d_buffer_reset = 0;
-    culled_info->op_buffer_reset = 0;
-
-    SCMutexLock(&cb_data->m);
-
-    int buffer_reset = 0;
-    uint32_t d_buffer_write_temp = 0;
-    uint32_t op_buffer_write_temp = 0;
-
-    if ((cb_data->d_buffer_write >=
-         (cb_data->d_buffer_len * CUDA_BUFFER_BUFFER_ROTATION_LIMIT)) &&
-        (cb_data->d_buffer_read != 0))
-    {
-        SCLogDebug("d_buffer reset");
-        d_buffer_write_temp = cb_data->d_buffer_write;
-        cb_data->d_buffer_write = 0;
-        buffer_reset = 1;
-        culled_info->d_buffer_reset = 1;
-    }
-
-    /* reset op_buffer */
-    if ((cb_data->op_buffer_write >=
-         (cb_data->op_buffer_len * CUDA_BUFFER_BUFFER_ROTATION_LIMIT)) &&
-        (cb_data->op_buffer_read != 0))
-    {
-        SCLogDebug("op_buffer reset");
-        op_buffer_write_temp = cb_data->op_buffer_write;
-        cb_data->op_buffer_write = 0;
-        buffer_reset = 1;
-        culled_info->op_buffer_reset = 1;
-    }
-
-    CudaBufferSlice *slice_temp = cb_data->slice_head;
-    CudaBufferSlice *max_culled_slice = NULL;
-    uint32_t curr_size = 0;
-
-    while (slice_temp != NULL) {
-        if (!SC_ATOMIC_GET(slice_temp->done)) {
-            SCLogDebug("CudaBuffer waiting on an item to finish");
-            if (buffer_reset) {
-                while (!SC_ATOMIC_GET(slice_temp->done))
-                    usleep(1);
-            } else {
-                break;
-            }
-        }
-
-        if (curr_size + (slice_temp->end_offset - slice_temp->start_offset + 1) > size_limit) {
-            if (buffer_reset) {
-                cb_data->op_buffer_write = op_buffer_write_temp;
-                cb_data->d_buffer_write = d_buffer_write_temp;
-                culled_info->d_buffer_reset = 0;
-                culled_info->op_buffer_reset = 0;
-            }
-            break;
-        }
-
-        max_culled_slice = slice_temp;
-        curr_size += (slice_temp->end_offset - slice_temp->start_offset + 1);
-
-        slice_temp = slice_temp->next;
-    }
-
-    CudaBufferSlice *slice_head = cb_data->slice_head;
-
-    if (max_culled_slice != NULL) {
-        cb_data->slice_head = max_culled_slice->next;
-        if (max_culled_slice->next == NULL) {
-            cb_data->slice_tail = NULL;
-        }
-        max_culled_slice->next = NULL;
-    } else {
-        SCMutexUnlock(&cb_data->m);
-        return;
-    }
-
-    culled_info->d_buffer_start_offset = slice_head->start_offset;
-    culled_info->d_buffer_len = (max_culled_slice->end_offset -
-                                 slice_head->start_offset + 1);
-    culled_info->op_buffer_start_offset = cb_data->op_buffer_read;
-    SCMutexUnlock(&cb_data->m);
-
-    /* push out the used slices to the the slice_pool */
-    SCMutexLock(&slice_pool_mutex);
-    slice_temp = slice_head;
-    while (slice_temp != max_culled_slice) {
-        CudaBufferSlice *tmp = slice_temp->next;
-
-        PoolReturn(slice_pool, slice_temp);
-        culled_info->no_of_items++;
-
-        slice_temp = tmp;
-    }
-    PoolReturn(slice_pool, slice_temp);
-    culled_info->no_of_items++;
-    SCMutexUnlock(&slice_pool_mutex);
-
-    return;
-}
-
-/**
- * \internal
- * \brief Adds a slice to the CudaBufferData slice list.
- *
- *        We expect the CudaBufferData instance to be locked.
- *
- * \param cb_data Pointer to the CudaBufferdata instance.
- * \param slice Pointer to the slice to be pushed.
- */
-static inline void CudaBufferAppendSlice(CudaBufferData *cb_data, CudaBufferSlice *slice)
-{
-    slice->next = NULL;
-
-    if (cb_data->slice_head == NULL) {
-        cb_data->slice_head = slice;
-        cb_data->slice_tail = slice;
-    } else {
-        cb_data->slice_tail->next = slice;
-        cb_data->slice_tail = slice;
-    }
-
-    return;
-}
-
-/**
- * \brief Gets a new buffer slice for a consumer to write to.
- *
- *        All slices returned are aligned to the next 8 byte boundary.
- *
- * \param cb_data Pointer to the CudaBufferdata instance.
- * \param len     Length of the slice required.
- * \param p       Pointer to the var corresponding to the data to store.
- *
- * \retval slice Pointer to the slice if successful; NULL if unsuccessful.
- */
-CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void *p)
-{
-#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
-
-    SCMutexLock(&slice_pool_mutex);
-    CudaBufferSlice *slice = PoolGet(slice_pool);
-    SCMutexUnlock(&slice_pool_mutex);
-    if (slice == NULL) {
-        return NULL;
-    }
-
-    SCMutexLock(&cb_data->m);
-
-    if (cb_data->d_buffer_write < cb_data->d_buffer_read) {
-        if (cb_data->d_buffer_write + len >= cb_data->d_buffer_read) {
-            SCLogDebug("d_buffer full");
-            SCMutexUnlock(&cb_data->m);
-
-            SCMutexLock(&slice_pool_mutex);
-            PoolReturn(slice_pool, slice);
-            SCMutexUnlock(&slice_pool_mutex);
-            return NULL;
-        }
-    } else {
-        if (cb_data->d_buffer_write + len > cb_data->d_buffer_len) {
-            SCLogDebug("d_buffer limit hit - buffer_len - %"PRIu32,
-                      cb_data->d_buffer_len);
-            SCMutexUnlock(&cb_data->m);
-
-            SCMutexLock(&slice_pool_mutex);
-            PoolReturn(slice_pool, slice);
-            SCMutexUnlock(&slice_pool_mutex);
-            return NULL;
-        }
-    }
-
-    if (cb_data->op_buffer_write < cb_data->op_buffer_read) {
-        if (cb_data->op_buffer_write + 1 >= cb_data->op_buffer_read) {
-            SCLogDebug("op_buffer full");
-            SCMutexUnlock(&cb_data->m);
-
-            SCMutexLock(&slice_pool_mutex);
-            PoolReturn(slice_pool, slice);
-            SCMutexUnlock(&slice_pool_mutex);
-            return NULL;
-        }
-    } else {
-        if (cb_data->op_buffer_write + 1 > cb_data->op_buffer_len) {
-            SCLogDebug("op_buffer limit hit - buffer_len - %"PRIu32,
-                      cb_data->op_buffer_len);
-            SCMutexUnlock(&cb_data->m);
-
-            SCMutexLock(&slice_pool_mutex);
-            PoolReturn(slice_pool, slice);
-            SCMutexUnlock(&slice_pool_mutex);
-            return NULL;
-        }
-    }
-
-    slice->start_offset = cb_data->d_buffer_write;
-    cb_data->d_buffer_write = slice->start_offset + len;
-    ALIGN_UP(cb_data->d_buffer_write, 8);
-    slice->end_offset = cb_data->d_buffer_write - 1;
-    slice->buffer = cb_data->d_buffer;
-    SC_ATOMIC_SET(slice->done, 0);
-
-    CudaBufferAppendSlice(cb_data, slice);
-    cb_data->no_of_items++;
-
-    cb_data->o_buffer[cb_data->op_buffer_write] = slice->start_offset;
-    cb_data->p_buffer[cb_data->op_buffer_write] = p;
-    cb_data->op_buffer_write++;
-
-    SCMutexUnlock(&cb_data->m);
-
-    return slice;
-}
-
-void CudaBufferDeRegister(CudaBufferData *cb_data)
-{
-    CudaBufferSlice *slice_temp = cb_data->slice_head;
-    SCMutexLock(&slice_pool_mutex);
-    while (slice_temp != NULL) {
-        CudaBufferSlice *slice_temp_next = slice_temp->next;
-        PoolReturn(slice_pool, slice_temp);
-        slice_temp = slice_temp_next;
-    }
-    SCMutexUnlock(&slice_pool_mutex);
-
-    SCMutexDestroy(&cb_data->m);
-    SCFree(cb_data);
-
-    return;
-}
-
-/**
- * \brief Registers a new buffer to be handled by the CudaBuffer API.
- *
- *        More on what this API does can be understood from the API
- *        docs at the start of this file.
- *
- * \param d_buffer     The data buffer to work with.
- * \param d_buffer_len Length of d_buffer.
- * \param o_buffer     The offset buffer.
- * \param p_buffer     The pointer buffer.
- * \param op_buffer_no_of_items Length of o_buffer and p_buffer.  Please
- *                              note that both o_buffer and p_buffer
- *                              should be of the same length.
- * \param len Length of the buffer to be assigned.
- */
-CudaBufferData *CudaBufferRegisterNew(uint8_t *d_buffer, uint32_t d_buffer_len,
-                                      uint32_t *o_buffer, void **p_buffer,
-                                      uint32_t op_buffer_no_of_items)
-{
-    if (d_buffer_len > CUDA_BUFFER_BUFFER_LIMIT) {
-        SCLogError(SC_ERR_CUDA_BUFFER_ERROR, "Buffer max limit exceeded.  We "
-                   "accept a max limit of %u bytes", CUDA_BUFFER_BUFFER_LIMIT);
-        return NULL;
-    }
-
-    if ((d_buffer_len % 8) != 0) {
-        SCLogError(SC_ERR_CUDA_BUFFER_ERROR, "Please specify a buffer length which "
-                   "is a multiple of 8");
-        return NULL;
-    }
-
-    CudaBufferData *new = SCMalloc(sizeof(CudaBufferData));
-    if (unlikely(new == NULL)) {
-        return NULL;
-    }
-    memset(new, 0, sizeof(CudaBufferData));
-
-    /* payload/data buffer and set its size */
-    new->d_buffer = d_buffer;
-    new->d_buffer_len = d_buffer_len;
-
-    /* offset buffer and set its size */
-    new->o_buffer = o_buffer;
-    new->p_buffer = p_buffer;
-    /* common to the above 2 malloc'ed buffers */
-    new->op_buffer_len = op_buffer_no_of_items;
-
-    /* used to lock this new instance when it's used */
-    SCMutexInit(&new->m, NULL);
-
-    return new;
-}
-
-static void *CudaBufferSlicePoolAlloc(void *null)
-{
-    void *ptr = SCMalloc(sizeof(CudaBufferSlice));
-    if (unlikely(ptr == NULL))
-        return NULL;
-    memset(ptr, 0, sizeof(CudaBufferSlice));
-
-    SC_ATOMIC_INIT(((CudaBufferSlice *)ptr)->done);
-
-    return ptr;
-}
-
-static int CudaBufferSlicePoolInit(void *data, void *init_data)
-{
-    SC_ATOMIC_INIT(((CudaBufferSlice *)data)->done);
-
-    return 1;
-}
-
-/* disabled to reflect the changes made in PoolInit */
-#if 0
-static void CudaBufferSlicePoolFree(void *data)
-{
-    SC_ATOMIC_DESTROY(((CudaBufferSlice *)data)->done);
-    SCFree(data);
-
-    return;
-}
-#endif
-
-static void CudaBufferSlicePoolCleanup(void *data)
-{
-    SC_ATOMIC_DESTROY(((CudaBufferSlice *)data)->done);
-
-    return;
-}
-
-/**
- * \brief Init the API.  To be called only once at startup time.
- */
-void CudaBufferInit(void)
-{
-    SCMutexInit(&slice_pool_mutex, NULL);
-
-    slice_pool = PoolInit(CUDA_BUFFER_SLICE_POOL_PREALLOC,
-                          CUDA_BUFFER_SLICE_POOL_PREALLOC,
-                          sizeof(CudaBufferSlice),
-                          CudaBufferSlicePoolAlloc,
-                          CudaBufferSlicePoolInit,
-                          NULL,
-                          CudaBufferSlicePoolCleanup,
-                          NULL);
-    if (slice_pool == NULL) {
-        SCLogError(SC_ERR_POOL_INIT, "CudaBuffer slice_pool is not initialized");
-        exit(EXIT_FAILURE);
-    }
-
-    return;
-}
-
-/****************************Unittests***************************/
-
-#ifdef UNITTESTS
-
-int CudaBufferTest01(void)
-{
-    CudaBufferSlice *slice1, *slice2, *slice3, *slice4, *slice_temp;
-    int result = 0;
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    /* new slice */
-    slice1 = CudaBufferGetSlice(data, 8, NULL);
-    if (slice1->start_offset != 0 || slice1->end_offset != 7 ||
-        SC_ATOMIC_GET(slice1->done) != 0) {
-        printf("failure 1\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 8 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 1 || data->op_buffer_read != 0 ||
-        data->no_of_items != 1) {
-        printf("failure 2\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 3\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 4\n");
-        goto end;
-    }
-
-    /* new slice */
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    if (slice2->start_offset != 8 || slice2->end_offset != 23 ||
-        SC_ATOMIC_GET(slice2->done) != 0) {
-        printf("failure 5\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 24 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 2 || data->op_buffer_read != 0 ||
-        data->no_of_items != 2) {
-        printf("failure 6\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 7\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 8\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 9\n");
-        goto end;
-    }
-
-    /* new slice */
-    slice3 = CudaBufferGetSlice(data, 36, NULL);
-    if (slice3->start_offset != 24 || slice3->end_offset != 63 ||
-        SC_ATOMIC_GET(slice3->done) != 0) {
-        printf("failure 10\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 64 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 0 ||
-        data->no_of_items != 3) {
-        printf("failure 11\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 12\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 13\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 24 || slice_temp->end_offset != 63 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 14\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 15\n");
-        goto end;
-    }
-
-    slice4 = CudaBufferGetSlice(data, 10, NULL);
-    if (slice4 != NULL) {
-        printf("failure 16\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 17\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-int CudaBufferTest02(void)
-{
-    CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp;
-    int result = 0;
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    slice1 = CudaBufferGetSlice(data, 8, NULL);
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    if (data->d_buffer_write != 24 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 2 || data->op_buffer_read != 0 ||
-        data->no_of_items != 2) {
-        printf("failure 1\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 2\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 3\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 4\n");
-        goto end;
-    }
-
-    /* culling */
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 0) {
-        printf("failure 5\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 6\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 7\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 8\n");
-        goto end;
-    }
-
-    SC_ATOMIC_SET(slice2->done, 1);
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 0) {
-        printf("failure 9\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 10\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 ||
-        SC_ATOMIC_GET(slice_temp->done) != 1) {
-        printf("failure 11\n");
-        goto end;
-    }
-    if (slice_temp->next != NULL) {
-        printf("failure 12\n");
-        goto end;
-    }
-
-    SC_ATOMIC_SET(slice1->done, 1);
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 2) {
-        printf("failure 13\n");
-        goto end;
-    }
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 14\n");
-        goto end;
-    }
-    if (culled_info.d_buffer_start_offset != 0 ||
-        culled_info.d_buffer_len != 24 ||
-        culled_info.op_buffer_start_offset != 0 ||
-        culled_info.d_buffer_reset != 0 || culled_info.op_buffer_reset != 0) {
-        printf("failure 15\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 24 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 2 || data->op_buffer_read != 0 ||
-        data->no_of_items != 2) {
-        printf("failure 16\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    if (data->d_buffer_write != 24 || data->d_buffer_read != 24 ||
-        data->op_buffer_write != 2 || data->op_buffer_read != 2 ||
-        data->no_of_items != 2) {
-        printf("failure 17\n");
-        goto end;
-    }
-
-    /* new slice */
-    slice3 = CudaBufferGetSlice(data, 8, NULL);
-    if (slice3->start_offset != 24 || slice3->end_offset != 31 ||
-        SC_ATOMIC_GET(slice3->done) != 0) {
-        printf("failure 18\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 32 || data->d_buffer_read != 24 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 2 ||
-        data->no_of_items != 3) {
-        printf("failure 19\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 20\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp != NULL) {
-        printf("failure 21\n");
-        goto end;
-    }
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 0) {
-        printf("failure 22\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 32 || data->d_buffer_read != 24 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 2 ||
-        data->no_of_items != 3) {
-        printf("failure 23\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 24\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp != NULL) {
-        printf("failure 25\n");
-        goto end;
-    }
-
-    /* set done flag */
-    SC_ATOMIC_SET(slice3->done, 1);
-    if (slice3->start_offset != 24 || slice3->end_offset != 31 ||
-        SC_ATOMIC_GET(slice3->done) != 1) {
-        printf("failure 26\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 32 || data->d_buffer_read != 24 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 2 ||
-        data->no_of_items != 3) {
-        printf("failure 27\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 ||
-        SC_ATOMIC_GET(slice_temp->done) != 1) {
-        printf("failure 28\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp != NULL) {
-        printf("failure 29\n");
-        goto end;
-    }
-
-    /* culling */
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 1) {
-        printf("failure 30\n");
-        goto end;
-    }
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 31\n");
-        goto end;
-    }
-    if (culled_info.d_buffer_start_offset != 24 ||
-        culled_info.d_buffer_len != 8 ||
-        culled_info.op_buffer_start_offset != 2 ||
-        culled_info.d_buffer_reset != 0 || culled_info.op_buffer_reset != 0) {
-        printf("failure 32\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 32 || data->d_buffer_read != 24 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 2 ||
-        data->no_of_items != 3) {
-        printf("failure 33\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    if (data->d_buffer_write != 32 || data->d_buffer_read != 32 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 3 ||
-        data->no_of_items != 3) {
-        printf("failure 34\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 35\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-int CudaBufferTest03(void)
-{
-    CudaBufferSlice *slice, *slice_temp;
-    int result = 0;
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    slice = CudaBufferGetSlice(data, 16, NULL);
-    BUG_ON(slice == NULL);
-    slice = CudaBufferGetSlice(data, 16, NULL);
-    BUG_ON(slice == NULL);
-    slice = CudaBufferGetSlice(data, 24, NULL);
-    BUG_ON(slice == NULL);
-
-    /* culling */
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 0) {
-        printf("failure 1\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 0 ||
-        data->no_of_items != 3) {
-        printf("failure 2\n");
-        goto end;
-    }
-    slice_temp = data->slice_head;
-    if (slice_temp->start_offset != 0 || slice_temp->end_offset != 15 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 3\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 16 || slice_temp->end_offset != 31 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 4\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp->start_offset != 32 || slice_temp->end_offset != 55 ||
-        SC_ATOMIC_GET(slice_temp->done) != 0) {
-        printf("failure 5\n");
-        goto end;
-    }
-    slice_temp = slice_temp->next;
-    if (slice_temp != NULL) {
-        printf("failure 6\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 7\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-int CudaBufferTest04(void)
-{
-    CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp;
-    int result = 0;
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    slice1 = CudaBufferGetSlice(data, 16, NULL);
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    slice3 = CudaBufferGetSlice(data, 24, NULL);
-
-    SC_ATOMIC_SET(slice1->done, 1);
-
-    /* culling */
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 0 ||
-        data->no_of_items != 3) {
-        printf("failure 1\n");
-        goto end;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 1) {
-        printf("failure 2\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 0 ||
-        data->no_of_items != 3) {
-        printf("failure 3\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 16 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 1 ||
-        data->no_of_items != 3) {
-        printf("failure 4\n");
-        goto end;
-    }
-
-    SC_ATOMIC_SET(slice2->done, 1);
-    SC_ATOMIC_SET(slice3->done, 1);
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (culled_info.no_of_items != 2) {
-        printf("failure 5\n");
-        goto end;
-    }
-    if (data->d_buffer_write != 0 || data->d_buffer_read != 16 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 1 ||
-        data->no_of_items != 3) {
-        printf("failure 6\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    if (data->d_buffer_write != 0 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 3 || data->op_buffer_read != 3 ||
-        data->no_of_items != 3) {
-        printf("failure 7\n");
-        goto end;
-    }
-
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 8\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 9\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-int CudaBufferTest05(void)
-{
-    CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp;
-    int result = 0;
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    slice1 = CudaBufferGetSlice(data, 16, NULL);
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    slice3 = CudaBufferGetSlice(data, 24, NULL);
-
-    SC_ATOMIC_SET(slice1->done, 1);
-
-    /* culling */
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    CudaBufferReportCulledConsumption(data, &culled_info);
-
-    SC_ATOMIC_SET(slice2->done, 1);
-    SC_ATOMIC_SET(slice3->done, 1);
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    slice1 = CudaBufferGetSlice(data, 16, NULL);
-    if (slice1 == NULL) {
-        printf("failure 1\n");
-        goto end;
-    }
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    if (slice2 == NULL) {
-        printf("failure 2\n");
-        goto end;
-    }
-    slice3 = CudaBufferGetSlice(data, 24, NULL);
-    if (slice2 == NULL) {
-        printf("failure 3\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 4\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-int CudaBufferTest06(void)
-{
-    CudaBufferSlice *slice, *slice_temp;
-    int result = 0;
-    CudaBufferCulledInfo culled_info;
-    memset(&culled_info, 0, sizeof(CudaBufferCulledInfo));
-
-    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
-    uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64);
-    void **p_buffer = SCMalloc(sizeof(void *) * 64);
-    if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) {
-        printf("failure 0\n");
-        SCFree(d_buffer);
-        SCFree(o_buffer);
-        SCFree(p_buffer);
-        return 0;
-    }
-
-    CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64,
-                                         o_buffer, p_buffer, 64);
-    if (data == NULL) {
-        goto end;
-    }
-
-    slice = CudaBufferGetSlice(data, 3, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "one", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 3, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "two", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    if (data->d_buffer_write != 16 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 2 || data->op_buffer_read != 0 ||
-        data->no_of_items != 2) {
-        printf("failure 1\n");
-        goto end;
-    }
-
-    slice = CudaBufferGetSlice(data, 5, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "three", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 4, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "four", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 4, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "five", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    if (data->d_buffer_write != 40 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 5 || data->op_buffer_read != 0 ||
-        data->no_of_items != 5) {
-        printf("failure 2\n");
-        goto end;
-    }
-
-    slice = CudaBufferGetSlice(data, 3, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "six", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 5, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "seven", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    if (memcmp(data->d_buffer, "one", 3) != 0 ||
-        memcmp(data->d_buffer + 8, "two", 3) != 0 ||
-        memcmp(data->d_buffer + 16, "three", 5) != 0 ||
-        memcmp(data->d_buffer + 24, "four", 4) != 0 ||
-        memcmp(data->d_buffer + 32, "five", 4) != 0 ||
-        memcmp(data->d_buffer + 40, "six", 3) != 0 ||
-        memcmp(data->d_buffer + 48, "seven", 5) != 0) {
-        printf("failure 3\n");
-        goto end;
-    }
-
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 7 || data->op_buffer_read != 0 ||
-        data->no_of_items != 7) {
-        printf("failure 4\n");
-        goto end;
-    }
-
-    /* culling */
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 7 || data->op_buffer_read != 0 ||
-        data->no_of_items != 7) {
-        printf("failure 5\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-    if (data->d_buffer_write != 56 || data->d_buffer_read != 56 ||
-        data->op_buffer_write != 7 || data->op_buffer_read != 7 ||
-        data->no_of_items != 7) {
-        printf("failure 6\n");
-        goto end;
-    }
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->d_buffer_write != 0 || data->d_buffer_read != 56 ||
-        data->op_buffer_write != 7 || data->op_buffer_read != 7 ||
-        data->no_of_items != 7) {
-        printf("failure 7\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-
-    if (data->d_buffer_write != 0 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 7 || data->op_buffer_read != 7 ||
-        data->no_of_items != 7) {
-        printf("failure 8\n");
-        goto end;
-    }
-
-    slice = CudaBufferGetSlice(data, 5, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "eight", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 4, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "nine", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 3, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "ten", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 6, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "eleven", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    slice = CudaBufferGetSlice(data, 6, NULL);
-    memcpy(slice->buffer + slice->start_offset,
-           "twelve", slice->end_offset - slice->start_offset + 1);
-    SC_ATOMIC_SET(slice->done, 1);
-
-    if (data->d_buffer_write != 40 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 12 || data->op_buffer_read != 7 ||
-        data->no_of_items != 12) {
-        printf("failure 9\n");
-        goto end;
-    }
-
-    if (memcmp(data->d_buffer, "eight", 5) != 0 ||
-        memcmp(data->d_buffer + 8, "nine", 4) != 0 ||
-        memcmp(data->d_buffer + 16, "ten", 3) != 0 ||
-        memcmp(data->d_buffer + 24, "eleven", 6) != 0 ||
-        memcmp(data->d_buffer + 32, "twelve", 6) != 0) {
-        printf("failure 10\n");
-        goto end;
-    }
-
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->d_buffer_write != 40 || data->d_buffer_read != 0 ||
-        data->op_buffer_write != 12 || data->op_buffer_read != 7 ||
-        data->no_of_items != 12) {
-        printf("failure 11\n");
-        goto end;
-    }
-    CudaBufferReportCulledConsumption(data, &culled_info);
-
-    if (data->d_buffer_write != 40 || data->d_buffer_read != 40 ||
-        data->op_buffer_write != 12 || data->op_buffer_read != 12 ||
-        data->no_of_items != 12) {
-        printf("failure 12\n");
-        goto end;
-    }
-
-    result = 1;
- end:
-    slice_temp = data->slice_head;
-    while (slice_temp != NULL) {
-        SC_ATOMIC_SET(slice_temp->done, 1);
-        slice_temp = slice_temp->next;
-    }
-    CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE);
-    if (data->slice_head != NULL || data->slice_tail != NULL) {
-        printf("failure 13\n");
-        result = 0;
-    }
-
-    CudaBufferDeRegister(data);
-    SCFree(d_buffer);
-    SCFree(o_buffer);
-    SCFree(p_buffer);
-
-    return result;
-}
-
-#endif /* #ifdef UNITTESTS */
-
-void CudaBufferRegisterUnittests(void)
-{
-#ifdef UNITTESTS
-    UtRegisterTest("CudaBufferTest01", CudaBufferTest01);
-    UtRegisterTest("CudaBufferTest02", CudaBufferTest02);
-    UtRegisterTest("CudaBufferTest03", CudaBufferTest03);
-    UtRegisterTest("CudaBufferTest04", CudaBufferTest04);
-    UtRegisterTest("CudaBufferTest05", CudaBufferTest05);
-    UtRegisterTest("CudaBufferTest06", CudaBufferTest06);
-#endif
-
-    return;
-}
-
-#endif /* __SC_CUDA_SUPPORT__ */
diff --git a/src/util-cuda-buffer.h b/src/util-cuda-buffer.h
deleted file mode 100644
index ab494e6755..0000000000
--- a/src/util-cuda-buffer.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (C) 2007-2013 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file API to allow buffering of data.
- *
- *       Introduced with cuda as the primary objective.  Allows multiple
- *       threads to simultaneously access a single buffer and write to it.
- *
- *       Current version allows only serial reads from the buffer.
- *       When the need arises, the API will be updated to allow multiple
- *       non-sequential reads.
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#ifdef __SC_CUDA_SUPPORT__
-
-#ifndef __UTIL_CUDA_BUFFER_H__
-#define __UTIL_CUDA_BUFFER_H__
-
-#include "util-atomic.h"
-
-/**
- * \brief Used by consumers to retrieve the data buffered.
- */
-typedef struct CudaBufferCulledInfo_ {
-    uint32_t no_of_items;
-
-    uint32_t d_buffer_start_offset;
-    uint32_t d_buffer_len;
-
-    /* we use no_of_items to determine the no of items here */
-    uint32_t op_buffer_start_offset;
-
-    uint8_t d_buffer_reset;
-    uint8_t op_buffer_reset;
-} CudaBufferCulledInfo;
-
-/**
- * /brief A slice which contains details on where to buffer data by a
- *        writer.
- */
-typedef struct CudaBufferSlice_ {
-    uint32_t start_offset;
-    uint32_t end_offset;
-    uint8_t *buffer;
-    SC_ATOMIC_DECLARE(uint8_t, done);
-
-    struct CudaBufferSlice_ *next;
-} CudaBufferSlice;
-
-typedef struct CudaBufferData_ {
-    /* the data buffer */
-    uint8_t *d_buffer;
-    uint32_t d_buffer_len;
-    uint32_t d_buffer_write;
-    uint32_t d_buffer_read;
-
-    /* debug only.  Can be removed */
-    uint32_t no_of_items;
-
-    /* these 2 buffers below - o_buffer and p_buffer should be
-     * used/updated in tandem
-     * p_buffer is the ptr buffer that points to a data instance that
-     * represents it's corresponding data stored in d_buffer.
-     * o_buffer is the corresponding entry to the one in p_buffer, which
-     * holds the offset to the corresponding entry in d_buffer. */
-    uint32_t *o_buffer;
-    void **p_buffer;
-    uint32_t op_buffer_len;
-    uint32_t op_buffer_write;
-    uint32_t op_buffer_read;
-
-    /* slice lists used by writers */
-    CudaBufferSlice *slice_head;
-    CudaBufferSlice *slice_tail;
-
-    /* mutex used by the entire struct */
-    SCMutex m;
-} CudaBufferData;
-
-void CudaBufferReportCulledConsumption(CudaBufferData *cb_data,
-                                       CudaBufferCulledInfo *culled_info);
-void CudaBufferCullCompletedSlices(CudaBufferData *cb_data,
-                                   CudaBufferCulledInfo *culled_info, uint32_t size_limit);
-CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *data, uint32_t len, void *p);
-void CudaBufferDeRegister(CudaBufferData *cb_data);
-CudaBufferData *CudaBufferRegisterNew(uint8_t *d_buffer, uint32_t d_buffer_len,
-                                      uint32_t *o_buffer, void **p_buffer,
-                                      uint32_t op_buffer_no_of_items);
-void CudaBufferInit(void);
-void CudaBufferRegisterUnittests(void);
-
-#endif /* __UTIL_CUDA_BUFFER_H__ */
-
-#endif /* __SC_CUDA_SUPPORT__ */
diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c
deleted file mode 100644
index 198c6f73b9..0000000000
--- a/src/util-cuda-handlers.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (C) 2007-2013 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-/* compile in, only if we have a CUDA enabled device on the machine, with the
- * toolkit and the driver installed */
-
-#include "suricata-common.h"
-
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-error.h"
-#include "util-debug.h"
-#include "conf.h"
-#include "util-cuda.h"
-#include "util-cuda-handlers.h"
-
-/* file only exists if cuda is enabled */
-#include "cuda-ptxdump.h"
-
-/************************conf file profile section**********************/
-
-typedef struct CudaHandlerConfProfile_ {
-    char *name;
-    void *ctx;
-    void (*Free)(void *);
-
-    struct CudaHandlerConfProfile_ *next;
-} CudaHandlerConfProfile;
-
-static CudaHandlerConfProfile *conf_profiles = NULL;
-/* protects above var */
-static SCMutex mutex = SCMUTEX_INITIALIZER;
-
-void CudaHandlerAddCudaProfileFromConf(const char *name,
-                                       void *(*Callback)(ConfNode *node),
-                                       void (*Free)(void *))
-{
-    /* we don't do data validation */
-    SCMutexLock(&mutex);
-
-    CudaHandlerConfProfile *tmp_cp = conf_profiles;
-    while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
-        tmp_cp = tmp_cp->next;
-
-    if (tmp_cp != NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf "
-                   "profile by the name \"%s\" registered.", name);
-        exit(EXIT_FAILURE);
-    }
-
-    char tmp[200];
-    int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name);
-    if (r < 0) {
-        SCLogError(SC_ERR_FATAL, "snprintf failure.");
-        exit(EXIT_FAILURE);
-    } else if (r > (int)sizeof(tmp)) {
-        SCLogError(SC_ERR_FATAL, "buffer not big enough to write param.");
-        exit(EXIT_FAILURE);
-    }
-    void *ctx = Callback(ConfGetNode(tmp));
-    if (ctx == NULL) {
-        SCMutexUnlock(&mutex);
-        return;
-    }
-
-    CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile));
-    if (unlikely(new_cp == NULL))
-        exit(EXIT_FAILURE);
-    memset(new_cp, 0, sizeof(CudaHandlerConfProfile));
-    new_cp->name = SCStrdup(name);
-    if (new_cp->name == NULL)
-        exit(EXIT_FAILURE);
-    new_cp->ctx = ctx;
-    new_cp->Free = Free;
-
-    if (conf_profiles == NULL) {
-        conf_profiles = new_cp;
-    } else {
-        new_cp->next = conf_profiles;
-        conf_profiles = new_cp;
-    }
-
-    SCMutexUnlock(&mutex);
-    return;
-}
-
-void *CudaHandlerGetCudaProfile(const char *name)
-{
-    SCMutexLock(&mutex);
-
-    CudaHandlerConfProfile *tmp_cp = conf_profiles;
-    while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
-        tmp_cp = tmp_cp->next;
-
-    if (tmp_cp == NULL) {
-        SCMutexUnlock(&mutex);
-        return NULL;
-    }
-
-    SCMutexUnlock(&mutex);
-    return tmp_cp->ctx;
-}
-
-void CudaHandlerFreeProfiles(void)
-{
-    SCMutexLock(&mutex);
-
-    CudaHandlerConfProfile *tmp = conf_profiles;
-    while (tmp != NULL) {
-        CudaHandlerConfProfile *curr = tmp;
-        tmp = tmp->next;
-        SCFree(curr->name);
-        if (curr->Free != NULL)
-            curr->Free(curr->ctx);
-        SCFree(curr);
-    }
-
-    SCMutexUnlock(&mutex);
-    return;
-}
-
-/*******************cuda context related data section*******************/
-
-/* we use a concept where every device on the gpu has only 1 context.  If
- * a section in the engine wants to use a device and tries to open a context
- * on it, we first check if a context is already created for the device and if
- * so we return it.  If not we create a new one and update with the entry */
-
-static CUcontext *cuda_contexts = NULL;
-static int no_of_cuda_contexts = 0;
-
-typedef struct CudaHandlerModuleData_ {
-    char *name;
-    void *data;
-
-    struct CudaHandlerModuleData_ *next;
-} CudaHandlerModuleData;
-
-typedef struct CudaHandlerModule_ {
-    char *name;
-
-    /* the context used by this module */
-    CUcontext context;
-    /* the device on which the above context was created */
-    int device_id;
-    CudaHandlerModuleData *module_data;
-
-    struct CudaHandlerModule_ *next;
-} CudaHandlerModule;
-
-static CudaHandlerModule *cudahl_modules = NULL;
-
-CUcontext CudaHandlerModuleGetContext(const char *name, int device_id)
-{
-    void *ptmp;
-    SCMutexLock(&mutex);
-
-    CudaHandlerModule *module = cudahl_modules;
-    while (module != NULL && strcasecmp(module->name, name) != 0)
-        module = module->next;
-    if (module != NULL) {
-        if (module->device_id != device_id) {
-            SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already "
-                       "registered, but the new device_id is different "
-                       "from the already registered device_id.");
-            exit(EXIT_FAILURE);
-        }
-        SCMutexUnlock(&mutex);
-        return module->context;
-    }
-
-    CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule));
-    if (unlikely(new_module == NULL))
-        exit(EXIT_FAILURE);
-    memset(new_module, 0, sizeof(CudaHandlerModule));
-    new_module->device_id = device_id;
-    new_module->name = SCStrdup(name);
-    if (new_module->name == NULL)
-        exit(EXIT_FAILURE);
-    if (cudahl_modules == NULL) {
-        cudahl_modules = new_module;
-    } else {
-        new_module->next = cudahl_modules;
-        cudahl_modules = new_module;
-    }
-
-    if (no_of_cuda_contexts <= device_id) {
-        ptmp = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1));
-        if (unlikely(ptmp == NULL)) {
-            SCFree(cuda_contexts);
-            cuda_contexts = NULL;
-            exit(EXIT_FAILURE);
-        }
-        cuda_contexts = ptmp;
-
-        memset(cuda_contexts + no_of_cuda_contexts, 0,
-               sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts));
-        no_of_cuda_contexts = device_id + 1;
-    }
-
-    if (cuda_contexts[device_id] == 0) {
-        SCCudaDevices *devices = SCCudaGetDeviceList();
-        if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
-                            devices->devices[device_id]->device) == -1) {
-            SCLogDebug("ctxcreate failure.");
-            exit(EXIT_FAILURE);
-        }
-    }
-    new_module->context = cuda_contexts[device_id];
-
-    SCMutexUnlock(&mutex);
-    return cuda_contexts[device_id];
-}
-
-void CudaHandlerModuleStoreData(const char *module_name,
-                                const char *data_name, void *data_ptr)
-{
-    SCMutexLock(&mutex);
-
-    CudaHandlerModule *module = cudahl_modules;
-    while (module != NULL && strcasecmp(module->name, module_name) != 0)
-        module = module->next;
-    if (module == NULL) {
-        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
-                   "\"%s\" from module \"%s\" that hasn't been registered "
-                   "yet.",  module_name, data_name);
-        exit(EXIT_FAILURE);
-    }
-
-    CudaHandlerModuleData *data = module->module_data;
-    while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
-        data = data->next;
-    }
-    if (data != NULL) {
-        SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already "
-                     "registered for this module \"%s\".", data_name,
-                     module_name);
-        SCMutexUnlock(&mutex);
-        goto end;
-    }
-
-    CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData));
-    if (unlikely(new_data == NULL))
-        exit(EXIT_FAILURE);
-    memset(new_data, 0, sizeof(CudaHandlerModuleData));
-    new_data->name = SCStrdup(data_name);
-    if (new_data->name == NULL)
-        exit(EXIT_FAILURE);
-    new_data->data = data_ptr;
-
-    if (module->module_data == NULL) {
-        module->module_data = new_data;
-    } else {
-        new_data->next = module->module_data;
-        module->module_data = new_data;
-    }
-
-    SCMutexUnlock(&mutex);
-
- end:
-    return;
-}
-
-void *CudaHandlerModuleGetData(const char *module_name, const char *data_name)
-{
-    SCMutexLock(&mutex);
-
-    CudaHandlerModule *module = cudahl_modules;
-    while (module != NULL && strcasecmp(module->name, module_name) != 0)
-        module = module->next;
-    if (module == NULL) {
-        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
-                   "\"%s\" from module \"%s\" that hasn't been registered "
-                   "yet.",  module_name, data_name);
-        SCMutexUnlock(&mutex);
-        return NULL;
-    }
-
-    CudaHandlerModuleData *data = module->module_data;
-    while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
-        data = data->next;
-    }
-    if (data == NULL) {
-        SCLogInfo("Data \"%s\" already registered for this module \"%s\".  "
-                  "Returning it.", data_name, module_name);
-        SCMutexUnlock(&mutex);
-        return NULL;
-    }
-
-    SCMutexUnlock(&mutex);
-    return data->data;
-}
-
-int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image)
-{
-#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15
-
-    int i = 0;
-
-    /* select the ptx image based on the compute capability supported by all
-     * devices (i.e. the lowest) */
-    char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
-    if (unlikely(image == NULL)) {
-        exit(EXIT_FAILURE);
-    }
-    memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
-
-    int major = INT_MAX;
-    int minor = INT_MAX;
-    SCCudaDevices *devices = SCCudaGetDeviceList();
-    for (i = 0; i < devices->count; i++){
-        if (devices->devices[i]->major_rev < major){
-            major = devices->devices[i]->major_rev;
-            minor = devices->devices[i]->minor_rev;
-        }
-        if (devices->devices[i]->major_rev == major &&
-            devices->devices[i]->minor_rev < minor){
-            minor = devices->devices[i]->minor_rev;
-        }
-    }
-    snprintf(image,
-             strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE,
-             "%s_sm_%u%u",
-             ptx_image, major, minor);
-
-    /* we don't have a cuda module associated with this module.  Create a
-     * cuda module, update the module with this cuda module reference and
-     * then return the module refernce back to the calling function using
-     * the argument */
-    SCLogDebug("Loading kernel module: %s\n",image);
-    if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1)
-        goto error;
-    SCFree(image);
-
-    return 0;
- error:
-    SCFree(image);
-    return -1;
-
-#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE
-}
-
-
-#endif /*  __SC_CUDA_SUPPORT__ */
diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h
deleted file mode 100644
index eee227df60..0000000000
--- a/src/util-cuda-handlers.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (C) 2007-2012 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#ifndef __UTIL_CUDA_HANDLERS__H__
-#define __UTIL_CUDA_HANDLERS__H__
-
-#include "conf.h"
-#include "util-cuda.h"
-
-/************************conf file profile section**********************/
-
-void CudaHandlerAddCudaProfileFromConf(const char *name,
-                                       void *(*Callback)(ConfNode *node),
-                                       void (*Free)(void *));
-void *CudaHandlerGetCudaProfile(const char *name);
-void CudaHandlerFreeProfiles(void);
-
-/*******************cuda context related data section*******************/
-
-#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0
-#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1
-#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2
-
-CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id);
-void CudaHandlerModuleStoreData(const char *module_name,
-                                const char *data_name, void *data_ptr);
-void *CudaHandlerModuleGetData(const char *module_name, const char *data_name);
-int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image);
-
-#endif /* __UTIL_CUDA_HANDLERS__H__ */
diff --git a/src/util-cuda-vars.c b/src/util-cuda-vars.c
deleted file mode 100644
index 596be85826..0000000000
--- a/src/util-cuda-vars.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#include "suricata-common.h"
-#ifdef __SC_CUDA_SUPPORT__
-#include "suricata.h"
-#include "util-mpm.h"
-#include "util-cuda-handlers.h"
-#include "util-cuda-vars.h"
-#include "detect-engine-mpm.h"
-#include "util-debug.h"
-#include "util-mpm-ac.h"
-
-static DetectEngineCtx *cuda_de_ctx = NULL;
-
-void CudaVarsSetDeCtx(DetectEngineCtx *de_ctx)
-{
-    if (cuda_de_ctx != NULL) {
-        SCLogError(SC_ERR_FATAL, "CudaVarsSetDeCtx() called more than once.  "
-                   "This function should be called only once during the "
-                   "lifetime of the engine.");
-        exit(EXIT_FAILURE);
-    }
-
-    cuda_de_ctx = de_ctx;
-
-    return;
-}
-
-int CudaThreadVarsInit(CudaThreadVars *ctv)
-{
-    if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
-        return 0;
-
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    if (conf == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
-        return -1;
-    }
-
-    ctv->mpm_is_cuda = 1;
-    ctv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
-    ctv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
-    ctv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
-    ctv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
-    ctv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
-    ctv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
-    ctv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
-    ctv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
-
-    return 0;
-}
-
-#endif
diff --git a/src/util-cuda-vars.h b/src/util-cuda-vars.h
deleted file mode 100644
index 9c24a915ba..0000000000
--- a/src/util-cuda-vars.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#ifdef __SC_CUDA_SUPPORT__
-
-#ifndef __UTIL_CUDA_VARS__H__
-#define __UTIL_CUDA_VARS__H__
-
-#include "util-cuda-buffer.h"
-#include "util-mpm.h"
-#include "threads.h"
-
-typedef struct CudaThreadVars_ {
-    /* cb - CudaBuffer */
-    CudaBufferData *cuda_ac_cb;
-
-    MpmCtx *mpm_proto_other_ctx;
-
-    MpmCtx *mpm_proto_tcp_ctx_ts;
-    MpmCtx *mpm_proto_udp_ctx_ts;
-
-    MpmCtx *mpm_proto_tcp_ctx_tc;
-    MpmCtx *mpm_proto_udp_ctx_tc;
-
-    uint16_t data_buffer_size_max_limit;
-    uint16_t data_buffer_size_min_limit;
-
-    uint8_t mpm_is_cuda;
-} CudaThreadVars;
-
-typedef struct CudaPacketVars_ {
-    uint8_t cuda_mpm_enabled;
-    uint8_t cuda_done;
-    uint16_t cuda_gpu_matches;
-    SCMutex cuda_mutex;
-    SCCondT cuda_cond;
-    uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
-} CudaPacketVars;
-
-void CudaVarsSetDeCtx(struct DetectEngineCtx_ *de_ctx);
-int CudaThreadVarsInit(CudaThreadVars *ctv);
-
-#endif /* __UTIL_CUDA_VARS__H__ */
-
-#endif /* __SC_CUDA_SUPPORT__ */
diff --git a/src/util-cuda.c b/src/util-cuda.c
deleted file mode 100644
index 288631176b..0000000000
--- a/src/util-cuda.c
+++ /dev/null
@@ -1,5455 +0,0 @@
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * NVIDIA CUDA utility functions - last referenced Cuda Toolkit 4.2
- */
-
-/* compile in, only if we have a CUDA enabled device on the machine, with the
- * toolkit and the driver installed */
-
-#include "suricata-common.h"
-#ifdef __SC_CUDA_SUPPORT__
-
-#include <cuda.h>
-#include "util-cuda.h"
-#include "util-error.h"
-#include "util-debug.h"
-#include "util-unittest.h"
-
-#define CASE_CODE(E) case E: return #E
-
-typedef enum SCCudaAPIS_ {
-    /* init api */
-    SC_CUDA_CU_INIT,
-
-    /* version management api */
-    SC_CUDA_CU_DRIVER_GET_VERSION,
-
-    /* device management api */
-    SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY,
-    SC_CUDA_CU_DEVICE_GET,
-    SC_CUDA_CU_DEVICE_GET_ATTRIBUTE,
-    SC_CUDA_CU_DEVICE_GET_COUNT,
-    SC_CUDA_CU_DEVICE_GET_NAME,
-    SC_CUDA_CU_DEVICE_GET_PROPERTIES,
-    SC_CUDA_CU_DEVICE_TOTAL_MEM,
-
-    /* context management api */
-    SC_CUDA_CU_CTX_CREATE,
-    SC_CUDA_CU_CTX_DESTROY,
-    SC_CUDA_CU_CTX_GET_API_VERSION,
-    SC_CUDA_CU_CTX_GET_CACHE_CONFIG,
-    SC_CUDA_CU_CTX_GET_CURRENT,
-    SC_CUDA_CU_CTX_GET_DEVICE,
-    SC_CUDA_CU_CTX_GET_LIMIT,
-    SC_CUDA_CU_CTX_POP_CURRENT,
-    SC_CUDA_CU_CTX_PUSH_CURRENT,
-    SC_CUDA_CU_CTX_SET_CACHE_CONFIG,
-    SC_CUDA_CU_CTX_SET_CURRENT,
-    SC_CUDA_CU_CTX_SET_LIMIT,
-    SC_CUDA_CU_CTX_SYNCHRONIZE,
-    SC_CUDA_CU_CTX_ATTACH,
-    SC_CUDA_CU_CTX_DETACH,
-
-    /* module management api */
-    SC_CUDA_CU_MODULE_GET_FUNCTION,
-    SC_CUDA_CU_MODULE_GET_GLOBAL,
-    SC_CUDA_CU_MODULE_GET_SURF_REF,
-    SC_CUDA_CU_MODULE_GET_TEX_REF,
-    SC_CUDA_CU_MODULE_LOAD,
-    SC_CUDA_CU_MODULE_LOAD_DATA,
-    SC_CUDA_CU_MODULE_LOAD_DATA_EX,
-    SC_CUDA_CU_MODULE_LOAD_FAT_BINARY,
-    SC_CUDA_CU_MODULE_UNLOAD,
-
-    /* memory management api */
-    SC_CUDA_CU_ARRAY_3D_CREATE,
-    SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR,
-    SC_CUDA_CU_ARRAY_CREATE,
-    SC_CUDA_CU_ARRAY_DESTROY,
-    SC_CUDA_CU_ARRAY_GET_DESCRIPTOR,
-    SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID,
-    SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID,
-    SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE,
-    SC_CUDA_CU_IPC_GET_EVENT_HANDLE,
-    SC_CUDA_CU_IPC_GET_MEM_HANDLE,
-    SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE,
-    SC_CUDA_CU_IPC_OPEN_MEM_HANDLE,
-    SC_CUDA_CU_MEM_ALLOC,
-    SC_CUDA_CU_MEM_ALLOC_HOST,
-    SC_CUDA_CU_MEM_ALLOC_PITCH,
-    SC_CUDA_CU_MEMCPY,
-    SC_CUDA_CU_MEMCPY_2D,
-    SC_CUDA_CU_MEMCPY_2D_ASYNC,
-    SC_CUDA_CU_MEMCPY_2D_UNALIGNED,
-    SC_CUDA_CU_MEMCPY_3D,
-    SC_CUDA_CU_MEMCPY_3D_ASYNC,
-    SC_CUDA_CU_MEMCPY_3D_PEER,
-    SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC,
-    SC_CUDA_CU_MEMCPY_ASYNC,
-    SC_CUDA_CU_MEMCPY_A_TO_A,
-    SC_CUDA_CU_MEMCPY_A_TO_D,
-    SC_CUDA_CU_MEMCPY_A_TO_H,
-    SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC,
-    SC_CUDA_CU_MEMCPY_D_TO_A,
-    SC_CUDA_CU_MEMCPY_D_TO_D,
-    SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC,
-    SC_CUDA_CU_MEMCPY_D_TO_H,
-    SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC,
-    SC_CUDA_CU_MEMCPY_H_TO_A,
-    SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC,
-    SC_CUDA_CU_MEMCPY_H_TO_D,
-    SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC,
-    SC_CUDA_CU_MEMCPY_PEER,
-    SC_CUDA_CU_MEMCPY_PEER_ASYNC,
-    SC_CUDA_CU_MEM_FREE,
-    SC_CUDA_CU_MEM_FREE_HOST,
-    SC_CUDA_CU_MEM_GET_ADDRESS_RANGE,
-    SC_CUDA_CU_MEM_GET_INFO,
-    SC_CUDA_CU_MEM_HOST_ALLOC,
-    SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER,
-    SC_CUDA_CU_MEM_HOST_GET_FLAGS,
-    SC_CUDA_CU_MEM_HOST_REGISTER,
-    SC_CUDA_CU_MEM_HOST_UNREGISTER,
-    SC_CUDA_CU_MEMSET_D16,
-    SC_CUDA_CU_MEMSET_D16_ASYNC,
-    SC_CUDA_CU_MEMSET_D2_D16,
-    SC_CUDA_CU_MEMSET_D2_D16_ASYNC,
-    SC_CUDA_CU_MEMSET_D2_D32,
-    SC_CUDA_CU_MEMSET_D2_D32_ASYNC,
-    SC_CUDA_CU_MEMSET_D2_D8,
-    SC_CUDA_CU_MEMSET_D2_D8_ASYNC,
-    SC_CUDA_CU_MEMSET_D32,
-    SC_CUDA_CU_MEMSET_D32_ASYNC,
-    SC_CUDA_CU_MEMSET_D8,
-    SC_CUDA_CU_MEMSET_D8_ASYNC,
-
-    /* unified addresssing */
-    SC_CUDA_CU_POINTER_GET_ATTRIBUTE,
-
-    /* stream management api */
-    SC_CUDA_CU_STREAM_CREATE,
-    SC_CUDA_CU_STREAM_DESTROY,
-    SC_CUDA_CU_STREAM_QUERY,
-    SC_CUDA_CU_STREAM_SYNCHRONIZE,
-    SC_CUDA_CU_STREAM_WAIT_EVENT,
-
-    /* event management api */
-    SC_CUDA_CU_EVENT_CREATE,
-    SC_CUDA_CU_EVENT_DESTROY,
-    SC_CUDA_CU_EVENT_ELAPSED_TIME,
-    SC_CUDA_CU_EVENT_QUERY,
-    SC_CUDA_CU_EVENT_RECORD,
-    SC_CUDA_CU_EVENT_SYNCHRONIZE,
-
-    /* execution control api */
-    SC_CUDA_CU_FUNC_GET_ATTRIBUTE,
-    SC_CUDA_CU_FUNC_SET_CACHE_CONFIG,
-    SC_CUDA_CU_LAUNCH_KERNEL,
-    SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE,
-    SC_CUDA_CU_FUNC_SET_SHARED_SIZE,
-    SC_CUDA_CU_LAUNCH,
-    SC_CUDA_CU_LAUNCH_GRID,
-    SC_CUDA_CU_LAUNCH_GRID_ASYNC,
-    SC_CUDA_CU_PARAM_SETF,
-    SC_CUDA_CU_PARAM_SETI,
-    SC_CUDA_CU_PARAM_SET_SIZE,
-    SC_CUDA_CU_PARAM_SET_TEX_REF,
-    SC_CUDA_CU_PARAM_SETV,
-
-    /* texture reference api */
-    SC_CUDA_CU_TEX_REF_CREATE,
-    SC_CUDA_CU_TEX_REF_DESTROY,
-    SC_CUDA_CU_TEX_REF_GET_ADDRESS,
-    SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE,
-    SC_CUDA_CU_TEX_REF_GET_ARRAY,
-    SC_CUDA_CU_TEX_REF_GET_FILTER_MODE,
-    SC_CUDA_CU_TEX_REF_GET_FLAGS,
-    SC_CUDA_CU_TEX_REF_GET_FORMAT,
-    SC_CUDA_CU_TEX_REF_SET_ADDRESS,
-    SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D,
-    SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE,
-    SC_CUDA_CU_TEX_REF_SET_ARRAY,
-    SC_CUDA_CU_TEX_REF_SET_FILTER_MODE,
-    SC_CUDA_CU_TEX_REF_SET_FLAGS,
-    SC_CUDA_CU_TEX_REF_SET_FORMAT,
-} SCCudaAPIS;
-
-SCEnumCharMap sc_cuda_api_names_string_map[] = {
-    /* init api */
-    { "cuInit",                    SC_CUDA_CU_INIT },
-
-    /* version management api */
-    { "cuDriverGetVersion",        SC_CUDA_CU_DRIVER_GET_VERSION },
-
-    /* device management api */
-    { "cuDeviceComputeCapability", SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY },
-    { "cuDeviceGet",               SC_CUDA_CU_DEVICE_GET },
-    { "cuDeviceGetAttribute",      SC_CUDA_CU_DEVICE_GET_ATTRIBUTE },
-    { "cuDeviceGetCount",          SC_CUDA_CU_DEVICE_GET_COUNT },
-    { "cuDeviceGetName",           SC_CUDA_CU_DEVICE_GET_NAME },
-    { "cuDeviceGetProperties",     SC_CUDA_CU_DEVICE_GET_PROPERTIES },
-    { "cuDeviceTotalMem",          SC_CUDA_CU_DEVICE_TOTAL_MEM },
-
-    /* context management api */
-    { "cuCtxCreate",               SC_CUDA_CU_CTX_CREATE },
-    { "cuCtxDestroy",              SC_CUDA_CU_CTX_DESTROY },
-    { "cuCtxGetApiVersion",        SC_CUDA_CU_CTX_GET_API_VERSION },
-    { "cuCtxGetCacheConfig",       SC_CUDA_CU_CTX_GET_CACHE_CONFIG },
-    { "cuCtxGetCurrent",           SC_CUDA_CU_CTX_GET_CURRENT },
-    { "cuCtxGetDevice",            SC_CUDA_CU_CTX_GET_DEVICE },
-    { "cuCtxGetLimit",             SC_CUDA_CU_CTX_GET_LIMIT },
-    { "cuCtxPopCurrent",           SC_CUDA_CU_CTX_POP_CURRENT },
-    { "cuCtxPushCurrent",          SC_CUDA_CU_CTX_PUSH_CURRENT },
-    { "cuCtxSetCacheConfig",       SC_CUDA_CU_CTX_SET_CACHE_CONFIG },
-    { "cuCtxSetCurrent",           SC_CUDA_CU_CTX_SET_CURRENT },
-    { "cuCtxSetLimit",             SC_CUDA_CU_CTX_SET_LIMIT },
-    { "cuCtxSynchronize",          SC_CUDA_CU_CTX_SYNCHRONIZE },
-    { "cuCtxAttach",               SC_CUDA_CU_CTX_ATTACH },
-    { "cuCtxDetach",               SC_CUDA_CU_CTX_DETACH },
-
-    /* module management api */
-    { "cuModuleGetFunction",       SC_CUDA_CU_MODULE_GET_FUNCTION },
-    { "cuModuleGetGlobal",         SC_CUDA_CU_MODULE_GET_GLOBAL },
-    { "cuModuleGetSurfRef",        SC_CUDA_CU_MODULE_GET_SURF_REF },
-    { "cuModuleGetTexRef",         SC_CUDA_CU_MODULE_GET_TEX_REF },
-    { "cuModuleLoad",              SC_CUDA_CU_MODULE_LOAD },
-    { "cuModuleLoadData",          SC_CUDA_CU_MODULE_LOAD_DATA },
-    { "cuModuleLoadDataEx",        SC_CUDA_CU_MODULE_LOAD_DATA_EX },
-    { "cuModuleLoadFatBinary",     SC_CUDA_CU_MODULE_LOAD_FAT_BINARY },
-    { "cuModuleUnload",            SC_CUDA_CU_MODULE_UNLOAD },
-
-    /* memory management api */
-    { "cuArray3DCreate",           SC_CUDA_CU_ARRAY_3D_CREATE },
-    { "cuArray3DGetDescriptor",    SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR },
-    { "cuArrayCreate",             SC_CUDA_CU_ARRAY_CREATE },
-    { "cuArrayDestroy",            SC_CUDA_CU_ARRAY_DESTROY },
-    { "cuArrayGetDescriptor",      SC_CUDA_CU_ARRAY_GET_DESCRIPTOR },
-    { "cuDeviceGetByPCIBusId",     SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID },
-    { "cuDeviceGetPCIBusId",       SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID },
-    { "cuIpcCloseMemHandle",       SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE },
-    { "cuIpcGetEventHandle",       SC_CUDA_CU_IPC_GET_MEM_HANDLE },
-    { "cuIpcGetMemHandle",         SC_CUDA_CU_IPC_GET_MEM_HANDLE },
-    { "cuIpcOpenEventHandle",      SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE },
-    { "cuIpcOpenMemHandle",        SC_CUDA_CU_IPC_OPEN_MEM_HANDLE },
-    { "cuMemAlloc",                SC_CUDA_CU_MEM_ALLOC },
-    { "cuMemAllocHost",            SC_CUDA_CU_MEM_ALLOC_HOST },
-    { "cuMemAllocPitch",           SC_CUDA_CU_MEM_ALLOC_PITCH },
-    { "cuMemcpy",                  SC_CUDA_CU_MEMCPY },
-    { "cuMemcpy2D",                SC_CUDA_CU_MEMCPY_2D },
-    { "cuMemcpy2DAsync",           SC_CUDA_CU_MEMCPY_2D_ASYNC },
-    { "cuMemcpy2DUnaligned",       SC_CUDA_CU_MEMCPY_2D_UNALIGNED },
-    { "cuMemcpy3D",                SC_CUDA_CU_MEMCPY_3D },
-    { "cuMemcpy3DAsync",           SC_CUDA_CU_MEMCPY_3D_ASYNC },
-    { "cuMemcpy3DPeer",            SC_CUDA_CU_MEMCPY_3D_PEER },
-    { "cuMemcpy3DPeerAsync",       SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC },
-    { "cuMemcpyAsync",             SC_CUDA_CU_MEMCPY_ASYNC },
-    { "cuMemcpyAtoA",              SC_CUDA_CU_MEMCPY_A_TO_A },
-    { "cuMemcpyAtoD",              SC_CUDA_CU_MEMCPY_A_TO_D },
-    { "cuMemcpyAtoH",              SC_CUDA_CU_MEMCPY_A_TO_H },
-    { "cuMemcpyAtoHAsync",         SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC },
-    { "cuMemcpyDtoA",              SC_CUDA_CU_MEMCPY_D_TO_A },
-    { "cuMemcpyDtoD",              SC_CUDA_CU_MEMCPY_D_TO_D },
-    { "cuMemcpyDtoDAsync",         SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC },
-    { "cuMemcpyDtoH",              SC_CUDA_CU_MEMCPY_D_TO_H },
-    { "cuMemcpyDtoHAsync",         SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC },
-    { "cuMemcpyHtoA",              SC_CUDA_CU_MEMCPY_H_TO_A },
-    { "cuMemcpyHtoAAsync",         SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC },
-    { "cuMemcpyHtoD",              SC_CUDA_CU_MEMCPY_H_TO_D },
-    { "cuMemcpyHtoDAsync",         SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC },
-    { "cuMemcpyPeer",              SC_CUDA_CU_MEMCPY_PEER },
-    { "cuMemcpyPeerAsync",         SC_CUDA_CU_MEMCPY_PEER_ASYNC },
-    { "cuMemFree",                 SC_CUDA_CU_MEM_FREE },
-    { "cuMemFreeHost",             SC_CUDA_CU_MEM_FREE_HOST },
-    { "cuMemGetAddressRange",      SC_CUDA_CU_MEM_GET_ADDRESS_RANGE },
-    { "cuMemGetInfo",              SC_CUDA_CU_MEM_GET_INFO },
-    { "cuMemHostAlloc",            SC_CUDA_CU_MEM_HOST_ALLOC },
-    { "cuMemHostGetDevicePointer", SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER },
-    { "cuMemHostGetFlags",         SC_CUDA_CU_MEM_HOST_GET_FLAGS },
-    { "cuMemHostRegister",         SC_CUDA_CU_MEM_HOST_REGISTER },
-    { "cuMemHostUnregister",       SC_CUDA_CU_MEM_HOST_UNREGISTER },
-    { "cuMemsetD16",               SC_CUDA_CU_MEMSET_D16 },
-    { "cuMemsetD16Async",          SC_CUDA_CU_MEMSET_D16_ASYNC },
-    { "cuMemsetD2D16",             SC_CUDA_CU_MEMSET_D2_D16 },
-    { "cuMemsetD2D16Async",        SC_CUDA_CU_MEMSET_D2_D16_ASYNC },
-    { "cuMemsetD2D32",             SC_CUDA_CU_MEMSET_D2_D32 },
-    { "cuMemsetD2D32Async",        SC_CUDA_CU_MEMSET_D2_D32_ASYNC },
-    { "cuMemsetD2D8",              SC_CUDA_CU_MEMSET_D2_D8 },
-    { "cuMemsetD2D8Async",         SC_CUDA_CU_MEMSET_D2_D8_ASYNC },
-    { "cuMemsetD32",               SC_CUDA_CU_MEMSET_D32 },
-    { "cuMemsetD32Async",          SC_CUDA_CU_MEMSET_D32_ASYNC },
-    { "cuMemsetD8",                SC_CUDA_CU_MEMSET_D8 },
-    { "cuMemsetD8Async",           SC_CUDA_CU_MEMSET_D8_ASYNC },
-
-    /* unified addressing */
-    { "cuPointerGetAttribute",     SC_CUDA_CU_POINTER_GET_ATTRIBUTE },
-
-    /* stream management api */
-    { "cuStreamCreate",            SC_CUDA_CU_STREAM_CREATE },
-    { "cuStreamDestroy",           SC_CUDA_CU_STREAM_DESTROY },
-    { "cuStreamQuery",             SC_CUDA_CU_STREAM_QUERY },
-    { "cuStreamSynchronize",       SC_CUDA_CU_STREAM_SYNCHRONIZE },
-    { "cuStreamWaitEvent",         SC_CUDA_CU_STREAM_WAIT_EVENT },
-
-    /* event management api */
-    { "cuEventCreate",             SC_CUDA_CU_EVENT_CREATE },
-    { "cuEventDestroy",            SC_CUDA_CU_EVENT_DESTROY },
-    { "cuEventElapseTime",         SC_CUDA_CU_EVENT_ELAPSED_TIME },
-    { "cuEventQuery",              SC_CUDA_CU_EVENT_QUERY },
-    { "cuEventRecord",             SC_CUDA_CU_EVENT_RECORD },
-    { "cuEventSynchronize",        SC_CUDA_CU_EVENT_SYNCHRONIZE },
-
-    /* execution control api */
-    { "cuFuncGetAttribute",        SC_CUDA_CU_FUNC_GET_ATTRIBUTE },
-    { "cuFuncSetCacheConfig",      SC_CUDA_CU_FUNC_SET_CACHE_CONFIG },
-    { "cuLaunchKernel",            SC_CUDA_CU_LAUNCH_KERNEL },
-    { "cuFuncSetBlockShape",       SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE },
-    { "cuFuncSetSharedSize",       SC_CUDA_CU_FUNC_SET_SHARED_SIZE },
-    { "cuLaunch",                  SC_CUDA_CU_LAUNCH },
-    { "cuLaunchGrid",              SC_CUDA_CU_LAUNCH_GRID },
-    { "cuLaunchGridAsync",         SC_CUDA_CU_LAUNCH_GRID_ASYNC },
-    { "cuParamSetf",               SC_CUDA_CU_PARAM_SETF },
-    { "cuParamSeti",               SC_CUDA_CU_PARAM_SETI },
-    { "cuParamSetSize",            SC_CUDA_CU_PARAM_SET_SIZE },
-    { "cuSetTexRef",               SC_CUDA_CU_PARAM_SET_TEX_REF },
-    { "cuSetv",                    SC_CUDA_CU_PARAM_SETV },
-
-    /* texture reference api */
-    { "cuTexRefCreate",            SC_CUDA_CU_TEX_REF_CREATE},
-    { "cuTexRefDestroy",           SC_CUDA_CU_TEX_REF_DESTROY},
-    { "cuTexRefGetAddress",        SC_CUDA_CU_TEX_REF_GET_ADDRESS},
-    { "cuTexRefGetAddressMode",    SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE},
-    { "cuTexRefGetArray",          SC_CUDA_CU_TEX_REF_GET_ARRAY},
-    { "cuTexRefGetFilterMode",     SC_CUDA_CU_TEX_REF_GET_FILTER_MODE},
-    { "cuTexRefGetFlags",          SC_CUDA_CU_TEX_REF_GET_FLAGS},
-    { "cuTexRefGetFormat",         SC_CUDA_CU_TEX_REF_GET_FORMAT},
-    { "cuTexRefSetAddress",        SC_CUDA_CU_TEX_REF_SET_ADDRESS},
-    { "cuTexRefSetAddress2D",      SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D},
-    { "cuTexRefSetAddressMode",    SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE},
-    { "cuTexRefSetArray",          SC_CUDA_CU_TEX_REF_SET_ARRAY},
-    { "cuTexRefSetFilterMode",     SC_CUDA_CU_TEX_REF_SET_FILTER_MODE},
-    { "cuTexRefSetFlags",          SC_CUDA_CU_TEX_REF_SET_FLAGS},
-    { "cuTexRefSetFormat",         SC_CUDA_CU_TEX_REF_SET_FORMAT},
-
-    { NULL, -1 },
-};
-
-static SCCudaDevices *devices = NULL;
-
-/*****************************Error_Handling_API*******************************/
-
-/**
- * \internal
- * \brief Maps the error enums from SCCudaAPIS to strings using the preprocessor
- *        #ENUM_VALUE.  This is mainly needed for logging purposes to log the
- *        error codes.
- *
- * \param err The error_code for which the string has to be returned.
- *
- * \retval The string equivalent of the error code.
- */
-static const char *SCCudaGetErrorCodeInString(int err)
-{
-    switch (err) {
-        CASE_CODE(CUDA_SUCCESS);
-        CASE_CODE(CUDA_ERROR_INVALID_VALUE);
-        CASE_CODE(CUDA_ERROR_OUT_OF_MEMORY);
-        CASE_CODE(CUDA_ERROR_NOT_INITIALIZED);
-        CASE_CODE(CUDA_ERROR_DEINITIALIZED);
-        CASE_CODE(CUDA_ERROR_PROFILER_DISABLED);
-        CASE_CODE(CUDA_ERROR_PROFILER_NOT_INITIALIZED);
-        CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STARTED);
-        CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STOPPED);
-        CASE_CODE(CUDA_ERROR_NO_DEVICE);
-        CASE_CODE(CUDA_ERROR_INVALID_DEVICE);
-        CASE_CODE(CUDA_ERROR_INVALID_IMAGE);
-        CASE_CODE(CUDA_ERROR_INVALID_CONTEXT);
-        /* deprecated error code as of 3.2 */
-        CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_CURRENT);
-        CASE_CODE(CUDA_ERROR_MAP_FAILED);
-        CASE_CODE(CUDA_ERROR_UNMAP_FAILED);
-        CASE_CODE(CUDA_ERROR_ARRAY_IS_MAPPED);
-        CASE_CODE(CUDA_ERROR_ALREADY_MAPPED);
-        CASE_CODE(CUDA_ERROR_NO_BINARY_FOR_GPU);
-        CASE_CODE(CUDA_ERROR_ALREADY_ACQUIRED);
-        CASE_CODE(CUDA_ERROR_NOT_MAPPED);
-        CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
-        CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
-        CASE_CODE(CUDA_ERROR_ECC_UNCORRECTABLE);
-        CASE_CODE(CUDA_ERROR_UNSUPPORTED_LIMIT);
-        CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_IN_USE);
-        CASE_CODE(CUDA_ERROR_INVALID_SOURCE);
-        CASE_CODE(CUDA_ERROR_FILE_NOT_FOUND);
-        CASE_CODE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
-        CASE_CODE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
-        CASE_CODE(CUDA_ERROR_OPERATING_SYSTEM);
-        CASE_CODE(CUDA_ERROR_INVALID_HANDLE);
-        CASE_CODE(CUDA_ERROR_NOT_FOUND);
-        CASE_CODE(CUDA_ERROR_NOT_READY);
-        CASE_CODE(CUDA_ERROR_LAUNCH_FAILED);
-        CASE_CODE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
-        CASE_CODE(CUDA_ERROR_LAUNCH_TIMEOUT);
-        CASE_CODE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
-        CASE_CODE(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED);
-        CASE_CODE(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED);
-        CASE_CODE(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE);
-        CASE_CODE(CUDA_ERROR_CONTEXT_IS_DESTROYED);
-        CASE_CODE(CUDA_ERROR_ASSERT);
-        CASE_CODE(CUDA_ERROR_TOO_MANY_PEERS);
-        CASE_CODE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED);
-        CASE_CODE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED);
-        CASE_CODE(CUDA_ERROR_UNKNOWN);
-        default:
-            return "CUDA_UNKNOWN_ERROR_CODE";
-    }
-}
-
-/**
- * \internal
- * \brief A generic function that handles the return values from the CUDA driver
- *        API.
- *
- * \param result   The result from the CUDA driver API call.
- * \param api_type An enum value SCCudaAPIS corresponing to the API for which the
- *                 result was returned.  The enum is needed to map the api type to
- *                 a string for logging purposes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-static int SCCudaHandleRetValue(CUresult result, SCCudaAPIS api_type)
-{
-    if (result == CUDA_SUCCESS) {
-        SCLogDebug("%s executed successfully",
-                   SCMapEnumValueToName(api_type, sc_cuda_api_names_string_map));
-        return 0;
-    } else {
-        SCLogError(SC_ERR_CUDA_ERROR, "%s failed.  Returned errocode - %s",
-                   SCMapEnumValueToName(api_type, sc_cuda_api_names_string_map),
-                   SCCudaGetErrorCodeInString(result));
-        return -1;
-    }
-}
-
-/*****************************Cuda_Initialization_API**************************/
-
-/**
- * \internal
- * \brief Inits the cuda driver API.
- *
- * \param flags Currently should be 0.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaInit(unsigned int flags)
-{
-    CUresult result = cuInit(flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_INIT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/*****************************Version_Management_API***************************/
-
-/**
- * \brief Returns in *driver_version the version number of the installed CUDA
- *        driver. This function automatically returns CUDA_ERROR_INVALID_VALUE
- *        if the driver_version argument is NULL.
- *
- * \param driver_version Returns the CUDA driver version.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDriverGetVersion(int *driver_version)
-{
-    CUresult result = 0;
-
-    if (driver_version == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "driver_version NULL");
-        goto error;
-    }
-
-    result = cuDriverGetVersion(driver_version);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DRIVER_GET_VERSION) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/*****************************Device_Management_API****************************/
-
-/**
- * \internal
- * \brief Returns the major and the minor revision numbers that define the
- *        compute capability for the device that is sent as the argument.
- *
- * \param major Pointer to an integer, that will be updated with the major revision.
- * \param minor Pointer to an integer, that will be updated with the minor revision.
- * \param dev  The device handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (major == NULL || minor == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "major is NULL or minor is NULL");
-        goto error;
-    }
-
-    result = cuDeviceComputeCapability(major, minor, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Returns a device handle given an ordinal in the range
- *        [0, cuDeviceGetCount() - 1].
- *
- * \param device  Pointer to a CUDevice instance that will be updated with the
- *                device handle.
- * \param ordinal An index in the range [0, cuDeviceGetCount() - 1].
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceGet(CUdevice *device, int ordinal)
-{
-    CUresult result = 0;
-
-    if (device == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "device NULL");
-        goto error;
-    }
-
-    result = cuDeviceGet(device, ordinal);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Returns the various attributes for the device that is sent as the arg.
- *
- *        The supported attributes are:
- *
- *        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads
- *            per block;
- *        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
- *        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
- *        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
- *        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
- *        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
- *        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
- *        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- *            shared mem-ory available to a thread block in bytes; this amount
- *            is shared by all thread blocks simultaneously resident on a
- *            multiprocessor;
- *        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device
- *            for __constant_-_ variables in a CUDA C kernel in bytes;
- *        CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
- *        CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
- *            memory copy functions that involve memory regions allocated
- *            through cuMemAllocPitch();
- *        CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- *            registers avail-able to a thread block; this number is shared by
- *            all thread blocks simultaneously resident on a multiprocessor;
- *        CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz;
- *        CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
- *            base addresses aligned to textureAlign bytes do not need an offset
- *            applied to texture fetches;
- *        CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- *            memory between host and device while executing a kernel, or 0 if not;
- *        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- *            the device;
- *        CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- *            for kernels executed on the device, or 0 if not;
- *        CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- *            memory subsystem, or 0 if not;
- *        CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- *            memory into the CUDA address space, or 0 if not;
- *        CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
- *            in. Available modes are as follows:
- *           - CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted
- *                 and can have multiple CUDA contexts present at a single time.
- *           - CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have
- *                 only one CUDA con-text present on it at a time.
- *           - CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
- *                 prohibited from creating new CUDA contexts.
- *
- * \param pi     Pointer to an interger instance that will be updated with the
- *               attribute value.
- * \param attrib Device attribute to query.
- * \param dev  The device handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                             CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (pi == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "prop is NULL");
-        goto error;
-    }
-
-    result = cuDeviceGetAttribute(pi, attrib, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_ATTRIBUTE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Gets the total no of devices with compute capability greater than or
- *        equal to 1.0 that are available for execution.
- *
- * \param count Pointer to an integer that will be updated with the device count.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceGetCount(int *count)
-{
-    CUresult result = 0;
-
-    if (count == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "count NULL");
-        goto error;
-    }
-
-    result = cuDeviceGetCount(count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_COUNT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Returns the device name, given the device handle.
- *
- * \param name Pointer to a char buffer which will be updated with the device name.
- * \param len  Length of the above buffer.
- * \param dev  The device handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceGetName(char *name, int len, CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (name == NULL || len == 0) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "name is NULL or len is 0");
-        goto error;
-    }
-
-    result = cuDeviceGetName(name, len, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_NAME) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Returns the properties of the device.  The CUdevprop structure is
- *        defined as
- *
- *        typedef struct CUdevprop_st {
- *            int maxThreadsPerBlock;
- *            int maxThreadsDim[3];
- *            int maxGridSize[3];
- *            int sharedMemPerBlock;
- *            int totalConstantMemory;
- *            int SIMDWidth;
- *            int memPitch;
- *            int regsPerBlock;
- *            int clockRate;
- *            int textureAlign
- *        } CUdevprop;
- *
- * \param prop Pointer to a CUdevprop instance that holds the device properties.
- * \param dev  The device handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (prop == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "prop is NULL");
-        goto error;
-    }
-
-    result = cuDeviceGetProperties(prop, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PROPERTIES) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Returns the total amount of memory availabe on the device which
- *        is sent as the argument.
- *
- * \param bytes Pointer to an unsigned int instance, that will be updated with
- *              total memory for the device.
- * \param dev   The device handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (bytes == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "bytes is NULL");
-        goto error;
-    }
-
-    result = cuDeviceTotalMem(bytes, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_TOTAL_MEM) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \internal
- * \brief Creates and returns a new instance of SCCudaDevice.
- *
- * \retval device Pointer to the new instance of SCCudaDevice.
- */
-static SCCudaDevice *SCCudaAllocSCCudaDevice(void)
-{
-    SCCudaDevice *device = SCMalloc(sizeof(SCCudaDevice));
-    if (unlikely(device == NULL))
-        return NULL;
-    memset(device, 0 , sizeof(SCCudaDevice));
-
-    return device;
-}
-
-/**
- * \internal
- * \brief Frees an instance of SCCudaDevice.
- *
- * \param device Pointer to the an instance of SCCudaDevice to be freed.
- */
-static void SCCudaDeAllocSCCudaDevice(SCCudaDevice *device)
-{
-    SCFree(device);
-
-    return;
-}
-
-/**
- * \internal
- * \brief Creates and returns a new instance of SCCudaDevices.
- *
- * \retval devices Pointer to the new instance of SCCudaDevices.
- */
-static SCCudaDevices *SCCudaAllocSCCudaDevices(void)
-{
-    SCCudaDevices *devices = SCMalloc(sizeof(SCCudaDevices));
-    if (unlikely(devices == NULL))
-        return NULL;
-    memset(devices, 0 , sizeof(SCCudaDevices));
-
-    return devices;
-}
-
-/**
- * \internal
- * \brief Frees an instance of SCCudaDevices.
- *
- * \param device Pointer to the an instance of SCCudaDevices to be freed.
- */
-static void SCCudaDeAllocSCCudaDevices(SCCudaDevices *devices)
-{
-    int i = 0;
-
-    if (devices == NULL)
-        return;
-
-    if (devices->devices != NULL) {
-        for (i = 0; i < devices->count; i++)
-            SCCudaDeAllocSCCudaDevice(devices->devices[i]);
-
-        SCFree(devices->devices);
-    }
-
-    SCFree(devices);
-
-    return;
-}
-
-/**
- * \brief Retrieves all the devices and all the information corresponding to
- *        the devices on the CUDA device available on this system and returns
- *        a SCCudaDevices instances which holds all this information.
- *
- * \retval devices Pointer to a SCCudaDevices instance that holds information
- *                 for all the CUDA devices on the system.
- */
-static SCCudaDevices *SCCudaGetDevices(void)
-{
-    SCCudaDevices *devices = SCCudaAllocSCCudaDevices();
-    int i = 0;
-
-    if (SCCudaDeviceGetCount(&devices->count) == -1)
-        goto error;
-
-    devices->devices = SCMalloc(devices->count * sizeof(SCCudaDevice *));
-    if (devices->devices == NULL)
-        goto error;
-
-    /* update the device properties */
-    for (i = 0; i < devices->count; i++) {
-        devices->devices[i] = SCCudaAllocSCCudaDevice();
-
-        if (SCCudaDeviceGet(&devices->devices[i]->device, i) == -1)
-            goto error;
-
-        if (SCCudaDeviceComputeCapability(&devices->devices[i]->major_rev,
-                                          &devices->devices[i]->minor_rev,
-                                          devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetName(devices->devices[i]->name,
-                                SC_CUDA_DEVICE_NAME_MAX_LEN,
-                                devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceTotalMem(&devices->devices[i]->bytes,
-                                 devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetProperties(&devices->devices[i]->prop,
-                                      devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        /* retrieve the attributes */
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_threads_per_block,
-                                     CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_x,
-                                     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_y,
-                                     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_z,
-                                     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_x,
-                                     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_y,
-                                     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_z,
-                                     CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_shared_memory_per_block,
-                                     CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_total_constant_memory,
-                                     CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_warp_size,
-                                     CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_pitch,
-                                     CU_DEVICE_ATTRIBUTE_MAX_PITCH,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_registers_per_block,
-                                     CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_clock_rate,
-                                     CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_texture_alignment,
-                                     CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_gpu_overlap,
-                                     CU_DEVICE_ATTRIBUTE_GPU_OVERLAP,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_multiprocessor_count,
-                                     CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_kernel_exec_timeout,
-                                     CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_integrated,
-                                     CU_DEVICE_ATTRIBUTE_INTEGRATED,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_can_map_host_memory,
-                                     CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-
-        if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_compute_mode,
-                                     CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
-                                     devices->devices[i]->device) == -1) {
-            goto error;
-        }
-    }
-
-#ifdef DEBUG
-    SCCudaPrintDeviceList(devices);
-#endif
-
-    return devices;
-
- error:
-    SCCudaDeAllocSCCudaDevices(devices);
-    return NULL;
-}
-
-/**
- * \brief Prints the information for all the devices for this CUDA platform,
- *        supplied inside the argument.
- *
- * \param devices Pointer to a SCCudaDevices instance that holds information on
- *                the devices.
- */
-void SCCudaPrintDeviceList(SCCudaDevices *devices)
-{
-    int i = 0;
-
-    if (devices == NULL) {
-        SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized.  "
-                   "Please initialized the CUDA environment by calling "
-                   "SCCudaInitCudaEnvironment() before making any calls "
-                   "to the CUDA API.");
-        return;
-    }
-
-    SCLogDebug("Printing device info for this CUDA context");
-    SCLogDebug("No of devices:  %d", devices->count);
-
-    for (i = 0; i < devices->count; i++) {
-        SCLogDebug("Device ID: %d", devices->devices[i]->device);
-        SCLogDebug("Device Name: %s", devices->devices[i]->name);
-        SCLogDebug("Device Major Revision: %d", devices->devices[i]->major_rev);
-        SCLogDebug("Device Minor Revision: %d", devices->devices[i]->minor_rev);
-
-        /* Cudevprop */
-        SCLogDebug("Device Max Threads Per Block: %d",
-                   devices->devices[i]->prop.maxThreadsPerBlock);
-        SCLogDebug("Device Max Threads Dim: [%d, %d, %d]",
-                   devices->devices[i]->prop.maxThreadsDim[0],
-                   devices->devices[i]->prop.maxThreadsDim[1],
-                   devices->devices[i]->prop.maxThreadsDim[2]);
-        SCLogDebug("Device Max Grid Size: [%d, %d, %d]",
-                   devices->devices[i]->prop.maxGridSize[0],
-                   devices->devices[i]->prop.maxGridSize[1],
-                   devices->devices[i]->prop.maxGridSize[2]);
-        SCLogDebug("Device Shared Memory Per Block: %d",
-                   devices->devices[i]->prop.sharedMemPerBlock);
-        SCLogDebug("Device Total Constant Memory: %d",
-                   devices->devices[i]->prop.totalConstantMemory);
-        SCLogDebug("Device SIMD Width(Warp Size): %d",
-                   devices->devices[i]->prop.SIMDWidth);
-        SCLogDebug("Device Maximum Mem Pitch: %d", devices->devices[i]->prop.memPitch);
-        SCLogDebug("Device Total Registers Available Per Block: %d",
-                   devices->devices[i]->prop.regsPerBlock);
-        SCLogDebug("Device Clock Frequency: %d", devices->devices[i]->prop.clockRate);
-        SCLogDebug("Device Texture Alignment Requirement: %d",
-                   devices->devices[i]->prop.textureAlign);
-
-
-        /* device attributes */
-        SCLogDebug("Device Max Threads Per Block: %d",
-                   devices->devices[i]->attr_max_threads_per_block);
-        SCLogDebug("Device Max Block Dim X: %d",
-                   devices->devices[i]->attr_max_block_dim_x);
-        SCLogDebug("Device Max Block Dim Y: %d",
-                   devices->devices[i]->attr_max_block_dim_y);
-        SCLogDebug("Device Max Block Dim Z: %d",
-                   devices->devices[i]->attr_max_block_dim_z);
-        SCLogDebug("Device Max Grid Dim X: %d",
-                   devices->devices[i]->attr_max_grid_dim_x);
-        SCLogDebug("Device Max Grid Dim Y: %d",
-                   devices->devices[i]->attr_max_grid_dim_y);
-        SCLogDebug("Device Max Grid Dim Z: %d",
-                   devices->devices[i]->attr_max_grid_dim_z);
-        SCLogDebug("Device Max Shared Memory Per Block: %d",
-                   devices->devices[i]->attr_max_shared_memory_per_block);
-        SCLogDebug("Device Total Constant Memory: %d",
-                   devices->devices[i]->attr_total_constant_memory);
-        SCLogDebug("Device Warp Size: %d", devices->devices[i]->attr_warp_size);
-        SCLogDebug("Device Max Pitch: %d", devices->devices[i]->attr_max_pitch);
-        SCLogDebug("Device Max Registers Per Block: %d",
-                   devices->devices[i]->attr_max_registers_per_block);
-        SCLogDebug("Device Clock Rate: %d", devices->devices[i]->attr_clock_rate);
-        SCLogDebug("Device Texture Alignement: %d",
-                   devices->devices[i]->attr_texture_alignment);
-        SCLogDebug("Device GPU Overlap: %s",
-                   (devices->devices[i]->attr_gpu_overlap == 1) ? "Yes": "No");
-        SCLogDebug("Device Multiprocessor Count: %d",
-                   devices->devices[i]->attr_multiprocessor_count);
-        SCLogDebug("Device Kernel Exec Timeout: %s",
-                   (devices->devices[i]->attr_kernel_exec_timeout) ? "Yes": "No");
-        SCLogDebug("Device Integrated With Memory Subsystem: %s",
-                   (devices->devices[i]->attr_integrated) ? "Yes": "No");
-        SCLogDebug("Device Can Map Host Memory: %s",
-                   (devices->devices[i]->attr_can_map_host_memory) ? "Yes": "No");
-        if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_DEFAULT)
-            SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_DEFAULT");
-        else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_EXCLUSIVE)
-            SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_EXCLUSIVE");
-        else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_PROHIBITED)
-            SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_PROHIBITED");
-    }
-
-    return;
-}
-
-/**
- * \brief Prints some basic information for the default device(the first devie)
- *        we will be using on this cuda platform for use by our engine.  This
- *        function is basically to be used to print some minimal information to
- *        the user at engine startup.
- *
- * \param devices Pointer to a SCCudaDevices instance that holds information on
- *                the devices.
- */
-void SCCudaPrintBasicDeviceInfo(SCCudaDevices *devices)
-{
-    int i = 0;
-
-    if (devices == NULL) {
-        SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized.  "
-                   "Please initialized the CUDA environment by calling "
-                   "SCCudaInitCudaEnvironment() before making any calls "
-                   "to the CUDA API.");
-        return;
-    }
-
-    for (i = 0; i < devices->count; i++) {
-        SCLogInfo("GPU Device %d: %s, %d Multiprocessors, %dMHz, CUDA Compute "
-                  "Capability %d.%d", i + 1,
-                  devices->devices[i]->name,
-                  devices->devices[i]->attr_multiprocessor_count,
-                  devices->devices[i]->attr_clock_rate/1000,
-                  devices->devices[i]->major_rev,
-                  devices->devices[i]->minor_rev);
-    }
-
-    return;
-}
-
-/**
- * \brief Gets the device list, for the CUDA platform environment initialized by
- *        the engine.
- *
- * \retval devices Pointer to the CUDA device list on success; NULL on failure.
- */
-SCCudaDevices *SCCudaGetDeviceList(void)
-{
-    if (devices == NULL) {
-        SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized.  "
-                   "Please initialized the CUDA environment by calling "
-                   "SCCudaInitCudaEnvironment() before making any calls "
-                   "to the CUDA API.");
-        return NULL;
-    }
-
-    return devices;
-}
-
-/*****************************Context_Management_API***************************/
-
-/**
- * \brief Creates a new CUDA context and associates it with the calling thread.
- *        The flags parameter is described below. The context is created with
- *        a usage count of 1 and the caller of cuCtxCreate() must call
- *        cuCtxDestroy() or cuCtxDetach() when done using the context. If a
- *        context is already current to the thread, it is supplanted by the
- *        newly created context and may be restored by a subsequent call to
- *        cuCtxPopCurrent(). The two LSBs of the flags parameter can be used
- *        to control how the OS thread, which owns the CUDA context at the
- *        time of an API call, interacts with the OS scheduler when waiting for
- *        results from the GPU.
- *
- *        - CU_CTX_SCHED_AUTO: The default value if the flags parameter is zero,
- *              uses a heuristic based on the number of active CUDA contexts in
- *              the process C and the number of logical processors in the system
- *              P. If C > P, then CUDA will yield to other OS threads when
- *              waiting for the GPU, otherwise CUDA will not yield while waiting
- *              for results and actively spin on the processor.
- *        - CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- *              results from the GPU. This can de-crease latency when waiting for
- *              the GPU, but may lower the performance of CPU threads if they are
- *              performing work in parallel with the CUDA thread.
- *        - CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting
- *              for results from the GPU. This can increase latency when waiting
- *              for the GPU, but can increase the performance of CPU threads
- *              performing work in parallel with the GPU.
- *        - CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- *              synchronization primitive when waiting for the GPU to finish work.
- *        - CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- *              This flag must be set in order to allocate pinned host memory
- *              that is accessible to the GPU.
- *
- *        Note to Linux users:
- *        Context creation will fail with CUDA_ERROR_UNKNOWN if the compute mode
- *        of the device is CU_COMPUTEMODE_PROHIBITED. Similarly, context creation
- *        will also fail with CUDA_ERROR_UNKNOWN if the compute mode for the
- *        device is set to CU_COMPUTEMODE_EXCLUSIVE and there is already an
- *        active context on the device. The function cuDeviceGetAttribute() can
- *        be used with CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute
- *        mode of the device. The nvidia-smi tool can be used to set the compute
- *        mode for devices. Documentation for nvidia-smi can be obtained by
- *        passing a -h option to it.
- *
- * \param pctx  Returned context handle of the current context.
- * \param flags Context creation flags.
- * \param dev   Device to create context on.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
-{
-    CUresult result = 0;
-
-    if (pctx == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pctx NULL");
-        goto error;
-    }
-
-    result = cuCtxCreate(pctx, flags, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Destroys the CUDA context specified by ctx. If the context usage count
- *        is not equal to 1, or the context is current to any CPU thread other
- *        than the current one, this function fails. Floating contexts (detached
- *        from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
- *        function.
- *
- * \param ctx Context to destroy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxDestroy(CUcontext ctx)
-{
-    CUresult result = 0;
-
-    result = cuCtxDestroy(ctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DESTROY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version)
-{
-    CUresult result = 0;
-
-    if (version == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "version NULL");
-        goto error;
-    }
-
-    result = cuCtxGetApiVersion(ctx, version);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_API_VERSION) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig)
-{
-    CUresult result = 0;
-
-    if (pconfig == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pconfig NULL");
-        goto error;
-    }
-
-    result = cuCtxGetCacheConfig(pconfig);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CACHE_CONFIG) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxGetCurrent(CUcontext *pctx)
-{
-    CUresult result = 0;
-
-    if (pctx == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pctx NULL");
-        goto error;
-    }
-
-    result = cuCtxGetCurrent(pctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CURRENT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *device the ordinal of the current context's device.
- *
- * \param device Returned device id for the current context.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxGetDevice(CUdevice *device)
-{
-    CUresult result = 0;
-
-    if (device == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "device NULL");
-        goto error;
-    }
-
-    result = cuCtxGetDevice(device);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_DEVICE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit)
-{
-    CUresult result = 0;
-
-    result = cuCtxGetLimit(pvalue, limit);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_LIMIT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Pops the current CUDA context from the CPU thread. The CUDA context
- *        must have a usage count of 1. CUDA contexts have a usage count of 1
- *        upon creation; the usage count may be incremented with cuCtxAttach()
- *        and decremented with cuCtxDetach().
- *
- *        If successful, cuCtxPopCurrent() passes back the new context handle
- *        in *pctx. The old context may then be made current to a different CPU
- *        thread by calling cuCtxPushCurrent().
- *
- *        Floating contexts may be destroyed by calling cuCtxDestroy().
- *
- *        If a context was current to the CPU thread before cuCtxCreate() or
- *        cuCtxPushCurrent() was called, this function makes that context
- *        current to the CPU thread again.
- *
- * \param pctx Returned new context handle.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxPopCurrent(CUcontext *pctx)
-{
-    CUresult result = 0;
-
-    result = cuCtxPopCurrent(pctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_POP_CURRENT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Pushes the given context ctx onto the CPU thread's stack of current
- *        contexts. The speci?ed context becomes the CPU thread's current
- *        context, so all CUDA functions that operate on the current context
- *        are affected.
- *
- *        The previous current context may be made current again by calling
- *        cuCtxDestroy() or cuCtxPopCurrent().
- *
- *        The context must be "floating," i.e. not attached to any thread.
- *        Contexts are made to float by calling cuCtxPopCurrent().
- *
- * \param ctx Floating context to attach.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxPushCurrent(CUcontext ctx)
-{
-    CUresult result = 0;
-
-    result = cuCtxPushCurrent(ctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_PUSH_CURRENT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxSetCacheConfig(CUfunc_cache config)
-{
-    CUresult result = 0;
-
-    result = cuCtxSetCacheConfig(config);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CACHE_CONFIG) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxSetCurrent(CUcontext ctx)
-{
-    CUresult result = 0;
-
-    result = cuCtxSetCurrent(ctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CURRENT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaCtxSetLimit(CUlimit limit, size_t value)
-{
-    CUresult result = 0;
-
-    result = cuCtxSetLimit(value, limit);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_LIMIT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Blocks until the device has completed all preceding requested tasks.
- *        cuCtxSynchronize() returns an error if one of the preceding tasks failed.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxSynchronize(void)
-{
-    CUresult result = 0;
-
-    result = cuCtxSynchronize();
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SYNCHRONIZE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Increments the usage count of the context and passes back a context
- *        handle in *pctx that must be passed to cuCtxDetach() when the
- *        application is done with the context. cuCtxAttach() fails if there is
- *        no context current to the thread.  Currently, the flags parameter must
- *        be 0.
- *
- * \param pctx  Returned context handle of the current context.
- * \param flags Context attach flags (must be 0).
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags)
-{
-    CUresult result = 0;
-
-    SCLogInfo("Cuda API - %s deprecated",
-              SCMapEnumValueToName(SC_CUDA_CU_CTX_ATTACH,
-                                   sc_cuda_api_names_string_map));
-
-    if (pctx == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pctx NULL");
-        goto error;
-    }
-
-    result = cuCtxAttach(pctx, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_ATTACH) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Decrements the usage count of the context ctx, and destroys the
- *        context if the usage count goes to 0. The context must be a handle
- *        that was passed back by cuCtxCreate() or cuCtxAttach(), and must be
- *        current to the calling thread.
- *
- * \param ctx Context to destroy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaCtxDetach(CUcontext ctx)
-{
-    CUresult result = 0;
-
-    SCLogInfo("Cuda API - %s deprecated",
-              SCMapEnumValueToName(SC_CUDA_CU_CTX_DETACH,
-                                   sc_cuda_api_names_string_map));
-
-    result = cuCtxDetach(ctx);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DETACH) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/*****************************Module_Management_API****************************/
-
-/**
- * \brief Returns in *hfunc the handle of the function of name \"name\" located
- *        in module hmod. If no function of that name exists,
- *        cuModuleGetFunction() returns CUDA_ERROR_NOT_FOUND.
- *
- * \param hfunc Returned function handle.
- * \param hmod  Module to return function from.
- * \param name  Name of function to retrieve.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name)
-{
-    CUresult result = 0;
-
-    if (hfunc == NULL || name == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "hfunc is NULL or name is NULL");
-        goto error;
-    }
-
-    result = cuModuleGetFunction(hfunc, hmod, name);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_FUNCTION) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *dptr and *bytes the base pointer and size of the global
- *        name \"name\" located in module hmod. If no variable of that name
- *        exists, cuModuleGetGlobal() returns CUDA_ERROR_NOT_FOUND. Both
- *        parameters dptr and bytes are optional. If one of them is NULL,
- *        it is ignored.
- *
- * \param dptr Returned global device pointer.
- * \param bytes Returned global size in bytes.
- * \param hmod  Module to return function from.
- * \param name  Name of global to retrieve.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod,
-                          const char *name)
-{
-    CUresult result = 0;
-
-    if (name == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "name is NULL");
-        goto error;
-    }
-
-    result = cuModuleGetGlobal(dptr, bytes, hmod, name);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_GLOBAL) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod, const char *name)
-{
-    CUresult result = 0;
-
-    if (p_surf_ref == NULL || name == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_surf_ref is NULL or name is NULL");
-        goto error;
-    }
-
-    result = cuModuleGetSurfRef(p_surf_ref, hmod, name);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_SURF_REF) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *p_tex_ref the handle of the texture reference of name
- *        \"name\" in the module hmod. If no texture reference of that name
- *        exists, cuModuleGetTexRef() returns CUDA_ERROR_NOT_FOUND. This texture
- *        reference handle should not be destroyed, since it will be destroyed
- *        when the module is unloaded.
- *
- * \param p_tex_ref Returned global device pointer.
- * \param hmod      Module to retrieve texture reference from.
- * \param name      Name of the texture reference to retrieve.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, const char *name)
-{
-    CUresult result = 0;
-
-    if (p_tex_ref == NULL || name == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_tex_ref is NULL or name is NULL");
-        goto error;
-    }
-
-    result = cuModuleGetTexRef(p_tex_ref, hmod, name);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_TEX_REF) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Takes a filename fname and loads the corresponding module \"module\"
- *        into the current context. The CUDA driver API does not attempt to
- *        lazily allocate the resources needed by a module; if the memory for
- *        functions and data (constant and global) needed by the module cannot
- *        be allocated, cuModuleLoad() fails. The file should be a cubin file
- *        as output by nvcc or a PTX file, either as output by nvcc or handwrtten.
- *
- * \param module Returned module.
- * \param fname  Filename of module to load.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleLoad(CUmodule *module, const char *fname)
-{
-    CUresult result = 0;
-
-    if (module == NULL || fname == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "module is NULL or fname is NULL");
-        goto error;
-    }
-
-    result = cuModuleLoad(module, fname);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Takes a pointer image and loads the corresponding module \"module\"
- *        into the current context. The pointer may be obtained by mapping a
- *        cubin or PTX file, passing a cubin or PTX ?le as a NULL-terminated
- *        text string, or incorporating a cubin object into the executable
- *        resources and using operating system calls such as Windows
- *        FindResource() to obtain the pointer.
- *
- * \param module Returned module.
- * \param image  Module data to load
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleLoadData(CUmodule *module, const void *image)
-{
-    CUresult result = 0;
-
-    if (module == NULL || image == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "module is NULL or image is NULL");
-        goto error;
-    }
-
-    result = cuModuleLoadData(module, image);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Takes a pointer image and loads the corresponding module module into
- *        the current context. The pointer may be obtained by mapping a cubin or
- *        PTX file, passing a cubin or PTX file as a NULL-terminated text
- *        string, or incorporating a cubin object into the executable resources
- *        and using operating system calls such as Windows FindResource() to
- *        obtain the pointer. Options are passed as an array via options and any
- *        corresponding parameters are passed in optionValues. The number of
- *        total options is supplied via numOptions. Any outputs will be returned
- *        via optionValues. Supported options are:
- *
- *        - CU_JIT_MAX_REGISTERS: input specifies the maximum number of registers
- *              per thread;
- *        - CU_JIT_THREADS_PER_BLOCK: input specifies number of threads per block
- *              to target compilation for; output returns the number of threads
- *              the compiler actually targeted;
- *        - CU_JIT_WALL_TIME: output returns the float value of wall clock time,
- *              in milliseconds, spent compiling the PTX code;
- *        - CU_JIT_INFO_LOG_BUFFER: input is a pointer to a buffer in which to
- *              print any informational log messages from PTX assembly;
- *        - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the
- *              buffer; output is the number of bytes filled with messages;
- *        - CU_JIT_ERROR_LOG_BUFFER: input is a pointer to a buffer in which to
- *              print any error log messages from PTX assembly;
- *        - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the
- *              buffer; output is the number of bytes filled with messages;
- *        - CU_JIT_OPTIMIZATION_LEVEL: input is the level of optimization to apply
- *              to generated code (0 - 4), with 4 being the default and highest
- *              level;
- *        - CU_JIT_TARGET_FROM_CUCONTEXT: causes compilation target to be
- *              determined based on current attached context (default);
- *        - CU_JIT_TARGET: input is the compilation target based on supplied
- *              CUjit_target_enum; possible values are:
- *            -- CU_TARGET_COMPUTE_10
- *            -- CU_TARGET_COMPUTE_11
- *            -- CU_TARGET_COMPUTE_12
- *            -- CU_TARGET_COMPUTE_13
- *
- * \param module       Returned module.
- * \param image        Module data to load.
- * \param numOptions   Number of options.
- * \param options      Options for JIT.
- * \param optionValues Option values for JIT.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleLoadDataEx(CUmodule *module, const void *image,
-                           unsigned int num_options, CUjit_option *options,
-                           void **option_values)
-{
-    CUresult result = 0;
-
-    if (module == NULL || image == NULL || options == NULL ||
-        option_values == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "module is NULL or image is NULL or options is NULL or "
-                   "option_values is NULL");
-        goto error;
-    }
-
-    result = cuModuleLoadDataEx(module, image, num_options, options, option_values);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA_EX) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Takes a pointer fat_cubin and loads the corresponding module \"module\"
- *        into the current context. The pointer represents a fat binary object,
- *        which is a collection of different cubin files, all representing the
- *        same device code, but compiled and optimized for different
- *        architectures. There is currently no documented API for constructing
- *        and using fat binary objects by programmers, and therefore this
- *        function is an internal function in this version of CUDA. More
- *        information can be found in the nvcc document.
- *
- * \param module   Returned module.
- * \param fatCubin Fat binary to load.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin)
-{
-    CUresult result = 0;
-
-    if (module == NULL || fat_cubin == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "module is NULL or fatCubin is NULL");
-        goto error;
-    }
-
-    result = cuModuleLoadFatBinary(module, fat_cubin);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_FAT_BINARY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Unloads a module hmod from the current context.
- *
- * \param module Module to unload
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaModuleUnload(CUmodule hmod)
-{
-    CUresult result = 0;
-
-    result = cuModuleUnload(hmod);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_UNLOAD) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/****************************Memory_Management_API*****************************/
-
-/**
- * \brief Creates a CUDA array according to the CUDA_ARRAY3D_DESCRIPTOR
- *        structure pAllocateArray and returns a handle to the new CUDA
- *        array in *p_handle. The CUDA_ARRAY3D_DESCRIPTOR is defined as:
- *
- *        typedef struct {
- *            unsigned int Width;
- *            unsigned int Height;
- *            unsigned int Depth;
- *            CUarray_format Format;
- *            unsigned int NumChannels;
- *            unsigned int Flags;
- *        } CUDA_ARRAY3D_DESCRIPTOR;
- *
- *        where:
- *
- *        - Width, Height, and Depth are the width, height, and depth of the
- *          CUDA array (in elements); the CUDA array is one-dimensional if
-v *          height and depth are 0, two-dimensional if depth is 0, and
- *          three-dimensional otherwise;
- *        - Format speci?es the format of the elements; CUarray_format is
- *          defined as:
- *
- *          typedef enum CUarray_format_enum {
- *              CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
- *              CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
- *              CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
- *              CU_AD_FORMAT_SIGNED_INT8 = 0x08,
- *              CU_AD_FORMAT_SIGNED_INT16 = 0x09,
- *              CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
- *              CU_AD_FORMAT_HALF = 0x10,
- *              CU_AD_FORMAT_FLOAT = 0x20
- *          } CUarray_format;
- *
- *        - NumChannels speci?es the number of packed components per CUDA array
- *          element; it may be 1, 2, or 4;
- *        - Flags provides for future features. For now, it must be set to 0.
- *
- *        Here are examples of CUDA array descriptions:
- *
- *        Description for a CUDA array of 2048 floats:
- *
- *        CUDA_ARRAY3D_DESCRIPTOR desc;
- *        desc.Format = CU_AD_FORMAT_FLOAT;
- *        desc.NumChannels = 1;
- *        desc.Width = 2048;
- *        desc.Height = 0;
- *        desc.Depth = 0;
- *
- *        Description for a 64 x 64 CUDA array of floats:
- *
- *        CUDA_ARRAY3D_DESCRIPTOR desc;
- *        desc.Format = CU_AD_FORMAT_FLOAT;
- *        desc.NumChannels = 1;
- *        desc.Width = 64;
- *        desc.Height = 64;
- *        desc.Depth = 0;
- *
- *        Description for a width x height x depth CUDA array of 64-bit,
- *        4x16-bit float16's:
- *
- *        CUDA_ARRAY3D_DESCRIPTOR desc;
- *        desc.FormatFlags = CU_AD_FORMAT_HALF;
- *        desc.NumChannels = 4;
- *        desc.Width = width;
- *        desc.Height = height;
- *        desc.Depth = depth;
- *
- * \param p_handle         Returned Handle.
- * \param p_allocate_array 3D array descriptor.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaArray3DCreate(CUarray *p_handle,
-                        const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array)
-{
-    CUresult result = 0;
-
-    if (p_handle == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_handle is NULL");
-        goto error;
-    }
-
-    result = cuArray3DCreate(p_handle, p_allocate_array);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *p_rray_descriptor a descriptor containing information on
- *        the format and dimensions of the CUDA array h_array. It is useful for
- *        subroutines that have been passed a CUDA array, but need to know the
- *        CUDA array parameters for validation or other purposes.
- *
- *        This function may be called on 1D and 2D arrays, in which case the
- *        Height and/or Depth members of the descriptor struct will be set to 0.
- *
- * \param p_array_descriptor Returned 3D array descriptor.
- * \param h_array            3D array to get descriptor of.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor,
-                               CUarray h_array)
-{
-    CUresult result = 0;
-
-    if (p_array_descriptor == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_array_descriptor is NULL");
-        goto error;
-    }
-
-    result = cuArray3DGetDescriptor(p_array_descriptor, h_array);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Creates a CUDA array according to the CUDA_ARRAY_DESCRIPTOR structure
- *        p_allocate_array and returns a handle to the new CUDA array in
- *        p_handle. The CUDA_ARRAY_DESCRIPTOR is defined as:
- *
- *        typedef struct {
- *            unsigned int Width;
- *            unsigned int Height;
- *            CUarray_format Format;
- *            unsigned int NumChannels;
- *        } CUDA_ARRAY_DESCRIPTOR;
- *
- *        where:
- *
- *        - Width, and Height are the width, and height of the CUDA array
- *          (in elements); the CUDA array is one-dimensional if height is 0,
- *          two-dimensional otherwise;
- *        - Format speci?es the format of the elements; CUarray_format is
- *          defined as:
- *
- *        typedef enum CUarray_format_enum {
- *            CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
- *            CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
- *            CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
- *            CU_AD_FORMAT_SIGNED_INT8 = 0x08,
- *            CU_AD_FORMAT_SIGNED_INT16 = 0x09,
- *            CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
- *            CU_AD_FORMAT_HALF = 0x10,
- *            CU_AD_FORMAT_FLOAT = 0x20
- *        } CUarray_format;
- *
- *        - NumChannels specifies the number of packed components per CUDA
- *          array element; it may be 1, 2, or 4;
- *
- *        Here are examples of CUDA array descriptions:
- *
- *        Description for a CUDA array of 2048 floats:
- *
- *        CUDA_ARRAY_DESCRIPTOR desc;
- *        desc.Format = CU_AD_FORMAT_FLOAT;
- *        desc.NumChannels = 1;
- *        desc.Width = 2048;
- *        desc.Height = 1;
- *
- *        Description for a 64 x 64 CUDA array of floats:
- *
- *        CUDA_ARRAY_DESCRIPTOR desc;
- *        desc.Format = CU_AD_FORMAT_FLOAT;
- *        desc.NumChannels = 1;
- *        desc.Width = 64;
- *        desc.Height = 64;
- *
- *        Description for a width x height CUDA array of 64-bit, 4x16-bit
- *        float16's:
- *
- *        CUDA_ARRAY_DESCRIPTOR desc;
- *        desc.FormatFlags = CU_AD_FORMAT_HALF;
- *        desc.NumChannels = 4;
- *        desc.Width = width;
- *        desc.Height = height;
- *
- *        Description for a width x height CUDA array of 16-bit elements, each
- *        of which is two 8-bit unsigned chars:
- *
- *        CUDA_ARRAY_DESCRIPTOR arrayDesc;
- *        desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
- *        desc.NumChannels = 2;
- *        desc.Width = width;
- *        desc.Height = height;
- *
- * \param p_handle         Returned array.
- * \param p_allocate_array Array descriptor.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaArrayCreate(CUarray *p_handle,
-                      const CUDA_ARRAY_DESCRIPTOR *p_allocate_array)
-{
-    CUresult result = 0;
-
-    if (p_handle == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_handle is NULL");
-        goto error;
-    }
-
-    result = cuArrayCreate(p_handle, p_allocate_array);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-
-/**
- * \brief Destroys the CUDA array h_array.
- *
- * \param h_array Array to destroy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaArrayDestroy(CUarray h_array)
-{
-    int result = cuArrayDestroy(h_array);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_DESTROY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *p_array_descriptor a descriptor containing information on
- *        the format and dimensions of the CUDA array h_array. It is useful for
- *        subroutines that have been passed a CUDA array, but need to know the
- *        CUDA array parameters for validation or other purposes.
- *
- * \param p_array_descriptor Returned array descriptor.
- * \param h_array            Array to get descriptor of.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor,
-                             CUarray h_array)
-{
-    CUresult result = 0;
-
-    if (p_array_descriptor == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_array_descriptor is NULL");
-        goto error;
-    }
-
-    result = cuArrayGetDescriptor(p_array_descriptor, h_array);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_GET_DESCRIPTOR) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id)
-{
-    CUresult result = 0;
-
-    result = cuDeviceGetByPCIBusId(dev, pci_bus_id);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev)
-{
-    CUresult result = 0;
-
-    result = cuDeviceGetPCIBusId(pci_bus_id, len, dev);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaIpcCloseMemHandle(CUdeviceptr dptr)
-{
-    CUresult result = 0;
-
-    result = cuIpcCloseMemHandle(dptr);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event)
-{
-    CUresult result = 0;
-
-    result = cuIpcGetEventHandle(p_handle, event);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr)
-{
-    CUresult result = 0;
-
-    result = cuIpcGetMemHandle(p_handle, dptr);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle)
-{
-    CUresult result = 0;
-
-    result = cuIpcOpenEventHandle(ph_event, handle);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                           unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuIpcOpenMemHandle(pdptr, handle, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *p_array_descriptor a descriptor containing information on
- *        the format and dimensions of the CUDA array h_array. It is useful for
- *        subroutines that have been passed a CUDA array, but need to know the
- *        CUDA array parameters for validation or other purposes.
- *
- * \param p_array_descriptor Returned array descriptor.
- * \param h_array            Array to get descriptor of.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size)
-{
-    CUresult result = 0;
-
-    if (dptr == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "dptr is NULL");
-        goto error;
-    }
-
-    result = cuMemAlloc(dptr, byte_size);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Allocates bytesize bytes of host memory that is page-locked and
- *        accessible to the device. The driver tracks the vir-tual memory
- *        ranges allocated with this function and automatically accelerates
- *        calls to functions such as cuMemcpy(). Since the memory can be
- *        accessed directly by the device, it can be read or written with
- *        much higher bandwidth than pageable memory obtained with functions
- *        such as SCMalloc(). Allocating excessive amounts of memory with
- *        cuMemAllocHost() may degrade system performance, since it reduces
- *        the amount of memory available to the system for paging. As a result,
- *        this function is best used sparingly to allocate staging areas for
- *        data exchange between host and device.
- *
- * \param pp        Returned host pointer to page-locked memory.
- * \param byte_size Requested allocation size in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemAllocHost(void **pp, size_t byte_size)
-{
-    CUresult result = 0;
-
-    if (pp == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pp is NULL");
-        goto error;
-    }
-
-    result = cuMemAllocHost(pp, byte_size);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Allocates at least width_in_bytes * height bytes of linear memory on the
- *        device and returns in *dptr a pointer to the allocated memory. The
- *        function may pad the allocation to ensure that corresponding pointers in
- *        any given row will continue to meet the alignment requirements for
- *        coalescing as the address is updated from row to row. ElementSizeBytes
- *        specifies the size of the largest reads and writes that will be
- *        performed on the memory range.
- *
- *        element_size_bytes may be 4, 8 or 16 (since coalesced memory
- *        transactions are not possible on other data sizes). If element_size_bytes
- *        is smaller than the actual read/write size of a kernel, the kernel will
- *        run correctly, but possibly at reduced speed. The pitch returned in
- *        *p_itch by cuMemAllocPitch() is the width in bytes of the allocation.
- *        The intended usage of pitch is as a separate parameter of the allocation,
- *        used to compute addresses within the 2D array. Given the row and column
- *        of an array element of type T, the address is computed as:
- *
- *        T * p_element = (T*)((char*)base_address + row * pitch) + column;
- *
- *        The pitch returned by cuMemAllocPitch() is guaranteed to work with
- *        cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it
- *        is recommended that programmers consider performing pitch allocations
- *        using cuMemAllocPitch(). Due to alignment restrictions in the hardware,
- *        this is especially true if the application will be performing 2D memory
- *        copies between different regions of device memory (whether linear memory
- *        or CUDA arrays).
- *
- * \param dptr Returned device pointer.
- * \param p_pitch Returned pitch of allocation in bytes.
- * \param width_in_bytes Requested allocation width in bytes.
- * \param height Requested allocation width in rows.
- * \param element_size_bytes Size of largest reads/writes for range.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch,
-                        size_t width_in_bytes,
-                        size_t height,
-                        unsigned int element_size_bytes)
-{
-    CUresult result = 0;
-
-    if (dptr == NULL || p_pitch == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "dptr is NULL or p_pitch is NULL");
-        goto error;
-    }
-
-    result = cuMemAllocPitch(dptr, p_pitch, width_in_bytes, height,
-                             element_size_bytes);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_PITCH) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpy(dst, src, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-
-/**
- * \brief Perform a 2D memory copy according to the parameters specified in
- *        p_copy. The CUDA_MEMCPY2D structure is defined as:
- *
- *        typedef struct CUDA_MEMCPY2D_st {
- *            unsigned int srcXInBytes, srcY;
- *            CUmemorytype srcMemoryType;
- *            const void *srcHost;
- *            CUdeviceptr srcDevice;
- *            CUarray srcArray;
- *            unsigned int srcPitch;
- *            unsigned int dstXInBytes, dstY;
- *            CUmemorytype dstMemoryType;
- *            void *dstHost;
- *            CUdeviceptr dstDevice;
- *            CUarray dstArray;
- *            unsigned int dstPitch;
- *            unsigned int WidthInBytes;
- *            unsigned int Height;
- *        } CUDA_MEMCPY2D;
- *
- *        where:
- *
- *        - srcMemoryType and dstMemoryType specify the type of memory of the
- *          source and destination, respectively;
- *
- *          CUmemorytype_enum is de?ned as:
- *
- *          typedef enum CUmemorytype_enum {
- *              CU_MEMORYTYPE_HOST = 0x01,
- *              CU_MEMORYTYPE_DEVICE = 0x02,
- *              CU_MEMORYTYPE_ARRAY = 0x03
- *          } CUmemorytype;
- *
- *        If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify
- *        the (host) base address of the source data and the bytes per row to
- *        apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch
- *        specify the (device) base address of the source data and the bytes per
- *        row to apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle
- *        of the source data. srcHost, srcDevice and srcPitch are ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify
- *        the (host) base address of the destination data and the bytes per row
- *        to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch
- *        specify the (device) base address of the destination data and the
- *        bytes per row to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle
- *        of the destination data dstHost, dstDevice and dstPitch are ignored.
- *
- *        - srcXInBytes and srcY specify the base address of the source data for
- *          the copy.
- *
- *        For host pointers, the starting address is
- *
- *            void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- *
- *        For CUDA arrays, srcXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - dstXInBytes and dstY specify the base address of the destination data
- *          for the copy.
- *
- *        For host pointers, the base address is
- *
- *            void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- *
- *        For CUDA arrays, dstXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - WidthInBytes and Height specify the width (in bytes) and height of
- *          the 2D copy being performed. Any pitches must be greater than or
- *          equal to WidthInBytes.
- *
- *        cuMemcpy2D() returns an error if any pitch is greater than the
- *        maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch()
- *        passes back pitches that always work with cuMemcpy2D(). On intra-device
- *        memory copies (device ? device, CUDA array ? device, CUDA array ?
- *        CUDA array), cuMemcpy2D() may fail for pitches not computed by
- *        cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction,
- *        but may run signi?cantly slower in the cases where cuMemcpy2D() would
- *        have returned an error code.
- *
- * \param p_copy Parameters for the memory copy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy)
-{
-    CUresult result = 0;
-
-    if (p_copy == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_copy is NULL");
-        goto error;
-    }
-
-    result = cuMemcpy2D(p_copy);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Perform a 2D memory copy according to the parameters specified in
- *        p_copy. The CUDA_MEMCPY2D structure is defined as:
- *
- *        typedef struct CUDA_MEMCPY2D_st {
- *            unsigned int srcXInBytes, srcY;
- *            CUmemorytype srcMemoryType;
- *            const void *srcHost;
- *            CUdeviceptr srcDevice;
- *            CUarray srcArray;
- *            unsigned int srcPitch;
- *            unsigned int dstXInBytes, dstY;
- *            CUmemorytype dstMemoryType;
- *            void *dstHost;
- *            CUdeviceptr dstDevice;
- *            CUarray dstArray;
- *            unsigned int dstPitch;
- *            unsigned int WidthInBytes;
- *            unsigned int Height;
- *        } CUDA_MEMCPY2D;
- *
- *        where:
- *
- *        - srcMemoryType and dstMemoryType specify the type of memory of the
- *          source and destination, respectively;
- *
- *          CUmemorytype_enum is de?ned as:
- *
- *          typedef enum CUmemorytype_enum {
- *              CU_MEMORYTYPE_HOST = 0x01,
- *              CU_MEMORYTYPE_DEVICE = 0x02,
- *              CU_MEMORYTYPE_ARRAY = 0x03
- *          } CUmemorytype;
- *
- *        If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify
- *        the (host) base address of the source data and the bytes per row to
- *        apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch
- *        specify the (device) base address of the source data and the bytes per
- *        row to apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle
- *        of the source data. srcHost, srcDevice and srcPitch are ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify
- *        the (host) base address of the destination data and the bytes per row
- *        to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch
- *        specify the (device) base address of the destination data and the
- *        bytes per row to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle
- *        of the destination data dstHost, dstDevice and dstPitch are ignored.
- *
- *        - srcXInBytes and srcY specify the base address of the source data for
- *          the copy.
- *
- *        For host pointers, the starting address is
- *
- *            void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- *
- *        For CUDA arrays, srcXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - dstXInBytes and dstY specify the base address of the destination data
- *          for the copy.
- *
- *        For host pointers, the base address is
- *
- *            void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- *
- *        For CUDA arrays, dstXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - WidthInBytes and Height specify the width (in bytes) and height of
- *          the 2D copy being performed. Any pitches must be greater than or
- *          equal to WidthInBytes.
- *
- *        cuMemcpy2D() returns an error if any pitch is greater than the
- *        maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch()
- *        passes back pitches that always work with cuMemcpy2D(). On intra-device
- *        memory copies (device ? device, CUDA array ? device, CUDA array ?
- *        CUDA array), cuMemcpy2D() may fail for pitches not computed by
- *        cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction,
- *        but may run signi?cantly slower in the cases where cuMemcpy2D() would
- *        have returned an error code.
- *
- *        cuMemcpy2DAsync() is asynchronous and can optionally be associated to a
- *        stream by passing a non-zero hStream argument. It only works on
- *        page-locked host memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- * \param p_copy   Parameters for the memory copy.
- * \param h_stream Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    if (p_copy == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_copy is NULL");
-        goto error;
-    }
-
-    result = cuMemcpy2DAsync(p_copy, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Perform a 2D memory copy according to the parameters specified in
- *        p_copy. The CUDA_MEMCPY2D structure is defined as:
- *
- *        typedef struct CUDA_MEMCPY2D_st {
- *            unsigned int srcXInBytes, srcY;
- *            CUmemorytype srcMemoryType;
- *            const void *srcHost;
- *            CUdeviceptr srcDevice;
- *            CUarray srcArray;
- *            unsigned int srcPitch;
- *            unsigned int dstXInBytes, dstY;
- *            CUmemorytype dstMemoryType;
- *            void *dstHost;
- *            CUdeviceptr dstDevice;
- *            CUarray dstArray;
- *            unsigned int dstPitch;
- *            unsigned int WidthInBytes;
- *            unsigned int Height;
- *        } CUDA_MEMCPY2D;
- *
- *        where:
- *
- *        - srcMemoryType and dstMemoryType specify the type of memory of the
- *          source and destination, respectively;
- *
- *          CUmemorytype_enum is de?ned as:
- *
- *          typedef enum CUmemorytype_enum {
- *              CU_MEMORYTYPE_HOST = 0x01,
- *              CU_MEMORYTYPE_DEVICE = 0x02,
- *              CU_MEMORYTYPE_ARRAY = 0x03
- *          } CUmemorytype;
- *
- *        If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify
- *        the (host) base address of the source data and the bytes per row to
- *        apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch
- *        specify the (device) base address of the source data and the bytes per
- *        row to apply. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle
- *        of the source data. srcHost, srcDevice and srcPitch are ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify
- *        the (host) base address of the destination data and the bytes per row
- *        to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch
- *        specify the (device) base address of the destination data and the
- *        bytes per row to apply. dstArray is ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle
- *        of the destination data dstHost, dstDevice and dstPitch are ignored.
- *
- *        - srcXInBytes and srcY specify the base address of the source data for
- *          the copy.
- *
- *        For host pointers, the starting address is
- *
- *            void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- *
- *        For CUDA arrays, srcXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - dstXInBytes and dstY specify the base address of the destination data
- *          for the copy.
- *
- *        For host pointers, the base address is
- *
- *            void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *            CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- *
- *        For CUDA arrays, dstXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - WidthInBytes and Height specify the width (in bytes) and height of
- *          the 2D copy being performed. Any pitches must be greater than or
- *          equal to WidthInBytes.
- *
- *        cuMemcpy2D() returns an error if any pitch is greater than the
- *        maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch()
- *        passes back pitches that always work with cuMemcpy2D(). On intra-device
- *        memory copies (device ? device, CUDA array ? device, CUDA array ?
- *        CUDA array), cuMemcpy2D() may fail for pitches not computed by
- *        cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction,
- *        but may run signi?cantly slower in the cases where cuMemcpy2D() would
- *        have returned an error code.
- *
- *        cuMemcpy2DAsync() is asynchronous and can optionally be associated to a
- *        stream by passing a non-zero hStream argument. It only works on
- *        page-locked host memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- * \param p_copy   Parameters for the memory copy.
- * \param h_stream Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy)
-{
-    CUresult result = 0;
-
-    if (p_copy == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_copy is NULL");
-        goto error;
-    }
-
-    result = cuMemcpy2DUnaligned(p_copy);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_UNALIGNED) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Perform a 3D memory copy according to the parameters specified in
- *        p_copy. The CUDA_MEMCPY3D structure is defined as:
- *
- *        typedef struct CUDA_MEMCPY3D_st {
- *            unsigned int srcXInBytes, srcY, srcZ;
- *            unsigned int srcLOD;
- *            CUmemorytype srcMemoryType;
- *            const void *srcHost;
- *            CUdeviceptr srcDevice;
- *            CUarray srcArray;
- *            unsigned int srcPitch; // ignored when src is array
- *            unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
- *            unsigned int dstXInBytes, dstY, dstZ;
- *            unsigned int dstLOD;
- *            CUmemorytype dstMemoryType;
- *            void *dstHost;
- *            CUdeviceptr dstDevice;
- *            CUarray dstArray;
- *            unsigned int dstPitch; // ignored when dst is array
- *            unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
- *            unsigned int WidthInBytes;
- *            unsigned int Height;
- *            unsigned int Depth;
- *        } CUDA_MEMCPY3D;
- *
- *        where:
- *
- *        - srcMemoryType and dstMemoryType specify the type of memory of the
- *          source and destination, respectively;
- *        CUmemorytype_enum is defined as:
- *
- *        typedef enum CUmemorytype_enum {
- *            CU_MEMORYTYPE_HOST = 0x01,
- *            CU_MEMORYTYPE_DEVICE = 0x02,
- *            CU_MEMORYTYPE_ARRAY = 0x03
- *        } CUmemorytype;
- *
- *        If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight
- *        specify the (host) base address of the source data, the bytes per row,
- *        and the height of each 2D slice of the 3D array. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and
- *        srcHeight specify the (device) base address of the source data, the
- *        bytes per row, and the height of each 2D slice of the 3D array.
- *        srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle
- *        of the source data. srcHost, srcDevice, srcPitch and srcHeight are
- *        ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch
- *        specify the (host) base address of the destination data, the bytes per
- *        row, and the height of each 2D slice of the 3D array. dstArray is
- *        ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch
- *        specify the (device) base address of the destination data, the bytes
- *        per row, and the height of each 2D slice of the 3D array. dstArray is
- *        ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the
- *        handle of the destination data. dstHost, dstDevice, dstPitch and
- *        dstHeight are ignored.
- *
- *        - srcXInBytes, srcY and srcZ specify the base address of the source
- *          data for the copy.
- *
- *        For host pointers, the starting address is
- *
- *        void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *        CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- *
- *        For CUDA arrays, srcXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - dstXInBytes, dstY and dstZ specify the base address of the destination
- *          data for the copy.
- *
- *        For host pointers, the base address is
- *
- *        void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *        CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- *
- *        For CUDA arrays, dstXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - WidthInBytes, Height and Depth specify the width (in bytes), height
- *          and depth of the 3D copy being performed. Any pitches must be greater
- *          than or equal to WidthInBytes.
- *
- *        cuMemcpy3D() returns an error if any pitch is greater than the maximum
- *        allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- *        The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be
- *        set to 0.
- *
- * \param p_copy Parameters for the memory copy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy)
-{
-    CUresult result = 0;
-
-    if (p_copy == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_copy is NULL");
-        goto error;
-    }
-
-    result = cuMemcpy3D(p_copy);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Perform a 3D memory copy according to the parameters specified in
- *        p_copy. The CUDA_MEMCPY3D structure is defined as:
- *
- *        typedef struct CUDA_MEMCPY3D_st {
- *            unsigned int srcXInBytes, srcY, srcZ;
- *            unsigned int srcLOD;
- *            CUmemorytype srcMemoryType;
- *            const void *srcHost;
- *            CUdeviceptr srcDevice;
- *            CUarray srcArray;
- *            unsigned int srcPitch; // ignored when src is array
- *            unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
- *            unsigned int dstXInBytes, dstY, dstZ;
- *            unsigned int dstLOD;
- *            CUmemorytype dstMemoryType;
- *            void *dstHost;
- *            CUdeviceptr dstDevice;
- *            CUarray dstArray;
- *            unsigned int dstPitch; // ignored when dst is array
- *            unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
- *            unsigned int WidthInBytes;
- *            unsigned int Height;
- *            unsigned int Depth;
- *        } CUDA_MEMCPY3D;
- *
- *        where:
- *
- *        - srcMemoryType and dstMemoryType specify the type of memory of the
- *          source and destination, respectively;
- *        CUmemorytype_enum is defined as:
- *
- *        typedef enum CUmemorytype_enum {
- *            CU_MEMORYTYPE_HOST = 0x01,
- *            CU_MEMORYTYPE_DEVICE = 0x02,
- *            CU_MEMORYTYPE_ARRAY = 0x03
- *        } CUmemorytype;
- *
- *        If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight
- *        specify the (host) base address of the source data, the bytes per row,
- *        and the height of each 2D slice of the 3D array. srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and
- *        srcHeight specify the (device) base address of the source data, the
- *        bytes per row, and the height of each 2D slice of the 3D array.
- *        srcArray is ignored.
- *
- *        If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle
- *        of the source data. srcHost, srcDevice, srcPitch and srcHeight are
- *        ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch
- *        specify the (host) base address of the destination data, the bytes per
- *        row, and the height of each 2D slice of the 3D array. dstArray is
- *        ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch
- *        specify the (device) base address of the destination data, the bytes
- *        per row, and the height of each 2D slice of the 3D array. dstArray is
- *        ignored.
- *
- *        If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the
- *        handle of the destination data. dstHost, dstDevice, dstPitch and
- *        dstHeight are ignored.
- *
- *        - srcXInBytes, srcY and srcZ specify the base address of the source
- *          data for the copy.
- *
- *        For host pointers, the starting address is
- *
- *        void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *        CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- *
- *        For CUDA arrays, srcXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - dstXInBytes, dstY and dstZ specify the base address of the destination
- *          data for the copy.
- *
- *        For host pointers, the base address is
- *
- *        void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- *
- *        For device pointers, the starting address is
- *
- *        CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- *
- *        For CUDA arrays, dstXInBytes must be evenly divisible by the array
- *        element size.
- *
- *        - WidthInBytes, Height and Depth specify the width (in bytes), height
- *          and depth of the 3D copy being performed. Any pitches must be greater
- *          than or equal to WidthInBytes.
- *
- *        cuMemcpy3D() returns an error if any pitch is greater than the maximum
- *        allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- *        cuMemcpy3DAsync() is asynchronous and can optionally be associated
- *        to a stream by passing a non-zero hStream argument. It only works on
- *        page-locked host memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- *        The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be
- *        set to 0.
- *
- * \param p_copy Parameters for the memory copy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    if (p_copy == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_copy is NULL");
-        goto error;
-    }
-
-    result = cuMemcpy3DAsync(p_copy, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy)
-{
-    CUresult result = 0;
-
-    result = cuMemcpy3DPeer(p_copy);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy,
-                            CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpy3DPeerAsync(p_copy, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count,
-                      CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyAsync(dst, src, byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from one 1D CUDA array to another. dstArray and srcArray
- *        specify the handles of the destination and source CUDA arrays for the
- *        copy, respectively. dstIndex and srcIndex specify the destination and
- *        source indices into the CUDA array. These values are in the range
- *        [0, Width-1] for the CUDA array; they are not byte offsets. ByteCount
- *        is the number of bytes to be copied. The size of the elements in the
- *        CUDA arrays need not be the same format, but the elements must be the
- *        same size; and count must be evenly divisible by that size.
- *
- * \param dst_array  Destination array.
- * \param dst_index  Offset of destination array.
- * \param src_array  Source array.
- * \param src_index  Offset of source array.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset,
-                     CUarray src_array, size_t src_offset,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyAtoA(dst_array, dst_offset, src_array, src_offset,
-                          byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_A) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \param Copies from one 1D CUDA array to device memory. dstDevice specifies the
- *        base pointer of the destination and must be naturally aligned with the
- *        CUDA array elements. hSrc and SrcIndex specify the CUDA array handle and
- *        the index (in array elements) of the array element where the copy is
- *        to begin. ByteCount speci?es the number of bytes to copy and must be
- *        evenly divisible by the array element size.
- *
- * \param dst_device Destination device pointer.
- * \param h_src      Source array.
- * \param src_index  Offset of source array.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array,
-                     size_t src_offset, size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyAtoD(dst_device, src_array, src_offset, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \param Copies from one 1D CUDA array to host memory. dstHost specifies the
- *        base pointer of the destination. srcArray and srcIndex specify the
- *        CUDA array handle and starting index of the source data. ByteCount
- *        specifies the number of bytes to copy.
- *
- * \param dst_device Destination device pointer.
- * \param h_src      Source array.
- * \param src_index  Offset of source array.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyAtoH(dst_host, src_array, src_offset, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \param Copies from one 1D CUDA array to host memory. dstHost specifies the
- *        base pointer of the destination. srcArray and srcIndex specify the
- *        CUDA array handle and starting index of the source data. ByteCount
- *        specifies the number of bytes to copy.
- *
- *        cuMemcpyAtoHAsync() is asynchronous and can optionally be associated
- *        to a stream by passing a non-zero stream argument. It only works on
- *        page-locked host memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- * \param dst_device Destination device pointer.
- * \param src_array  Source array.
- * \param src_index  Offset of source array.
- * \param byte_count Size of memory copy in bytes.
- * \param h_stream   Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array,
-                          size_t src_offset, size_t byte_count,
-                          CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyAtoHAsync(dst_host, src_array, src_offset, byte_count,
-                               h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from device memory to a 1D CUDA array. dstArray and dstIndex
- *        specify the CUDA array handle and starting index of the destination
- *        data. srcDevice speci?es the base pointer of the source. ByteCount
- *        specifies the number of bytes to copy.
- *
- * \param dst_array  Destination array.
- * \param dst_index  Offset of destination array.
- * \param src_device Source device pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset,
-                     CUdeviceptr src_device, size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyDtoA(dst_array, dst_offset, src_device, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_A) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from device memory to device memory. dstDevice and srcDevice are
- *        the base pointers of the destination and source, respectively.
- *        byte_count specifies the number of bytes to copy. Note that this
- *        function is asynchronous.
- *
- * \param dst_device Destination device pointer.
- * \param src_device Source device pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyDtoD(dst_device, src_device, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device,
-                          size_t byte_count, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyDtoDAsync(dst_device, src_device, byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-
-/**
- * \brief Copies from device to host memory. dst_host and src_device specify
- *        the base pointers of the destination and source, respectively.
- *        byte_count specifies the number of bytes to copy. Note that this
- *        function is synchronous.
- *
- * \param dst_host   Destination device pointer.
- * \param src_device Source device pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyDtoH(dst_host, src_device, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from device to host memory. dst_host and src_device specify
- *        the base pointers of the destination and source, respectively.
- *        byte_count specifies the number of bytes to copy.
- *
- *        cuMemcpyDtoHAsync() is asynchronous and can optionally be associated
- *        to a stream by passing a non-zero h_stream argument. It only works
- *        on page-locked memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- * \param dst_host   Destination device pointer.
- * \param src_device Source device pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device,
-                          size_t byte_count, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyDtoHAsync(dst_host, src_device, byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index
- *        specify the CUDA array handle and starting index of the destination
- *        data. p_src specifies the base address of the source. byte_count
- *        specifies the number of bytes to copy.
- *
- * \param dst_array  Destination array.
- * \param dst_index  Offset of destination array.
- * \param p_src      Source host pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset,
-                     const void *src_host, size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyHtoA(dst_array, dst_offset, src_host, byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index
- *        specify the CUDA array handle and starting index of the destination
- *        data. p_src specifies the base address of the source. byte_count
- *        specfies the number of bytes to copy.
- *
- *        cuMemcpyHtoAAsync() is asynchronous and can optionally be associated
- *        to a stream by passing a non-zero h_stream argument. It only works on
- *        page-locked memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- * \param dst_array  Destination array.
- * \param dst_index  Offset of destination array.
- * \param p_src      Source host pointer.
- * \param byte_count Size of memory copy in bytes.
- * \param h_stream   Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset,
-                          const void *src_host, size_t byte_count,
-                          CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyHtoAAsync(dst_array, dst_offset, src_host, byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from host memory to device memory. dst_device and src_host
- *        are the base addresses of the destination and source, respectively.
- *        byte_count specifies the number of bytes to copy. Note that this
- *        function is synchronous.
- *
- * \param dst_device Destination device pointer.
- * \param src_host   Source host pointer.
- * \param byte_count Size of memory copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyHtoD(dst_device, src_host,byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies from host memory to device memory. dst_device and src_host are
- *        the base addresses of the destination and source, respectively.
- *        byte_count specifies the number of bytes to copy.
- *
- *        cuMemcpyHtoDAsync() is asynchronous and can optionally be associated
- *        to a stream by passing a non-zero h_stream argument. It only works on
- *        page-locked memory and returns an error if a pointer to pageable
- *        memory is passed as input.
- *
- *
- * \param dst_device Destination device pointer.
- * \param src_host   Source host pointer.
- * \param byte_count Size of memory copy in bytes.
- * \param h_stream   Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host,
-                          size_t byte_count, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyHtoDAsync(dst_device, src_host, byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context,
-                     CUdeviceptr src_device, CUcontext src_context,
-                     size_t byte_count)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyPeer(dst_device, dst_context, src_device, src_context,
-                          byte_count);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context,
-                          CUdeviceptr src_device, CUcontext src_context,
-                          size_t byte_count, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemcpyPeerAsync(dst_device, dst_context, src_device, src_context,
-                               byte_count, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Frees the memory space pointed to by dptr, which must have been
- *        returned by a previous call to cuMemAlloc() or cuMemAllocPitch().
- *
- * \param dptr Pointer to the memory to free.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemFree(CUdeviceptr dptr)
-{
-    CUresult result = 0;
-
-    result = cuMemFree(dptr);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Frees the memory space pointed to by p, which must have been returned
- *        by a previous call to cuMemAllocHost().
- *
- * \param p Pointer to the memory to free.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemFreeHost(void *p)
-{
-    CUresult result = 0;
-
-    if (p == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p is NULL");
-        goto error;
-    }
-
-    result = cuMemFreeHost(p);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE_HOST) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns the base address in *pbase and size in *psize of the allocation
- *        by cuMemAlloc() or cuMemAllocPitch() that contains the input pointer
- *        dptr. Both parameters pbase and psize are optional. If one of them is
- *        NULL, it is ignored.
- *
- * \param pbase Returned base address.
- * \param psize Returned size of device memory allocation.
- * \param dptr  Device pointer to query
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                             CUdeviceptr dptr)
-{
-    CUresult result = 0;
-
-    result = cuMemGetAddressRange(pbase, psize, dptr);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_ADDRESS_RANGE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *free and *total respectively, the free and total amount
- *        of memory available for allocation by the CUDA context, in bytes.
- *
- * \param free  Returned free memory in bytes.
- * \param total Returned total memory in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemGetInfo(size_t *free, size_t *total)
-{
-    CUresult result = 0;
-
-    if (free == NULL || total == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "free is NULL || total is NULL");
-        goto error;
-    }
-
-    result = cuMemGetInfo(free, total);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_INFO) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Allocates bytesize bytes of host memory that is page-locked and
- *        accessible to the device. The driver tracks the virtual memory ranges
- *        allocated with this function and automatically accelerates calls to
- *        functions such as cuMemcpyHtoD(). Since the memory can be accessed
- *        directly by the device, it can be read or written with much higher
- *        bandwidth than pageable memory obtained with functions such as
- *        SCMalloc(). Allocating excessive amounts of pinned memory may degrade
- *        system performance, since it reduces the amount of memory available
- *        to the system for paging. As a result, this function is best used
- *        sparingly to allocate staging areas for data exchange between host
- *        and device.
- *
- *        The Flags parameter enables different options to be specified that
- *        affect the allocation, as follows.
- *
- *        - CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
- *          considered as pinned memory by all CUDA contexts, not just the one
- *          that performed the allocation.
- *        - CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA
- *          address space. The device pointer to the memory may be obtained by
- *          calling cuMemHostGetDevicePointer(). This feature is available only
- *          on GPUs with compute capability greater than or equal to 1.1.
- *        - CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
- *          (WC). WC memory can be transferred across the PCI Express bus more
- *          quickly on some system con?gurations, but cannot be read efficiently
- *          by most CPUs. WC memory is a good option for buffers that will be
- *          written by the CPU and read by the GPU via mapped pinned memory or
- *          host->device transfers.  All of these fags are orthogonal to one
- *          another: a developer may allocate memory that is portable, mapped
- *          and/or write-combined with no restrictions.
- *
- *        The CUDA context must have been created with the CU_CTX_MAP_HOST flag
- *        in order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect.
- *
- *        The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for
- *        devices that do not support mapped pinned memory. The failure is
- *        deferred to cuMemHostGetDevicePointer() because the memory may be
- *        mapped into other CUDA contexts via the CU_MEMHOSTALLOC_PORTABLE flag.
- *
- *        The memory allocated by this function must be freed with cuMemFreeHost().
- *
- * \param pp        Returned host pointer to page-locked memory.
- * \param byte_size Requested allocation size in bytes.
- * \param flags     Flags for allocation request.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuMemHostAlloc(pp, byte_size, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_ALLOC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Passes back the device pointer pdptr corresponding to the mapped,
- *        pinned host buffer p allocated by cuMemHostAlloc.
- *
- *        cuMemHostGetDevicePointer() will fail if the CU_MEMALLOCHOST_DEVICEMAP
- *        flag was not speci?ed at the time the memory was allocated, or if the
- *        function is called on a GPU that does not support mapped pinned memory.
- *
- *        Flags provides for future releases. For now, it must be set to 0.
- *
- * \param pdptr Returned device pointer.
- * \param p     Host pointer.
- * \param flags Options(must be 0).
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuMemHostGetDevicePointer(pdptr, p, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Passes back the flags p_flags that were specified when allocating the
- *        pinned host buffer p allocated by cuMemHostAlloc.
- *
- *        cuMemHostGetFlags() will fail if the pointer does not reside in an
- *        allocation performed by cuMemAllocHost() or cuMemHostAlloc().
- *
- * \param p_flags Returned flags word.
- * \param p       Host pointer.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p)
-{
-    CUresult result = 0;
-
-    result = cuMemHostGetFlags(p_flags, p);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_FLAGS) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuMemHostRegister(p, byte_size, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_REGISTER) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaMemHostUnregister(void *p)
-{
-    CUresult result = 0;
-
-    result = cuMemHostUnregister(p);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_UNREGISTER) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the memory range of N 16-bit values to the speci?ed value us.
- *
- * \param dst_device Destination device pointer.
- * \param us         Value to set.
- * \param n          Number of elements.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD16(dst_device, us, n);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us,
-                         size_t n, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD16Async(dst_device, us, n, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the 2D memory range of Width 16-bit values to the specified
- *        value us. Height specifies the number of rows to set, and dst_pitch
- *        specifies the number of bytes between each row. This function
- *        performs fastest when the pitch is one that has been passed back
- *        by cuMemAllocPitch().
- *
- * \param dst_device Destination device pointer.
- * \param dst_pitch  Pitch of destination device pointer.
- * \param us         Value to set
- * \param width      Width of row.
- * \param height     Number of rows
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch,
-                      unsigned short us, size_t width,
-                      size_t height)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D16(dst_device, dst_pitch, us, width, height);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch,
-                           unsigned short us, size_t width,
-                           size_t height, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D16Async(dst_device, dst_pitch, us, width, height,
-                                h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the 2D memory range of Width 32-bit values to the specified value
- *        ui. Height speci?es the number of rows to set, and dstPitch specifies
- *        the number of bytes between each row. This function performs fastest
- *        when the pitch is one that has been passed back by cuMemAllocPitch().
- *
- * \param dst_device Destination device pointer.
- * \param dst_pitch  Pitch of destination device pointer.
- * \param ui         Value to set
- * \param width      Width of row.
- * \param height     Number of rows
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch,
-                      unsigned int ui, size_t width, size_t height)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D32(dst_device, dst_pitch, ui, width, height);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch,
-                           unsigned int ui, size_t width, size_t height,
-                           CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D32Async(dst_device, dst_pitch, ui, width, height,
-                                h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the 2D memory range of Width 8-bit values to the specified value
- *        uc. Height speci?es the number of rows to set, and dstPitch specifies
- *        the number of bytes between each row. This function performs fastest
- *        when the pitch is one that has been passed back by cuMemAllocPitch().
- *
- * \param dst_device Destination device pointer.
- * \param dst_pitch  Pitch of destination device pointer.
- * \param uc         Value to set
- * \param width      Width of row.
- * \param height     Number of rows
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch,
-                     unsigned char uc, size_t width, size_t height)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D8(dst_device, dst_pitch, uc, width, height);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch,
-                          unsigned char uc, size_t width, size_t height,
-                          CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD2D8Async(dst_device, dst_pitch, uc, width, height,
-                               h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the memory range of N 32-bit values to the specified value ui.
- *
- * \param dst_device Destination device pointer.
- * \param ui         Value to set.
- * \param n          Number of elements.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD32(dst_device, ui, n);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui,
-                         size_t n, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD32Async(dst_device, ui, n, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Sets the memory range of N 8-bit values to the specified value ui.
- *
- * \param dst_device Destination device pointer.
- * \param uc         Value to set.
- * \param n          Number of elements.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD8(dst_device, uc, n);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc,
-                        size_t n, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuMemsetD8Async(dst_device, uc, n, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8_ASYNC) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/*****************************Unified_Addressing_API****************************/
-
-int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute,
-                              CUdeviceptr ptr)
-{
-    CUresult result = 0;
-
-    result = cuPointerGetAttribute(data, attribute, ptr);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_POINTER_GET_ATTRIBUTE) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/*****************************Stream_Management_API****************************/
-
-/**
- * \brief Creates a stream and returns a handle in ph_stream. Flags is
- *        required to be 0.
- *
- * \param ph_stream Returned newly created stream.
- * \param flags    Parameters for stream creation(must be 0).
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaStreamCreate(CUstream *ph_stream, unsigned int flags)
-{
-    CUresult result = 0;
-
-    if (ph_stream == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "phStream is NULL");
-        goto error;
-    }
-
-    result = cuStreamCreate(ph_stream, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Destroys the stream specified by h_stream.
- *
- * \param h_stream Stream to destroy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaStreamDestroy(CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuStreamDestroy(h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_DESTROY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns CUDA_SUCCESS if all operations in the stream specifed by
- *        h_stream have completed, or CUDA_ERROR_NOT_READY if not.
- *
- * \param h_stream Stream to query status of.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaStreamQuery(CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuStreamQuery(h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_QUERY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Waits until the device has completed all operations in the stream
- *        specified by h_stream.
- *
- * \param h_stream Stream to wait for.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaStreamSynchronize(CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuStreamSynchronize(h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_SYNCHRONIZE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event,
-                          unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuStreamWaitEvent(h_stream, h_event, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_WAIT_EVENT) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/*****************************Event_Management_API*****************************/
-
-/**
- * \brief Creates an event *ph_event with the flags specified via flags.  Valid
- *        flags include:
- *
- *        CU_EVENT_DEFAULT: Default event creation flag.
- *        CU_EVENT_BLOCKING_SYNC: Specifies that event should use blocking
- *            synchronization.
- *
- * \param ph_event Returns newly created event.
- * \param flags   Event creation flags.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventCreate(CUevent *ph_event, unsigned int flags)
-{
-    CUresult result = 0;
-
-    if (ph_event == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "ph_event is NULL");
-        goto error;
-    }
-
-    result = cuEventCreate(ph_event, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Destroys the event specified by h_event.
- *
- * \param h_event Event to destroy.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventDestroy(CUevent h_event)
-{
-    CUresult result = 0;
-
-    result = cuEventDestroy(h_event);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_DESTROY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Computes the elapsed time between two events (in milliseconds with
- *        a resolution of around 0.5 microseconds). If either event has not
- *        been recorded yet, this function returns CUDA_ERROR_NOT_READY. If
- *        either event has been recorded with a non-zero stream, the result
- *        is undefined.
- *
- * \param p_milli_seconds Returned elapsed time in milliseconds.
- * \param h_start         Starting event.
- * \param h_end           Ending event.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, CUevent h_end)
-{
-    CUresult result = 0;
-
-    if (p_milli_seconds == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_milli_seconds is NULL");
-        goto error;
-    }
-
-    result = cuEventElapsedTime(p_milli_seconds, h_start, h_end);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_ELAPSED_TIME) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns CUDA_SUCCESS if the event has actually been recorded, or
- *        CUDA_ERROR_NOT_READY if not. If cuEventRecord() has not been called
- *        on this event, the function returns CUDA_ERROR_INVALID_VALUE.
- *
- * \param h_event Event to query.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventQuery(CUevent h_event)
-{
-    CUresult result = 0;
-
-    result = cuEventQuery(h_event);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_QUERY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Records an event. If stream is non-zero, the event is recorded after
- *        all preceding operations in the stream have been completed; otherwise,
- *        it is recorded after all preceding operations in the CUDA context have
- *        been completed. Since operation is asynchronous, cuEventQuery() and/or
- *        cuEventSynchronize() must be used to determine when the event has
- *        actually been recorded.
- *
- *        If cuEventRecord() has previously been called and the event has not
- *        been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE.
- *
- * \param h_event  Event to record.
- * \param h_stream Stream to record event for.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventRecord(CUevent h_event, CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuEventRecord(h_event, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_RECORD) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Waits until the event has actually been recorded. If cuEventRecord()
- *        has been called on this event, the function returns
- *        CUDA_ERROR_INVALID_VALUE.
- *
- *        If cuEventRecord() has previously been called and the event has not
- *        been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE.
- *
- * \param h_event  Event to wait for.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaEventSynchronize(CUevent h_event)
-{
-    CUresult result = 0;
-
-    result = cuEventSynchronize(h_event);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_SYNCHRONIZE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/***********************Execution_Control_Management_API***********************/
-
-/**
- * \brief Returns in *pi the integer value of the attribute attrib on the
- *        kernel given by hfunc. The supported attributes are:
- *
- *        - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The number of threads
- *              beyond which a launch of the function would fail. This number
- *              depends on both the function and the device on which the
- *              function is currently loaded.
- *        - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
- *              statically-allocated shared memory required by this function.
- *              This does not include dynamically-allocated shared memory
- *              requested by the user at runtime.
- *        - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of
- *              user-allocated constant memory required by this function.
- *        - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of thread
- *              local memory used by this function.
- *        - CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each
- *              thread of this function.
- *
- * \param pi     Pointer to an integer which would be updated with the returned
- *               attribute value.
- * \param attrib Attribute requested.
- * \param hfunc  Function to query attribute of.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc)
-{
-    CUresult result = 0;
-
-    if (pi == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pi is NULL");
-        goto error;
-    }
-
-    result = cuFuncGetAttribute(pi, attrib, hfunc);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_GET_ATTRIBUTE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config)
-{
-    CUresult result = 0;
-
-    result = cuFuncSetCacheConfig(hfunc, config);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_CACHE_CONFIG) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x,
-                       unsigned int grid_dim_y, unsigned int grid_dim_z,
-                       unsigned int block_dim_x, unsigned int block_dim_y,
-                       unsigned int block_dim_z, unsigned int shared_mem_bytes,
-                       CUstream h_stream, void **kernel_params, void **extra)
-{
-    CUresult result = 0;
-
-    result = cuLaunchKernel(f, grid_dim_x, grid_dim_y, grid_dim_z,
-                            block_dim_x, block_dim_y, block_dim_z,
-                            shared_mem_bytes, h_stream, kernel_params, extra);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_KERNEL) == -1)
-        goto error;
-
-    return 0;
- error:
-    return -1;
-}
-
-/**
- * \brief Specifies the x, y, and z dimensions of the thread blocks that are
- *        created when the kernel given by hfunc is launched.
- *
- * \param hfunc Kernel to specify dimensions of.
- * \param x X dimension.
- * \param y Y dimension.
- * \param z Z dimension.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z)
-{
-    CUresult result = 0;
-
-    result = cuFuncSetBlockShape(hfunc, x, y, z);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Sets through bytes the amount of dynamic shared memory that will be
- *        available to each thread block when the kernel given by hfunc is
- *        launched.
- *
- * \param hfunc Kernel to specify dynamic shared memory for.
- * \param bytes Dynamic shared memory size per thread in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes)
-{
-    CUresult result = 0;
-
-    result = cuFuncSetSharedSize(hfunc, bytes);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_SHARED_SIZE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Invokes the kernel f on a 1 x 1 x 1 grid of blocks. The block contains
- *        the number of threads specified by a previous call to
- *        cuFuncSetBlockShape().
- *
- * \param f Kernel to launch.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaLaunch(CUfunction f)
-{
-    CUresult result = 0;
-
-    result = cuLaunch(f);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks.
- *        Each block contains the number of threads specified by a previous call
- *        to cuFuncSetBlockShape().
- *
- * \param f           Kernel to launch.
- * \param grid_width  Width of grid in blocks.
- * \param grib_height Height of grid in blocks.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height)
-{
-    CUresult result = 0;
-
-    result = cuLaunchGrid(f, grid_width, grid_height);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks.
- *        Each block contains the number of threads specified by a previous call
- *        to cuFuncSetBlockShape().  cuLaunchGridAsync() can optionally be
- *        associated to a stream by passing a non-zero hStream argument.
- *
- * \param f           Kernel to launch.
- * \param grid_width  Width of grid in blocks.
- * \param grib_height Height of grid in blocks.
- * \param h_stream    Stream identifier.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height,
-                          CUstream h_stream)
-{
-    CUresult result = 0;
-
-    result = cuLaunchGridAsync(f, grid_width, grid_height, h_stream);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID_ASYNC) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Sets a foating-point parameter that will be specified the next time
- *        the kernel corresponding to hfunc will be invoked. offset is a byte
- *        offset.
- *
- * \param h_func Kernel to add parameter to.
- * \param offset Offset to add parameter to argument list.
- * \param value  Value of parameter.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaParamSetf(CUfunction h_func, int offset, float value)
-{
-    CUresult result = 0;
-
-    result = cuParamSetf(h_func, offset, value);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETF) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Sets an integer parameter that will be specified the next time
- *        the kernel corresponding to hfunc will be invoked. offset is a byte
- *        offset.
- *
- * \param h_func Kernel to add parameter to.
- * \param offset Offset to add parameter to argument list.
- * \param value  Value of parameter.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value)
-{
-    CUresult result = 0;
-
-    result = cuParamSeti(h_func, offset, value);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETI) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Sets through numbytes the total size in bytes needed by the function
- *        parameters of the kernel corresponding to hfunc.
- *
- * \param h_func    Kernel to set parameter size for.
- * \param num_bytes Size of paramter list in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes)
-{
-    CUresult result = 0;
-
-    result = cuParamSetSize(h_func, num_bytes);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_SIZE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Makes the CUDA array or linear memory bound to the texture reference
- *        h_tex_ref available to a device program as a texture. In this version
- *        of CUDA, the texture-reference must be obtained via cuModuleGetTexRef()
- *        and the tex_unit parameter must be set to CU_PARAM_TR_DEFAULT.
- *
- * \param h_func    Kernel to add texture-reference to.
- * \param tex_unit  Texture unit (must be CU_PARAM_TR_DEFAULT).
- * \param h_tex_ref Texture-reference to add to argument list.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    result = cuParamSetTexRef(h_func, tex_unit, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_TEX_REF) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Copies an arbitrary amount of data (specified in numbytes) from ptr
- *        into the parameter space of the kernel corresponding to hfunc.
- *        offset is a byte offset.
- *
- * \param h_func    Kernel to add data to.
- * \param offset    Offset to add data to argument list.
- * \param ptr       Pointer to arbitrary data.
- * \param num_bytes Size of data to copy in bytes.
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr,
-                    unsigned int num_bytes)
-{
-    CUresult result = 0;
-
-    if (ptr == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "ptr is NULL");
-        goto error;
-    }
-
-    result = cuParamSetv(h_func, offset, ptr, num_bytes);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETV) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/***********************Texture_Reference_Management_API***********************/
-
-/**
- * \brief Creates a texture reference and returns its handle in *pTexRef. Once
- *        created, the application must call cuTexRefSetArray() or cuTexRefSetAddress()
- *        to associate the reference with allocated memory. Other texture reference
- *        functions are used to specify the format and interpretation (addressing,
- *        filtering, etc.) to be used when the memory is read through this texture
- *        reference. To associate the texture reference with a texture ordinal for
- *        a given function, the application should call cuParamSetTexRef().
- *
- * \param p_tex_ref  Returned texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefCreate(CUtexref *p_tex_ref)
-{
-    CUresult result = 0;
-
-    if (p_tex_ref == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_tex_ref is NULL");
-        goto error;
-    }
-
-    result = cuTexRefCreate(p_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_CREATE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Destroys the texture reference specified by hTexRef.
- *
- * \param h_tex_ref  Texture reference to destroy
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefDestroy(CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    result = cuTexRefDestroy(h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_DESTROY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *pdptr the base address bound to the texture reference
- *        hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference
- *        is not bound to any device memory range.
- *
- * \param pdptr      Returned device address
- * \param h_tex_ref  Texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    if (pdptr == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pdptr is NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetAddress(pdptr, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *pam the addressing mode corresponding to the dimension
- *        dim of the texture reference hTexRef. Currently, the only valid value
- *        for dim are 0 and 1.
- *
- * \param pam        Returned addressing mode
- * \param h_tex_ref  Texture reference
- * \param dim        Dimension
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, int dim)
-{
-    CUresult result = 0;
-
-    if (pam == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pam is NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetAddressMode(pam, h_tex_ref, dim);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *phArray the CUDA array bound to the texture reference
- *        hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference
- *        is not bound to any CUDA array.
- *
- * \param ph_array   Returned array
- * \param h_tex_ref  Texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    if (ph_array == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "ph_array is NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetArray(ph_array, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ARRAY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *pfm the filtering mode of the texture reference hTexRef.
- *
- * \param pfm        Returned filtering mode
- * \param h_tex_ref  Texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    if (pfm == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "pfm is NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetFilterMode(pfm, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FILTER_MODE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *pFlags the flags of the texture reference hTexRef.
- *
- * \param p_flags    Returned flags
- * \param h_tex_ref  Texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    if (p_flags == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_flags is NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetFlags(p_flags, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FLAGS) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Returns in *pFormat and *pNumChannels the format and number of
- *        components of the CUDA array bound to the texture reference hTexRef.
- *        If pFormat or pNumChannels is NULL, it will be ignored.
- *
- * \param p_format        Returned format
- * \param p_num_channels  Returned number of components
- * \param h_tex_ref       Texture reference
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels,
-                          CUtexref h_tex_ref)
-{
-    CUresult result = 0;
-
-    if (p_format == NULL || p_num_channels == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "p_format == NULL || p_num_channels == NULL");
-        goto error;
-    }
-
-    result = cuTexRefGetFormat(p_format, p_num_channels, h_tex_ref);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FORMAT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Binds a linear address range to the texture reference hTexRef. Any
- *        previous address or CUDA array state associated with the texture
- *        reference is superseded by this function. Any memory previously
- *        bound to hTexRef is unbound.
- *
- *        Since the hardware enforces an alignment requirement on texture
- *        base addresses, cuTexRefSetAddress() passes back a byte offset in
- *        *ByteOffset that must be applied to texture fetches in order to read
- *        from the desired memory. This offset must be divided by the texel
- *        size and passed to kernels that read from the texture so they can be
- *        applied to the tex1Dfetch() function.
- *
- *        If the device memory pointer was returned from cuMemAlloc(), the
- *        offset is guaranteed to be 0 and NULL may be passed as the
- *        ByteOffset parameter.
- *
- * \param byte_offset  Returned byte offset
- * \param h_tex_ref    Texture reference to bind
- * \param dptr         Device pointer to bind
- * \param bytes        Size of memory to bind in bytes
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref,
-                           CUdeviceptr dptr, unsigned int bytes)
-{
-    CUresult result = 0;
-
-    if (byte_offset == NULL) {
-        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied.  "
-                   "byte_offset is NULL");
-        goto error;
-    }
-
-    result = cuTexRefSetAddress(byte_offset, h_tex_ref, dptr, bytes);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Binds a linear address range to the texture reference hTexRef. Any
- *        previous address or CUDA array state associated with the texture
- *        reference is superseded by this function. Any memory previously bound
- *        to hTexRef is unbound.
- *
- *        Using a tex2D() function inside a kernel requires a call to either
- *        cuTexRefSetArray() to bind the corresponding texture reference to an
- *        array, or cuTexRefSetAddress2D() to bind the texture reference to
- *        linear memory.
- *
- *        Function calls to cuTexRefSetFormat() cannot follow calls to
- *        cuTexRefSetAddress2D() for the same texture reference.
- *
- *        It is required that dptr be aligned to the appropriate hardware-
- *        specific texture alignment. You can query this value using the device
- *        attribute CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned dptr
- *        is supplied, CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param h_tex_ref  Texture reference to bind
- * \param desc       Descriptor of CUDA array
- * \param dptr       Device pointer to bind
- * \param pitch      Line pitch in bytes
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, const CUDA_ARRAY_DESCRIPTOR *desc,
-                             CUdeviceptr dptr, unsigned int pitch)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetAddress2D(h_tex_ref, desc, dptr, pitch);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Specifies the addressing mode am for the given dimension dim of the
- *        texture reference hTexRef. If dim is zero, the addressing mode is
- *        applied to the first parameter of the functions used to fetch from
- *        the texture; if dim is 1, the second, and so on. CUaddress_mode is
- *        defined as:
- *
- *        typedef enum CUaddress_mode_enum {
- *            CU_TR_ADDRESS_MODE_WRAP = 0,
- *            CU_TR_ADDRESS_MODE_CLAMP = 1,
- *            CU_TR_ADDRESS_MODE_MIRROR = 2,
- *        } CUaddress_mode;
- *
- * \param h_tex_ref  Texture reference
- * \param dim        Dimension
- * \param am         Addressing mode to set
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetAddressMode(h_tex_ref, dim, am);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Binds the CUDA array hArray to the texture reference hTexRef. Any
- *        previous address or CUDA array state associated with the texture
- *        reference is superseded by this function. Flags must be set to
- *        CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to hTexRef
- *        is unbound.
- *
- * \param h_tex_ref  Texture reference to bind
- * \param h_array    Array to bind
- * \param flags      Options (must be CU_TRSA_OVERRIDE_FORMAT)
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetArray(h_tex_ref, h_array, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ARRAY) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Specifies the filtering mode fm to be used when reading memory through
- *        the texture reference hTexRef. CUfilter_mode_enum is defined as:
- *
- *        typedef enum CUfilter_mode_enum {
- *            CU_TR_FILTER_MODE_POINT = 0,
- *            CU_TR_FILTER_MODE_LINEAR = 1
- *        } CUfilter_mode;
- *
- * \param h_tex_ref  Texture reference
- * \param fm         Filtering mode to set
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetFilterMode(h_tex_ref, fm);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FILTER_MODE) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Specifies optional flags via Flags to specify the behavior of data
- *        returned through the texture reference hTexRef. The valid flags are:
- *
- *        * CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *          having the texture promote integer data to floating point data in
- *          the range [0, 1];
- *        * CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default
- *          behavior of having the texture coordinates range from [0, Dim) where
- *          Dim is the width or height of the CUDA array. Instead, the texture
- *          coordinates [0, 1.0) reference the entire breadth of the array
- *          dimension;
- *
- * \param h_tex_ref  Texture reference
- * \param flags      Optional flags to set
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetFlags(h_tex_ref, flags);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FLAGS) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**
- * \brief Specifies the format of the data to be read by the texture reference
- *        hTexRef. fmt and NumPackedComponents are exactly analogous to the
- *        Format and NumChannels members of the CUDA_ARRAY_DESCRIPTOR structure:
- *        They specify the format of each component and the number of components
- *        per array element.
- *
- * \param h_tex_ref  Texture reference
- * \param fmt        Format to set
- * \param num_packed_components  Number of components per array element
- *
- * \retval  0 On success.
- * \retval -1 On failure.
- */
-int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt,
-                          int num_packed_components)
-{
-    CUresult result = 0;
-
-    result = cuTexRefSetFormat(h_tex_ref, fmt, num_packed_components);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FORMAT) == -1)
-        goto error;
-
-    return 0;
-
- error:
-    return -1;
-}
-
-/**************************Cuda_Env_Initialization_API*************************/
-
-/**
- * \brief Initialize the CUDA Environment for the engine.
- *
- * \retval  0 On successfully initializing the CUDA environment for the engine.
- * \retval -1 On failure.
- */
-int SCCudaInitCudaEnvironment(void)
-{
-    if (devices != NULL) {
-        SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine already initalized!!!!");
-        return 0;
-    }
-
-    if (SCCudaInit(0) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error initializing CUDA API.  SCCudaInit() "
-                   "returned -1");
-        goto error;
-    }
-
-    if ( (devices = SCCudaGetDevices()) == NULL) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error getting CUDA device list.  "
-                   "SCCudaGetDevices() returned NULL");
-        goto error;
-    }
-
-    SCCudaPrintBasicDeviceInfo(devices);
-
-    return 0;
-
- error:
-    SCCudaDeAllocSCCudaDevices(devices);
-    return -1;
-}
-
-/**********************************Cuda_Utility********************************/
-
-/**
- * \brief List the cuda cards on the system.
- *
- */
-void SCCudaListCards(void)
-{
-    int i = 0;
-
-    if (devices == NULL) {
-        SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine not initalized!  Please "
-                     "initialize the cuda environment using "
-                     "SCCudaInitCudaEnvironment().");
-        return;
-    }
-
-    printf("CUDA Cards recognized by the suricata CUDA module - \n");
-    printf("|-----------------------------------------------------------------------------|\n");
-    printf("| %-10s | %-20s | %-10s | %-10s | %-13s |\n",
-           "Device Id", "    Device Name", "  Multi-", "Clock Rate", "Cuda Compute");
-    printf("| %-10s | %-20s | %-10s | %-10s | %-13s |\n",
-           "", "", "Processors", "   (MHz)", "Capability");
-    printf("|-----------------------------------------------------------------------------|\n");
-    for (i = 0; i < devices->count; i++) {
-        printf("| %-10d | %-20s | %-10d | %-10d | %d.%-11d |\n",
-               i,
-               devices->devices[i]->name,
-               devices->devices[i]->attr_multiprocessor_count,
-               devices->devices[i]->attr_clock_rate/1000,
-               devices->devices[i]->major_rev,
-               devices->devices[i]->minor_rev);
-    }
-    printf("|-----------------------------------------------------------------------------|\n");
-
-    return;
-}
-
-int SCCudaIsCudaDeviceIdValid(int cuda_device_id)
-{
-    if (devices == NULL) {
-        SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine not initalized!  Please "
-                     "initialize the cuda environment using "
-                     "SCCudaInitCudaEnvironment().");
-        return 0;
-    }
-
-    return (cuda_device_id < devices->count);
-}
-
-/**********************************Unittests***********************************/
-
-int SCCudaTest01(void)
-{
-    SCCudaDevices *devices = SCCudaGetDeviceList();
-
-    if (devices == NULL)
-        return 0;
-
-    return (devices->count != 0);
-}
-
-#if defined(__x86_64__) || defined(__ia64__)
-/**
- * extern "C" __global__ void SCCudaSuricataTest(int *input, int *output)
- * {
- *   output[threadIdx.x] = input[threadIdx.x] * 2;
- * }
- */
-static const char *sc_cuda_test_kernel_64_bit =
-    "    .version 1.4\n"
-    "    .target sm_10, map_f64_to_f32\n"
-    "    .entry SCCudaSuricataTest (\n"
-    "                               .param .u64 __cudaparm_SCCudaSuricataTest_input,\n"
-    "                               .param .u64 __cudaparm_SCCudaSuricataTest_output)\n"
-    "{\n"
-    "    .reg .u32 %r<5>;\n"
-    "    .reg .u64 %rd<8>;\n"
-    "    .loc 15 1 0\n"
-    "    $LBB1_SCCudaSuricataTest:\n"
-    "    .loc 15 3 0\n"
-    "    cvt.u32.u16 %r1, %tid.x;\n"
-    "    cvt.u64.u32 %rd1, %r1;\n"
-    "    mul.lo.u64 %rd2, %rd1, 4;\n"
-    "    ld.param.u64 %rd3, [__cudaparm_SCCudaSuricataTest_input];\n"
-    "    add.u64 %rd4, %rd3, %rd2;\n"
-    "    ld.global.s32 %r2, [%rd4+0];\n"
-    "    mul.lo.s32 %r3, %r2, 2;\n"
-    "    ld.param.u64 %rd5, [__cudaparm_SCCudaSuricataTest_output];\n"
-    "    add.u64 %rd6, %rd5, %rd2;\n"
-    "    st.global.s32 [%rd6+0], %r3;\n"
-    "    .loc 15 4 0\n"
-    "    exit;\n"
-    " $LDWend_SCCudaSuricataTest:\n"
-    "} // SCCudaSuricataTest\n"
-    "\n";
-#else
-/**
- * extern "C" __global__ void SCCudaSuricataTest(int *input, int *output)
- * {
- *   output[threadIdx.x] = input[threadIdx.x] * 2;
- * }
- */
-static const char *sc_cuda_test_kernel_32_bit =
-    "        .version 1.4\n"
-    "        .target sm_10, map_f64_to_f32\n"
-    "        .entry SCCudaSuricataTest (\n"
-    "                .param .u32 __cudaparm_SCCudaSuricataTest_input,\n"
-    "                .param .u32 __cudaparm_SCCudaSuricataTest_output)\n"
-    "        {\n"
-    "        .reg .u16 %rh<3>;\n"
-    "        .reg .u32 %r<9>;\n"
-    "        .loc    15      2       0\n"
-    "$LBB1_SCCudaSuricataTest:\n"
-    "        .loc    15      4       0\n"
-    "        mov.u16         %rh1, %tid.x;\n"
-    "        mul.wide.u16    %r1, %rh1, 4;\n"
-    "        ld.param.u32    %r2, [__cudaparm_SCCudaSuricataTest_input];\n"
-    "        add.u32         %r3, %r2, %r1;\n"
-    "        ld.global.s32   %r4, [%r3+0];\n"
-    "        mul.lo.s32      %r5, %r4, 2;\n"
-    "        ld.param.u32    %r6, [__cudaparm_SCCudaSuricataTest_output];\n"
-    "        add.u32         %r7, %r6, %r1;\n"
-    "        st.global.s32   [%r7+0], %r5;\n"
-    "        .loc    15      5       0\n"
-    "        exit;\n"
-    "$LDWend_SCCudaSuricataTest:\n"
-    "        } // SCCudaSuricataTest\n"
-    "";
-#endif
-
-int SCCudaTest02(void)
-{
-#define ALIGN_UP(offset, alignment) do { \
-            (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1); \
-        } while (0)
-#define N 256
-    CUcontext context;
-    CUmodule module;
-    CUfunction kernel;
-    CUdeviceptr d_input, d_output;
-    int h_input[N];
-    int h_result[N];
-    SCCudaDevices *devices = SCCudaGetDeviceList();
-    int result = 0;
-    int offset = 0;
-    int i = 0;
-
-    if (devices == NULL)
-        goto end;
-
-    if (devices->count == 0)
-        goto end;
-
-    if (SCCudaCtxCreate(&context, 0, devices->devices[0]->device) == -1)
-        goto end;
-
-#if defined(__x86_64__) || defined(__ia64__)
-    if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_64_bit) == -1)
-        goto end;
-#else
-    if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_32_bit) == -1)
-        goto end;
-#endif
-
-    if (SCCudaModuleGetFunction(&kernel, module, "SCCudaSuricataTest") == -1)
-        goto end;
-
-    for (i = 0; i < N; i++)
-        h_input[i] = i * 2;
-
-    if (SCCudaMemAlloc(&d_input, N * sizeof(int)) == -1)
-        goto end;
-
-    if (SCCudaMemcpyHtoD(d_input, h_input, N * sizeof(int)) == -1)
-        goto end;
-
-    if (SCCudaMemAlloc(&d_output, N * sizeof(int)) == -1)
-        goto end;
-
-    offset = 0;
-    ALIGN_UP(offset, __alignof(void *));
-    if (SCCudaParamSetv(kernel, offset, (void *)&d_input, sizeof(void *)) == -1)
-        goto end;
-    offset += sizeof(void *);
-
-    ALIGN_UP(offset, __alignof(void *));
-    if (SCCudaParamSetv(kernel, offset, (void *)&d_output, sizeof(void *)) == -1)
-        goto end;
-    offset += sizeof(void *);
-
-    if (SCCudaParamSetSize(kernel, offset) == -1)
-        goto end;
-
-    if (SCCudaFuncSetBlockShape(kernel, N, 1, 1) == -1)
-        goto end;
-
-    if (SCCudaLaunchGrid(kernel, 1, 1) == -1)
-        goto end;
-
-    if (SCCudaMemcpyDtoH(h_result, d_output, N * sizeof(int)) == -1)
-        goto end;
-
-    for (i = 0; i < N; i++)
-        h_input[i] = i * 4;
-
-    for (i = 0; i < N; i++) {
-        if (h_result[i] != h_input[i])
-            goto end;
-    }
-
-    if (SCCudaMemFree(d_input) == -1)
-        goto end;
-
-    if (SCCudaMemFree(d_output) == -1)
-        goto end;
-
-    if (SCCudaModuleUnload(module) == -1)
-        goto end;
-
-    if (SCCudaCtxDestroy(context) == -1)
-        goto end;
-
-    result = 1;
-
- end:
-    return result;
-}
-
-void SCCudaRegisterTests(void)
-{
-#ifdef UNITTESTS
-    UtRegisterTest("SCCudaTest01", SCCudaTest01);
-    UtRegisterTest("SCCudaTest02", SCCudaTest02);
-#endif
-
-    return;
-}
-
-#endif /* __SC_CUDA_SUPPORT__ */
diff --git a/src/util-cuda.h b/src/util-cuda.h
deleted file mode 100644
index 8e544fd04a..0000000000
--- a/src/util-cuda.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- */
-
-#ifndef __UTIL_CUDA__H__
-#define __UTIL_CUDA__H__
-
-#ifdef __SC_CUDA_SUPPORT__
-
-#include <cuda.h>
-
-#define SC_CUDA_DEFAULT_DEVICE 0
-#define SC_CUDA_DEVICE_NAME_MAX_LEN 128
-
-typedef struct SCCudaDevice_ {
-    /* device id */
-    CUdevice device;
-
-    /* device name */
-    char name[SC_CUDA_DEVICE_NAME_MAX_LEN];
-
-    /* device compute capability */
-    int major_rev;
-    int minor_rev;
-
-    /* device properties */
-    CUdevprop prop;
-
-    /* device total memory */
-    size_t bytes;
-
-    /* device attributes.  We could have used a fixed int array table to hold
-     * the attributes, but it is better we specify it exclusively this way,
-     * since the usage would be less error prone */
-    int attr_max_threads_per_block;
-    int attr_max_block_dim_x;
-    int attr_max_block_dim_y;
-    int attr_max_block_dim_z;
-    int attr_max_grid_dim_x;
-    int attr_max_grid_dim_y;
-    int attr_max_grid_dim_z;
-    int attr_max_shared_memory_per_block;
-    int attr_total_constant_memory;
-    int attr_warp_size;
-    int attr_max_pitch;
-    int attr_max_registers_per_block;
-    int attr_clock_rate;
-    int attr_texture_alignment;
-    int attr_gpu_overlap;
-    int attr_multiprocessor_count;
-    int attr_kernel_exec_timeout;
-    int attr_integrated;
-    int attr_can_map_host_memory;
-    int attr_compute_mode;
-} SCCudaDevice;
-
-
-typedef struct SCCudaDevices_ {
-    int count;
-    SCCudaDevice **devices;
-} SCCudaDevices;
-
-
-/**************************Cuda_Initialization_API**************************/
-int SCCudaInit(unsigned int flags);
-
-/***************************Version_Management_API***************************/
-int SCCudaDriverGetVersion(int *driver_version);
-
-/***************************Device_Management_API****************************/
-int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev);
-int SCCudaDeviceGet(CUdevice *device, int ordinal);
-int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                             CUdevice dev);
-int SCCudaDeviceGetCount(int *count);
-int SCCudaDeviceGetName(char *name, int len, CUdevice dev);
-int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev);
-int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev);
-
-void SCCudaPrintDeviceList(SCCudaDevices *);
-void SCCudaPrintBasicDeviceInfo(SCCudaDevices *);
-SCCudaDevices *SCCudaGetDeviceList(void);
-
-/***************************Context_Management_API***************************/
-int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-int SCCudaCtxDestroy(CUcontext ctx);
-int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version);
-int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig);
-int SCCudaCtxGetCurrent(CUcontext *pctx);
-int SCCudaCtxGetDevice(CUdevice *device);
-int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit);
-int SCCudaCtxPopCurrent(CUcontext *pctx);
-int SCCudaCtxPushCurrent(CUcontext ctx);
-int SCCudaCtxSetCacheConfig(CUfunc_cache config);
-int SCCudaCtxSetCurrent(CUcontext ctx);
-int SCCudaCtxSetLimit(CUlimit limit, size_t value);
-int SCCudaCtxSynchronize(void);
-int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags);
-int SCCudaCtxDetach(CUcontext ctx);
-
-/***************************Module_Management_API****************************/
-int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                            const char *name);
-int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod,
-                          const char *name);
-int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod,
-                           const char *name);
-int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod,
-                          const char *name);
-int SCCudaModuleLoad(CUmodule *module, const char *fname);
-int SCCudaModuleLoadData(CUmodule *module, const void *image);
-int SCCudaModuleLoadDataEx(CUmodule *module, const void *image,
-                           unsigned int num_options, CUjit_option *options,
-                           void **option_values);
-int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin);
-int SCCudaModuleUnload(CUmodule hmod);
-
-/**************************Memory_Management_API*****************************/
-int SCCudaArray3DCreate(CUarray *p_handle,
-                        const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array);
-int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor,
-                               CUarray h_array);
-int SCCudaArrayCreate(CUarray *p_handle,
-                      const CUDA_ARRAY_DESCRIPTOR *p_allocate_array);
-int SCCudaArrayDestroy(CUarray h_array);
-int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor,
-                             CUarray h_array);
-int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id);
-int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev);
-int SCCudaIpcCloseMemHandle(CUdeviceptr dptr);
-int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event);
-int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr);
-int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle);
-int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
-                           unsigned int flags);
-int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size);
-int SCCudaMemAllocHost(void **pp, size_t byte_size);
-int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch,
-                        size_t width_in_bytes,
-                        size_t height,
-                        unsigned int element_size_bytes);
-int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count);
-int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy);
-int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream);
-int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy);
-int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy);
-int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream);
-int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy);
-int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy,
-                            CUstream h_stream);
-int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count,
-                      CUstream h_stream);
-int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset,
-                     CUarray src_array, size_t src_offset,
-                     size_t byte_count);
-int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array,
-                     size_t src_offset, size_t byte_count);
-int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset,
-                     size_t byte_count);
-int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array,
-                          size_t src_offset, size_t byte_count,
-                          CUstream h_stream);
-int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset,
-                     CUdeviceptr src_device, size_t byte_count);
-int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device,
-                     size_t byte_count);
-int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device,
-                          size_t byte_count, CUstream h_stream);
-int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device,
-                     size_t byte_count);
-int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device,
-                          size_t byte_count, CUstream h_stream);
-int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset,
-                     const void *src_host, size_t byte_count);
-int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset,
-                          const void *src_host, size_t byte_count,
-                          CUstream h_stream);
-int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host,
-                     size_t byte_count);
-int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host,
-                          size_t byte_count, CUstream h_stream);
-int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context,
-                     CUdeviceptr src_device, CUcontext src_context,
-                     size_t byte_count);
-int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context,
-                          CUdeviceptr src_device, CUcontext src_context,
-                          size_t byte_count, CUstream h_stream);
-int SCCudaMemFree(CUdeviceptr dptr);
-int SCCudaMemFreeHost(void *p);
-int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
-                             CUdeviceptr dptr);
-int SCCudaMemGetInfo(size_t *free, size_t *total);
-int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags);
-int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                  unsigned int flags);
-int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p);
-int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags);
-int SCCudaMemHostUnregister(void *p);
-int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n);
-int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us,
-                         size_t n, CUstream h_stream);
-int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch,
-                      unsigned short us, size_t width,
-                      size_t height);
-int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch,
-                           unsigned short us, size_t width,
-                           size_t height, CUstream h_stream);
-int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch,
-                      unsigned int ui, size_t width, size_t height);
-int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch,
-                           unsigned int ui, size_t width, size_t height,
-                           CUstream h_stream);
-int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch,
-                     unsigned char uc, size_t width, size_t height);
-int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch,
-                          unsigned char uc, size_t width, size_t height,
-                          CUstream h_stream);
-int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n);
-int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui,
-                         size_t n, CUstream h_stream);
-int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n);
-int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc,
-                        size_t n, CUstream h_stream);
-
-/***************************Unified_Addressing_API****************************/
-
-int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute,
-                              CUdeviceptr ptr);
-
-/***************************Stream_Management_API****************************/
-int SCCudaStreamCreate(CUstream *ph_stream, unsigned int flags);
-int SCCudaStreamDestroy(CUstream h_stream);
-int SCCudaStreamQuery(CUstream h_stream);
-int SCCudaStreamSynchronize(CUstream h_stream);
-int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event,
-                          unsigned int flags);
-
-/***************************Event_Management_API*****************************/
-int SCCudaEventCreate(CUevent *ph_event, unsigned int flags);
-int SCCudaEventDestroy(CUevent h_event);
-int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start,
-                           CUevent h_end);
-int SCCudaEventQuery(CUevent h_event);
-int SCCudaEventRecord(CUevent h_event, CUstream h_stream);
-int SCCudaEventSynchronize(CUevent h_event);
-
-/***********************Execution_Control_Management_API***********************/
-int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                           CUfunction hfunc);
-int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x,
-                       unsigned int grid_dim_y, unsigned int grid_dim_z,
-                       unsigned int block_dim_x, unsigned int block_dim_y,
-                       unsigned int block_dim_z, unsigned int shared_mem_bytes,
-                       CUstream h_stream, void **kernel_params, void **extra);
-int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
-int SCCudaLaunch(CUfunction f);
-int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height);
-int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height,
-                          CUstream h_stream);
-int SCCudaParamSetf(CUfunction h_func, int offset, float value);
-int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value);
-int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes);
-int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref);
-int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr,
-                    unsigned int num_bytes);
-
-/*********************Texture_Reference_Management_API***********************/
-int SCCudaTexRefCreate(CUtexref *p_tex_ref);
-int SCCudaTexRefDestroy(CUtexref h_tex_ref);
-int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref);
-int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref,
-                               int dim);
-int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref);
-int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref);
-int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref);
-int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels,
-                          CUtexref h_tex_ref);
-int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref,
-                           CUdeviceptr dptr, unsigned int bytes);
-int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref,
-                             const CUDA_ARRAY_DESCRIPTOR *desc,
-                             CUdeviceptr dptr, unsigned int pitch);
-int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am);
-int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array,
-                         unsigned int flags);
-int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm);
-int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags);
-int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt,
-                          int num_packed_components);
-
-/************************Cuda_Env_Initialization_API*************************/
-int SCCudaInitCudaEnvironment(void);
-
-/********************************Cuda_Utility********************************/
-void SCCudaListCards(void);
-int SCCudaIsCudaDeviceIdValid(int cuda_device_id);
-
-/********************************Unittests***********************************/
-void SCCudaRegisterTests(void);
-
-#endif /* __SC_CUDA_SUPPORT__ */
-#endif /* __UTIL_CUDA_H__ */
diff --git a/src/util-mpm-ac-bs.c b/src/util-mpm-ac-bs.c
index 16fb9ba795..ded1ad18fa 100644
--- a/src/util-mpm-ac-bs.c
+++ b/src/util-mpm-ac-bs.c
@@ -995,8 +995,6 @@ void SCACBSInitThreadCtx(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx)
  * \brief Initialize the AC context.
  *
  * \param mpm_ctx       Mpm context.
- * \param module_handle Cuda module handle from the cuda handler API.  We don't
- *                      have to worry about this here.
  */
 void SCACBSInitCtx(MpmCtx *mpm_ctx)
 {
diff --git a/src/util-mpm-ac-cuda-kernel.cu b/src/util-mpm-ac-cuda-kernel.cu
deleted file mode 100644
index d7cc125bf2..0000000000
--- a/src/util-mpm-ac-cuda-kernel.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (C) 2007-2012 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * The Cuda kernel for MPM AC.
- *
- * \todo - This is a basic version of the kernel.
- *       - Support 16 bit state tables.
- *       - Texture memory.
- *       - Multiple threads per blocks of threads.  Make use of
- *         shared memory/texture memory.
- */
-
-extern "C"
-__global__ void SCACCudaSearch64(unsigned char *d_buffer,
-                                 unsigned int d_buffer_start_offset,
-                                 unsigned int *o_buffer,
-                                 unsigned int *results_buffer,
-                                 unsigned int nop,
-                                 unsigned char *tolower)
-{
-    unsigned int u = 0;
-    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= nop)
-        return;
-
-    unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
-    unsigned int (*state_table_u32)[256] =
-        (unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8));
-    unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16);
-
-    unsigned int state = 0;
-    unsigned int matches = 0;
-    unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
-    for (u = 0; u < buflen; u++) {
-        state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
-        if (state & 0xFF000000) {
-            results[matches++] = u;
-            results[matches++] = state & 0x00FFFFFF;
-        }
-    }
-
-    *(results - 1) = matches;
-    return;
-}
-
-extern "C"
-__global__ void SCACCudaSearch32(unsigned char *d_buffer,
-                                 unsigned int d_buffer_start_offset,
-                                 unsigned int *o_buffer,
-                                 unsigned int *results_buffer,
-                                 unsigned int nop,
-                                 unsigned char *tolower)
-{
-    unsigned int u = 0;
-    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= nop)
-        return;
-
-    unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
-    unsigned int (*state_table_u32)[256] =
-        (unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4));
-    unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8);
-
-    unsigned int state = 0;
-    unsigned int matches = 0;
-    unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
-    for (u = 0; u < buflen; u++) {
-        state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
-        if (state & 0xFF000000) {
-            results[matches++] = u;
-            results[matches++] = state & 0x00FFFFFF;
-        }
-    }
-
-    *(results - 1) = matches;
-    return;
-}
diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c
index 0331fd0297..42666f111d 100644
--- a/src/util-mpm-ac.c
+++ b/src/util-mpm-ac.c
@@ -60,15 +60,6 @@
 #include "util-mpm-ac.h"
 #include "util-memcpy.h"
 
-#ifdef __SC_CUDA_SUPPORT__
-
-#include "util-mpm.h"
-#include "tm-threads.h"
-#include "detect-engine-mpm.h"
-#include "util-cuda.h"
-#include "util-cuda-handlers.h"
-#endif /* __SC_CUDA_SUPPORT__ */
-
 void SCACInitCtx(MpmCtx *);
 void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *);
 void SCACDestroyCtx(MpmCtx *);
@@ -851,25 +842,6 @@ int SCACPreparePatterns(MpmCtx *mpm_ctx)
     /* prepare the state table required by AC */
     SCACPrepareStateTable(mpm_ctx);
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda,
-                               ctx->state_count * sizeof(unsigned int) * 256);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-            exit(EXIT_FAILURE);
-        }
-
-        r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda,
-                             ctx->state_table_u32,
-                             ctx->state_count * sizeof(unsigned int) * 256);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
-            exit(EXIT_FAILURE);
-        }
-    }
-#endif
-
     /* free all the stored patterns.  Should save us a good 100-200 mbs */
     for (i = 0; i < mpm_ctx->pattern_cnt; i++) {
         if (ctx->parray[i] != NULL) {
@@ -1258,549 +1230,6 @@ void SCACPrintInfo(MpmCtx *mpm_ctx)
     return;
 }
 
-/****************************Cuda side of things****************************/
-
-#ifdef __SC_CUDA_SUPPORT__
-
-/* \todo Technically it's generic to all mpms, but since we use ac only, the
- *       code internally directly references ac and hence it has found its
- *       home in this file, instead of util-mpm.c
- */
-void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
-{
-    MpmCtx *mpm_ctx = NULL;
-
-    int ac_16_tables = 0;
-    int ac_32_tables = 0;
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    if (ac_16_tables > 0 && ac_32_tables > 0)
-        SCACConstructBoth16and32StateTables();
-
-    SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
-    SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
-
-}
-
-void CudaReleasePacket(Packet *p)
-{
-    if (p->cuda_pkt_vars.cuda_mpm_enabled == 1) {
-        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-        SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
-        p->cuda_pkt_vars.cuda_done = 0;
-        SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-    }
-
-    return;
-}
-
-/* \todos
- * - Use texture memory - Can we fit all the arrays into a 3d texture.
- *   Texture memory definitely offers slightly better performance even
- *   on gpus that offer cache for global memory.
- * - Packetpool - modify to support > 65k max pending packets.  We are
- *   hitting packetpool limit currently even with 65k packets.
- * - Use streams.  We have tried overlapping parsing results from the
- *   previous call with invoking the next call.
- * - Offer higher priority to decode threads.
- * - Modify pcap file mode to support reading from multiple pcap files
- *   and hence we will have multiple receive threads.
- * - Split state table into many small pieces and have multiple threads
- *   run each small state table on the same payload.
- * - Used a config peference of l1 over shared memory with no noticeable
- *   perf increase.  Explore it in detail over cards/architectures.
- * - Constant memory performance sucked.  Explore it in detail.
- * - Currently all our state tables are small.  Implement 16 bit state
- *   tables on priority.
- * - Introduce profiling.
- * - Retrieve sgh before buffer packet.
- */
-
-void SCACConstructBoth16and32StateTables(void)
-{
-    construct_both_16_and_32_state_tables = 1;
-
-    return;
-}
-
-/* \todo Reduce offset buffer size.  Probably a 100,000 entry would be sufficient. */
-static void *SCACCudaDispatcher(void *arg)
-{
-#define BLOCK_SIZE 32
-
-    int r = 0;
-    ThreadVars *tv = (ThreadVars *)arg;
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    uint32_t sleep_interval_ms = conf->batching_timeout;
-
-    SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of "
-              "\"%"PRIu32"\" micro-seconds", sleep_interval_ms);
-
-    CudaBufferData *cb_data =
-        CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME,
-                                 MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
-
-    CUcontext cuda_context =
-        CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
-    if (cuda_context == 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaCtxPushCurrent(cuda_context);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed.");
-        exit(EXIT_FAILURE);
-    }
-    CUmodule cuda_module = 0;
-    if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module.");
-        exit(EXIT_FAILURE);
-    }
-    CUfunction kernel = 0;
-#if __WORDSIZE==64
-    if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
-        exit(EXIT_FAILURE);
-    }
-#else
-    if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
-        exit(EXIT_FAILURE);
-    }
-#endif
-
-    uint8_t g_u8_lowercasetable[256];
-    for (int c = 0; c < 256; c++)
-        g_u8_lowercasetable[c] = tolower((uint8_t)c);
-    CUdeviceptr cuda_g_u8_lowercasetable_d = 0;
-    CUdeviceptr cuda_packets_buffer_d = 0;
-    CUdeviceptr cuda_offset_buffer_d = 0;
-    CUdeviceptr cuda_results_buffer_d = 0;
-    uint32_t *cuda_results_buffer_h = NULL;
-    r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable));
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable));
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
-        exit(EXIT_FAILURE);
-    }
-
-    CudaBufferCulledInfo cb_culled_info;
-    memset(&cb_culled_info, 0, sizeof(cb_culled_info));
-
-    TmThreadsSetFlag(tv, THV_INIT_DONE);
-    while (1) {
-        if (TmThreadsCheckFlag(tv, THV_KILL))
-            break;
-
-        usleep(sleep_interval_ms);
-
-        /**************** 1 SEND ****************/
-        CudaBufferCullCompletedSlices(cb_data, &cb_culled_info, conf->gpu_transfer_size);
-        if (cb_culled_info.no_of_items == 0)
-            continue;
-#if 0
-        SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" "
-                  "cb_culled_info.buffer_len - %"PRIu32" "
-                  "cb_culled_info.average size - %f "
-                  "cb_culled_info.d_buffer_start_offset - %"PRIu32" "
-                  "cb_culled_info.op_buffer_start_offset - %"PRIu32" "
-                  "cb_data.no_of_items - %"PRIu32"  "
-                  "cb_data.d_buffer_read - %"PRIu32" "
-                  "cb_data.d_buffer_write - %"PRIu32" "
-                  "cb_data.op_buffer_read - %"PRIu32" "
-                  "cb_data.op_buffer_write - %"PRIu32"\n",
-                  cb_culled_info.no_of_items,
-                  cb_culled_info.d_buffer_len,
-                  cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items,
-                  cb_culled_info.d_buffer_start_offset,
-                  cb_culled_info.op_buffer_start_offset,
-                  cb_data->no_of_items,
-                  cb_data->d_buffer_read,
-                  cb_data->d_buffer_write,
-                  cb_data->op_buffer_read,
-                  cb_data->op_buffer_write);
-#endif
-        r = SCCudaMemcpyHtoDAsync(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len, 0);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
-            exit(EXIT_FAILURE);
-        }
-        r = SCCudaMemcpyHtoDAsync(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items, 0);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
-            exit(EXIT_FAILURE);
-        }
-        void *args[] = { &cuda_packets_buffer_d,
-                         &cb_culled_info.d_buffer_start_offset,
-                         &cuda_offset_buffer_d,
-                         &cuda_results_buffer_d,
-                         &cb_culled_info.no_of_items,
-                         &cuda_g_u8_lowercasetable_d };
-        r = SCCudaLaunchKernel(kernel,
-                               (cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1,
-                               BLOCK_SIZE, 1, 1,
-                               0, 0,
-                               args, NULL);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure.");
-            exit(EXIT_FAILURE);
-        }
-        r = SCCudaMemcpyDtoHAsync(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2), 0);
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure.");
-            exit(EXIT_FAILURE);
-        }
-
-
-
-        /**************** 1 SYNCHRO ****************/
-        r = SCCudaCtxSynchronize();
-        if (r < 0) {
-            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure.");
-            exit(EXIT_FAILURE);
-        }
-
-        /************* 1 Parse Results ************/
-        uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset;
-        uint32_t no_of_items = cb_culled_info.no_of_items;
-        uint32_t *o_buffer = cb_data->o_buffer;
-        uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset;
-        for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
-            Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];
-
-            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
-            if (p->cuda_pkt_vars.cuda_mpm_enabled == 0) {
-                p->cuda_pkt_vars.cuda_done = 0;
-                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-                continue;
-            }
-
-            p->cuda_pkt_vars.cuda_gpu_matches =
-                cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
-            if (p->cuda_pkt_vars.cuda_gpu_matches != 0) {
-                memcpy(p->cuda_pkt_vars.cuda_results,
-                       cuda_results_buffer_h +
-                       ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2),
-                       (cuda_results_buffer_h[((o_buffer[i_op_start_offset] -
-                                                d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
-            }
-
-            p->cuda_pkt_vars.cuda_done = 1;
-            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-            SCCondSignal(&p->cuda_pkt_vars.cuda_cond);
-        }
-        if (no_of_items != 0)
-            CudaBufferReportCulledConsumption(cb_data, &cb_culled_info);
-    } /* while (1) */
-
-    r = SCCudaModuleUnload(cuda_module);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemFree(cuda_packets_buffer_d);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemFree(cuda_offset_buffer_d);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemFree(cuda_results_buffer_d);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
-        exit(EXIT_FAILURE);
-    }
-    r = SCCudaMemFreeHost(cuda_results_buffer_h);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
-        exit(EXIT_FAILURE);
-    }
-
-    TmThreadsSetFlag(tv, THV_RUNNING_DONE);
-    TmThreadWaitForFlag(tv, THV_DEINIT);
-    TmThreadsSetFlag(tv, THV_CLOSED);
-
-    return NULL;
-
-#undef BLOCK_SIZE
-}
-
-uint32_t SCACCudaPacketResultsProcessing(Packet *p, const MpmCtx *mpm_ctx,
-                                         PrefilterRuleStore *pmq)
-{
-    uint32_t u = 0;
-
-    while (!p->cuda_pkt_vars.cuda_done) {
-        SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
-        if (p->cuda_pkt_vars.cuda_done) {
-            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-            break;
-        } else {
-            SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex);
-            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-        }
-    } /* while */
-    p->cuda_pkt_vars.cuda_done = 0;
-    p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-
-    uint32_t cuda_matches = p->cuda_pkt_vars.cuda_gpu_matches;
-    if (cuda_matches == 0)
-        return 0;
-
-    uint32_t matches = 0;
-    uint32_t *results = p->cuda_pkt_vars.cuda_results + 1;
-    uint8_t *buf = p->payload;
-    SCACCtx *ctx = mpm_ctx->ctx;
-    SCACOutputTable *output_table = ctx->output_table;
-    SCACPatternList *pid_pat_list = ctx->pid_pat_list;
-
-    uint8_t bitarray[ctx->pattern_id_bitarray_size];
-    memset(bitarray, 0, ctx->pattern_id_bitarray_size);
-
-    for (u = 0; u < cuda_matches; u += 2) {
-        uint32_t offset = results[u];
-        uint32_t state = results[u + 1];
-        /* we should technically be doing state & 0x00FFFFFF, but we don't
-         * since the cuda kernel does that for us */
-        uint32_t no_of_entries = output_table[state].no_of_entries;
-        /* we should technically be doing state & 0x00FFFFFF, but we don't
-         * since the cuda kernel does that for us */
-        uint32_t *pids = output_table[state].pids;
-        uint32_t k;
-        /* note that this is not a verbatim copy from SCACSearch().  We
-         * don't copy the pattern id into the pattern_id_array.  That's
-         * the only change */
-        for (k = 0; k < no_of_entries; k++) {
-            if (pids[k] & AC_CASE_MASK) {
-                uint32_t lower_pid = pids[k] & 0x0000FFFF;
-                if (SCMemcmp(pid_pat_list[lower_pid].cs,
-                             buf + offset - pid_pat_list[lower_pid].patlen + 1,
-                             pid_pat_list[lower_pid].patlen) != 0) {
-                    /* inside loop */
-                    continue;
-                }
-                if (bitarray[(lower_pid) / 8] & (1 << ((lower_pid) % 8))) {
-                    ;
-                } else {
-                    bitarray[(lower_pid) / 8] |= (1 << ((lower_pid) % 8));
-                    PrefilterAddSids(pmq, pid_pat_list[lower_pid].sids,
-                            pid_pat_list[lower_pid].sids_size);
-                }
-                matches++;
-            } else {
-                if (bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) {
-                    ;
-                } else {
-                    bitarray[pids[k] / 8] |= (1 << (pids[k] % 8));
-                    PrefilterAddSids(pmq, pid_pat_list[pids[k]].sids,
-                            pid_pat_list[pids[k]].sids_size);
-                }
-                matches++;
-            }
-        }
-    }
-
-    return matches;
-}
-
-void SCACCudaStartDispatcher(void)
-{
-    /* create the threads */
-    ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher",
-                                    NULL, NULL,
-                                    NULL, NULL,
-                                    "custom", SCACCudaDispatcher, 0);
-    if (tv == NULL) {
-        SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for "
-                   "ac cuda dispatcher.  Killing engine.");
-        exit(EXIT_FAILURE);
-    }
-    if (TmThreadSpawn(tv) != 0) {
-        SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for "
-                   "ac cuda dispatcher.  Killing engine.");
-        exit(EXIT_FAILURE);
-    }
-
-    return;
-}
-
-int MpmCudaBufferSetup(void)
-{
-    int r = 0;
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    if (conf == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
-        return -1;
-    }
-
-    CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
-    if (cuda_context == 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
-        return -1;
-    }
-    r = SCCudaCtxPushCurrent(cuda_context);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
-        return -1;
-    }
-
-    uint8_t *d_buffer = NULL;
-    uint32_t *o_buffer = NULL;
-    void **p_buffer = NULL;
-
-    r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
-        return -1;
-    }
-    SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size);
-    r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue.");
-        return -1;
-    }
-    r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
-        return -1;
-    }
-
-    r = SCCudaCtxPopCurrent(NULL);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
-        return -1;
-    }
-
-    CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
-    if (cb == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance.");
-        return -1;
-    }
-    CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb);
-
-    return 0;
-}
-
-int MpmCudaBufferDeSetup(void)
-{
-    int r = 0;
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    if (conf == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
-        return -1;
-    }
-
-    CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
-    BUG_ON(cb_data == NULL);
-
-    CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
-    if (cuda_context == 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
-        return -1;
-    }
-    r = SCCudaCtxPushCurrent(cuda_context);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
-        return -1;
-    }
-
-    r = SCCudaMemFreeHost(cb_data->d_buffer);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
-        return -1;
-    }
-    r = SCCudaMemFreeHost(cb_data->o_buffer);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
-        return -1;
-    }
-    r = SCCudaMemFreeHost(cb_data->p_buffer);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
-        return -1;
-    }
-
-    r = SCCudaCtxPopCurrent(NULL);
-    if (r < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
-        return -1;
-    }
-
-    CudaBufferDeRegister(cb_data);
-
-    return 0;
-}
-
-#endif /* __SC_CUDA_SUPPORT */
 
 /************************** Mpm Registration ***************************/
 
@@ -1825,31 +1254,6 @@ void MpmACRegister(void)
     return;
 }
 
-#ifdef __SC_CUDA_SUPPORT__
-
-/**
- * \brief Register the aho-corasick cuda mpm.
- */
-void MpmACCudaRegister(void)
-{
-    mpm_table[MPM_AC_CUDA].name = "ac-cuda";
-    mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx;
-    mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx;
-    mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx;
-    mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx;
-    mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS;
-    mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI;
-    mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns;
-    mpm_table[MPM_AC_CUDA].Search = SCACSearch;
-    mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo;
-    mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats;
-    mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests;
-
-    return;
-}
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 /*************************************Unittests********************************/
 
 #ifdef UNITTESTS
diff --git a/src/util-mpm-ac.h b/src/util-mpm-ac.h
index 4fa2f9da25..a9dbd6a090 100644
--- a/src/util-mpm-ac.h
+++ b/src/util-mpm-ac.h
@@ -28,16 +28,6 @@
 #define SC_AC_STATE_TYPE_U16 uint16_t
 #define SC_AC_STATE_TYPE_U32 uint32_t
 
-#ifdef __SC_CUDA_SUPPORT__
-#include "suricata-common.h"
-#include "util-cuda.h"
-#include "util-cuda-vars.h"
-#include "decode.h"
-#include "util-cuda-buffer.h"
-#include "util-mpm.h"
-#include "flow.h"
-#endif /* __SC_CUDA_SUPPORT__ */
-
 typedef struct SCACPatternList_ {
     uint8_t *cs;
     uint16_t patlen;
@@ -83,10 +73,6 @@ typedef struct SCACCtx_ {
 
     uint32_t allocated_state_count;
 
-#ifdef __SC_CUDA_SUPPORT__
-    CUdeviceptr state_table_u16_cuda;
-    CUdeviceptr state_table_u32_cuda;
-#endif /* __SC_CUDA_SUPPORT__ */
 } SCACCtx;
 
 typedef struct SCACThreadCtx_ {
@@ -98,105 +84,4 @@ typedef struct SCACThreadCtx_ {
 
 void MpmACRegister(void);
 
-
-#ifdef __SC_CUDA_SUPPORT__
-
-#define MPM_AC_CUDA_MODULE_NAME "ac_cuda"
-#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb"
-
-static inline void CudaBufferPacket(CudaThreadVars *ctv, Packet *p)
-{
-    if (p->cuda_pkt_vars.cuda_mpm_enabled) {
-        while (!p->cuda_pkt_vars.cuda_done) {
-            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
-            if (p->cuda_pkt_vars.cuda_done) {
-                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-                break;
-            } else {
-                SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex);
-                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
-            }
-        }
-    }
-    p->cuda_pkt_vars.cuda_done = 0;
-
-    if (p->payload_len == 0 ||
-        (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
-        (p->flags & PKT_ALLOC) ||
-        (ctv->data_buffer_size_min_limit != 0 && p->payload_len < ctv->data_buffer_size_min_limit) ||
-        (p->payload_len > ctv->data_buffer_size_max_limit && ctv->data_buffer_size_max_limit != 0) ) {
-        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-        return;
-    }
-
-    MpmCtx *mpm_ctx = NULL;
-    if (p->proto == IPPROTO_TCP) {
-        if (p->flowflags & FLOW_PKT_TOSERVER)
-            mpm_ctx = ctv->mpm_proto_tcp_ctx_ts;
-        else
-            mpm_ctx = ctv->mpm_proto_tcp_ctx_tc;
-    } else if (p->proto == IPPROTO_UDP) {
-        if (p->flowflags & FLOW_PKT_TOSERVER)
-            mpm_ctx = ctv->mpm_proto_udp_ctx_ts;
-        else
-            mpm_ctx = ctv->mpm_proto_udp_ctx_tc;
-    } else {
-        mpm_ctx = ctv->mpm_proto_other_ctx;
-    }
-    if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
-        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-        return;
-    }
-
-#if __WORDSIZE==64
-    CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb,
-                                                p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
-                                                (void *)p);
-    if (slice == NULL) {
-        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
-                   "this to dev.");
-        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-        return;
-    }
-    *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
-    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
-    memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
-#else
-    CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb,
-                                                p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
-                                                (void *)p);
-    if (slice == NULL) {
-        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
-                   "this to dev.");
-        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
-        return;
-    }
-    *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
-    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
-    memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
-#endif
-    p->cuda_pkt_vars.cuda_mpm_enabled = 1;
-    SC_ATOMIC_SET(slice->done, 1);
-
-    SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
-               p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
-
-    return;
-}
-
-void MpmACCudaRegister(void);
-void SCACConstructBoth16and32StateTables(void);
-int MpmCudaBufferSetup(void);
-int MpmCudaBufferDeSetup(void);
-void SCACCudaStartDispatcher(void);
-void SCACCudaKillDispatcher(void);
-uint32_t  SCACCudaPacketResultsProcessing(Packet *p, const MpmCtx *mpm_ctx,
-                                          PrefilterRuleStore *pmq);
-void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx);
-
-void CudaReleasePacket(Packet *p);
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
-
 #endif /* __UTIL_MPM_AC__H__ */
diff --git a/src/util-mpm.c b/src/util-mpm.c
index b8b68f26dd..0dfe1793ba 100644
--- a/src/util-mpm.c
+++ b/src/util-mpm.c
@@ -35,16 +35,11 @@
 #include "util-hashlist.h"
 
 #include "detect-engine.h"
-#include "util-cuda.h"
 #include "util-misc.h"
 #include "conf.h"
 #include "conf-yaml-loader.h"
 #include "queue.h"
 #include "util-unittest.h"
-#ifdef __SC_CUDA_SUPPORT__
-#include "util-cuda-handlers.h"
-#include "detect-engine-mpm.h"
-#endif
 #include "util-memcpy.h"
 #ifdef BUILD_HYPERSCAN
 #include "hs.h"
@@ -258,142 +253,6 @@ void MpmFactoryDeRegisterAllMpmCtxProfiles(DetectEngineCtx *de_ctx)
     return;
 }
 
-#ifdef __SC_CUDA_SUPPORT__
-
-static void MpmCudaConfFree(void *conf)
-{
-    SCFree(conf);
-    return;
-}
-
-static void *MpmCudaConfParse(ConfNode *node)
-{
-    const char *value;
-
-    MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf));
-    if (unlikely(conf == NULL))
-        exit(EXIT_FAILURE);
-    memset(conf, 0, sizeof(*conf));
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT;
-    } else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "data-buffer-size-min-limit - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT;
-    } else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "data-buffer-size-max-limit - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT;
-    } else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "cb-buffer-size - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "gpu-transfer-size");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE;
-    } else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "gpu-transfer-size - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "batching-timeout");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT;
-    } else if ((conf->batching_timeout = atoi(value)) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "batching-timeout - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "device-id");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT;
-    } else if ((conf->device_id = atoi(value)) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "device-id - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    if (node != NULL)
-        value = ConfNodeLookupChildValue(node, "cuda-streams");
-    else
-        value = NULL;
-    if (value == NULL) {
-        /* default */
-        conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT;
-    } else if ((conf->cuda_streams = atoi(value)) < 0) {
-        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
-                   "cuda-streams - \"%s\"", node->name, value);
-        exit(EXIT_FAILURE);
-    }
-
-    return conf;
-}
-
-void MpmCudaEnvironmentSetup()
-{
-    if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
-        return;
-
-    CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree);
-
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    if (conf == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm "
-                       "profile.");
-        exit(EXIT_FAILURE);
-    }
-
-    if (MpmCudaBufferSetup() < 0) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac "
-                   "cuda");
-        exit(EXIT_FAILURE);
-    }
-
-    return;
-}
-
-#endif
-
 void MpmInitThreadCtx(MpmThreadCtx *mpm_thread_ctx, uint16_t matcher)
 {
     mpm_table[matcher].InitThreadCtx(NULL, mpm_thread_ctx);
@@ -446,9 +305,6 @@ void MpmTableSetup(void)
         MpmHSRegister();
     #endif /* HAVE_HS_VALID_PLATFORM */
 #endif /* BUILD_HYPERSCAN */
-#ifdef __SC_CUDA_SUPPORT__
-    MpmACCudaRegister();
-#endif /* __SC_CUDA_SUPPORT__ */
 }
 
 int MpmAddPatternCS(struct MpmCtx_ *mpm_ctx, uint8_t *pat, uint16_t patlen,
diff --git a/src/util-mpm.h b/src/util-mpm.h
index 3b7960a86d..d6a1605139 100644
--- a/src/util-mpm.h
+++ b/src/util-mpm.h
@@ -33,9 +33,6 @@ enum {
 
     /* aho-corasick */
     MPM_AC,
-#ifdef __SC_CUDA_SUPPORT__
-    MPM_AC_CUDA,
-#endif
     MPM_AC_BS,
     MPM_AC_TILE,
     MPM_HS,
@@ -168,42 +165,6 @@ typedef struct MpmTableElmt_ {
 MpmTableElmt mpm_table[MPM_TABLE_SIZE];
 int mpm_default_matcher;
 
-/* macros decides if cuda is enabled for the platform or not */
-#ifdef __SC_CUDA_SUPPORT__
-
-/* the min size limit of a payload(or any other data) to be buffered */
-#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0
-/* the max size limit of a payload(or any other data) to be buffered */
-#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500
-/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */
-#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024
-/* Default value for the max data chunk that would be sent to gpu */
-#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024
-/* Default value for offset/pointer buffer to be used by cuda mpm
- * engine for CudaBuffer reg */
-#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000
-#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000
-#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2
-#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0
-
-/**
- * \brief Cuda configuration for "mpm" profile.  We can further extend this
- *        to have conf for specific mpms.  For now its common for all mpms.
- */
-typedef struct MpmCudaConf_ {
-    uint16_t data_buffer_size_min_limit;
-    uint16_t data_buffer_size_max_limit;
-    uint32_t cb_buffer_size;
-    uint32_t gpu_transfer_size;
-    int batching_timeout;
-    int device_id;
-    int cuda_streams;
-} MpmCudaConf;
-
-void MpmCudaEnvironmentSetup();
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 struct DetectEngineCtx_;
 
 int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *);
diff --git a/src/util-running-modes.c b/src/util-running-modes.c
index ebc44a3fb6..e152c5a251 100644
--- a/src/util-running-modes.c
+++ b/src/util-running-modes.c
@@ -25,7 +25,6 @@
 #include "app-layer-detect-proto.h"
 #include "app-layer.h"
 #include "app-layer-parser.h"
-#include "util-cuda.h"
 #include "util-unittest.h"
 #include "util-debug.h"
 #include "conf-yaml-loader.h"
@@ -55,11 +54,3 @@ int ListAppLayerProtocols()
     exit(EXIT_SUCCESS);
 }
 
-#ifdef __SC_CUDA_SUPPORT__
-int ListCudaCards()
-{
-    SCCudaInitCudaEnvironment();
-    SCCudaListCards();
-    exit(EXIT_SUCCESS);
-}
-#endif
diff --git a/src/util-running-modes.h b/src/util-running-modes.h
index 510a86f125..f047e02dde 100644
--- a/src/util-running-modes.h
+++ b/src/util-running-modes.h
@@ -23,11 +23,7 @@
 #ifndef __UTIL_RUNNING_MODES_H__
 #define __UTIL_RUNNING_MODES_H__
 
-
 int ListKeywords(const char *keyword_info);
 int ListAppLayerProtocols(void);
-#ifdef __SC_CUDA_SUPPORT__
-int ListCudaCards(void);
-#endif
 
 #endif /* __UTIL_RUNNING_MODES_H__ */
diff --git a/suricata.yaml.in b/suricata.yaml.in
index eb89c8c428..4944ba1c81 100644
--- a/suricata.yaml.in
+++ b/suricata.yaml.in
@@ -967,10 +967,6 @@ host-mode: auto
 # Number of packets preallocated per thread. The default is 1024. A higher number 
 # will make sure each CPU will be more easily kept busy, but may negatively 
 # impact caching.
-#
-# If you are using the CUDA pattern matcher (mpm-algo: ac-cuda), different rules
-# apply. In that case try something like 60000 or more. This is because the CUDA
-# pattern matcher buffers and scans as many packets as possible in parallel.
 #max-pending-packets: 1024
 
 # Runmode the engine should use. Please check --list-runmodes to get the available
@@ -1345,7 +1341,6 @@ detect:
 # The supported algorithms are:
 # "ac"      - Aho-Corasick, default implementation
 # "ac-bs"   - Aho-Corasick, reduced memory implementation
-# "ac-cuda" - Aho-Corasick, CUDA implementation
 # "ac-ks"   - Aho-Corasick, "Ken Steele" variant
 # "hs"      - Hyperscan, available when built with Hyperscan support
 #
@@ -1358,10 +1353,6 @@ detect:
 # to be set to "single", because of ac's memory requirements, unless the
 # ruleset is small enough to fit in one's memory, in which case one can
 # use "full" with "ac".  Rest of the mpms can be run in "full" mode.
-#
-# There is also a CUDA pattern matcher (only available if Suricata was
-# compiled with --enable-cuda: b2g_cuda. Make sure to update your
-# max-pending-packets setting above as well if you use b2g_cuda.
 
 mpm-algo: auto
 
@@ -1724,40 +1715,6 @@ mpipe:
     size10386: 0
     size16384: 0
 
-##
-## Hardware accelaration
-##
-
-# Cuda configuration.
-cuda:
-  # The "mpm" profile.  On not specifying any of these parameters, the engine's
-  # internal default values are used, which are same as the ones specified in
-  # in the default conf file.
-  mpm:
-    # The minimum length required to buffer data to the gpu.
-    # Anything below this is MPM'ed on the CPU.
-    # Can be specified in kb, mb, gb.  Just a number indicates it's in bytes.
-    # A value of 0 indicates there's no limit.
-    data-buffer-size-min-limit: 0
-    # The maximum length for data that we would buffer to the gpu.
-    # Anything over this is MPM'ed on the CPU.
-    # Can be specified in kb, mb, gb.  Just a number indicates it's in bytes.
-    data-buffer-size-max-limit: 1500
-    # The ring buffer size used by the CudaBuffer API to buffer data.
-    cudabuffer-buffer-size: 500mb
-    # The max chunk size that can be sent to the gpu in a single go.
-    gpu-transfer-size: 50mb
-    # The timeout limit for batching of packets in microseconds.
-    batching-timeout: 2000
-    # The device to use for the mpm.  Currently we don't support load balancing
-    # on multiple gpus.  In case you have multiple devices on your system, you
-    # can specify the device to use, using this conf.  By default we hold 0, to
-    # specify the first device cuda sees.  To find out device-id associated with
-    # the card(s) on the system run "suricata --list-cuda-cards".
-    device-id: 0
-    # No of Cuda streams used for asynchronous processing. All values > 0 are valid.
-    # For this option you need a device with Compute Capability > 1.0.
-    cuda-streams: 2
 
 ##
 ## Include other configs