From: Victor Julien Date: Sat, 30 Dec 2017 13:55:26 +0000 (+0100) Subject: cuda: remove X-Git-Tag: suricata-4.1.0-beta1~377 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F3128%2Fhead;p=thirdparty%2Fsuricata.git cuda: remove Remove CUDA support as it has been broken for a long time. Ticket #2382. --- diff --git a/configure.ac b/configure.ac index 805afdeddc..6536e70af8 100644 --- a/configure.ac +++ b/configure.ac @@ -1347,80 +1347,6 @@ fi - # enable CUDA output - AC_ARG_ENABLE(cuda, - AS_HELP_STRING([--enable-cuda], [Enable experimental CUDA pattern matching]),,[enable_cuda=no]) - AS_IF([test "x$enable_cuda" = "xyes"], [ - AC_ARG_WITH(cuda_includes, - [ --with-cuda-includes=DIR cuda include directory], - [with_cuda_includes="$withval"],[with_cuda_includes=no]) - AC_ARG_WITH(cuda_libraries, - [ --with-cuda-libraries=DIR cuda library directory], - [with_cuda_libraries="$withval"],[with_cuda_libraries="no"]) - AC_ARG_WITH(cuda_nvcc, - [ --with-cuda-nvcc=DIR cuda nvcc compiler directory], - [with_cuda_nvcc="$withval"],[with_cuda_nvcc=no]) - - AC_DEFINE([__SC_CUDA_SUPPORT__],[1],(CUDA support enabled)) - - if test "$with_cuda_includes" != "no"; then - CPPFLAGS="${CPPFLAGS} -I${with_cuda_includes}" - else - CPPFLAGS="${CPPFLAGS} -I/usr/local/cuda/include" - fi - - if test "$with_cuda_libraries" != "no"; then - LDFLAGS="${LDFLAGS} -L${with_cuda_libraries}" - fi - - if test "$with_cuda_nvcc" != "no"; then - NVCC_DIR="${with_cuda_nvcc}" - else - NVCC_DIR="/usr/local/cuda/bin" - fi - - AC_CHECK_HEADER(cuda.h,,[AC_ERROR(cuda.h not found ...)]) - - LIBCUDA="" - AC_CHECK_LIB(cuda, cuArray3DCreate,, LIBCUDA="no") - if test "$LIBCUDA" = "no"; then - echo - echo " ERROR! libcuda library not found" - echo - exit 1 - fi - - AC_PATH_PROG([NVCC], [nvcc], no, [$PATH:$NVCC_DIR]) - if test "x$NVCC" = "xno"; then - echo - echo " ERROR! CUDA nvcc compiler not found: use --with-cuda-nvcc=DIR" - echo - exit 1 - fi - - AC_MSG_CHECKING(for nvcc version) - NVCCVER=`$NVCC --version | grep "release" | sed 's/.*release \(@<:@0-9@:>@\)\.\(@<:@0-9@:>@\).*/\1\2/'` - AC_MSG_RESULT($NVCCVER) - if test "$NVCCVER" -lt 31; then - echo - echo " Warning! Your CUDA nvcc version might be outdated." - echo " If compilation fails try the latest CUDA toolkit from" - echo " www.nvidia.com/object/cuda_develop.html" - echo - fi - - AM_PATH_PYTHON(,, no) - if test "x$PYTHON" = "xno"; then - echo - echo " ERROR! Compiling CUDA kernels requires python." - echo - exit 1 - fi - ]) - AM_CONDITIONAL([BUILD_CUDA], [test "x$enable_cuda" = "xyes"]) - AM_CONDITIONAL([__SC_CUDA_SUPPORT__], [test "x$enable_cuda" = "xyes"]) - - # Check for libcap-ng case $host in *-*-linux*) @@ -2220,7 +2146,6 @@ SURICATA_BUILD_CONF="Suricata Configuration: libgeoip: ${enable_geoip} Non-bundled htp: ${enable_non_bundled_htp} Old barnyard2 support: ${enable_old_barnyard2} - CUDA enabled: ${enable_cuda} Hyperscan support: ${enable_hyperscan} Libnet support: ${enable_libnet} diff --git a/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt b/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt deleted file mode 100644 index 18ea5d1027..0000000000 --- a/doc/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt +++ /dev/null @@ -1,149 +0,0 @@ -Autogenerated on 2012-11-29 -from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6 - - -Installation with CUDA and PFRING on Scientific Linux 6 - -For setup and install you need to be root: -mkdir /root/src -cd /root/src - -Pre installation requirements - -Install the following packages, to make sure you have everything needed for the -installation: - - yum install mpfr-2.4.1-6.el6.x86_64 cpp-4.4.4-13.el6.x86_64 ppl-0.10.2- - 11.el6.x86_64 \ - cloog-ppl-0.15.7-1.2.el6.x86_64 gcc-4.4.4-13.el6.x86_64 kernel-devel-2.6.32- - 131.2.1.el6.x86_64 \ - pcre-devel-7.8-3.1.el6.x86_64 libpcap-devel-1.0.0- - 6.20091201git117cb5.el6.x86_64 \ - yum-plugin-priorities-1.1.26-11.el6.noarch yum-conf-sl6x-1-1.noarch libyaml- - 0.1.3-1.el6.rf.x86_64 \ - libyaml-devel-0.1.3-1.el6.rf.x86_64 libnet-1.1.2.1-2.2.el6.rf.x86_64 flex- - 2.5.35-8.el6.x86_64 \ - bison-2.4.1-5.el6.x86_64 gcc-c++-4.4.4-13.el6.x86_64 - - -CUDA - -Download and install NVIDIA CUDA drivers: - - wget http://us.download.nvidia.com/XFree86/Linux-x86_64/270.41.19/NVIDIA- - Linux-x86_64-270.41.19.run - chmod +x NVIDIA-Linux-x86_64-270.41.19.run - ./NVIDIA-Linux-x86_64-270.41.19.run - -You also need to download and install the CUDA toolkit for RHEL6 : - - wget http://developer.download.nvidia.com/compute/cuda/4_0/toolkit/ - cudatoolkit_4.0.17_linux_64_rhel6.0.run - chmod +x cudatoolkit_4.0.17_linux_64_rhel6.0.run - ./cudatoolkit_4.0.17_linux_64_rhel6.0.run - -Make sure the kernel modules are loaded: - - /sbin/modprobe -r nouveau && /sbin/modprobe nvidia - -To ensure the proper NVIDIA CUDA modules get loaded on reboot, add the above -line to your /etc/rc.local file. - -PF_RING - -Go to your download directory and get the latest PF_RING: - - svn export https://svn.ntop.org/svn/ntop/trunk/PF_RING/ pfring-svn- - latest - -Compile and install -Next, enter the following commands for configuration and installation: - - cd pfring-svn-latest/kernel - make && sudo make install - cd ../userland/lib - ./configure --prefix=/usr/local/pfring && make && sudo make install - cd ../libpcap-1.1.1-ring - ./configure --prefix=/usr/local/pfring && make && sudo make install - cd ../tcpdump-4.1.1 - ./configure --prefix=/usr/local/pfring && make && sudo make install - -Load the pf_ring kernel module: - - /sbin/modprobe pf_ring - -To ensure the pf_ring module gets loaded on reboot, add the above line to your -/etc/rc.local file. - -Suricata - -Download and install Suricata: - - wget http://www.openinfosecfoundation.org/download/suricata-1.1beta2.tar.gz - -And unpack it: - - tar -xvzf suricata-1.1beta2.tar.gz - -Change to the unpacked directory: - - cd suricata-1.1beta2 - -Now compile and install Suricata with PF_RING and CUDA support: - - ./configure --enable-gccprotect --enable-profiling --enable-cuda --with-cuda- - includes=/usr/local/cuda/include \ - --with-cuda-libraries=/usr/local/cuda/lib64 --enable-pfring --with-libpfring- - libraries=/usr/local/lib \ - --with-libpfring-includes=/usr/local/include --with-libpcap-libraries=/usr/ - local/lib --with-libpcap-includes=/usr/local/include - make - make install - -Continue with the Basic_Setup -Next, you need to edit max-pending-packets in your /etc/suricata/suricata.yaml. -If you don't have one, download a generic one to get started: - - cd /etc/suricata - wget https://rules.emergingthreatspro.com/open-nogpl/suricata/suricata- - open.yaml - -Edit your suricata-open.yaml file accordingly. -The number of packets allowed to be processed simultaneously can be whatever -you want but it is recommended that it be 4000 or more. -For example: - - max-pending-packets: 12288 - -Next make sure the following line is present in the multi pattern algorithm -section: - - mpm-algo: b2g_cuda - - -Rules - -Read the information in Rule_Management_with_Oinkmaster -Add rules to suricata: - - cd /etc/suricata - wget https://rules.emergingthreatspro.com/open-nogpl/suricata/ - emerging.rules.tar.gz - tar -xvzf emerging.rules.tar.gz - -Make sure your .yaml file includes the /etc/suricata/rules/emerging-*.rules -files (they may need to be uncommented). -Run Suricata as followed: - - cd /etc/suricata - /usr/local/bin/suricata -c /etc/suricata/suricata.yaml\ - --pfring-int=eth0 --pfring-cluster-id=99 --pfring-cluster-type=cluster_flow - - - touch /var/lock/subsys/local - - -References - -PF_RING -http://www.ntop.org/products/pf_ring/ diff --git a/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt b/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt deleted file mode 100644 index 1d1cd22000..0000000000 --- a/doc/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt +++ /dev/null @@ -1,280 +0,0 @@ -Autogenerated on 2012-01-11 -from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104 - - -Installation with CUDA and PF RING on Ubuntu server 11.04 - -THIS WOULD NOT WORK ON A VIRTUAL MACHINE! -This guide is written using: -Ubuntu Server 11.04 -Linux ubuntu 2.6.38-8-generic x86_64 GNU/Linux - -Pre installation requirements - - - apt-get update - apt-get upgrade - -To get the CUDA toolkit, enter: - - http://developer.nvidia.com/cuda-toolkit-40 - -Pick up the correct NVIDIA drivers for your card and system - - http://www.nvidia.com/Download/index.aspx?lang=en-us - -Go to your download directory -chmod the 2 *.run files that you just downloaded. -For example: - - chmod 655 cudatoolkit_4.0.17_linux_64_ubuntu10.10.run - chmod 655 NVIDIA-Linux-x86_64-280.13.run - - - sudo apt-get -y install libpcre3 libpcre3-dbg libpcre3-dev \ - build-essential autoconf automake libtool libpcap-dev libnet1-dev \ - libyaml-0-2 libyaml-dev zlib1g zlib1g-dev libcap-ng-dev libcap-ng0 \ - make flex bison git - -Run the cuda toolkit installation package: - - sudo ./cudatoolkit_4.0.17_linux_64_ubuntu10.10.run - -Close all windows and as you are logged in press: - - Ctr+Alt+F1 - -Log in with your credentials - - sudo -i - -And enter your password -Stop the x server: - - /etc/init.d/gdm stop - -Uninstall xserver video drivers: - - apt-get remove --purge xserver-xorg-video-nouveau - -Go to the directory where you downloaded nvidia/cuda drivers. -Run the NVIDIA*******.run: - - ./NVIDIA********.run - -Ok and yes your way out. -At some point it will ask you to make a special configuration file to disable a -"nouveau" -driver that the system is currently using - say yes! -Reboot: - - shutdown -r now - -After reboot log in as you would normally do through the GUI -Log in as you would normally. -Go to shell: - - Ctrl+Alt+F1 - -Type in your credentials and pass - - sudo -i - -Stop the xserver again: - - /etc/init.d/gdm stop - -Run the NVIDIA driver again. -This time it would finish and be successful.... -Reboot: - - shutdown -r now - -After start you would notice that the display has much better resolution - it -is a good thing. -Log in as you would normally. -Because the 11.04 Ubuntu comes with gcc version 4.5 by default, you need to -install gcc 4.4 since you must use 4.4 for the cuda compilation: - - apt-get install gcc-4.4 gcc-4.4-base g++-4.4 - -Then we switch and make ubuntu use the gcc 4.4 by default: - - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.5 40 -- - slave /usr/bin/g++ g++ /usr/bin/g++-4.5 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 -- - slave /usr/bin/g++ g++ /usr/bin/g++-4.4 - -Make sure that this is the case: - - sudo update-alternatives --config gcc - -"" - - update-alternatives --config gcc (as root) - -There are 2 choices for the alternative gcc (providing /usr/bin/gcc). - - - Selection Path Priority Status - ------------------------------------------------------------ - * 0 /usr/bin/gcc-4.4 60 auto mode - 1 /usr/bin/gcc-4.4 60 manual mode - 2 /usr/bin/gcc-4.5 40 manual mode - - Press enter to keep the current choice[*], or type selection number (as - root) - "" - - -PF_RING installation. - -Install pre-requisites: - - cd /opt - apt-get install subversion gobjc++-4.4-multilib gobjc++-4.4 - -Get the latest PF_RING: - - svn --force export https://svn.ntop.org/svn/ntop/trunk/PF_RING/ PF_RING - -Install PF_RING: - - cd /kernel - make && make install - sudo insmod ./pf_ring.ko - cd ../userland - make && make install - cd /lib - ./configure && make && make install - cd ../libpcap - ./configure && make && make install - cd ../examples - echo "options pf_ring transparent_mode=0 min_num_slots=32768 - enable_tx_capture=0" > /etc/modprobe.d/pf_ring.conf - -Check info: - - cat /proc/net/pf_ring/info - "" - cd ../kernel - cat /proc/net/pf_ring/info - PF_RING Version : 4.7.3 ($Revision: exported$) - Ring slots : 4096 - Slot version : 13 - Capture TX : Yes [RX+TX] - IP Defragment : No - Socket Mode : Standard - Transparent mode : Yes (mode 0) - Total rings : 0 - Total plugins : 0 - - "" - -Check functionality: - - ./pfcount -i eth0 - -You should see something even if you have no traffic at the moment: -"" -cd /opt/PF_RING/userland/examples -./pfcount -i eth0 -Using PF_RING v.4.7.3 -Capturing from eth0 [88:AE:1D:56:90:FA] - - 1. Device RX channels: 1 - 2. Polling threads: 1 ========================= - Absolute Stats: [0 pkts rcvd][0 pkts dropped] - Total Pkts=0/Dropped=0.0 % - 0 pkts - 0 bytes ========================= - -========================= -Absolute Stats: [0 pkts rcvd][0 pkts dropped] -Total Pkts=0/Dropped=0.0 % -0 pkts - 0 bytes [0.00 pkt/sec - 0.00 Mbit/sec] ========================= -Actual Stats: 0 pkts [1'000.32 ms][0.00 pkt/sec] ========================= -^CLeaving... ========================= -Absolute Stats: [0 pkts rcvd][0 pkts dropped] -Total Pkts=0/Dropped=0.0 % -0 pkts - 0 bytes [0.00 pkt/sec - 0.00 Mbit/sec] ========================= -Actual Stats: 0 pkts [629.37 ms][0.00 pkt/sec] ========================= - - cd /opt/PF_RING/userland/examples - -"" - -Suricata - -Go to directory of your choice and get Suricata: - - git clone git://phalanx.openinfosecfoundation.org/oisf.git - cd oisf/ - -Configure: - - ./autogen.sh - ./configure --enable-gccprotect --enable-profiling --enable-cuda --with-cuda- - includes=/usr/local/cuda/include \ - --with-cuda-libraries=/usr/local/cuda/lib64 --enable-pfring - -You should get at the end: -"" - - Suricata Configuration: - NFQueue support: no - IPFW support: no - PF_RING support: yes - Prelude support: no - Unit tests enabled: no - Debug output enabled: no - Debug validation enabled: no - CUDA enabled: yes - DAG enabled: no - Profiling enabled: yes - GCC Protect enabled: yes - GCC march native enabled: yes - GCC Profile enabled: no - Unified native time: no - Non-bundled htp: no - PCRE sljit: no - - -"" -Install: - - make && make install - ldconfig - -Verify: - - suricata --build-info - - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:622) (main) -- This is - Suricata version 1.1beta2 (rev b3f7e6a) - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:507) (SCPrintBuildInfo) - - - Features: PCAP_SET_BUFF LIBPCAP_VERSION_MAJOR=1 CUDA PF_RING LIBCAP_NG - LIBNET1.1 HAVE_HTP_URI_NORMALIZE_HOOK - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:521) (SCPrintBuildInfo) - - - 64-bits, Little-endian architecture - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:523) (SCPrintBuildInfo) - - - GCC version 4.4.5, C version 199901 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:529) (SCPrintBuildInfo) - - - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:532) (SCPrintBuildInfo) - - - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:535) (SCPrintBuildInfo) - - - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:538) (SCPrintBuildInfo) - - - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:541) (SCPrintBuildInfo) - - - __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:545) (SCPrintBuildInfo) - - - compiled with -fstack-protector - [1840] 13/8/2011 -- 14:26:39 - (suricata.c:551) (SCPrintBuildInfo) - - - compiled with _FORTIFY_SOURCE=2 - -Run Suricata: - - suricata -c /etc/suricata/suricata.yaml\ - --pfring-int=eth0 --pfring-cluster-id=99 --pfring-cluster-type=cluster_flow - diff --git a/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt b/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt deleted file mode 100644 index 604ee8bcfc..0000000000 --- a/doc/Installation_with_CUDA_on_Scientific_Linux_6.txt +++ /dev/null @@ -1,95 +0,0 @@ -Autogenerated on 2012-11-29 -from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_on_Scientific_Linux_6 - - -Installation with CUDA on Scientific Linux 6 - -Hardware used: HP Proliant G7, 16 cores, 30 GB RAM, NVIDIA CUDA Quadro 4000 -graphics card -For setup you need to be root. Enter the following: - - mkdir /root/src - cd /root/src - - -Pre installation requirements - -Run the following command to ensure that you have everything you need for the -installation: - - yum install mpfr-2.4.1-6.el6.x86_64 cpp-4.4.4-13.el6.x86_64 ppl-0.10.2- - 11.el6.x86_64 \ - cloog-ppl-0.15.7-1.2.el6.x86_64 gcc-4.4.4-13.el6.x86_64 kernel-devel-2.6.32- - 131.2.1.el6.x86_64 \ - pcre-devel-7.8-3.1.el6.x86_64 libpcap-devel-1.0.0- - 6.20091201git117cb5.el6.x86_64 \ - yum-plugin-priorities-1.1.26-11.el6.noarch yum-conf-sl6x-1-1.noarch libyaml- - 0.1.3-1.el6.rf.x86_64 \ - libyaml-devel-0.1.3-1.el6.rf.x86_64 libnet-1.1.2.1-2.2.el6.rf.x86_64 flex- - 2.5.35-8.el6.x86_64 \ - bison-2.4.1-5.el6.x86_64 gcc-c++-4.4.4-13.el6.x86_64 - - -CUDA - -Download and install NVIDIA CUDA drivers: - - wget http://us.download.nvidia.com/XFree86/Linux-x86_64/270.41.19/NVIDIA- - Linux-x86_64-270.41.19.run - chmod +x NVIDIA-Linux-x86_64-270.41.19.run - ./NVIDIA-Linux-x86_64-270.41.19.run - -You also need to download and install the CUDA toolkit for RHEL6 : - - wget http://developer.download.nvidia.com/compute/cuda/4_0/toolkit/ - cudatoolkit_4.0.17_linux_64_rhel6.0.run - chmod +x cudatoolkit_4.0.17_linux_64_rhel6.0.run - ./cudatoolkit_4.0.17_linux_64_rhel6.0.run - -Make sure the kernel modules are loaded: - - /sbin/modprobe -r nouveau && /sbin/modprobe nvidia - -To ensure the proper NVIDIA CUDA modules get loaded on reboot, add the above -line to your /etc/rc.local file. - -Suricata - -Download and install Suricata: - - wget http://www.openinfosecfoundation.org/download/suricata-1.1beta2.tar.gz - -And unpack it: - - tar -xvzf suricata-1.1beta2.tar.gz - -Change to the unpacked directory: - - cd suricata-1.1beta2 - -Compile and install the engine with CUDA support: - - ./configure --enable-gccprotect --enable-profiling --enable-cuda \ - --with-cuda-includes=/usr/local/cuda/include --with-cuda-libraries=/usr/ - local/cuda/lib64/ - make - make install - - -Rules - -Read the information in Rule_Management_with_Oinkmaster -Add rules to suricata: - - cd /etc/suricata - wget https://rules.emergingthreatspro.com/open-nogpl/suricata/ - emerging.rules.tar.gz - tar -xvzf emerging.rules.tar.gz - -Make sure your .yaml file includes the /etc/suricata/rules/emerging-*.rules -files (they may need to be uncommented). -Run Suricata as followed: - - cd /etc/suricata - /usr/local/bin/suricata -c /etc/suricata/suricata.yaml -i eth0 - diff --git a/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt b/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt deleted file mode 100644 index 9c6c82fafa..0000000000 --- a/doc/Installation_with_CUDA_on_Ubuntu_server_1104.txt +++ /dev/null @@ -1,183 +0,0 @@ -Autogenerated on 2012-11-29 -from - https://redmine.openinfosecfoundation.org/projects/suricata/wiki/Installation_with_CUDA_on_Ubuntu_server_1104 - - -Installation with CUDA on Ubuntu server 11.04 - -THIS WOULD NOT WORK ON A VIRTUAL MACHINE! -This guide is written using: -Ubuntu Server 11.04 -Linux ubuntu 2.6.38-8-generic x86_64 GNU/Linux - -Pre installation requirements - - - apt-get update - apt-get upgrade - -Get the CUDA toolkit - - http://developer.nvidia.com/cuda-toolkit-40 - -Pick up the correct NVIDIA drivers for your card and system - - http://www.nvidia.com/Download/index.aspx?lang=en-us - -Go to your download directory -and chmod the 2 *.run files that you just downloaded. -Example: - - chmod 655 cudatoolkit_4.0.17_linux_64_ubuntu10.10.run - chmod 655 NVIDIA-Linux-x86_64-280.13.run - - - sudo apt-get -y install libpcre3 libpcre3-dbg libpcre3-dev \ - build-essential autoconf automake libtool libpcap-dev libnet1-dev \ - libyaml-0-2 libyaml-dev zlib1g zlib1g-dev libcap-ng-dev libcap-ng0 \ - make flex bison git - -Run the cuda toolkit installation package: - - sudo ./cudatoolkit_4.0.17_linux_64_ubuntu10.10.run - -Close all windows and as you are logged in press: - - Ctr+Alt+F1 - -Log in with your credentials - - sudo -i - -And enter your password -Stop the x server: - - /etc/init.d/gdm stop - -Uninstall xserver video drivers: - - apt-get remove --purge xserver-xorg-video-nouveau - -Go to the directory where you downloaded nvidia/cuda drivers. - - Run the NVIDIA*******.run: - ./NVIDIA********.run - -Ok and yes your way out. -At some point it will ask you to make a special configuration file to disable a -"nouveau" -driver that the system is currently using and prevents the NVIDIA drivers to be -installed - say yes! -Reboot: - - shutdown -r now - -After reboot log in as you would normally through the GUI -Log in as you would normally. -Go to shell: - - Ctrl+Alt+F1 - -Type in your credentials and pass - - sudo -i - -Stop the xserver again: - - /etc/init.d/gdm stop - -Run the NVIDIA driver again. -This time it would finish and be successful.... -Reboot: - - shutdown -r now - -After start you would notice that the display has much better resolution - it -is a good thing. -Log in as you would normally. -Because the 11.04 Ubuntu comes with gcc version 4.5 by default we need to -install gcc 4.4 since we must use 4.4 for the cuda compilation: - - apt-get install gcc-4.4 gcc-4.4-base g++-4.4 - -Then we switch and make ubuntu use the gcc 4.4 by default: - - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.5 40 -- - slave /usr/bin/g++ g++ /usr/bin/g++-4.5 - udo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 -- - slave /usr/bin/g++ g++ /usr/bin/g++-4.4 - -We make sure that this is the case: - - sudo update-alternatives --config gcc - -"" - - update-alternatives --config gcc (as root) - - -There are 2 choices for the alternative gcc (providing /usr/bin/gcc). - -* 0 /usr/bin/gcc-4.4 60 auto mode - 1 /usr/bin/gcc-4.4 60 manual mode - 2 /usr/bin/gcc-4.5 40 manual mode - - - Selection Path Priority Status - ------------------------------------------------------------ - -Press enter to keep the current choice[*], or type selection number: -"" - -Suricata - -Enter the following in your download directory: - - git clone git://phalanx.openinfosecfoundation.org/oisf.git - cd oisf/ - ./autogen.sh - ./configure --enable-gccprotect --enable-profiling --enable-cuda \ - --with-cuda-includes=/usr/local/cuda/include --with-cuda-libraries=/usr/ - local/cuda/lib64/ - -After that you should get the following result: -"" - - Suricata Configuration: - NFQueue support: no - IPFW support: no - PF_RING support: no - Prelude support: no - Unit tests enabled: no - Debug output enabled: no - Debug validation enabled: no - CUDA enabled: yes - DAG enabled: no - Profiling enabled: yes - GCC Protect enabled: yes - GCC march native enabled: yes - GCC Profile enabled: no - Unified native time: no - Non-bundled htp: no - PCRE sljit: no - "" - - - make && make install - ldconfig - -Proceed with Basic_Setup -After you start suricata , you should see cuda - - example : - "" - suricata -c suricata.yaml -i eth0 - [12406] 13/8/2011 -- 10:14:39 - (suricata.c:622) (main) -- This is - Suricata version 1.1beta2 (rev b3f7e6a) - [12406] 13/8/2011 -- 10:14:39 - (util-cpu.c:171) (UtilCpuPrintSummary) - -- CPUs/cores online: 8 - [12406] 13/8/2011 -- 10:14:39 - (util-cuda.c:4504) - (SCCudaPrintBasicDeviceInfo) -- GPU Device 1: GeForce 310M, 2 - Multiprocessors, 1468MHz, CUDA Compute Capability 1.2................... - ........................ - "" - diff --git a/doc/Makefile.am b/doc/Makefile.am index 4589a9fe10..1e64e4c11d 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -18,12 +18,8 @@ Fedora_Core.txt \ FreeBSD_8.txt \ HTP_library_installation.txt \ Installation_from_GIT_with_PF_RING_on_Ubuntu_server_1104.txt \ -Installation_with_CUDA_on_Ubuntu_server_1104.txt \ -Installation_with_CUDA_and_PFRING_on_Scientific_Linux_6.txt \ Installation_with_PF_RING.txt \ -Installation_with_CUDA_and_PF_RING_on_Ubuntu_server_1104.txt \ Installation_from_GIT_with_PCRE-JIT.txt \ -Installation_with_CUDA_on_Scientific_Linux_6.txt \ Mac_OS_X_106x.txt \ OpenBSD_Installation_from_GIT.txt \ Setting_up_IPSinline_for_Linux.txt \ diff --git a/doc/userguide/configuration/suricata-yaml.rst b/doc/userguide/configuration/suricata-yaml.rst index c7ecabb682..e79c3576ca 100644 --- a/doc/userguide/configuration/suricata-yaml.rst +++ b/doc/userguide/configuration/suricata-yaml.rst @@ -787,69 +787,6 @@ To let Suricata make these decisions set default to 'auto': default: auto -CUDA (Compute United Device Architecture) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Suricata utilizes CUDA for offloading CPU intensive tasks to the -(NVIDIA) GPU (graphics processing unit). Suricata supports an -experimental multi-pattern-matcher using CUDA. Only if you have -compiled Suricata with CUDA (by entering --enable-cuda in the -configure stage) you can make use of these features. There are -several options for CUDA. The option 'packet_buffer_limit' designates -how many packets will be send to the GPU at the same time. Suricata -sends packets in 'batches', meaning it sends multiple packets at -once. As soon as Suricata has collected the amount of packets set in -the 'packet_buffer_limit' option, it sends them to the GPU. The -default amount of packets is 2400. - -The option 'packet_size_limit' makes sure that packets with payloads -bigger than a certain amount of bytes will not be send to the -GPU. Other packets will be send to the GPU. The default setting is -1500 bytes. - -The option 'packet_buffers' designates the amount of buffers that will -be filled with packets and will be processed. Buffers contain the -batches of packets. During the time these filled buffers are being -processed, new buffers will be filled. - -The option 'batching_timeout' can have all values higher than 0. If a -buffers is not fully filled after a period of time (set in this option -'batching_timeout'), the buffer will be send to the GPU anyway. - -The option 'page_locked' designates whether the page locked memory -will or will not be used. The advantage of page locked memory is that -it can not be swapped out to disk. You would not want your computer to -use your hard disk for Suricata, because it lowers the performance a -lot. In this option you can set whether you still want this for CUDA -or not. - -The option 'device_id' is an option within CUDA to determine which GPU -should be turned to account.(If there is only one GPU present at your -computer, there is no benefit making use of the 'device-id' option.) -To detect the id of your GPU's, enter the following in your command -line: - -:: - - suricata --list-cuda-cards - -With the option 'cuda_streams' you can determine how many cuda-streams -should be used for asynchronous processing. All values > 0 are -valid. For this option you need a device with Compute Capability > 1.0 -and page_locked enabled to have any effect. - -:: - - cuda: - -mpm: - packet_buffer_limit: 2400 - packet_size_limit: 1500 - packet_buffers: 10 - batching_timeout: 1 - page_locked: enabled - device_id: 0 - cuda_streams: 2 - Pattern matcher settings ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/Makefile.am b/src/Makefile.am index eab7b1c51c..cbbde7973a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -51,7 +51,6 @@ app-layer-tls-handshake.c app-layer-tls-handshake.h \ conf.c conf.h \ conf-yaml-loader.c conf-yaml-loader.h \ counters.c counters.h \ -data-queue.c data-queue.h \ decode.c decode.h \ decode-afl.c \ decode-erspan.c decode-erspan.h \ @@ -384,10 +383,6 @@ util-conf.c util-conf.h \ util-coredump-config.c util-coredump-config.h \ util-cpu.c util-cpu.h \ util-crypt.c util-crypt.h \ -util-cuda.c util-cuda.h \ -util-cuda-buffer.c util-cuda-buffer.h \ -util-cuda-handlers.c util-cuda-handlers.h \ -util-cuda-vars.c util-cuda-vars.h \ util-daemon.c util-daemon.h \ util-debug.c util-debug.h \ util-debug-filters.c util-debug-filters.h \ @@ -482,7 +477,7 @@ win32-misc.c win32-misc.h \ win32-service.c win32-service.h \ win32-syslog.h -EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py tests +EXTRA_DIST = tests # set the include path found by configure AM_CPPFLAGS = $(all_includes) @@ -495,58 +490,6 @@ if HAVE_RUST suricata_DEPENDENCIES = $(RUST_SURICATA_LIB) endif -# Rules to build CUDA ptx modules -if BUILD_CUDA -BUILT_SOURCES = cuda-ptxdump.h - -suricata_CUDA_KERNELS = \ -util-mpm-ac-cuda-kernel.cu - -NVCCFLAGS=-O2 - -PTXS = $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_32) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_35) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_37) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_50) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_52) -PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_53) - -.cu.ptx_sm_20: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_20 -ptx $< - -.cu.ptx_sm_21: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $< - -.cu.ptx_sm_30: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $< - -.cu.ptx_sm_32: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_32 -ptx $< - -.cu.ptx_sm_35: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_35 -ptx $< - -.cu.ptx_sm_37: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_37 -ptx $< - -.cu.ptx_sm_50: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_50 -ptx $< - -.cu.ptx_sm_52: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_52 -ptx $< - -.cu.ptx_sm_53: - $(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_53 -ptx $< - -cuda-ptxdump.h: $(PTXS) - $(PYTHON) ptxdump.py cuda-ptxdump $(PTXS) - -CLEANFILES = $(PTXS) cuda-ptxdump.h -endif - # default CFLAGS AM_CFLAGS = ${OPTIMIZATION_CFLAGS} ${GCC_CFLAGS} ${CLANG_CFLAGS} \ ${SECCFLAGS} ${PCAP_CFLAGS} -DLOCAL_STATE_DIR=\"$(localstatedir)\" \ diff --git a/src/app-layer-detect-proto.c b/src/app-layer-detect-proto.c index 7b0f66ff67..92d8014c9b 100644 --- a/src/app-layer-detect-proto.c +++ b/src/app-layer-detect-proto.c @@ -60,7 +60,6 @@ #include "conf.h" #include "util-memcmp.h" #include "util-spm.h" -#include "util-cuda.h" #include "util-debug.h" #include "runmodes.h" @@ -1591,12 +1590,6 @@ int AppLayerProtoDetectSetup(void) uint16_t spm_matcher = SinglePatternMatchDefaultMatcher(); uint16_t mpm_matcher = PatternMatchDefaultMatcher(); -#ifdef __SC_CUDA_SUPPORT__ - /* CUDA won't work here, so fall back to AC */ - if (mpm_matcher == MPM_AC_CUDA) - mpm_matcher = mpm_default_matcher; -#endif - alpd_ctx.spm_global_thread_ctx = SpmInitGlobalThreadCtx(spm_matcher); if (alpd_ctx.spm_global_thread_ctx == NULL) { SCLogError(SC_ERR_FATAL, "Unable to alloc SpmGlobalThreadCtx."); diff --git a/src/data-queue.c b/src/data-queue.c deleted file mode 100644 index a3afd4acc9..0000000000 --- a/src/data-queue.c +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright (c) 2009, 2010 Open Information Security Foundation. - * - * \author Anoop Saldanha - */ - -#include "suricata-common.h" -#include "data-queue.h" -#include "threads.h" - -/** - * \brief Enqueues data on the queue. - * - * \param q Pointer to the data queue. - * \param data Pointer to the data to be queued. It should be a pointer to a - * structure instance that implements the template structure - * struct SCDQGenericQData_ defined in data-queue.h. - */ -void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data) -{ - /* we already have some data in queue */ - if (q->top != NULL) { - data->next = q->top; - q->top->prev = data; - q->top = data; - - /* the queue is empty */ - } else { - q->top = data; - q->bot = data; - } - - q->len++; - -#ifdef DBG_PERF - if (q->len > q->dbg_maxlen) - q->dbg_maxlen = q->len; -#endif /* DBG_PERF */ - - return; -} - -/** - * \brief Dequeues and returns an entry from the queue. - * - * \param q Pointer to the data queue. - * \param retval Pointer to the data that has been enqueued. The instance - * returned is/should be a pointer to a structure instance that - * implements the template structure struct SCDQGenericQData_ - * defined in data-queue.h. - */ -SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q) -{ - SCDQGenericQData *data = NULL; - - /* if the queue is empty there are is no data left and we return NULL */ - if (q->len == 0) { - return NULL; - } - - /* If we are going to get the last packet, set len to 0 - * before doing anything else (to make the threads to follow - * the SCondWait as soon as possible) */ - q->len--; - - /* pull the bottom packet from the queue */ - data = q->bot; - -#ifdef OS_DARWIN - /* Weird issue in OS_DARWIN - * Sometimes it looks that two thread arrive here at the same time - * so the bot ptr is NULL */ - if (data == NULL) { - printf("No data to dequeue!\n"); - return NULL; - } -#endif /* OS_DARWIN */ - - /* more data in queue */ - if (q->bot->prev != NULL) { - q->bot = q->bot->prev; - q->bot->next = NULL; - /* just the one we remove, so now empty */ - } else { - q->top = NULL; - q->bot = NULL; - } - - data->next = NULL; - data->prev = NULL; - - return data; -} diff --git a/src/data-queue.h b/src/data-queue.h deleted file mode 100644 index f1f6bb3839..0000000000 --- a/src/data-queue.h +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Copyright (c) 2009, 2010 Open Information Security Foundation. - * - * \author Anoop Saldanha - * - * \file Generic queues. Any instance that wants to get itself on the generic - * queue, would have to implement the template struct SCDQGenericQData_ - * defined below. - */ - -#ifndef __DATA_QUEUE_H__ -#define __DATA_QUEUE_H__ - -#include "threads.h" - -/** - * \brief Generic template for any data structure that wants to be on the - * queue. Any other data structure that wants to be on the queue - * needs to use this template and define its own members from - * onwards. - */ -typedef struct SCDQGenericQData_ { - /* this is needed when we want to supply a list of data items */ - struct SCDQGenericQData_ *next; - struct SCDQGenericQData_ *prev; - /* if we want to consider this pointer as the head of a list, this var - * holds the no of elements in the list. Else it holds a . */ - //uint16_t len; - /* in case this data instance is the head of a list, we can refer the - * bottomost instance directly using this var */ - //struct SCDQGenericaQData *bot; - - - /* any other data structure that wants to be on the queue can implement - * its own memebers from here on, in its structure definition. Just note - * that the first 2 members should always be next and prev in the same - * order */ - // -} SCDQGenericQData; - -/** - * \brief The data queue to hold instances that implement the template - * SCDQGenericQData. - */ -typedef struct SCDQDataQueue_ { - /* holds the item at the top of the queue */ - SCDQGenericQData *top; - /* holds the item at the bottom of the queue */ - SCDQGenericQData *bot; - /* no of items currently in the queue */ - uint16_t len; -#ifdef DBG_PERF - uint16_t dbg_maxlen; -#endif /* DBG_PERF */ - - SCMutex mutex_q; - SCCondT cond_q; - -} __attribute__((aligned(CLS))) SCDQDataQueue; - -void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *); -SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *); - -#endif /* __DATA_QUEUE_H__ */ diff --git a/src/decode.c b/src/decode.c index 30dd8cee3a..e9e2f38be6 100644 --- a/src/decode.c +++ b/src/decode.c @@ -119,11 +119,6 @@ void PacketDecodeFinalize(ThreadVars *tv, DecodeThreadVars *dtv, Packet *p) } } } -#ifdef __SC_CUDA_SUPPORT__ - if (dtv->cuda_vars.mpm_is_cuda) - CudaBufferPacket(&dtv->cuda_vars, p); -#endif - } /** diff --git a/src/decode.h b/src/decode.h index 76f71eca50..0f0c6020c0 100644 --- a/src/decode.h +++ b/src/decode.h @@ -32,11 +32,6 @@ #include "decode-events.h" #include "flow-worker.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "util-cuda-buffer.h" -#include "util-cuda-vars.h" -#endif /* __SC_CUDA_SUPPORT__ */ - #ifdef HAVE_NAPATECH #include "util-napatech.h" #endif /* HAVE_NAPATECH */ @@ -596,9 +591,6 @@ typedef struct Packet_ #ifdef PROFILING PktProfiling *profile; #endif -#ifdef __SC_CUDA_SUPPORT__ - CudaPacketVars cuda_pkt_vars; -#endif #ifdef HAVE_NAPATECH NapatechPacketVars ntpv; #endif @@ -692,9 +684,6 @@ typedef struct DecodeThreadVars_ * flow recycle during lookups */ void *output_flow_thread_data; -#ifdef __SC_CUDA_SUPPORT__ - CudaThreadVars cuda_vars; -#endif } DecodeThreadVars; typedef struct CaptureStats_ { @@ -734,25 +723,11 @@ void CaptureStatsSetup(ThreadVars *tv, CaptureStats *s); /** * \brief Initialize a packet structure for use. */ -#ifdef __SC_CUDA_SUPPORT__ -#include "util-cuda-handlers.h" -#include "util-mpm.h" - -#define PACKET_INITIALIZE(p) do { \ - memset((p), 0x00, SIZE_OF_PACKET); \ - SCMutexInit(&(p)->tunnel_mutex, NULL); \ - PACKET_RESET_CHECKSUMS((p)); \ - (p)->livedev = NULL; \ - SCMutexInit(&(p)->cuda_pkt_vars.cuda_mutex, NULL); \ - SCCondInit(&(p)->cuda_pkt_vars.cuda_cond, NULL); \ - } while (0) -#else #define PACKET_INITIALIZE(p) { \ SCMutexInit(&(p)->tunnel_mutex, NULL); \ PACKET_RESET_CHECKSUMS((p)); \ (p)->livedev = NULL; \ } -#endif #define PACKET_RELEASE_REFS(p) do { \ FlowDeReference(&((p)->flow)); \ diff --git a/src/detect-engine-build.c b/src/detect-engine-build.c index d3e7bc40aa..31e0545f6f 100644 --- a/src/detect-engine-build.c +++ b/src/detect-engine-build.c @@ -1998,39 +1998,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) exit(EXIT_FAILURE); } -#ifdef __SC_CUDA_SUPPORT__ - if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) { - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) { - /* setting it to default. You've gotta remove it once you fix the state table thing */ - SCACConstructBoth16and32StateTables(); - - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); - if (cuda_context == 0) { - SCLogError(SC_ERR_FATAL, "cuda context is NULL."); - exit(EXIT_FAILURE); - } - int r = SCCudaCtxPushCurrent(cuda_context); - if (r < 0) { - SCLogError(SC_ERR_FATAL, "context push failed."); - exit(EXIT_FAILURE); - } - } - - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) { - int r = SCCudaCtxPopCurrent(NULL); - if (r < 0) { - SCLogError(SC_ERR_FATAL, "cuda context pop failure."); - exit(EXIT_FAILURE); - } - } - - /* too late to call this either ways. Should be called post ac goto. - * \todo Support this. */ - DetermineCudaStateTableSize(de_ctx); - } -#endif - int r = DetectMpmPrepareBuiltinMpms(de_ctx); r |= DetectMpmPrepareAppMpms(de_ctx); if (r != 0) { diff --git a/src/detect-engine-payload.c b/src/detect-engine-payload.c index a4e8f62eb3..313c897544 100644 --- a/src/detect-engine-payload.c +++ b/src/detect-engine-payload.c @@ -117,19 +117,9 @@ static void PrefilterPktPayload(DetectEngineThreadCtx *det_ctx, if (p->payload_len < mpm_ctx->minlen) SCReturn; -#ifdef __SC_CUDA_SUPPORT__ - if (p->cuda_pkt_vars.cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) { - (void)SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq); - } else { - (void)mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx, - &det_ctx->mtc, &det_ctx->pmq, - p->payload, p->payload_len); - } -#else (void)mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx, &det_ctx->mtc, &det_ctx->pmq, p->payload, p->payload_len); -#endif } int PrefilterPktPayloadRegister(SigGroupHead *sgh, MpmCtx *mpm_ctx) diff --git a/src/detect-engine-register.c b/src/detect-engine-register.c index 776ebe6fd7..788745a334 100644 --- a/src/detect-engine-register.c +++ b/src/detect-engine-register.c @@ -219,7 +219,6 @@ #include "util-unittest-helper.h" #include "util-debug.h" #include "util-hashlist.h" -#include "util-cuda.h" #include "util-privs.h" #include "util-profiling.h" #include "util-validate.h" diff --git a/src/detect-engine.c b/src/detect-engine.c index dad0c8e85a..652329ea7d 100644 --- a/src/detect-engine.c +++ b/src/detect-engine.c @@ -1263,11 +1263,7 @@ static int DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) #ifdef BUILD_HYPERSCAN de_ctx->mpm_matcher == MPM_HS || #endif -#ifdef __SC_CUDA_SUPPORT__ - de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) { -#else de_ctx->mpm_matcher == MPM_AC_BS) { -#endif de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE; } else { de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL; @@ -1276,15 +1272,6 @@ static int DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) if (strcmp(sgh_mpm_context, "single") == 0) { de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE; } else if (strcmp(sgh_mpm_context, "full") == 0) { -#ifdef __SC_CUDA_SUPPORT__ - if (de_ctx->mpm_matcher == MPM_AC_CUDA) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use " - "the cuda version of our mpm ac, i.e. \"ac-cuda\" " - "along with \"full\" \"sgh-mpm-context\". " - "Allowed values are \"single\" and \"auto\"."); - exit(EXIT_FAILURE); - } -#endif de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL; } else { SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an " diff --git a/src/detect.c b/src/detect.c index e11ab0ad14..5a86e04c9a 100644 --- a/src/detect.c +++ b/src/detect.c @@ -902,10 +902,6 @@ next: PACKET_PROFILING_DETECT_END(p, PROF_DETECT_RULES); end: -#ifdef __SC_CUDA_SUPPORT__ - CudaReleasePacket(p); -#endif - /* see if we need to increment the inspect_id and reset the de_state */ if (has_state && AppLayerParserProtocolSupportsTxs(p->proto, alproto)) { PACKET_PROFILING_DETECT_START(p, PROF_DETECT_STATEFUL_UPDATE); diff --git a/src/ptxdump.py b/src/ptxdump.py deleted file mode 100644 index 097e517334..0000000000 --- a/src/ptxdump.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -from string import * -import os, getopt, sys, platform - -header = '''/* Auto-generated by ptxdump.py DO NOT EDIT -* -* This file contains the ptx code of the Cuda kernels. -* A kernel is identified by its name and the compute capability (e.g. _sm_10). -*/ -''' - -def FormatCharHex(d): - s = hex(ord(d)) - if len(s) == 3: - s = "0x0" + s[2] - return s - -def CleanFileName(f): - v = f.replace("-","_") - v = v.replace(".ptx","") - return v - -if not(len(sys.argv[1:]) >= 2): - print("Usage: ptx2c.py ") - print("Description: creates a header file containing the ptx files as character array" + os.linesep) - sys.exit(0) - -out_h = sys.argv[1] + ".h" -out = open(out_h, 'w') - -out.writelines(header) -out.writelines("#ifdef __SC_CUDA_SUPPORT__\n") -out.writelines("#ifndef __ptxdump_h__\n") -out.writelines("#define __ptxdump_h__\n\n") - -# write char arrays -for file in sys.argv[2:]: - in_ptx = open(file, 'r') - source = in_ptx.read() - source_len = len(source) - - varname = CleanFileName(file) - - out.writelines("const unsigned char " + varname + "[" + str(source_len+1) + "] = {\n") - newlinecnt = 0 - for i in range(0, source_len): - out.write(FormatCharHex(source[i]) + ", ") - newlinecnt += 1 - if newlinecnt == 16: - newlinecnt = 0 - out.write("\n") - out.write("0x00\n};\n\n") - - print(sys.argv[0] + ": CUmodule " + varname + " packed successfully") - -# write retrieval function -out.writelines("const unsigned char* SCCudaPtxDumpGetModule(const char* module){\n"); -for file in sys.argv[2:]: - out.writelines('\tif (!strcmp(module, "' + file.replace(".ptx","")+'"))\n') - out.writelines("\t\treturn " + CleanFileName(file)+";\n") -out.writelines('\tSCLogError(SC_ERR_FATAL, "Error in SCCudaPtxDumpGetModule, module %s not found. Exiting...",module);\n') -out.writelines("\texit(EXIT_FAILURE);\n") -out.writelines("};\n") - -out.writelines("#endif /* __ptxdump_h__ */\n") -out.writelines("#endif /* __SC_CUDA_SUPPORT__ */\n") - -print(sys.argv[0] + ": " + out_h + " written successfully") - -in_ptx.close() -out.close() diff --git a/src/runmode-unittests.c b/src/runmode-unittests.c index f89a9439ed..848e88d8d8 100644 --- a/src/runmode-unittests.c +++ b/src/runmode-unittests.c @@ -185,9 +185,6 @@ static void RegisterUnittests(void) SCClassConfRegisterTests(); SCThresholdConfRegisterTests(); SCRConfRegisterTests(); -#ifdef __SC_CUDA_SUPPORT__ - SCCudaRegisterTests(); -#endif PayloadRegisterTests(); DcePayloadRegisterTests(); UriRegisterTests(); @@ -218,9 +215,6 @@ static void RegisterUnittests(void) DetectPortTests(); SCAtomicRegisterTests(); MemrchrRegisterTests(); -#ifdef __SC_CUDA_SUPPORT__ - CudaBufferRegisterUnittests(); -#endif AppLayerUnittestsRegister(); MimeDecRegisterTests(); StreamingBufferRegisterTests(); @@ -251,9 +245,6 @@ void RunUnittests(int list_unittests, const char *regex_arg) default_packet_size = DEFAULT_PACKET_SIZE; /* load the pattern matchers */ MpmTableSetup(); -#ifdef __SC_CUDA_SUPPORT__ - MpmCudaEnvironmentSetup(); -#endif SpmTableSetup(); AppLayerSetup(); @@ -312,11 +303,6 @@ void RunUnittests(int list_unittests, const char *regex_arg) UtCleanup(); #ifdef BUILD_HYPERSCAN MpmHSGlobalCleanup(); -#endif -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) - MpmCudaBufferDeSetup(); - CudaHandlerFreeProfiles(); #endif if (failed) { exit(EXIT_FAILURE); diff --git a/src/runmodes.c b/src/runmodes.c index 139a5a35d2..79e5f4d7e0 100644 --- a/src/runmodes.c +++ b/src/runmodes.c @@ -53,11 +53,6 @@ #include "flow-manager.h" #include "counters.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#endif - int debuglog_enabled = 0; /* Runmode Global Thread Names */ @@ -349,15 +344,6 @@ void RunModeDispatch(int runmode, const char *custom_mode) } } -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA && - strcasecmp(custom_mode, "autofp") != 0) { - SCLogError(SC_ERR_RUNMODE, "When using a cuda mpm, the only runmode we " - "support is autofp."); - exit(EXIT_FAILURE); - } -#endif - RunMode *mode = RunModeGetCustomMode(runmode, custom_mode); if (mode == NULL) { SCLogError(SC_ERR_RUNMODE, "The custom type \"%s\" doesn't exist " @@ -386,11 +372,6 @@ void RunModeDispatch(int runmode, const char *custom_mode) if (local_custom_mode != NULL) SCFree(local_custom_mode); -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) - SCACCudaStartDispatcher(); -#endif - /* Check if the alloted queues have at least 1 reader and writer */ TmValidateQueueState(); diff --git a/src/runmodes.h b/src/runmodes.h index da0091e411..0329c638b2 100644 --- a/src/runmodes.h +++ b/src/runmodes.h @@ -43,7 +43,6 @@ enum RunModes { RUNMODE_USER_MAX, /* Last standard running mode */ RUNMODE_LIST_KEYWORDS, RUNMODE_LIST_APP_LAYERS, - RUNMODE_LIST_CUDA_CARDS, RUNMODE_LIST_RUNMODES, RUNMODE_PRINT_VERSION, RUNMODE_PRINT_BUILDINFO, diff --git a/src/source-af-packet.c b/src/source-af-packet.c index 3c1d000eb7..11a872ae4f 100644 --- a/src/source-af-packet.c +++ b/src/source-af-packet.c @@ -56,18 +56,6 @@ #include "source-af-packet.h" #include "runmodes.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-cuda.h" -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#include "util-cuda-handlers.h" -#include "detect-engine.h" -#include "detect-engine-mpm.h" -#include "util-cuda-vars.h" - -#endif /* __SC_CUDA_SUPPORT__ */ - #ifdef HAVE_AF_PACKET #if HAVE_SYS_IOCTL_H @@ -2351,11 +2339,6 @@ TmEcode DecodeAFPThreadInit(ThreadVars *tv, const void *initdata, void **data) *data = (void *)dtv; -#ifdef __SC_CUDA_SUPPORT__ - if (CudaThreadVarsInit(&dtv->cuda_vars) < 0) - SCReturnInt(TM_ECODE_FAILED); -#endif - SCReturnInt(TM_ECODE_OK); } diff --git a/src/source-netmap.c b/src/source-netmap.c index cf0b443183..9521e7bede 100644 --- a/src/source-netmap.c +++ b/src/source-netmap.c @@ -54,18 +54,6 @@ #include "source-netmap.h" #include "runmodes.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-cuda.h" -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#include "util-cuda-handlers.h" -#include "detect-engine.h" -#include "detect-engine-mpm.h" -#include "util-cuda-vars.h" - -#endif /* __SC_CUDA_SUPPORT__ */ - #ifdef HAVE_NETMAP #if HAVE_SYS_IOCTL_H @@ -990,11 +978,6 @@ static TmEcode DecodeNetmapThreadInit(ThreadVars *tv, const void *initdata, void *data = (void *)dtv; -#ifdef __SC_CUDA_SUPPORT__ - if (CudaThreadVarsInit(&dtv->cuda_vars) < 0) - SCReturnInt(TM_ECODE_FAILED); -#endif - SCReturnInt(TM_ECODE_OK); } diff --git a/src/source-pcap-file-helper.c b/src/source-pcap-file-helper.c index 40b753f52e..6a5c0bffed 100644 --- a/src/source-pcap-file-helper.c +++ b/src/source-pcap-file-helper.c @@ -27,18 +27,6 @@ #include "util-checksum.h" #include "util-profiling.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-cuda.h" -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#include "util-cuda-handlers.h" -#include "detect-engine.h" -#include "detect-engine-mpm.h" -#include "util-cuda-vars.h" - -#endif /* __SC_CUDA_SUPPORT__ */ - extern int max_pending_packets; extern PcapFileGlobalVars pcap_g; diff --git a/src/source-pcap-file.c b/src/source-pcap-file.c index 1c408829ec..30c5483bde 100644 --- a/src/source-pcap-file.c +++ b/src/source-pcap-file.c @@ -30,18 +30,6 @@ #include "flow-manager.h" #include "util-checksum.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-cuda.h" -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#include "util-cuda-handlers.h" -#include "detect-engine.h" -#include "detect-engine-mpm.h" -#include "util-cuda-vars.h" - -#endif /* __SC_CUDA_SUPPORT__ */ - extern int max_pending_packets; PcapFileGlobalVars pcap_g; @@ -424,11 +412,6 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, const void *initdata, void **da DecodeRegisterPerfCounters(dtv, tv); -#ifdef __SC_CUDA_SUPPORT__ - if (CudaThreadVarsInit(&dtv->cuda_vars) < 0) - SCReturnInt(TM_ECODE_FAILED); -#endif - *data = (void *)dtv; SCReturnInt(TM_ECODE_OK); diff --git a/src/source-pcap.c b/src/source-pcap.c index 84e9456735..ba84988879 100644 --- a/src/source-pcap.c +++ b/src/source-pcap.c @@ -42,18 +42,6 @@ #include "util-ioctl.h" #include "tmqh-packetpool.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-cuda.h" -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#include "util-cuda-handlers.h" -#include "detect-engine.h" -#include "detect-engine-mpm.h" -#include "util-cuda-vars.h" - -#endif /* __SC_CUDA_SUPPORT__ */ - #define PCAP_STATE_DOWN 0 #define PCAP_STATE_UP 1 @@ -602,11 +590,6 @@ TmEcode DecodePcapThreadInit(ThreadVars *tv, const void *initdata, void **data) DecodeRegisterPerfCounters(dtv, tv); -#ifdef __SC_CUDA_SUPPORT__ - if (CudaThreadVarsInit(&dtv->cuda_vars) < 0) - SCReturnInt(TM_ECODE_FAILED); -#endif - *data = (void *)dtv; SCReturnInt(TM_ECODE_OK); diff --git a/src/suricata.c b/src/suricata.c index a9b7fc49a3..120b5fc2a4 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -149,7 +149,6 @@ #include "runmodes.h" #include "runmode-unittests.h" -#include "util-cuda.h" #include "util-decode-asn1.h" #include "util-debug.h" #include "util-error.h" @@ -163,10 +162,6 @@ #include "tmqh-packetpool.h" #include "util-proto-name.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "util-cuda-buffer.h" -#include "util-mpm-ac.h" -#endif #include "util-mpm-hs.h" #include "util-storage.h" #include "host-storage.h" @@ -321,30 +316,9 @@ uint8_t print_mem_flag = 1; #endif #endif -static void CreateLowercaseTable(void) -{ - /* create table for O(1) lowercase conversion lookup. It was removed, but - * we still need it for cuda. So resintalling it back into the codebase */ - int c = 0; - memset(g_u8_lowercasetable, 0x00, sizeof(g_u8_lowercasetable)); - for ( ; c < 256; c++) { - if (c >= 'A' && c <= 'Z') - g_u8_lowercasetable[c] = (c + ('a' - 'A')); - else - g_u8_lowercasetable[c] = c; - } -} - void GlobalsInitPreConfig(void) { -#ifdef __SC_CUDA_SUPPORT__ - /* Init the CUDA environment */ - SCCudaInitCudaEnvironment(); - CudaBufferInit(); -#endif - memset(trans_q, 0, sizeof(trans_q)); - memset(data_queues, 0, sizeof(data_queues)); /* Initialize the trans_q mutex */ int blah; @@ -352,9 +326,6 @@ void GlobalsInitPreConfig(void) for(blah=0;blah<256;blah++) { r |= SCMutexInit(&trans_q[blah].mutex_q, NULL); r |= SCCondInit(&trans_q[blah].cond_q, NULL); - - r |= SCMutexInit(&data_queues[blah].mutex_q, NULL); - r |= SCCondInit(&data_queues[blah].cond_q, NULL); } if (r != 0) { @@ -362,8 +333,6 @@ void GlobalsInitPreConfig(void) exit(EXIT_FAILURE); } - CreateLowercaseTable(); - TimeInit(); SupportFastPatternForSigMatchTypes(); } @@ -424,11 +393,6 @@ static void GlobalsDestroy(SCInstance *suri) MpmHSGlobalCleanup(); #endif -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) - MpmCudaBufferDeSetup(); - CudaHandlerFreeProfiles(); -#endif ConfDeInit(); #ifdef HAVE_LUAJIT LuajitFreeStatesPool(); @@ -637,9 +601,6 @@ static void PrintUsage(const char *progname) #endif /* UNITTESTS */ printf("\t--list-app-layer-protos : list supported app layer protocols\n"); printf("\t--list-keywords[=all|csv|] : list keywords implemented by the engine\n"); -#ifdef __SC_CUDA_SUPPORT__ - printf("\t--list-cuda-cards : list cuda supported cards\n"); -#endif printf("\t--list-runmodes : list supported runmodes\n"); printf("\t--runmode : specific runmode modification the engine should run. The argument\n" "\t supplied should be the id for the runmode obtained by running\n" @@ -728,9 +689,6 @@ static void PrintBuildInfo(void) #ifdef HAVE_PCAP_SET_BUFF strlcat(features, "PCAP_SET_BUFF ", sizeof(features)); #endif -#ifdef __SC_CUDA_SUPPORT__ - strlcat(features, "CUDA ", sizeof(features)); -#endif #ifdef HAVE_PFRING strlcat(features, "PF_RING ", sizeof(features)); #endif @@ -1469,7 +1427,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri) int dump_config = 0; int list_app_layer_protocols = 0; int list_unittests = 0; - int list_cuda_cards = 0; int list_runmodes = 0; int list_keywords = 0; int build_info = 0; @@ -1549,7 +1506,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri) {"unittest-filter", required_argument, 0, 'U'}, {"list-app-layer-protos", 0, &list_app_layer_protocols, 1}, {"list-unittests", 0, &list_unittests, 1}, - {"list-cuda-cards", 0, &list_cuda_cards, 1}, {"list-runmodes", 0, &list_runmodes, 1}, {"list-keywords", optional_argument, &list_keywords, 1}, {"runmode", required_argument, NULL, 0}, @@ -1719,12 +1675,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri) #else fprintf(stderr, "ERROR: Unit tests not enabled. Make sure to pass --enable-unittests to configure when building.\n"); return TM_ECODE_FAILED; -#endif /* UNITTESTS */ - } else if(strcmp((long_opts[option_index]).name, "list-cuda-cards") == 0) { -#ifndef __SC_CUDA_SUPPORT__ - fprintf(stderr, "ERROR: Cuda not enabled. Make sure to pass " - "--enable-cuda to configure when building.\n"); - return TM_ECODE_FAILED; #endif /* UNITTESTS */ } else if (strcmp((long_opts[option_index]).name, "list-runmodes") == 0) { suri->run_mode = RUNMODE_LIST_RUNMODES; @@ -2107,8 +2057,6 @@ static TmEcode ParseCommandLine(int argc, char** argv, SCInstance *suri) if (list_app_layer_protocols) suri->run_mode = RUNMODE_LIST_APP_LAYERS; - if (list_cuda_cards) - suri->run_mode = RUNMODE_LIST_CUDA_CARDS; if (list_keywords) suri->run_mode = RUNMODE_LIST_KEYWORDS; if (list_unittests) @@ -2343,10 +2291,6 @@ static int StartInternalRunMode(SCInstance *suri, int argc, char **argv) case RUNMODE_PRINT_USAGE: PrintUsage(argv[0]); return TM_ECODE_DONE; -#ifdef __SC_CUDA_SUPPORT__ - case RUNMODE_LIST_CUDA_CARDS: - return ListCudaCards(); -#endif case RUNMODE_LIST_RUNMODES: RunModeListRunmodes(); return TM_ECODE_DONE; @@ -2554,11 +2498,6 @@ static void PostConfLoadedDetectSetup(SCInstance *suri) exit(EXIT_FAILURE); } -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) - CudaVarsSetDeCtx(de_ctx); -#endif /* __SC_CUDA_SUPPORT__ */ - if (!de_ctx->minimal) { if (LoadSignatures(de_ctx, suri) != TM_ECODE_OK) exit(EXIT_FAILURE); @@ -2593,9 +2532,6 @@ static int PostConfLoadedSetup(SCInstance *suri) /* load the pattern matchers */ MpmTableSetup(); -#ifdef __SC_CUDA_SUPPORT__ - MpmCudaEnvironmentSetup(); -#endif SpmTableSetup(); int disable_offloading; diff --git a/src/suricata.h b/src/suricata.h index 65b1df4638..4a5ffdafc8 100644 --- a/src/suricata.h +++ b/src/suricata.h @@ -66,7 +66,6 @@ #include "suricata-common.h" #include "packet-queue.h" -#include "data-queue.h" /* the name of our binary */ #define PROG_NAME "Suricata" @@ -131,8 +130,6 @@ enum { */ PacketQueue trans_q[256]; -SCDQDataQueue data_queues[256]; - typedef struct SCInstance_ { enum RunModes run_mode; @@ -173,16 +170,6 @@ void GlobalsInitPreConfig(void); extern volatile uint8_t suricata_ctl_flags; extern int g_disable_randomness; -/* uppercase to lowercase conversion lookup table */ -uint8_t g_u8_lowercasetable[256]; - -/* marco to do the actual lookup */ -//#define u8_tolower(c) g_u8_lowercasetable[(c)] -// these 2 are slower: -//#define u8_tolower(c) ((c) >= 'A' && (c) <= 'Z') ? g_u8_lowercasetable[(c)] : (c) -//#define u8_tolower(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) + ('a' - 'A')) : (c)) - -/* this is faster than the table lookup */ #include #define u8_tolower(c) tolower((uint8_t)(c)) diff --git a/src/tm-queues.c b/src/tm-queues.c index bb8c045476..f4102d8f21 100644 --- a/src/tm-queues.c +++ b/src/tm-queues.c @@ -44,8 +44,6 @@ Tmq *TmqCreateQueue(const char *name) goto error; q->id = tmq_id++; - /* for cuda purposes */ - q->q_type = 0; SCLogDebug("created queue \'%s\', %p", name, q); return q; diff --git a/src/tm-queues.h b/src/tm-queues.h index 32e1e5203c..502ef2cd56 100644 --- a/src/tm-queues.h +++ b/src/tm-queues.h @@ -29,8 +29,6 @@ typedef struct Tmq_ { uint16_t id; uint16_t reader_cnt; uint16_t writer_cnt; - /* 0 for packet-queue and 1 for data-queue */ - uint8_t q_type; } Tmq; Tmq* TmqCreateQueue(const char *name); diff --git a/src/tm-threads.c b/src/tm-threads.c index b71cc99fde..25fef956a8 100644 --- a/src/tm-threads.c +++ b/src/tm-threads.c @@ -1487,10 +1487,7 @@ static int TmThreadKillThread(ThreadVars *tv) } if (tv->inq != NULL) { for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) { - if (tv->inq->q_type == 0) - SCCondSignal(&trans_q[tv->inq->id].cond_q); - else - SCCondSignal(&data_queues[tv->inq->id].cond_q); + SCCondSignal(&trans_q[tv->inq->id].cond_q); } SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id); } @@ -1641,10 +1638,7 @@ again: if (tv->inq != NULL) { int i; for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) { - if (tv->inq->q_type == 0) - SCCondSignal(&trans_q[tv->inq->id].cond_q); - else - SCCondSignal(&data_queues[tv->inq->id].cond_q); + SCCondSignal(&trans_q[tv->inq->id].cond_q); } SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id); } @@ -1723,10 +1717,7 @@ again: if (tv->inq != NULL) { int i; for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) { - if (tv->inq->q_type == 0) - SCCondSignal(&trans_q[tv->inq->id].cond_q); - else - SCCondSignal(&data_queues[tv->inq->id].cond_q); + SCCondSignal(&trans_q[tv->inq->id].cond_q); } SCLogDebug("signalled tv->inq->id %" PRIu32 "", tv->inq->id); } diff --git a/src/tmqh-simple.c b/src/tmqh-simple.c index 3228aaf1b5..255406476d 100644 --- a/src/tmqh-simple.c +++ b/src/tmqh-simple.c @@ -92,65 +92,3 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p) SCMutexUnlock(&q->mutex_q); } -/*******************************Generic-Q-Handlers*****************************/ - -/** - * \brief Public version of TmqhInputSimple from the tmqh-simple queue - * handler, except that it is a generic version that is directly - * tied to a "SCDQDataQueue" instance(sent as an arg). - * - * Retrieves a data_instance from the queue. If the queue is empty, it - * waits on the queue, till a data_instance is enqueued into the queue - * by some other module. - * - * All references to "data_instance" means a reference to a data structure - * instance that implements the template "struct SCDQGenericQData_". - * - * \param q The SCDQDataQueue instance to wait on. - * - * \retval p The returned packet from the queue. - * \retval data The returned data_instance from the queue. - */ -SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q) -{ - SCMutexLock(&q->mutex_q); - if (q->len == 0) { - /* if we have no packets in queue, wait... */ - SCCondWait(&q->cond_q, &q->mutex_q); - } - - if (q->len > 0) { - SCDQGenericQData *data = SCDQDataDequeue(q); - SCMutexUnlock(&q->mutex_q); - return data; - } else { - /* return NULL if we have no data in the queue. Should only happen - * on signals. */ - SCMutexUnlock(&q->mutex_q); - return NULL; - } -} - -/** - * \brief Public version of TmqhOutputSimple from the tmqh-simple queue - * handler, except that it is a generic version that is directly - * tied to a SCDQDataQueue instance(sent as an arg). - * - * Pumps out a data_instance into the queue. If the queue is empty, it - * waits on the queue, till a data_instance is enqueued into the queue. - * - * All references to "data_instance" means a reference to a data structure - * instance that implements the template "struct SCDQGenericQData_". - * - * \param q The SCDQDataQueue instance to pump the data into. - * \param data The data instance to be enqueued. - */ -void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data) -{ - SCMutexLock(&q->mutex_q); - SCDQDataEnqueue(q, data); - SCCondSignal(&q->cond_q); - SCMutexUnlock(&q->mutex_q); - - return; -} diff --git a/src/tmqh-simple.h b/src/tmqh-simple.h index 1d4417b4e8..d80de50852 100644 --- a/src/tmqh-simple.h +++ b/src/tmqh-simple.h @@ -24,11 +24,6 @@ #ifndef __TMQH_SIMPLE_H__ #define __TMQH_SIMPLE_H__ -#include "data-queue.h" - -SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *); -void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *); - void TmqhSimpleRegister (void); #endif /* __TMQH_SIMPLE_H__ */ diff --git a/src/util-cuda-buffer.c b/src/util-cuda-buffer.c deleted file mode 100644 index 54ae272a52..0000000000 --- a/src/util-cuda-buffer.c +++ /dev/null @@ -1,1358 +0,0 @@ -/* Copyright (C) 2007-2012 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - * - * API has be introduced to allow buffering of data by multiple writers - * asynronously. The current version only allows sequential reads. - * - * The API works by first registering a couple of buffers, which would - * be sliced and allocated for use by the API to potential writers. - * - * The registration API requires 3 buffers to be registered. The data - * buffer(d_buffer), into which the API buffers data, the pointer buffer - * (p_buffer), which would hold the pointer var instance corresponding to - * its entry in the d_buffer, and the offset buffer(o_buffer), which - * holds an offset entry for the data corresponding to the pointer buffer - * entry. - * - * A writer wishing to write data would be required to obtain a slice - * using CudaBufferGetSlice. Once data has been written to the slice, - * it can report back saying the slice has been written to by setting - * a flag in the slice - SC_ATOMIC_SET(slice->done, 1). - * - * A reader wishing to retrieve the data written by writers, will do - * so using the API call - CudaBufferCullCompletedSlices(). Once data - * has been consumed, the reader would report back using - * CudaBufferReportCulledConsumption() so that resources can be freed - * to be reallocated to other writers. - */ - - -#include "suricata-common.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "suricata.h" - -#include "util-atomic.h" -#include "util-pool.h" -#include "util-misc.h" -#include "util-error.h" -#include "util-debug.h" -#include "util-unittest.h" -#include "util-cuda-buffer.h" - -/* rotation limit for the buffers. This basically decides at what position - * inside alloced buffer should the API rotate and start using the buffer - * from the start - The right value's from 0.1-1.0. Do note that the - * rotation decision is taken when the culling process takes place. - * Have a look at - CudaBufferCullCompletedSlices */ -#define CUDA_BUFFER_BUFFER_ROTATION_LIMIT 0.75 - -/* The max buffer size that be registered to CudaBufferRegisterNew */ -#define CUDA_BUFFER_BUFFER_LIMIT (1 * 1024 * 1024 * 1024) - -/* 100,000 * 5 = 500,000 */ -#define CUDA_BUFFER_ITEM_LIMIT (100000 * 5) - -/* a million slices to be prealloced = 100,000 * 10 */ -#define CUDA_BUFFER_SLICE_POOL_PREALLOC (100000 * 10) - -/* we store all our slices here */ -static Pool *slice_pool = NULL; -/* mutex for the above slice pool */ -static SCMutex slice_pool_mutex; - -/** - * \brief Used by a consumer to report back(and thus have it freed), - * once it has consumed data returned in the CudaBufferCulledInfo - * instance(obtained from the call to CudaBufferCullCompletedSlices). - */ -void CudaBufferReportCulledConsumption(CudaBufferData *cb_data, - CudaBufferCulledInfo *culled_info) -{ - SCMutexLock(&cb_data->m); - - if (culled_info->d_buffer_reset) { - cb_data->d_buffer_read = 0; - } else { - if (culled_info->no_of_items != 0) { - cb_data->d_buffer_read = culled_info->d_buffer_start_offset + - culled_info->d_buffer_len; - } - } - - if (culled_info->op_buffer_reset) { - cb_data->op_buffer_read = 0; - } else { - if (culled_info->no_of_items != 0) { - cb_data->op_buffer_read += culled_info->no_of_items; - } - } - - SCMutexUnlock(&cb_data->m); -} - -/** - * \brief Remove slices that are done. "Done" as in worker threads are done - * writing data to it. - * - * \param cb_data Pointer to the CudaBufferData instance. - */ -void CudaBufferCullCompletedSlices(CudaBufferData *cb_data, - CudaBufferCulledInfo *culled_info, - uint32_t size_limit) -{ - culled_info->no_of_items = 0; - culled_info->d_buffer_reset = 0; - culled_info->op_buffer_reset = 0; - - SCMutexLock(&cb_data->m); - - int buffer_reset = 0; - uint32_t d_buffer_write_temp = 0; - uint32_t op_buffer_write_temp = 0; - - if ((cb_data->d_buffer_write >= - (cb_data->d_buffer_len * CUDA_BUFFER_BUFFER_ROTATION_LIMIT)) && - (cb_data->d_buffer_read != 0)) - { - SCLogDebug("d_buffer reset"); - d_buffer_write_temp = cb_data->d_buffer_write; - cb_data->d_buffer_write = 0; - buffer_reset = 1; - culled_info->d_buffer_reset = 1; - } - - /* reset op_buffer */ - if ((cb_data->op_buffer_write >= - (cb_data->op_buffer_len * CUDA_BUFFER_BUFFER_ROTATION_LIMIT)) && - (cb_data->op_buffer_read != 0)) - { - SCLogDebug("op_buffer reset"); - op_buffer_write_temp = cb_data->op_buffer_write; - cb_data->op_buffer_write = 0; - buffer_reset = 1; - culled_info->op_buffer_reset = 1; - } - - CudaBufferSlice *slice_temp = cb_data->slice_head; - CudaBufferSlice *max_culled_slice = NULL; - uint32_t curr_size = 0; - - while (slice_temp != NULL) { - if (!SC_ATOMIC_GET(slice_temp->done)) { - SCLogDebug("CudaBuffer waiting on an item to finish"); - if (buffer_reset) { - while (!SC_ATOMIC_GET(slice_temp->done)) - usleep(1); - } else { - break; - } - } - - if (curr_size + (slice_temp->end_offset - slice_temp->start_offset + 1) > size_limit) { - if (buffer_reset) { - cb_data->op_buffer_write = op_buffer_write_temp; - cb_data->d_buffer_write = d_buffer_write_temp; - culled_info->d_buffer_reset = 0; - culled_info->op_buffer_reset = 0; - } - break; - } - - max_culled_slice = slice_temp; - curr_size += (slice_temp->end_offset - slice_temp->start_offset + 1); - - slice_temp = slice_temp->next; - } - - CudaBufferSlice *slice_head = cb_data->slice_head; - - if (max_culled_slice != NULL) { - cb_data->slice_head = max_culled_slice->next; - if (max_culled_slice->next == NULL) { - cb_data->slice_tail = NULL; - } - max_culled_slice->next = NULL; - } else { - SCMutexUnlock(&cb_data->m); - return; - } - - culled_info->d_buffer_start_offset = slice_head->start_offset; - culled_info->d_buffer_len = (max_culled_slice->end_offset - - slice_head->start_offset + 1); - culled_info->op_buffer_start_offset = cb_data->op_buffer_read; - SCMutexUnlock(&cb_data->m); - - /* push out the used slices to the the slice_pool */ - SCMutexLock(&slice_pool_mutex); - slice_temp = slice_head; - while (slice_temp != max_culled_slice) { - CudaBufferSlice *tmp = slice_temp->next; - - PoolReturn(slice_pool, slice_temp); - culled_info->no_of_items++; - - slice_temp = tmp; - } - PoolReturn(slice_pool, slice_temp); - culled_info->no_of_items++; - SCMutexUnlock(&slice_pool_mutex); - - return; -} - -/** - * \internal - * \brief Adds a slice to the CudaBufferData slice list. - * - * We expect the CudaBufferData instance to be locked. - * - * \param cb_data Pointer to the CudaBufferdata instance. - * \param slice Pointer to the slice to be pushed. - */ -static inline void CudaBufferAppendSlice(CudaBufferData *cb_data, CudaBufferSlice *slice) -{ - slice->next = NULL; - - if (cb_data->slice_head == NULL) { - cb_data->slice_head = slice; - cb_data->slice_tail = slice; - } else { - cb_data->slice_tail->next = slice; - cb_data->slice_tail = slice; - } - - return; -} - -/** - * \brief Gets a new buffer slice for a consumer to write to. - * - * All slices returned are aligned to the next 8 byte boundary. - * - * \param cb_data Pointer to the CudaBufferdata instance. - * \param len Length of the slice required. - * \param p Pointer to the var corresponding to the data to store. - * - * \retval slice Pointer to the slice if successful; NULL if unsuccessful. - */ -CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void *p) -{ -#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) - - SCMutexLock(&slice_pool_mutex); - CudaBufferSlice *slice = PoolGet(slice_pool); - SCMutexUnlock(&slice_pool_mutex); - if (slice == NULL) { - return NULL; - } - - SCMutexLock(&cb_data->m); - - if (cb_data->d_buffer_write < cb_data->d_buffer_read) { - if (cb_data->d_buffer_write + len >= cb_data->d_buffer_read) { - SCLogDebug("d_buffer full"); - SCMutexUnlock(&cb_data->m); - - SCMutexLock(&slice_pool_mutex); - PoolReturn(slice_pool, slice); - SCMutexUnlock(&slice_pool_mutex); - return NULL; - } - } else { - if (cb_data->d_buffer_write + len > cb_data->d_buffer_len) { - SCLogDebug("d_buffer limit hit - buffer_len - %"PRIu32, - cb_data->d_buffer_len); - SCMutexUnlock(&cb_data->m); - - SCMutexLock(&slice_pool_mutex); - PoolReturn(slice_pool, slice); - SCMutexUnlock(&slice_pool_mutex); - return NULL; - } - } - - if (cb_data->op_buffer_write < cb_data->op_buffer_read) { - if (cb_data->op_buffer_write + 1 >= cb_data->op_buffer_read) { - SCLogDebug("op_buffer full"); - SCMutexUnlock(&cb_data->m); - - SCMutexLock(&slice_pool_mutex); - PoolReturn(slice_pool, slice); - SCMutexUnlock(&slice_pool_mutex); - return NULL; - } - } else { - if (cb_data->op_buffer_write + 1 > cb_data->op_buffer_len) { - SCLogDebug("op_buffer limit hit - buffer_len - %"PRIu32, - cb_data->op_buffer_len); - SCMutexUnlock(&cb_data->m); - - SCMutexLock(&slice_pool_mutex); - PoolReturn(slice_pool, slice); - SCMutexUnlock(&slice_pool_mutex); - return NULL; - } - } - - slice->start_offset = cb_data->d_buffer_write; - cb_data->d_buffer_write = slice->start_offset + len; - ALIGN_UP(cb_data->d_buffer_write, 8); - slice->end_offset = cb_data->d_buffer_write - 1; - slice->buffer = cb_data->d_buffer; - SC_ATOMIC_SET(slice->done, 0); - - CudaBufferAppendSlice(cb_data, slice); - cb_data->no_of_items++; - - cb_data->o_buffer[cb_data->op_buffer_write] = slice->start_offset; - cb_data->p_buffer[cb_data->op_buffer_write] = p; - cb_data->op_buffer_write++; - - SCMutexUnlock(&cb_data->m); - - return slice; -} - -void CudaBufferDeRegister(CudaBufferData *cb_data) -{ - CudaBufferSlice *slice_temp = cb_data->slice_head; - SCMutexLock(&slice_pool_mutex); - while (slice_temp != NULL) { - CudaBufferSlice *slice_temp_next = slice_temp->next; - PoolReturn(slice_pool, slice_temp); - slice_temp = slice_temp_next; - } - SCMutexUnlock(&slice_pool_mutex); - - SCMutexDestroy(&cb_data->m); - SCFree(cb_data); - - return; -} - -/** - * \brief Registers a new buffer to be handled by the CudaBuffer API. - * - * More on what this API does can be understood from the API - * docs at the start of this file. - * - * \param d_buffer The data buffer to work with. - * \param d_buffer_len Length of d_buffer. - * \param o_buffer The offset buffer. - * \param p_buffer The pointer buffer. - * \param op_buffer_no_of_items Length of o_buffer and p_buffer. Please - * note that both o_buffer and p_buffer - * should be of the same length. - * \param len Length of the buffer to be assigned. - */ -CudaBufferData *CudaBufferRegisterNew(uint8_t *d_buffer, uint32_t d_buffer_len, - uint32_t *o_buffer, void **p_buffer, - uint32_t op_buffer_no_of_items) -{ - if (d_buffer_len > CUDA_BUFFER_BUFFER_LIMIT) { - SCLogError(SC_ERR_CUDA_BUFFER_ERROR, "Buffer max limit exceeded. We " - "accept a max limit of %u bytes", CUDA_BUFFER_BUFFER_LIMIT); - return NULL; - } - - if ((d_buffer_len % 8) != 0) { - SCLogError(SC_ERR_CUDA_BUFFER_ERROR, "Please specify a buffer length which " - "is a multiple of 8"); - return NULL; - } - - CudaBufferData *new = SCMalloc(sizeof(CudaBufferData)); - if (unlikely(new == NULL)) { - return NULL; - } - memset(new, 0, sizeof(CudaBufferData)); - - /* payload/data buffer and set its size */ - new->d_buffer = d_buffer; - new->d_buffer_len = d_buffer_len; - - /* offset buffer and set its size */ - new->o_buffer = o_buffer; - new->p_buffer = p_buffer; - /* common to the above 2 malloc'ed buffers */ - new->op_buffer_len = op_buffer_no_of_items; - - /* used to lock this new instance when it's used */ - SCMutexInit(&new->m, NULL); - - return new; -} - -static void *CudaBufferSlicePoolAlloc(void *null) -{ - void *ptr = SCMalloc(sizeof(CudaBufferSlice)); - if (unlikely(ptr == NULL)) - return NULL; - memset(ptr, 0, sizeof(CudaBufferSlice)); - - SC_ATOMIC_INIT(((CudaBufferSlice *)ptr)->done); - - return ptr; -} - -static int CudaBufferSlicePoolInit(void *data, void *init_data) -{ - SC_ATOMIC_INIT(((CudaBufferSlice *)data)->done); - - return 1; -} - -/* disabled to reflect the changes made in PoolInit */ -#if 0 -static void CudaBufferSlicePoolFree(void *data) -{ - SC_ATOMIC_DESTROY(((CudaBufferSlice *)data)->done); - SCFree(data); - - return; -} -#endif - -static void CudaBufferSlicePoolCleanup(void *data) -{ - SC_ATOMIC_DESTROY(((CudaBufferSlice *)data)->done); - - return; -} - -/** - * \brief Init the API. To be called only once at startup time. - */ -void CudaBufferInit(void) -{ - SCMutexInit(&slice_pool_mutex, NULL); - - slice_pool = PoolInit(CUDA_BUFFER_SLICE_POOL_PREALLOC, - CUDA_BUFFER_SLICE_POOL_PREALLOC, - sizeof(CudaBufferSlice), - CudaBufferSlicePoolAlloc, - CudaBufferSlicePoolInit, - NULL, - CudaBufferSlicePoolCleanup, - NULL); - if (slice_pool == NULL) { - SCLogError(SC_ERR_POOL_INIT, "CudaBuffer slice_pool is not initialized"); - exit(EXIT_FAILURE); - } - - return; -} - -/****************************Unittests***************************/ - -#ifdef UNITTESTS - -int CudaBufferTest01(void) -{ - CudaBufferSlice *slice1, *slice2, *slice3, *slice4, *slice_temp; - int result = 0; - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - /* new slice */ - slice1 = CudaBufferGetSlice(data, 8, NULL); - if (slice1->start_offset != 0 || slice1->end_offset != 7 || - SC_ATOMIC_GET(slice1->done) != 0) { - printf("failure 1\n"); - goto end; - } - if (data->d_buffer_write != 8 || data->d_buffer_read != 0 || - data->op_buffer_write != 1 || data->op_buffer_read != 0 || - data->no_of_items != 1) { - printf("failure 2\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 3\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 4\n"); - goto end; - } - - /* new slice */ - slice2 = CudaBufferGetSlice(data, 16, NULL); - if (slice2->start_offset != 8 || slice2->end_offset != 23 || - SC_ATOMIC_GET(slice2->done) != 0) { - printf("failure 5\n"); - goto end; - } - if (data->d_buffer_write != 24 || data->d_buffer_read != 0 || - data->op_buffer_write != 2 || data->op_buffer_read != 0 || - data->no_of_items != 2) { - printf("failure 6\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 7\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 8\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 9\n"); - goto end; - } - - /* new slice */ - slice3 = CudaBufferGetSlice(data, 36, NULL); - if (slice3->start_offset != 24 || slice3->end_offset != 63 || - SC_ATOMIC_GET(slice3->done) != 0) { - printf("failure 10\n"); - goto end; - } - if (data->d_buffer_write != 64 || data->d_buffer_read != 0 || - data->op_buffer_write != 3 || data->op_buffer_read != 0 || - data->no_of_items != 3) { - printf("failure 11\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 12\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 13\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 24 || slice_temp->end_offset != 63 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 14\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 15\n"); - goto end; - } - - slice4 = CudaBufferGetSlice(data, 10, NULL); - if (slice4 != NULL) { - printf("failure 16\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 17\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -int CudaBufferTest02(void) -{ - CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp; - int result = 0; - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - slice1 = CudaBufferGetSlice(data, 8, NULL); - slice2 = CudaBufferGetSlice(data, 16, NULL); - if (data->d_buffer_write != 24 || data->d_buffer_read != 0 || - data->op_buffer_write != 2 || data->op_buffer_read != 0 || - data->no_of_items != 2) { - printf("failure 1\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 2\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 3\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 4\n"); - goto end; - } - - /* culling */ - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 0) { - printf("failure 5\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 6\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 7\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 8\n"); - goto end; - } - - SC_ATOMIC_SET(slice2->done, 1); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 0) { - printf("failure 9\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 7 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 10\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 8 || slice_temp->end_offset != 23 || - SC_ATOMIC_GET(slice_temp->done) != 1) { - printf("failure 11\n"); - goto end; - } - if (slice_temp->next != NULL) { - printf("failure 12\n"); - goto end; - } - - SC_ATOMIC_SET(slice1->done, 1); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 2) { - printf("failure 13\n"); - goto end; - } - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 14\n"); - goto end; - } - if (culled_info.d_buffer_start_offset != 0 || - culled_info.d_buffer_len != 24 || - culled_info.op_buffer_start_offset != 0 || - culled_info.d_buffer_reset != 0 || culled_info.op_buffer_reset != 0) { - printf("failure 15\n"); - goto end; - } - if (data->d_buffer_write != 24 || data->d_buffer_read != 0 || - data->op_buffer_write != 2 || data->op_buffer_read != 0 || - data->no_of_items != 2) { - printf("failure 16\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - if (data->d_buffer_write != 24 || data->d_buffer_read != 24 || - data->op_buffer_write != 2 || data->op_buffer_read != 2 || - data->no_of_items != 2) { - printf("failure 17\n"); - goto end; - } - - /* new slice */ - slice3 = CudaBufferGetSlice(data, 8, NULL); - if (slice3->start_offset != 24 || slice3->end_offset != 31 || - SC_ATOMIC_GET(slice3->done) != 0) { - printf("failure 18\n"); - goto end; - } - if (data->d_buffer_write != 32 || data->d_buffer_read != 24 || - data->op_buffer_write != 3 || data->op_buffer_read != 2 || - data->no_of_items != 3) { - printf("failure 19\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 20\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp != NULL) { - printf("failure 21\n"); - goto end; - } - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 0) { - printf("failure 22\n"); - goto end; - } - if (data->d_buffer_write != 32 || data->d_buffer_read != 24 || - data->op_buffer_write != 3 || data->op_buffer_read != 2 || - data->no_of_items != 3) { - printf("failure 23\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 24\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp != NULL) { - printf("failure 25\n"); - goto end; - } - - /* set done flag */ - SC_ATOMIC_SET(slice3->done, 1); - if (slice3->start_offset != 24 || slice3->end_offset != 31 || - SC_ATOMIC_GET(slice3->done) != 1) { - printf("failure 26\n"); - goto end; - } - if (data->d_buffer_write != 32 || data->d_buffer_read != 24 || - data->op_buffer_write != 3 || data->op_buffer_read != 2 || - data->no_of_items != 3) { - printf("failure 27\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 24 || slice_temp->end_offset != 31 || - SC_ATOMIC_GET(slice_temp->done) != 1) { - printf("failure 28\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp != NULL) { - printf("failure 29\n"); - goto end; - } - - /* culling */ - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 1) { - printf("failure 30\n"); - goto end; - } - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 31\n"); - goto end; - } - if (culled_info.d_buffer_start_offset != 24 || - culled_info.d_buffer_len != 8 || - culled_info.op_buffer_start_offset != 2 || - culled_info.d_buffer_reset != 0 || culled_info.op_buffer_reset != 0) { - printf("failure 32\n"); - goto end; - } - if (data->d_buffer_write != 32 || data->d_buffer_read != 24 || - data->op_buffer_write != 3 || data->op_buffer_read != 2 || - data->no_of_items != 3) { - printf("failure 33\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - if (data->d_buffer_write != 32 || data->d_buffer_read != 32 || - data->op_buffer_write != 3 || data->op_buffer_read != 3 || - data->no_of_items != 3) { - printf("failure 34\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 35\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -int CudaBufferTest03(void) -{ - CudaBufferSlice *slice, *slice_temp; - int result = 0; - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - slice = CudaBufferGetSlice(data, 16, NULL); - BUG_ON(slice == NULL); - slice = CudaBufferGetSlice(data, 16, NULL); - BUG_ON(slice == NULL); - slice = CudaBufferGetSlice(data, 24, NULL); - BUG_ON(slice == NULL); - - /* culling */ - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 0) { - printf("failure 1\n"); - goto end; - } - if (data->d_buffer_write != 56 || data->d_buffer_read != 0 || - data->op_buffer_write != 3 || data->op_buffer_read != 0 || - data->no_of_items != 3) { - printf("failure 2\n"); - goto end; - } - slice_temp = data->slice_head; - if (slice_temp->start_offset != 0 || slice_temp->end_offset != 15 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 3\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 16 || slice_temp->end_offset != 31 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 4\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp->start_offset != 32 || slice_temp->end_offset != 55 || - SC_ATOMIC_GET(slice_temp->done) != 0) { - printf("failure 5\n"); - goto end; - } - slice_temp = slice_temp->next; - if (slice_temp != NULL) { - printf("failure 6\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 7\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -int CudaBufferTest04(void) -{ - CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp; - int result = 0; - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - slice1 = CudaBufferGetSlice(data, 16, NULL); - slice2 = CudaBufferGetSlice(data, 16, NULL); - slice3 = CudaBufferGetSlice(data, 24, NULL); - - SC_ATOMIC_SET(slice1->done, 1); - - /* culling */ - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - - if (data->d_buffer_write != 56 || data->d_buffer_read != 0 || - data->op_buffer_write != 3 || data->op_buffer_read != 0 || - data->no_of_items != 3) { - printf("failure 1\n"); - goto end; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 1) { - printf("failure 2\n"); - goto end; - } - if (data->d_buffer_write != 56 || data->d_buffer_read != 0 || - data->op_buffer_write != 3 || data->op_buffer_read != 0 || - data->no_of_items != 3) { - printf("failure 3\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - if (data->d_buffer_write != 56 || data->d_buffer_read != 16 || - data->op_buffer_write != 3 || data->op_buffer_read != 1 || - data->no_of_items != 3) { - printf("failure 4\n"); - goto end; - } - - SC_ATOMIC_SET(slice2->done, 1); - SC_ATOMIC_SET(slice3->done, 1); - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (culled_info.no_of_items != 2) { - printf("failure 5\n"); - goto end; - } - if (data->d_buffer_write != 0 || data->d_buffer_read != 16 || - data->op_buffer_write != 3 || data->op_buffer_read != 1 || - data->no_of_items != 3) { - printf("failure 6\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - if (data->d_buffer_write != 0 || data->d_buffer_read != 0 || - data->op_buffer_write != 3 || data->op_buffer_read != 3 || - data->no_of_items != 3) { - printf("failure 7\n"); - goto end; - } - - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 8\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 9\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -int CudaBufferTest05(void) -{ - CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp; - int result = 0; - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - slice1 = CudaBufferGetSlice(data, 16, NULL); - slice2 = CudaBufferGetSlice(data, 16, NULL); - slice3 = CudaBufferGetSlice(data, 24, NULL); - - SC_ATOMIC_SET(slice1->done, 1); - - /* culling */ - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - CudaBufferReportCulledConsumption(data, &culled_info); - - SC_ATOMIC_SET(slice2->done, 1); - SC_ATOMIC_SET(slice3->done, 1); - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - CudaBufferReportCulledConsumption(data, &culled_info); - slice1 = CudaBufferGetSlice(data, 16, NULL); - if (slice1 == NULL) { - printf("failure 1\n"); - goto end; - } - slice2 = CudaBufferGetSlice(data, 16, NULL); - if (slice2 == NULL) { - printf("failure 2\n"); - goto end; - } - slice3 = CudaBufferGetSlice(data, 24, NULL); - if (slice2 == NULL) { - printf("failure 3\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 4\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -int CudaBufferTest06(void) -{ - CudaBufferSlice *slice, *slice_temp; - int result = 0; - CudaBufferCulledInfo culled_info; - memset(&culled_info, 0, sizeof(CudaBufferCulledInfo)); - - uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64); - uint32_t *o_buffer = SCMalloc(sizeof(uint32_t) * 64); - void **p_buffer = SCMalloc(sizeof(void *) * 64); - if (d_buffer == NULL || o_buffer == NULL || p_buffer == NULL) { - printf("failure 0\n"); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - return 0; - } - - CudaBufferData *data = CudaBufferRegisterNew(d_buffer, 64, - o_buffer, p_buffer, 64); - if (data == NULL) { - goto end; - } - - slice = CudaBufferGetSlice(data, 3, NULL); - memcpy(slice->buffer + slice->start_offset, - "one", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 3, NULL); - memcpy(slice->buffer + slice->start_offset, - "two", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - if (data->d_buffer_write != 16 || data->d_buffer_read != 0 || - data->op_buffer_write != 2 || data->op_buffer_read != 0 || - data->no_of_items != 2) { - printf("failure 1\n"); - goto end; - } - - slice = CudaBufferGetSlice(data, 5, NULL); - memcpy(slice->buffer + slice->start_offset, - "three", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 4, NULL); - memcpy(slice->buffer + slice->start_offset, - "four", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 4, NULL); - memcpy(slice->buffer + slice->start_offset, - "five", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - if (data->d_buffer_write != 40 || data->d_buffer_read != 0 || - data->op_buffer_write != 5 || data->op_buffer_read != 0 || - data->no_of_items != 5) { - printf("failure 2\n"); - goto end; - } - - slice = CudaBufferGetSlice(data, 3, NULL); - memcpy(slice->buffer + slice->start_offset, - "six", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 5, NULL); - memcpy(slice->buffer + slice->start_offset, - "seven", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - if (memcmp(data->d_buffer, "one", 3) != 0 || - memcmp(data->d_buffer + 8, "two", 3) != 0 || - memcmp(data->d_buffer + 16, "three", 5) != 0 || - memcmp(data->d_buffer + 24, "four", 4) != 0 || - memcmp(data->d_buffer + 32, "five", 4) != 0 || - memcmp(data->d_buffer + 40, "six", 3) != 0 || - memcmp(data->d_buffer + 48, "seven", 5) != 0) { - printf("failure 3\n"); - goto end; - } - - if (data->d_buffer_write != 56 || data->d_buffer_read != 0 || - data->op_buffer_write != 7 || data->op_buffer_read != 0 || - data->no_of_items != 7) { - printf("failure 4\n"); - goto end; - } - - /* culling */ - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->d_buffer_write != 56 || data->d_buffer_read != 0 || - data->op_buffer_write != 7 || data->op_buffer_read != 0 || - data->no_of_items != 7) { - printf("failure 5\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - if (data->d_buffer_write != 56 || data->d_buffer_read != 56 || - data->op_buffer_write != 7 || data->op_buffer_read != 7 || - data->no_of_items != 7) { - printf("failure 6\n"); - goto end; - } - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->d_buffer_write != 0 || data->d_buffer_read != 56 || - data->op_buffer_write != 7 || data->op_buffer_read != 7 || - data->no_of_items != 7) { - printf("failure 7\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - - if (data->d_buffer_write != 0 || data->d_buffer_read != 0 || - data->op_buffer_write != 7 || data->op_buffer_read != 7 || - data->no_of_items != 7) { - printf("failure 8\n"); - goto end; - } - - slice = CudaBufferGetSlice(data, 5, NULL); - memcpy(slice->buffer + slice->start_offset, - "eight", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 4, NULL); - memcpy(slice->buffer + slice->start_offset, - "nine", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 3, NULL); - memcpy(slice->buffer + slice->start_offset, - "ten", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 6, NULL); - memcpy(slice->buffer + slice->start_offset, - "eleven", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - slice = CudaBufferGetSlice(data, 6, NULL); - memcpy(slice->buffer + slice->start_offset, - "twelve", slice->end_offset - slice->start_offset + 1); - SC_ATOMIC_SET(slice->done, 1); - - if (data->d_buffer_write != 40 || data->d_buffer_read != 0 || - data->op_buffer_write != 12 || data->op_buffer_read != 7 || - data->no_of_items != 12) { - printf("failure 9\n"); - goto end; - } - - if (memcmp(data->d_buffer, "eight", 5) != 0 || - memcmp(data->d_buffer + 8, "nine", 4) != 0 || - memcmp(data->d_buffer + 16, "ten", 3) != 0 || - memcmp(data->d_buffer + 24, "eleven", 6) != 0 || - memcmp(data->d_buffer + 32, "twelve", 6) != 0) { - printf("failure 10\n"); - goto end; - } - - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->d_buffer_write != 40 || data->d_buffer_read != 0 || - data->op_buffer_write != 12 || data->op_buffer_read != 7 || - data->no_of_items != 12) { - printf("failure 11\n"); - goto end; - } - CudaBufferReportCulledConsumption(data, &culled_info); - - if (data->d_buffer_write != 40 || data->d_buffer_read != 40 || - data->op_buffer_write != 12 || data->op_buffer_read != 12 || - data->no_of_items != 12) { - printf("failure 12\n"); - goto end; - } - - result = 1; - end: - slice_temp = data->slice_head; - while (slice_temp != NULL) { - SC_ATOMIC_SET(slice_temp->done, 1); - slice_temp = slice_temp->next; - } - CudaBufferCullCompletedSlices(data, &culled_info, UTIL_MPM_CUDA_GPU_TRANSFER_SIZE); - if (data->slice_head != NULL || data->slice_tail != NULL) { - printf("failure 13\n"); - result = 0; - } - - CudaBufferDeRegister(data); - SCFree(d_buffer); - SCFree(o_buffer); - SCFree(p_buffer); - - return result; -} - -#endif /* #ifdef UNITTESTS */ - -void CudaBufferRegisterUnittests(void) -{ -#ifdef UNITTESTS - UtRegisterTest("CudaBufferTest01", CudaBufferTest01); - UtRegisterTest("CudaBufferTest02", CudaBufferTest02); - UtRegisterTest("CudaBufferTest03", CudaBufferTest03); - UtRegisterTest("CudaBufferTest04", CudaBufferTest04); - UtRegisterTest("CudaBufferTest05", CudaBufferTest05); - UtRegisterTest("CudaBufferTest06", CudaBufferTest06); -#endif - - return; -} - -#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda-buffer.h b/src/util-cuda-buffer.h deleted file mode 100644 index ab494e6755..0000000000 --- a/src/util-cuda-buffer.h +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (C) 2007-2013 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file API to allow buffering of data. - * - * Introduced with cuda as the primary objective. Allows multiple - * threads to simultaneously access a single buffer and write to it. - * - * Current version allows only serial reads from the buffer. - * When the need arises, the API will be updated to allow multiple - * non-sequential reads. - * - * \author Anoop Saldanha - */ - -#ifdef __SC_CUDA_SUPPORT__ - -#ifndef __UTIL_CUDA_BUFFER_H__ -#define __UTIL_CUDA_BUFFER_H__ - -#include "util-atomic.h" - -/** - * \brief Used by consumers to retrieve the data buffered. - */ -typedef struct CudaBufferCulledInfo_ { - uint32_t no_of_items; - - uint32_t d_buffer_start_offset; - uint32_t d_buffer_len; - - /* we use no_of_items to determine the no of items here */ - uint32_t op_buffer_start_offset; - - uint8_t d_buffer_reset; - uint8_t op_buffer_reset; -} CudaBufferCulledInfo; - -/** - * /brief A slice which contains details on where to buffer data by a - * writer. - */ -typedef struct CudaBufferSlice_ { - uint32_t start_offset; - uint32_t end_offset; - uint8_t *buffer; - SC_ATOMIC_DECLARE(uint8_t, done); - - struct CudaBufferSlice_ *next; -} CudaBufferSlice; - -typedef struct CudaBufferData_ { - /* the data buffer */ - uint8_t *d_buffer; - uint32_t d_buffer_len; - uint32_t d_buffer_write; - uint32_t d_buffer_read; - - /* debug only. Can be removed */ - uint32_t no_of_items; - - /* these 2 buffers below - o_buffer and p_buffer should be - * used/updated in tandem - * p_buffer is the ptr buffer that points to a data instance that - * represents it's corresponding data stored in d_buffer. - * o_buffer is the corresponding entry to the one in p_buffer, which - * holds the offset to the corresponding entry in d_buffer. */ - uint32_t *o_buffer; - void **p_buffer; - uint32_t op_buffer_len; - uint32_t op_buffer_write; - uint32_t op_buffer_read; - - /* slice lists used by writers */ - CudaBufferSlice *slice_head; - CudaBufferSlice *slice_tail; - - /* mutex used by the entire struct */ - SCMutex m; -} CudaBufferData; - -void CudaBufferReportCulledConsumption(CudaBufferData *cb_data, - CudaBufferCulledInfo *culled_info); -void CudaBufferCullCompletedSlices(CudaBufferData *cb_data, - CudaBufferCulledInfo *culled_info, uint32_t size_limit); -CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *data, uint32_t len, void *p); -void CudaBufferDeRegister(CudaBufferData *cb_data); -CudaBufferData *CudaBufferRegisterNew(uint8_t *d_buffer, uint32_t d_buffer_len, - uint32_t *o_buffer, void **p_buffer, - uint32_t op_buffer_no_of_items); -void CudaBufferInit(void); -void CudaBufferRegisterUnittests(void); - -#endif /* __UTIL_CUDA_BUFFER_H__ */ - -#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c deleted file mode 100644 index 198c6f73b9..0000000000 --- a/src/util-cuda-handlers.c +++ /dev/null @@ -1,364 +0,0 @@ -/* Copyright (C) 2007-2013 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - */ - -/* compile in, only if we have a CUDA enabled device on the machine, with the - * toolkit and the driver installed */ - -#include "suricata-common.h" - -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-error.h" -#include "util-debug.h" -#include "conf.h" -#include "util-cuda.h" -#include "util-cuda-handlers.h" - -/* file only exists if cuda is enabled */ -#include "cuda-ptxdump.h" - -/************************conf file profile section**********************/ - -typedef struct CudaHandlerConfProfile_ { - char *name; - void *ctx; - void (*Free)(void *); - - struct CudaHandlerConfProfile_ *next; -} CudaHandlerConfProfile; - -static CudaHandlerConfProfile *conf_profiles = NULL; -/* protects above var */ -static SCMutex mutex = SCMUTEX_INITIALIZER; - -void CudaHandlerAddCudaProfileFromConf(const char *name, - void *(*Callback)(ConfNode *node), - void (*Free)(void *)) -{ - /* we don't do data validation */ - SCMutexLock(&mutex); - - CudaHandlerConfProfile *tmp_cp = conf_profiles; - while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0) - tmp_cp = tmp_cp->next; - - if (tmp_cp != NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf " - "profile by the name \"%s\" registered.", name); - exit(EXIT_FAILURE); - } - - char tmp[200]; - int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name); - if (r < 0) { - SCLogError(SC_ERR_FATAL, "snprintf failure."); - exit(EXIT_FAILURE); - } else if (r > (int)sizeof(tmp)) { - SCLogError(SC_ERR_FATAL, "buffer not big enough to write param."); - exit(EXIT_FAILURE); - } - void *ctx = Callback(ConfGetNode(tmp)); - if (ctx == NULL) { - SCMutexUnlock(&mutex); - return; - } - - CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile)); - if (unlikely(new_cp == NULL)) - exit(EXIT_FAILURE); - memset(new_cp, 0, sizeof(CudaHandlerConfProfile)); - new_cp->name = SCStrdup(name); - if (new_cp->name == NULL) - exit(EXIT_FAILURE); - new_cp->ctx = ctx; - new_cp->Free = Free; - - if (conf_profiles == NULL) { - conf_profiles = new_cp; - } else { - new_cp->next = conf_profiles; - conf_profiles = new_cp; - } - - SCMutexUnlock(&mutex); - return; -} - -void *CudaHandlerGetCudaProfile(const char *name) -{ - SCMutexLock(&mutex); - - CudaHandlerConfProfile *tmp_cp = conf_profiles; - while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0) - tmp_cp = tmp_cp->next; - - if (tmp_cp == NULL) { - SCMutexUnlock(&mutex); - return NULL; - } - - SCMutexUnlock(&mutex); - return tmp_cp->ctx; -} - -void CudaHandlerFreeProfiles(void) -{ - SCMutexLock(&mutex); - - CudaHandlerConfProfile *tmp = conf_profiles; - while (tmp != NULL) { - CudaHandlerConfProfile *curr = tmp; - tmp = tmp->next; - SCFree(curr->name); - if (curr->Free != NULL) - curr->Free(curr->ctx); - SCFree(curr); - } - - SCMutexUnlock(&mutex); - return; -} - -/*******************cuda context related data section*******************/ - -/* we use a concept where every device on the gpu has only 1 context. If - * a section in the engine wants to use a device and tries to open a context - * on it, we first check if a context is already created for the device and if - * so we return it. If not we create a new one and update with the entry */ - -static CUcontext *cuda_contexts = NULL; -static int no_of_cuda_contexts = 0; - -typedef struct CudaHandlerModuleData_ { - char *name; - void *data; - - struct CudaHandlerModuleData_ *next; -} CudaHandlerModuleData; - -typedef struct CudaHandlerModule_ { - char *name; - - /* the context used by this module */ - CUcontext context; - /* the device on which the above context was created */ - int device_id; - CudaHandlerModuleData *module_data; - - struct CudaHandlerModule_ *next; -} CudaHandlerModule; - -static CudaHandlerModule *cudahl_modules = NULL; - -CUcontext CudaHandlerModuleGetContext(const char *name, int device_id) -{ - void *ptmp; - SCMutexLock(&mutex); - - CudaHandlerModule *module = cudahl_modules; - while (module != NULL && strcasecmp(module->name, name) != 0) - module = module->next; - if (module != NULL) { - if (module->device_id != device_id) { - SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already " - "registered, but the new device_id is different " - "from the already registered device_id."); - exit(EXIT_FAILURE); - } - SCMutexUnlock(&mutex); - return module->context; - } - - CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule)); - if (unlikely(new_module == NULL)) - exit(EXIT_FAILURE); - memset(new_module, 0, sizeof(CudaHandlerModule)); - new_module->device_id = device_id; - new_module->name = SCStrdup(name); - if (new_module->name == NULL) - exit(EXIT_FAILURE); - if (cudahl_modules == NULL) { - cudahl_modules = new_module; - } else { - new_module->next = cudahl_modules; - cudahl_modules = new_module; - } - - if (no_of_cuda_contexts <= device_id) { - ptmp = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1)); - if (unlikely(ptmp == NULL)) { - SCFree(cuda_contexts); - cuda_contexts = NULL; - exit(EXIT_FAILURE); - } - cuda_contexts = ptmp; - - memset(cuda_contexts + no_of_cuda_contexts, 0, - sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts)); - no_of_cuda_contexts = device_id + 1; - } - - if (cuda_contexts[device_id] == 0) { - SCCudaDevices *devices = SCCudaGetDeviceList(); - if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, - devices->devices[device_id]->device) == -1) { - SCLogDebug("ctxcreate failure."); - exit(EXIT_FAILURE); - } - } - new_module->context = cuda_contexts[device_id]; - - SCMutexUnlock(&mutex); - return cuda_contexts[device_id]; -} - -void CudaHandlerModuleStoreData(const char *module_name, - const char *data_name, void *data_ptr) -{ - SCMutexLock(&mutex); - - CudaHandlerModule *module = cudahl_modules; - while (module != NULL && strcasecmp(module->name, module_name) != 0) - module = module->next; - if (module == NULL) { - SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data " - "\"%s\" from module \"%s\" that hasn't been registered " - "yet.", module_name, data_name); - exit(EXIT_FAILURE); - } - - CudaHandlerModuleData *data = module->module_data; - while (data != NULL && (strcasecmp(data_name, data->name) != 0)) { - data = data->next; - } - if (data != NULL) { - SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already " - "registered for this module \"%s\".", data_name, - module_name); - SCMutexUnlock(&mutex); - goto end; - } - - CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData)); - if (unlikely(new_data == NULL)) - exit(EXIT_FAILURE); - memset(new_data, 0, sizeof(CudaHandlerModuleData)); - new_data->name = SCStrdup(data_name); - if (new_data->name == NULL) - exit(EXIT_FAILURE); - new_data->data = data_ptr; - - if (module->module_data == NULL) { - module->module_data = new_data; - } else { - new_data->next = module->module_data; - module->module_data = new_data; - } - - SCMutexUnlock(&mutex); - - end: - return; -} - -void *CudaHandlerModuleGetData(const char *module_name, const char *data_name) -{ - SCMutexLock(&mutex); - - CudaHandlerModule *module = cudahl_modules; - while (module != NULL && strcasecmp(module->name, module_name) != 0) - module = module->next; - if (module == NULL) { - SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data " - "\"%s\" from module \"%s\" that hasn't been registered " - "yet.", module_name, data_name); - SCMutexUnlock(&mutex); - return NULL; - } - - CudaHandlerModuleData *data = module->module_data; - while (data != NULL && (strcasecmp(data_name, data->name) != 0)) { - data = data->next; - } - if (data == NULL) { - SCLogInfo("Data \"%s\" already registered for this module \"%s\". " - "Returning it.", data_name, module_name); - SCMutexUnlock(&mutex); - return NULL; - } - - SCMutexUnlock(&mutex); - return data->data; -} - -int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image) -{ -#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15 - - int i = 0; - - /* select the ptx image based on the compute capability supported by all - * devices (i.e. the lowest) */ - char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE); - if (unlikely(image == NULL)) { - exit(EXIT_FAILURE); - } - memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE); - - int major = INT_MAX; - int minor = INT_MAX; - SCCudaDevices *devices = SCCudaGetDeviceList(); - for (i = 0; i < devices->count; i++){ - if (devices->devices[i]->major_rev < major){ - major = devices->devices[i]->major_rev; - minor = devices->devices[i]->minor_rev; - } - if (devices->devices[i]->major_rev == major && - devices->devices[i]->minor_rev < minor){ - minor = devices->devices[i]->minor_rev; - } - } - snprintf(image, - strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE, - "%s_sm_%u%u", - ptx_image, major, minor); - - /* we don't have a cuda module associated with this module. Create a - * cuda module, update the module with this cuda module reference and - * then return the module refernce back to the calling function using - * the argument */ - SCLogDebug("Loading kernel module: %s\n",image); - if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1) - goto error; - SCFree(image); - - return 0; - error: - SCFree(image); - return -1; - -#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE -} - - -#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h deleted file mode 100644 index eee227df60..0000000000 --- a/src/util-cuda-handlers.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (C) 2007-2012 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - */ - -#ifndef __UTIL_CUDA_HANDLERS__H__ -#define __UTIL_CUDA_HANDLERS__H__ - -#include "conf.h" -#include "util-cuda.h" - -/************************conf file profile section**********************/ - -void CudaHandlerAddCudaProfileFromConf(const char *name, - void *(*Callback)(ConfNode *node), - void (*Free)(void *)); -void *CudaHandlerGetCudaProfile(const char *name); -void CudaHandlerFreeProfiles(void); - -/*******************cuda context related data section*******************/ - -#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0 -#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1 -#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2 - -CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id); -void CudaHandlerModuleStoreData(const char *module_name, - const char *data_name, void *data_ptr); -void *CudaHandlerModuleGetData(const char *module_name, const char *data_name); -int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image); - -#endif /* __UTIL_CUDA_HANDLERS__H__ */ diff --git a/src/util-cuda-vars.c b/src/util-cuda-vars.c deleted file mode 100644 index 596be85826..0000000000 --- a/src/util-cuda-vars.c +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - */ - -#include "suricata-common.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "suricata.h" -#include "util-mpm.h" -#include "util-cuda-handlers.h" -#include "util-cuda-vars.h" -#include "detect-engine-mpm.h" -#include "util-debug.h" -#include "util-mpm-ac.h" - -static DetectEngineCtx *cuda_de_ctx = NULL; - -void CudaVarsSetDeCtx(DetectEngineCtx *de_ctx) -{ - if (cuda_de_ctx != NULL) { - SCLogError(SC_ERR_FATAL, "CudaVarsSetDeCtx() called more than once. " - "This function should be called only once during the " - "lifetime of the engine."); - exit(EXIT_FAILURE); - } - - cuda_de_ctx = de_ctx; - - return; -} - -int CudaThreadVarsInit(CudaThreadVars *ctv) -{ - if (PatternMatchDefaultMatcher() != MPM_AC_CUDA) - return 0; - - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - if (conf == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); - return -1; - } - - ctv->mpm_is_cuda = 1; - ctv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); - ctv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit; - ctv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit; - ctv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0); - ctv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1); - ctv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0); - ctv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1); - ctv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0); - - return 0; -} - -#endif diff --git a/src/util-cuda-vars.h b/src/util-cuda-vars.h deleted file mode 100644 index 9c24a915ba..0000000000 --- a/src/util-cuda-vars.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - */ - -#ifdef __SC_CUDA_SUPPORT__ - -#ifndef __UTIL_CUDA_VARS__H__ -#define __UTIL_CUDA_VARS__H__ - -#include "util-cuda-buffer.h" -#include "util-mpm.h" -#include "threads.h" - -typedef struct CudaThreadVars_ { - /* cb - CudaBuffer */ - CudaBufferData *cuda_ac_cb; - - MpmCtx *mpm_proto_other_ctx; - - MpmCtx *mpm_proto_tcp_ctx_ts; - MpmCtx *mpm_proto_udp_ctx_ts; - - MpmCtx *mpm_proto_tcp_ctx_tc; - MpmCtx *mpm_proto_udp_ctx_tc; - - uint16_t data_buffer_size_max_limit; - uint16_t data_buffer_size_min_limit; - - uint8_t mpm_is_cuda; -} CudaThreadVars; - -typedef struct CudaPacketVars_ { - uint8_t cuda_mpm_enabled; - uint8_t cuda_done; - uint16_t cuda_gpu_matches; - SCMutex cuda_mutex; - SCCondT cuda_cond; - uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1]; -} CudaPacketVars; - -void CudaVarsSetDeCtx(struct DetectEngineCtx_ *de_ctx); -int CudaThreadVarsInit(CudaThreadVars *ctv); - -#endif /* __UTIL_CUDA_VARS__H__ */ - -#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda.c b/src/util-cuda.c deleted file mode 100644 index 288631176b..0000000000 --- a/src/util-cuda.c +++ /dev/null @@ -1,5455 +0,0 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - * - * NVIDIA CUDA utility functions - last referenced Cuda Toolkit 4.2 - */ - -/* compile in, only if we have a CUDA enabled device on the machine, with the - * toolkit and the driver installed */ - -#include "suricata-common.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include -#include "util-cuda.h" -#include "util-error.h" -#include "util-debug.h" -#include "util-unittest.h" - -#define CASE_CODE(E) case E: return #E - -typedef enum SCCudaAPIS_ { - /* init api */ - SC_CUDA_CU_INIT, - - /* version management api */ - SC_CUDA_CU_DRIVER_GET_VERSION, - - /* device management api */ - SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY, - SC_CUDA_CU_DEVICE_GET, - SC_CUDA_CU_DEVICE_GET_ATTRIBUTE, - SC_CUDA_CU_DEVICE_GET_COUNT, - SC_CUDA_CU_DEVICE_GET_NAME, - SC_CUDA_CU_DEVICE_GET_PROPERTIES, - SC_CUDA_CU_DEVICE_TOTAL_MEM, - - /* context management api */ - SC_CUDA_CU_CTX_CREATE, - SC_CUDA_CU_CTX_DESTROY, - SC_CUDA_CU_CTX_GET_API_VERSION, - SC_CUDA_CU_CTX_GET_CACHE_CONFIG, - SC_CUDA_CU_CTX_GET_CURRENT, - SC_CUDA_CU_CTX_GET_DEVICE, - SC_CUDA_CU_CTX_GET_LIMIT, - SC_CUDA_CU_CTX_POP_CURRENT, - SC_CUDA_CU_CTX_PUSH_CURRENT, - SC_CUDA_CU_CTX_SET_CACHE_CONFIG, - SC_CUDA_CU_CTX_SET_CURRENT, - SC_CUDA_CU_CTX_SET_LIMIT, - SC_CUDA_CU_CTX_SYNCHRONIZE, - SC_CUDA_CU_CTX_ATTACH, - SC_CUDA_CU_CTX_DETACH, - - /* module management api */ - SC_CUDA_CU_MODULE_GET_FUNCTION, - SC_CUDA_CU_MODULE_GET_GLOBAL, - SC_CUDA_CU_MODULE_GET_SURF_REF, - SC_CUDA_CU_MODULE_GET_TEX_REF, - SC_CUDA_CU_MODULE_LOAD, - SC_CUDA_CU_MODULE_LOAD_DATA, - SC_CUDA_CU_MODULE_LOAD_DATA_EX, - SC_CUDA_CU_MODULE_LOAD_FAT_BINARY, - SC_CUDA_CU_MODULE_UNLOAD, - - /* memory management api */ - SC_CUDA_CU_ARRAY_3D_CREATE, - SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR, - SC_CUDA_CU_ARRAY_CREATE, - SC_CUDA_CU_ARRAY_DESTROY, - SC_CUDA_CU_ARRAY_GET_DESCRIPTOR, - SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID, - SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID, - SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE, - SC_CUDA_CU_IPC_GET_EVENT_HANDLE, - SC_CUDA_CU_IPC_GET_MEM_HANDLE, - SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE, - SC_CUDA_CU_IPC_OPEN_MEM_HANDLE, - SC_CUDA_CU_MEM_ALLOC, - SC_CUDA_CU_MEM_ALLOC_HOST, - SC_CUDA_CU_MEM_ALLOC_PITCH, - SC_CUDA_CU_MEMCPY, - SC_CUDA_CU_MEMCPY_2D, - SC_CUDA_CU_MEMCPY_2D_ASYNC, - SC_CUDA_CU_MEMCPY_2D_UNALIGNED, - SC_CUDA_CU_MEMCPY_3D, - SC_CUDA_CU_MEMCPY_3D_ASYNC, - SC_CUDA_CU_MEMCPY_3D_PEER, - SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC, - SC_CUDA_CU_MEMCPY_ASYNC, - SC_CUDA_CU_MEMCPY_A_TO_A, - SC_CUDA_CU_MEMCPY_A_TO_D, - SC_CUDA_CU_MEMCPY_A_TO_H, - SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC, - SC_CUDA_CU_MEMCPY_D_TO_A, - SC_CUDA_CU_MEMCPY_D_TO_D, - SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC, - SC_CUDA_CU_MEMCPY_D_TO_H, - SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC, - SC_CUDA_CU_MEMCPY_H_TO_A, - SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC, - SC_CUDA_CU_MEMCPY_H_TO_D, - SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC, - SC_CUDA_CU_MEMCPY_PEER, - SC_CUDA_CU_MEMCPY_PEER_ASYNC, - SC_CUDA_CU_MEM_FREE, - SC_CUDA_CU_MEM_FREE_HOST, - SC_CUDA_CU_MEM_GET_ADDRESS_RANGE, - SC_CUDA_CU_MEM_GET_INFO, - SC_CUDA_CU_MEM_HOST_ALLOC, - SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER, - SC_CUDA_CU_MEM_HOST_GET_FLAGS, - SC_CUDA_CU_MEM_HOST_REGISTER, - SC_CUDA_CU_MEM_HOST_UNREGISTER, - SC_CUDA_CU_MEMSET_D16, - SC_CUDA_CU_MEMSET_D16_ASYNC, - SC_CUDA_CU_MEMSET_D2_D16, - SC_CUDA_CU_MEMSET_D2_D16_ASYNC, - SC_CUDA_CU_MEMSET_D2_D32, - SC_CUDA_CU_MEMSET_D2_D32_ASYNC, - SC_CUDA_CU_MEMSET_D2_D8, - SC_CUDA_CU_MEMSET_D2_D8_ASYNC, - SC_CUDA_CU_MEMSET_D32, - SC_CUDA_CU_MEMSET_D32_ASYNC, - SC_CUDA_CU_MEMSET_D8, - SC_CUDA_CU_MEMSET_D8_ASYNC, - - /* unified addresssing */ - SC_CUDA_CU_POINTER_GET_ATTRIBUTE, - - /* stream management api */ - SC_CUDA_CU_STREAM_CREATE, - SC_CUDA_CU_STREAM_DESTROY, - SC_CUDA_CU_STREAM_QUERY, - SC_CUDA_CU_STREAM_SYNCHRONIZE, - SC_CUDA_CU_STREAM_WAIT_EVENT, - - /* event management api */ - SC_CUDA_CU_EVENT_CREATE, - SC_CUDA_CU_EVENT_DESTROY, - SC_CUDA_CU_EVENT_ELAPSED_TIME, - SC_CUDA_CU_EVENT_QUERY, - SC_CUDA_CU_EVENT_RECORD, - SC_CUDA_CU_EVENT_SYNCHRONIZE, - - /* execution control api */ - SC_CUDA_CU_FUNC_GET_ATTRIBUTE, - SC_CUDA_CU_FUNC_SET_CACHE_CONFIG, - SC_CUDA_CU_LAUNCH_KERNEL, - SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE, - SC_CUDA_CU_FUNC_SET_SHARED_SIZE, - SC_CUDA_CU_LAUNCH, - SC_CUDA_CU_LAUNCH_GRID, - SC_CUDA_CU_LAUNCH_GRID_ASYNC, - SC_CUDA_CU_PARAM_SETF, - SC_CUDA_CU_PARAM_SETI, - SC_CUDA_CU_PARAM_SET_SIZE, - SC_CUDA_CU_PARAM_SET_TEX_REF, - SC_CUDA_CU_PARAM_SETV, - - /* texture reference api */ - SC_CUDA_CU_TEX_REF_CREATE, - SC_CUDA_CU_TEX_REF_DESTROY, - SC_CUDA_CU_TEX_REF_GET_ADDRESS, - SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE, - SC_CUDA_CU_TEX_REF_GET_ARRAY, - SC_CUDA_CU_TEX_REF_GET_FILTER_MODE, - SC_CUDA_CU_TEX_REF_GET_FLAGS, - SC_CUDA_CU_TEX_REF_GET_FORMAT, - SC_CUDA_CU_TEX_REF_SET_ADDRESS, - SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D, - SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE, - SC_CUDA_CU_TEX_REF_SET_ARRAY, - SC_CUDA_CU_TEX_REF_SET_FILTER_MODE, - SC_CUDA_CU_TEX_REF_SET_FLAGS, - SC_CUDA_CU_TEX_REF_SET_FORMAT, -} SCCudaAPIS; - -SCEnumCharMap sc_cuda_api_names_string_map[] = { - /* init api */ - { "cuInit", SC_CUDA_CU_INIT }, - - /* version management api */ - { "cuDriverGetVersion", SC_CUDA_CU_DRIVER_GET_VERSION }, - - /* device management api */ - { "cuDeviceComputeCapability", SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY }, - { "cuDeviceGet", SC_CUDA_CU_DEVICE_GET }, - { "cuDeviceGetAttribute", SC_CUDA_CU_DEVICE_GET_ATTRIBUTE }, - { "cuDeviceGetCount", SC_CUDA_CU_DEVICE_GET_COUNT }, - { "cuDeviceGetName", SC_CUDA_CU_DEVICE_GET_NAME }, - { "cuDeviceGetProperties", SC_CUDA_CU_DEVICE_GET_PROPERTIES }, - { "cuDeviceTotalMem", SC_CUDA_CU_DEVICE_TOTAL_MEM }, - - /* context management api */ - { "cuCtxCreate", SC_CUDA_CU_CTX_CREATE }, - { "cuCtxDestroy", SC_CUDA_CU_CTX_DESTROY }, - { "cuCtxGetApiVersion", SC_CUDA_CU_CTX_GET_API_VERSION }, - { "cuCtxGetCacheConfig", SC_CUDA_CU_CTX_GET_CACHE_CONFIG }, - { "cuCtxGetCurrent", SC_CUDA_CU_CTX_GET_CURRENT }, - { "cuCtxGetDevice", SC_CUDA_CU_CTX_GET_DEVICE }, - { "cuCtxGetLimit", SC_CUDA_CU_CTX_GET_LIMIT }, - { "cuCtxPopCurrent", SC_CUDA_CU_CTX_POP_CURRENT }, - { "cuCtxPushCurrent", SC_CUDA_CU_CTX_PUSH_CURRENT }, - { "cuCtxSetCacheConfig", SC_CUDA_CU_CTX_SET_CACHE_CONFIG }, - { "cuCtxSetCurrent", SC_CUDA_CU_CTX_SET_CURRENT }, - { "cuCtxSetLimit", SC_CUDA_CU_CTX_SET_LIMIT }, - { "cuCtxSynchronize", SC_CUDA_CU_CTX_SYNCHRONIZE }, - { "cuCtxAttach", SC_CUDA_CU_CTX_ATTACH }, - { "cuCtxDetach", SC_CUDA_CU_CTX_DETACH }, - - /* module management api */ - { "cuModuleGetFunction", SC_CUDA_CU_MODULE_GET_FUNCTION }, - { "cuModuleGetGlobal", SC_CUDA_CU_MODULE_GET_GLOBAL }, - { "cuModuleGetSurfRef", SC_CUDA_CU_MODULE_GET_SURF_REF }, - { "cuModuleGetTexRef", SC_CUDA_CU_MODULE_GET_TEX_REF }, - { "cuModuleLoad", SC_CUDA_CU_MODULE_LOAD }, - { "cuModuleLoadData", SC_CUDA_CU_MODULE_LOAD_DATA }, - { "cuModuleLoadDataEx", SC_CUDA_CU_MODULE_LOAD_DATA_EX }, - { "cuModuleLoadFatBinary", SC_CUDA_CU_MODULE_LOAD_FAT_BINARY }, - { "cuModuleUnload", SC_CUDA_CU_MODULE_UNLOAD }, - - /* memory management api */ - { "cuArray3DCreate", SC_CUDA_CU_ARRAY_3D_CREATE }, - { "cuArray3DGetDescriptor", SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR }, - { "cuArrayCreate", SC_CUDA_CU_ARRAY_CREATE }, - { "cuArrayDestroy", SC_CUDA_CU_ARRAY_DESTROY }, - { "cuArrayGetDescriptor", SC_CUDA_CU_ARRAY_GET_DESCRIPTOR }, - { "cuDeviceGetByPCIBusId", SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID }, - { "cuDeviceGetPCIBusId", SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID }, - { "cuIpcCloseMemHandle", SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE }, - { "cuIpcGetEventHandle", SC_CUDA_CU_IPC_GET_MEM_HANDLE }, - { "cuIpcGetMemHandle", SC_CUDA_CU_IPC_GET_MEM_HANDLE }, - { "cuIpcOpenEventHandle", SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE }, - { "cuIpcOpenMemHandle", SC_CUDA_CU_IPC_OPEN_MEM_HANDLE }, - { "cuMemAlloc", SC_CUDA_CU_MEM_ALLOC }, - { "cuMemAllocHost", SC_CUDA_CU_MEM_ALLOC_HOST }, - { "cuMemAllocPitch", SC_CUDA_CU_MEM_ALLOC_PITCH }, - { "cuMemcpy", SC_CUDA_CU_MEMCPY }, - { "cuMemcpy2D", SC_CUDA_CU_MEMCPY_2D }, - { "cuMemcpy2DAsync", SC_CUDA_CU_MEMCPY_2D_ASYNC }, - { "cuMemcpy2DUnaligned", SC_CUDA_CU_MEMCPY_2D_UNALIGNED }, - { "cuMemcpy3D", SC_CUDA_CU_MEMCPY_3D }, - { "cuMemcpy3DAsync", SC_CUDA_CU_MEMCPY_3D_ASYNC }, - { "cuMemcpy3DPeer", SC_CUDA_CU_MEMCPY_3D_PEER }, - { "cuMemcpy3DPeerAsync", SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC }, - { "cuMemcpyAsync", SC_CUDA_CU_MEMCPY_ASYNC }, - { "cuMemcpyAtoA", SC_CUDA_CU_MEMCPY_A_TO_A }, - { "cuMemcpyAtoD", SC_CUDA_CU_MEMCPY_A_TO_D }, - { "cuMemcpyAtoH", SC_CUDA_CU_MEMCPY_A_TO_H }, - { "cuMemcpyAtoHAsync", SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC }, - { "cuMemcpyDtoA", SC_CUDA_CU_MEMCPY_D_TO_A }, - { "cuMemcpyDtoD", SC_CUDA_CU_MEMCPY_D_TO_D }, - { "cuMemcpyDtoDAsync", SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC }, - { "cuMemcpyDtoH", SC_CUDA_CU_MEMCPY_D_TO_H }, - { "cuMemcpyDtoHAsync", SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC }, - { "cuMemcpyHtoA", SC_CUDA_CU_MEMCPY_H_TO_A }, - { "cuMemcpyHtoAAsync", SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC }, - { "cuMemcpyHtoD", SC_CUDA_CU_MEMCPY_H_TO_D }, - { "cuMemcpyHtoDAsync", SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC }, - { "cuMemcpyPeer", SC_CUDA_CU_MEMCPY_PEER }, - { "cuMemcpyPeerAsync", SC_CUDA_CU_MEMCPY_PEER_ASYNC }, - { "cuMemFree", SC_CUDA_CU_MEM_FREE }, - { "cuMemFreeHost", SC_CUDA_CU_MEM_FREE_HOST }, - { "cuMemGetAddressRange", SC_CUDA_CU_MEM_GET_ADDRESS_RANGE }, - { "cuMemGetInfo", SC_CUDA_CU_MEM_GET_INFO }, - { "cuMemHostAlloc", SC_CUDA_CU_MEM_HOST_ALLOC }, - { "cuMemHostGetDevicePointer", SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER }, - { "cuMemHostGetFlags", SC_CUDA_CU_MEM_HOST_GET_FLAGS }, - { "cuMemHostRegister", SC_CUDA_CU_MEM_HOST_REGISTER }, - { "cuMemHostUnregister", SC_CUDA_CU_MEM_HOST_UNREGISTER }, - { "cuMemsetD16", SC_CUDA_CU_MEMSET_D16 }, - { "cuMemsetD16Async", SC_CUDA_CU_MEMSET_D16_ASYNC }, - { "cuMemsetD2D16", SC_CUDA_CU_MEMSET_D2_D16 }, - { "cuMemsetD2D16Async", SC_CUDA_CU_MEMSET_D2_D16_ASYNC }, - { "cuMemsetD2D32", SC_CUDA_CU_MEMSET_D2_D32 }, - { "cuMemsetD2D32Async", SC_CUDA_CU_MEMSET_D2_D32_ASYNC }, - { "cuMemsetD2D8", SC_CUDA_CU_MEMSET_D2_D8 }, - { "cuMemsetD2D8Async", SC_CUDA_CU_MEMSET_D2_D8_ASYNC }, - { "cuMemsetD32", SC_CUDA_CU_MEMSET_D32 }, - { "cuMemsetD32Async", SC_CUDA_CU_MEMSET_D32_ASYNC }, - { "cuMemsetD8", SC_CUDA_CU_MEMSET_D8 }, - { "cuMemsetD8Async", SC_CUDA_CU_MEMSET_D8_ASYNC }, - - /* unified addressing */ - { "cuPointerGetAttribute", SC_CUDA_CU_POINTER_GET_ATTRIBUTE }, - - /* stream management api */ - { "cuStreamCreate", SC_CUDA_CU_STREAM_CREATE }, - { "cuStreamDestroy", SC_CUDA_CU_STREAM_DESTROY }, - { "cuStreamQuery", SC_CUDA_CU_STREAM_QUERY }, - { "cuStreamSynchronize", SC_CUDA_CU_STREAM_SYNCHRONIZE }, - { "cuStreamWaitEvent", SC_CUDA_CU_STREAM_WAIT_EVENT }, - - /* event management api */ - { "cuEventCreate", SC_CUDA_CU_EVENT_CREATE }, - { "cuEventDestroy", SC_CUDA_CU_EVENT_DESTROY }, - { "cuEventElapseTime", SC_CUDA_CU_EVENT_ELAPSED_TIME }, - { "cuEventQuery", SC_CUDA_CU_EVENT_QUERY }, - { "cuEventRecord", SC_CUDA_CU_EVENT_RECORD }, - { "cuEventSynchronize", SC_CUDA_CU_EVENT_SYNCHRONIZE }, - - /* execution control api */ - { "cuFuncGetAttribute", SC_CUDA_CU_FUNC_GET_ATTRIBUTE }, - { "cuFuncSetCacheConfig", SC_CUDA_CU_FUNC_SET_CACHE_CONFIG }, - { "cuLaunchKernel", SC_CUDA_CU_LAUNCH_KERNEL }, - { "cuFuncSetBlockShape", SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE }, - { "cuFuncSetSharedSize", SC_CUDA_CU_FUNC_SET_SHARED_SIZE }, - { "cuLaunch", SC_CUDA_CU_LAUNCH }, - { "cuLaunchGrid", SC_CUDA_CU_LAUNCH_GRID }, - { "cuLaunchGridAsync", SC_CUDA_CU_LAUNCH_GRID_ASYNC }, - { "cuParamSetf", SC_CUDA_CU_PARAM_SETF }, - { "cuParamSeti", SC_CUDA_CU_PARAM_SETI }, - { "cuParamSetSize", SC_CUDA_CU_PARAM_SET_SIZE }, - { "cuSetTexRef", SC_CUDA_CU_PARAM_SET_TEX_REF }, - { "cuSetv", SC_CUDA_CU_PARAM_SETV }, - - /* texture reference api */ - { "cuTexRefCreate", SC_CUDA_CU_TEX_REF_CREATE}, - { "cuTexRefDestroy", SC_CUDA_CU_TEX_REF_DESTROY}, - { "cuTexRefGetAddress", SC_CUDA_CU_TEX_REF_GET_ADDRESS}, - { "cuTexRefGetAddressMode", SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE}, - { "cuTexRefGetArray", SC_CUDA_CU_TEX_REF_GET_ARRAY}, - { "cuTexRefGetFilterMode", SC_CUDA_CU_TEX_REF_GET_FILTER_MODE}, - { "cuTexRefGetFlags", SC_CUDA_CU_TEX_REF_GET_FLAGS}, - { "cuTexRefGetFormat", SC_CUDA_CU_TEX_REF_GET_FORMAT}, - { "cuTexRefSetAddress", SC_CUDA_CU_TEX_REF_SET_ADDRESS}, - { "cuTexRefSetAddress2D", SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D}, - { "cuTexRefSetAddressMode", SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE}, - { "cuTexRefSetArray", SC_CUDA_CU_TEX_REF_SET_ARRAY}, - { "cuTexRefSetFilterMode", SC_CUDA_CU_TEX_REF_SET_FILTER_MODE}, - { "cuTexRefSetFlags", SC_CUDA_CU_TEX_REF_SET_FLAGS}, - { "cuTexRefSetFormat", SC_CUDA_CU_TEX_REF_SET_FORMAT}, - - { NULL, -1 }, -}; - -static SCCudaDevices *devices = NULL; - -/*****************************Error_Handling_API*******************************/ - -/** - * \internal - * \brief Maps the error enums from SCCudaAPIS to strings using the preprocessor - * #ENUM_VALUE. This is mainly needed for logging purposes to log the - * error codes. - * - * \param err The error_code for which the string has to be returned. - * - * \retval The string equivalent of the error code. - */ -static const char *SCCudaGetErrorCodeInString(int err) -{ - switch (err) { - CASE_CODE(CUDA_SUCCESS); - CASE_CODE(CUDA_ERROR_INVALID_VALUE); - CASE_CODE(CUDA_ERROR_OUT_OF_MEMORY); - CASE_CODE(CUDA_ERROR_NOT_INITIALIZED); - CASE_CODE(CUDA_ERROR_DEINITIALIZED); - CASE_CODE(CUDA_ERROR_PROFILER_DISABLED); - CASE_CODE(CUDA_ERROR_PROFILER_NOT_INITIALIZED); - CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STARTED); - CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STOPPED); - CASE_CODE(CUDA_ERROR_NO_DEVICE); - CASE_CODE(CUDA_ERROR_INVALID_DEVICE); - CASE_CODE(CUDA_ERROR_INVALID_IMAGE); - CASE_CODE(CUDA_ERROR_INVALID_CONTEXT); - /* deprecated error code as of 3.2 */ - CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_CURRENT); - CASE_CODE(CUDA_ERROR_MAP_FAILED); - CASE_CODE(CUDA_ERROR_UNMAP_FAILED); - CASE_CODE(CUDA_ERROR_ARRAY_IS_MAPPED); - CASE_CODE(CUDA_ERROR_ALREADY_MAPPED); - CASE_CODE(CUDA_ERROR_NO_BINARY_FOR_GPU); - CASE_CODE(CUDA_ERROR_ALREADY_ACQUIRED); - CASE_CODE(CUDA_ERROR_NOT_MAPPED); - CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY); - CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_POINTER); - CASE_CODE(CUDA_ERROR_ECC_UNCORRECTABLE); - CASE_CODE(CUDA_ERROR_UNSUPPORTED_LIMIT); - CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_IN_USE); - CASE_CODE(CUDA_ERROR_INVALID_SOURCE); - CASE_CODE(CUDA_ERROR_FILE_NOT_FOUND); - CASE_CODE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND); - CASE_CODE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED); - CASE_CODE(CUDA_ERROR_OPERATING_SYSTEM); - CASE_CODE(CUDA_ERROR_INVALID_HANDLE); - CASE_CODE(CUDA_ERROR_NOT_FOUND); - CASE_CODE(CUDA_ERROR_NOT_READY); - CASE_CODE(CUDA_ERROR_LAUNCH_FAILED); - CASE_CODE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); - CASE_CODE(CUDA_ERROR_LAUNCH_TIMEOUT); - CASE_CODE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING); - CASE_CODE(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED); - CASE_CODE(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED); - CASE_CODE(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE); - CASE_CODE(CUDA_ERROR_CONTEXT_IS_DESTROYED); - CASE_CODE(CUDA_ERROR_ASSERT); - CASE_CODE(CUDA_ERROR_TOO_MANY_PEERS); - CASE_CODE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED); - CASE_CODE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED); - CASE_CODE(CUDA_ERROR_UNKNOWN); - default: - return "CUDA_UNKNOWN_ERROR_CODE"; - } -} - -/** - * \internal - * \brief A generic function that handles the return values from the CUDA driver - * API. - * - * \param result The result from the CUDA driver API call. - * \param api_type An enum value SCCudaAPIS corresponing to the API for which the - * result was returned. The enum is needed to map the api type to - * a string for logging purposes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -static int SCCudaHandleRetValue(CUresult result, SCCudaAPIS api_type) -{ - if (result == CUDA_SUCCESS) { - SCLogDebug("%s executed successfully", - SCMapEnumValueToName(api_type, sc_cuda_api_names_string_map)); - return 0; - } else { - SCLogError(SC_ERR_CUDA_ERROR, "%s failed. Returned errocode - %s", - SCMapEnumValueToName(api_type, sc_cuda_api_names_string_map), - SCCudaGetErrorCodeInString(result)); - return -1; - } -} - -/*****************************Cuda_Initialization_API**************************/ - -/** - * \internal - * \brief Inits the cuda driver API. - * - * \param flags Currently should be 0. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaInit(unsigned int flags) -{ - CUresult result = cuInit(flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_INIT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/*****************************Version_Management_API***************************/ - -/** - * \brief Returns in *driver_version the version number of the installed CUDA - * driver. This function automatically returns CUDA_ERROR_INVALID_VALUE - * if the driver_version argument is NULL. - * - * \param driver_version Returns the CUDA driver version. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDriverGetVersion(int *driver_version) -{ - CUresult result = 0; - - if (driver_version == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "driver_version NULL"); - goto error; - } - - result = cuDriverGetVersion(driver_version); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DRIVER_GET_VERSION) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/*****************************Device_Management_API****************************/ - -/** - * \internal - * \brief Returns the major and the minor revision numbers that define the - * compute capability for the device that is sent as the argument. - * - * \param major Pointer to an integer, that will be updated with the major revision. - * \param minor Pointer to an integer, that will be updated with the minor revision. - * \param dev The device handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev) -{ - CUresult result = 0; - - if (major == NULL || minor == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "major is NULL or minor is NULL"); - goto error; - } - - result = cuDeviceComputeCapability(major, minor, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Returns a device handle given an ordinal in the range - * [0, cuDeviceGetCount() - 1]. - * - * \param device Pointer to a CUDevice instance that will be updated with the - * device handle. - * \param ordinal An index in the range [0, cuDeviceGetCount() - 1]. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceGet(CUdevice *device, int ordinal) -{ - CUresult result = 0; - - if (device == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "device NULL"); - goto error; - } - - result = cuDeviceGet(device, ordinal); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Returns the various attributes for the device that is sent as the arg. - * - * The supported attributes are: - * - * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads - * per block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of - * shared mem-ory available to a thread block in bytes; this amount - * is shared by all thread blocks simultaneously resident on a - * multiprocessor; - * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device - * for __constant_-_ variables in a CUDA C kernel in bytes; - * CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; - * CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the - * memory copy functions that involve memory regions allocated - * through cuMemAllocPitch(); - * CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit - * registers avail-able to a thread block; this number is shared by - * all thread blocks simultaneously resident on a multiprocessor; - * CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz; - * CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture - * base addresses aligned to textureAlign bytes do not need an offset - * applied to texture fetches; - * CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy - * memory between host and device while executing a kernel, or 0 if not; - * CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on - * the device; - * CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit - * for kernels executed on the device, or 0 if not; - * CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the - * memory subsystem, or 0 if not; - * CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host - * memory into the CUDA address space, or 0 if not; - * CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently - * in. Available modes are as follows: - * - CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted - * and can have multiple CUDA contexts present at a single time. - * - CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have - * only one CUDA con-text present on it at a time. - * - CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is - * prohibited from creating new CUDA contexts. - * - * \param pi Pointer to an interger instance that will be updated with the - * attribute value. - * \param attrib Device attribute to query. - * \param dev The device handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, - CUdevice dev) -{ - CUresult result = 0; - - if (pi == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "prop is NULL"); - goto error; - } - - result = cuDeviceGetAttribute(pi, attrib, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_ATTRIBUTE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Gets the total no of devices with compute capability greater than or - * equal to 1.0 that are available for execution. - * - * \param count Pointer to an integer that will be updated with the device count. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceGetCount(int *count) -{ - CUresult result = 0; - - if (count == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "count NULL"); - goto error; - } - - result = cuDeviceGetCount(count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_COUNT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Returns the device name, given the device handle. - * - * \param name Pointer to a char buffer which will be updated with the device name. - * \param len Length of the above buffer. - * \param dev The device handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceGetName(char *name, int len, CUdevice dev) -{ - CUresult result = 0; - - if (name == NULL || len == 0) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "name is NULL or len is 0"); - goto error; - } - - result = cuDeviceGetName(name, len, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_NAME) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Returns the properties of the device. The CUdevprop structure is - * defined as - * - * typedef struct CUdevprop_st { - * int maxThreadsPerBlock; - * int maxThreadsDim[3]; - * int maxGridSize[3]; - * int sharedMemPerBlock; - * int totalConstantMemory; - * int SIMDWidth; - * int memPitch; - * int regsPerBlock; - * int clockRate; - * int textureAlign - * } CUdevprop; - * - * \param prop Pointer to a CUdevprop instance that holds the device properties. - * \param dev The device handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev) -{ - CUresult result = 0; - - if (prop == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "prop is NULL"); - goto error; - } - - result = cuDeviceGetProperties(prop, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PROPERTIES) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Returns the total amount of memory availabe on the device which - * is sent as the argument. - * - * \param bytes Pointer to an unsigned int instance, that will be updated with - * total memory for the device. - * \param dev The device handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev) -{ - CUresult result = 0; - - if (bytes == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "bytes is NULL"); - goto error; - } - - result = cuDeviceTotalMem(bytes, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_TOTAL_MEM) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \internal - * \brief Creates and returns a new instance of SCCudaDevice. - * - * \retval device Pointer to the new instance of SCCudaDevice. - */ -static SCCudaDevice *SCCudaAllocSCCudaDevice(void) -{ - SCCudaDevice *device = SCMalloc(sizeof(SCCudaDevice)); - if (unlikely(device == NULL)) - return NULL; - memset(device, 0 , sizeof(SCCudaDevice)); - - return device; -} - -/** - * \internal - * \brief Frees an instance of SCCudaDevice. - * - * \param device Pointer to the an instance of SCCudaDevice to be freed. - */ -static void SCCudaDeAllocSCCudaDevice(SCCudaDevice *device) -{ - SCFree(device); - - return; -} - -/** - * \internal - * \brief Creates and returns a new instance of SCCudaDevices. - * - * \retval devices Pointer to the new instance of SCCudaDevices. - */ -static SCCudaDevices *SCCudaAllocSCCudaDevices(void) -{ - SCCudaDevices *devices = SCMalloc(sizeof(SCCudaDevices)); - if (unlikely(devices == NULL)) - return NULL; - memset(devices, 0 , sizeof(SCCudaDevices)); - - return devices; -} - -/** - * \internal - * \brief Frees an instance of SCCudaDevices. - * - * \param device Pointer to the an instance of SCCudaDevices to be freed. - */ -static void SCCudaDeAllocSCCudaDevices(SCCudaDevices *devices) -{ - int i = 0; - - if (devices == NULL) - return; - - if (devices->devices != NULL) { - for (i = 0; i < devices->count; i++) - SCCudaDeAllocSCCudaDevice(devices->devices[i]); - - SCFree(devices->devices); - } - - SCFree(devices); - - return; -} - -/** - * \brief Retrieves all the devices and all the information corresponding to - * the devices on the CUDA device available on this system and returns - * a SCCudaDevices instances which holds all this information. - * - * \retval devices Pointer to a SCCudaDevices instance that holds information - * for all the CUDA devices on the system. - */ -static SCCudaDevices *SCCudaGetDevices(void) -{ - SCCudaDevices *devices = SCCudaAllocSCCudaDevices(); - int i = 0; - - if (SCCudaDeviceGetCount(&devices->count) == -1) - goto error; - - devices->devices = SCMalloc(devices->count * sizeof(SCCudaDevice *)); - if (devices->devices == NULL) - goto error; - - /* update the device properties */ - for (i = 0; i < devices->count; i++) { - devices->devices[i] = SCCudaAllocSCCudaDevice(); - - if (SCCudaDeviceGet(&devices->devices[i]->device, i) == -1) - goto error; - - if (SCCudaDeviceComputeCapability(&devices->devices[i]->major_rev, - &devices->devices[i]->minor_rev, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetName(devices->devices[i]->name, - SC_CUDA_DEVICE_NAME_MAX_LEN, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceTotalMem(&devices->devices[i]->bytes, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetProperties(&devices->devices[i]->prop, - devices->devices[i]->device) == -1) { - goto error; - } - - /* retrieve the attributes */ - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_threads_per_block, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_x, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_y, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_z, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_x, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_y, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_z, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_shared_memory_per_block, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_total_constant_memory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_warp_size, - CU_DEVICE_ATTRIBUTE_WARP_SIZE, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_pitch, - CU_DEVICE_ATTRIBUTE_MAX_PITCH, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_registers_per_block, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_clock_rate, - CU_DEVICE_ATTRIBUTE_CLOCK_RATE, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_texture_alignment, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_gpu_overlap, - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_multiprocessor_count, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_kernel_exec_timeout, - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_integrated, - CU_DEVICE_ATTRIBUTE_INTEGRATED, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_can_map_host_memory, - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_compute_mode, - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, - devices->devices[i]->device) == -1) { - goto error; - } - } - -#ifdef DEBUG - SCCudaPrintDeviceList(devices); -#endif - - return devices; - - error: - SCCudaDeAllocSCCudaDevices(devices); - return NULL; -} - -/** - * \brief Prints the information for all the devices for this CUDA platform, - * supplied inside the argument. - * - * \param devices Pointer to a SCCudaDevices instance that holds information on - * the devices. - */ -void SCCudaPrintDeviceList(SCCudaDevices *devices) -{ - int i = 0; - - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return; - } - - SCLogDebug("Printing device info for this CUDA context"); - SCLogDebug("No of devices: %d", devices->count); - - for (i = 0; i < devices->count; i++) { - SCLogDebug("Device ID: %d", devices->devices[i]->device); - SCLogDebug("Device Name: %s", devices->devices[i]->name); - SCLogDebug("Device Major Revision: %d", devices->devices[i]->major_rev); - SCLogDebug("Device Minor Revision: %d", devices->devices[i]->minor_rev); - - /* Cudevprop */ - SCLogDebug("Device Max Threads Per Block: %d", - devices->devices[i]->prop.maxThreadsPerBlock); - SCLogDebug("Device Max Threads Dim: [%d, %d, %d]", - devices->devices[i]->prop.maxThreadsDim[0], - devices->devices[i]->prop.maxThreadsDim[1], - devices->devices[i]->prop.maxThreadsDim[2]); - SCLogDebug("Device Max Grid Size: [%d, %d, %d]", - devices->devices[i]->prop.maxGridSize[0], - devices->devices[i]->prop.maxGridSize[1], - devices->devices[i]->prop.maxGridSize[2]); - SCLogDebug("Device Shared Memory Per Block: %d", - devices->devices[i]->prop.sharedMemPerBlock); - SCLogDebug("Device Total Constant Memory: %d", - devices->devices[i]->prop.totalConstantMemory); - SCLogDebug("Device SIMD Width(Warp Size): %d", - devices->devices[i]->prop.SIMDWidth); - SCLogDebug("Device Maximum Mem Pitch: %d", devices->devices[i]->prop.memPitch); - SCLogDebug("Device Total Registers Available Per Block: %d", - devices->devices[i]->prop.regsPerBlock); - SCLogDebug("Device Clock Frequency: %d", devices->devices[i]->prop.clockRate); - SCLogDebug("Device Texture Alignment Requirement: %d", - devices->devices[i]->prop.textureAlign); - - - /* device attributes */ - SCLogDebug("Device Max Threads Per Block: %d", - devices->devices[i]->attr_max_threads_per_block); - SCLogDebug("Device Max Block Dim X: %d", - devices->devices[i]->attr_max_block_dim_x); - SCLogDebug("Device Max Block Dim Y: %d", - devices->devices[i]->attr_max_block_dim_y); - SCLogDebug("Device Max Block Dim Z: %d", - devices->devices[i]->attr_max_block_dim_z); - SCLogDebug("Device Max Grid Dim X: %d", - devices->devices[i]->attr_max_grid_dim_x); - SCLogDebug("Device Max Grid Dim Y: %d", - devices->devices[i]->attr_max_grid_dim_y); - SCLogDebug("Device Max Grid Dim Z: %d", - devices->devices[i]->attr_max_grid_dim_z); - SCLogDebug("Device Max Shared Memory Per Block: %d", - devices->devices[i]->attr_max_shared_memory_per_block); - SCLogDebug("Device Total Constant Memory: %d", - devices->devices[i]->attr_total_constant_memory); - SCLogDebug("Device Warp Size: %d", devices->devices[i]->attr_warp_size); - SCLogDebug("Device Max Pitch: %d", devices->devices[i]->attr_max_pitch); - SCLogDebug("Device Max Registers Per Block: %d", - devices->devices[i]->attr_max_registers_per_block); - SCLogDebug("Device Clock Rate: %d", devices->devices[i]->attr_clock_rate); - SCLogDebug("Device Texture Alignement: %d", - devices->devices[i]->attr_texture_alignment); - SCLogDebug("Device GPU Overlap: %s", - (devices->devices[i]->attr_gpu_overlap == 1) ? "Yes": "No"); - SCLogDebug("Device Multiprocessor Count: %d", - devices->devices[i]->attr_multiprocessor_count); - SCLogDebug("Device Kernel Exec Timeout: %s", - (devices->devices[i]->attr_kernel_exec_timeout) ? "Yes": "No"); - SCLogDebug("Device Integrated With Memory Subsystem: %s", - (devices->devices[i]->attr_integrated) ? "Yes": "No"); - SCLogDebug("Device Can Map Host Memory: %s", - (devices->devices[i]->attr_can_map_host_memory) ? "Yes": "No"); - if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_DEFAULT) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_DEFAULT"); - else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_EXCLUSIVE) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_EXCLUSIVE"); - else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_PROHIBITED) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_PROHIBITED"); - } - - return; -} - -/** - * \brief Prints some basic information for the default device(the first devie) - * we will be using on this cuda platform for use by our engine. This - * function is basically to be used to print some minimal information to - * the user at engine startup. - * - * \param devices Pointer to a SCCudaDevices instance that holds information on - * the devices. - */ -void SCCudaPrintBasicDeviceInfo(SCCudaDevices *devices) -{ - int i = 0; - - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return; - } - - for (i = 0; i < devices->count; i++) { - SCLogInfo("GPU Device %d: %s, %d Multiprocessors, %dMHz, CUDA Compute " - "Capability %d.%d", i + 1, - devices->devices[i]->name, - devices->devices[i]->attr_multiprocessor_count, - devices->devices[i]->attr_clock_rate/1000, - devices->devices[i]->major_rev, - devices->devices[i]->minor_rev); - } - - return; -} - -/** - * \brief Gets the device list, for the CUDA platform environment initialized by - * the engine. - * - * \retval devices Pointer to the CUDA device list on success; NULL on failure. - */ -SCCudaDevices *SCCudaGetDeviceList(void) -{ - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return NULL; - } - - return devices; -} - -/*****************************Context_Management_API***************************/ - -/** - * \brief Creates a new CUDA context and associates it with the calling thread. - * The flags parameter is described below. The context is created with - * a usage count of 1 and the caller of cuCtxCreate() must call - * cuCtxDestroy() or cuCtxDetach() when done using the context. If a - * context is already current to the thread, it is supplanted by the - * newly created context and may be restored by a subsequent call to - * cuCtxPopCurrent(). The two LSBs of the flags parameter can be used - * to control how the OS thread, which owns the CUDA context at the - * time of an API call, interacts with the OS scheduler when waiting for - * results from the GPU. - * - * - CU_CTX_SCHED_AUTO: The default value if the flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in - * the process C and the number of logical processors in the system - * P. If C > P, then CUDA will yield to other OS threads when - * waiting for the GPU, otherwise CUDA will not yield while waiting - * for results and actively spin on the processor. - * - CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can de-crease latency when waiting for - * the GPU, but may lower the performance of CPU threads if they are - * performing work in parallel with the CUDA thread. - * - CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting - * for results from the GPU. This can increase latency when waiting - * for the GPU, but can increase the performance of CPU threads - * performing work in parallel with the GPU. - * - CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory - * that is accessible to the GPU. - * - * Note to Linux users: - * Context creation will fail with CUDA_ERROR_UNKNOWN if the compute mode - * of the device is CU_COMPUTEMODE_PROHIBITED. Similarly, context creation - * will also fail with CUDA_ERROR_UNKNOWN if the compute mode for the - * device is set to CU_COMPUTEMODE_EXCLUSIVE and there is already an - * active context on the device. The function cuDeviceGetAttribute() can - * be used with CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute - * mode of the device. The nvidia-smi tool can be used to set the compute - * mode for devices. Documentation for nvidia-smi can be obtained by - * passing a -h option to it. - * - * \param pctx Returned context handle of the current context. - * \param flags Context creation flags. - * \param dev Device to create context on. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) -{ - CUresult result = 0; - - if (pctx == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pctx NULL"); - goto error; - } - - result = cuCtxCreate(pctx, flags, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Destroys the CUDA context specified by ctx. If the context usage count - * is not equal to 1, or the context is current to any CPU thread other - * than the current one, this function fails. Floating contexts (detached - * from a CPU thread via cuCtxPopCurrent()) may be destroyed by this - * function. - * - * \param ctx Context to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxDestroy(CUcontext ctx) -{ - CUresult result = 0; - - result = cuCtxDestroy(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DESTROY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version) -{ - CUresult result = 0; - - if (version == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "version NULL"); - goto error; - } - - result = cuCtxGetApiVersion(ctx, version); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_API_VERSION) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig) -{ - CUresult result = 0; - - if (pconfig == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pconfig NULL"); - goto error; - } - - result = cuCtxGetCacheConfig(pconfig); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CACHE_CONFIG) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxGetCurrent(CUcontext *pctx) -{ - CUresult result = 0; - - if (pctx == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pctx NULL"); - goto error; - } - - result = cuCtxGetCurrent(pctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CURRENT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *device the ordinal of the current context's device. - * - * \param device Returned device id for the current context. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxGetDevice(CUdevice *device) -{ - CUresult result = 0; - - if (device == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "device NULL"); - goto error; - } - - result = cuCtxGetDevice(device); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_DEVICE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit) -{ - CUresult result = 0; - - result = cuCtxGetLimit(pvalue, limit); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_LIMIT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Pops the current CUDA context from the CPU thread. The CUDA context - * must have a usage count of 1. CUDA contexts have a usage count of 1 - * upon creation; the usage count may be incremented with cuCtxAttach() - * and decremented with cuCtxDetach(). - * - * If successful, cuCtxPopCurrent() passes back the new context handle - * in *pctx. The old context may then be made current to a different CPU - * thread by calling cuCtxPushCurrent(). - * - * Floating contexts may be destroyed by calling cuCtxDestroy(). - * - * If a context was current to the CPU thread before cuCtxCreate() or - * cuCtxPushCurrent() was called, this function makes that context - * current to the CPU thread again. - * - * \param pctx Returned new context handle. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxPopCurrent(CUcontext *pctx) -{ - CUresult result = 0; - - result = cuCtxPopCurrent(pctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_POP_CURRENT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Pushes the given context ctx onto the CPU thread's stack of current - * contexts. The speci?ed context becomes the CPU thread's current - * context, so all CUDA functions that operate on the current context - * are affected. - * - * The previous current context may be made current again by calling - * cuCtxDestroy() or cuCtxPopCurrent(). - * - * The context must be "floating," i.e. not attached to any thread. - * Contexts are made to float by calling cuCtxPopCurrent(). - * - * \param ctx Floating context to attach. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxPushCurrent(CUcontext ctx) -{ - CUresult result = 0; - - result = cuCtxPushCurrent(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_PUSH_CURRENT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxSetCacheConfig(CUfunc_cache config) -{ - CUresult result = 0; - - result = cuCtxSetCacheConfig(config); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CACHE_CONFIG) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxSetCurrent(CUcontext ctx) -{ - CUresult result = 0; - - result = cuCtxSetCurrent(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CURRENT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaCtxSetLimit(CUlimit limit, size_t value) -{ - CUresult result = 0; - - result = cuCtxSetLimit(value, limit); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_LIMIT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Blocks until the device has completed all preceding requested tasks. - * cuCtxSynchronize() returns an error if one of the preceding tasks failed. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxSynchronize(void) -{ - CUresult result = 0; - - result = cuCtxSynchronize(); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SYNCHRONIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Increments the usage count of the context and passes back a context - * handle in *pctx that must be passed to cuCtxDetach() when the - * application is done with the context. cuCtxAttach() fails if there is - * no context current to the thread. Currently, the flags parameter must - * be 0. - * - * \param pctx Returned context handle of the current context. - * \param flags Context attach flags (must be 0). - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags) -{ - CUresult result = 0; - - SCLogInfo("Cuda API - %s deprecated", - SCMapEnumValueToName(SC_CUDA_CU_CTX_ATTACH, - sc_cuda_api_names_string_map)); - - if (pctx == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pctx NULL"); - goto error; - } - - result = cuCtxAttach(pctx, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_ATTACH) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Decrements the usage count of the context ctx, and destroys the - * context if the usage count goes to 0. The context must be a handle - * that was passed back by cuCtxCreate() or cuCtxAttach(), and must be - * current to the calling thread. - * - * \param ctx Context to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxDetach(CUcontext ctx) -{ - CUresult result = 0; - - SCLogInfo("Cuda API - %s deprecated", - SCMapEnumValueToName(SC_CUDA_CU_CTX_DETACH, - sc_cuda_api_names_string_map)); - - result = cuCtxDetach(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DETACH) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/*****************************Module_Management_API****************************/ - -/** - * \brief Returns in *hfunc the handle of the function of name \"name\" located - * in module hmod. If no function of that name exists, - * cuModuleGetFunction() returns CUDA_ERROR_NOT_FOUND. - * - * \param hfunc Returned function handle. - * \param hmod Module to return function from. - * \param name Name of function to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) -{ - CUresult result = 0; - - if (hfunc == NULL || name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "hfunc is NULL or name is NULL"); - goto error; - } - - result = cuModuleGetFunction(hfunc, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_FUNCTION) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *dptr and *bytes the base pointer and size of the global - * name \"name\" located in module hmod. If no variable of that name - * exists, cuModuleGetGlobal() returns CUDA_ERROR_NOT_FOUND. Both - * parameters dptr and bytes are optional. If one of them is NULL, - * it is ignored. - * - * \param dptr Returned global device pointer. - * \param bytes Returned global size in bytes. - * \param hmod Module to return function from. - * \param name Name of global to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, - const char *name) -{ - CUresult result = 0; - - if (name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "name is NULL"); - goto error; - } - - result = cuModuleGetGlobal(dptr, bytes, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_GLOBAL) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod, const char *name) -{ - CUresult result = 0; - - if (p_surf_ref == NULL || name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_surf_ref is NULL or name is NULL"); - goto error; - } - - result = cuModuleGetSurfRef(p_surf_ref, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_SURF_REF) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *p_tex_ref the handle of the texture reference of name - * \"name\" in the module hmod. If no texture reference of that name - * exists, cuModuleGetTexRef() returns CUDA_ERROR_NOT_FOUND. This texture - * reference handle should not be destroyed, since it will be destroyed - * when the module is unloaded. - * - * \param p_tex_ref Returned global device pointer. - * \param hmod Module to retrieve texture reference from. - * \param name Name of the texture reference to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, const char *name) -{ - CUresult result = 0; - - if (p_tex_ref == NULL || name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_tex_ref is NULL or name is NULL"); - goto error; - } - - result = cuModuleGetTexRef(p_tex_ref, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_TEX_REF) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Takes a filename fname and loads the corresponding module \"module\" - * into the current context. The CUDA driver API does not attempt to - * lazily allocate the resources needed by a module; if the memory for - * functions and data (constant and global) needed by the module cannot - * be allocated, cuModuleLoad() fails. The file should be a cubin file - * as output by nvcc or a PTX file, either as output by nvcc or handwrtten. - * - * \param module Returned module. - * \param fname Filename of module to load. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleLoad(CUmodule *module, const char *fname) -{ - CUresult result = 0; - - if (module == NULL || fname == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or fname is NULL"); - goto error; - } - - result = cuModuleLoad(module, fname); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Takes a pointer image and loads the corresponding module \"module\" - * into the current context. The pointer may be obtained by mapping a - * cubin or PTX file, passing a cubin or PTX ?le as a NULL-terminated - * text string, or incorporating a cubin object into the executable - * resources and using operating system calls such as Windows - * FindResource() to obtain the pointer. - * - * \param module Returned module. - * \param image Module data to load - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleLoadData(CUmodule *module, const void *image) -{ - CUresult result = 0; - - if (module == NULL || image == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or image is NULL"); - goto error; - } - - result = cuModuleLoadData(module, image); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Takes a pointer image and loads the corresponding module module into - * the current context. The pointer may be obtained by mapping a cubin or - * PTX file, passing a cubin or PTX file as a NULL-terminated text - * string, or incorporating a cubin object into the executable resources - * and using operating system calls such as Windows FindResource() to - * obtain the pointer. Options are passed as an array via options and any - * corresponding parameters are passed in optionValues. The number of - * total options is supplied via numOptions. Any outputs will be returned - * via optionValues. Supported options are: - * - * - CU_JIT_MAX_REGISTERS: input specifies the maximum number of registers - * per thread; - * - CU_JIT_THREADS_PER_BLOCK: input specifies number of threads per block - * to target compilation for; output returns the number of threads - * the compiler actually targeted; - * - CU_JIT_WALL_TIME: output returns the float value of wall clock time, - * in milliseconds, spent compiling the PTX code; - * - CU_JIT_INFO_LOG_BUFFER: input is a pointer to a buffer in which to - * print any informational log messages from PTX assembly; - * - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the - * buffer; output is the number of bytes filled with messages; - * - CU_JIT_ERROR_LOG_BUFFER: input is a pointer to a buffer in which to - * print any error log messages from PTX assembly; - * - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the - * buffer; output is the number of bytes filled with messages; - * - CU_JIT_OPTIMIZATION_LEVEL: input is the level of optimization to apply - * to generated code (0 - 4), with 4 being the default and highest - * level; - * - CU_JIT_TARGET_FROM_CUCONTEXT: causes compilation target to be - * determined based on current attached context (default); - * - CU_JIT_TARGET: input is the compilation target based on supplied - * CUjit_target_enum; possible values are: - * -- CU_TARGET_COMPUTE_10 - * -- CU_TARGET_COMPUTE_11 - * -- CU_TARGET_COMPUTE_12 - * -- CU_TARGET_COMPUTE_13 - * - * \param module Returned module. - * \param image Module data to load. - * \param numOptions Number of options. - * \param options Options for JIT. - * \param optionValues Option values for JIT. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleLoadDataEx(CUmodule *module, const void *image, - unsigned int num_options, CUjit_option *options, - void **option_values) -{ - CUresult result = 0; - - if (module == NULL || image == NULL || options == NULL || - option_values == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or image is NULL or options is NULL or " - "option_values is NULL"); - goto error; - } - - result = cuModuleLoadDataEx(module, image, num_options, options, option_values); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA_EX) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Takes a pointer fat_cubin and loads the corresponding module \"module\" - * into the current context. The pointer represents a fat binary object, - * which is a collection of different cubin files, all representing the - * same device code, but compiled and optimized for different - * architectures. There is currently no documented API for constructing - * and using fat binary objects by programmers, and therefore this - * function is an internal function in this version of CUDA. More - * information can be found in the nvcc document. - * - * \param module Returned module. - * \param fatCubin Fat binary to load. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin) -{ - CUresult result = 0; - - if (module == NULL || fat_cubin == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or fatCubin is NULL"); - goto error; - } - - result = cuModuleLoadFatBinary(module, fat_cubin); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_FAT_BINARY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Unloads a module hmod from the current context. - * - * \param module Module to unload - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleUnload(CUmodule hmod) -{ - CUresult result = 0; - - result = cuModuleUnload(hmod); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_UNLOAD) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/****************************Memory_Management_API*****************************/ - -/** - * \brief Creates a CUDA array according to the CUDA_ARRAY3D_DESCRIPTOR - * structure pAllocateArray and returns a handle to the new CUDA - * array in *p_handle. The CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * typedef struct { - * unsigned int Width; - * unsigned int Height; - * unsigned int Depth; - * CUarray_format Format; - * unsigned int NumChannels; - * unsigned int Flags; - * } CUDA_ARRAY3D_DESCRIPTOR; - * - * where: - * - * - Width, Height, and Depth are the width, height, and depth of the - * CUDA array (in elements); the CUDA array is one-dimensional if -v * height and depth are 0, two-dimensional if depth is 0, and - * three-dimensional otherwise; - * - Format speci?es the format of the elements; CUarray_format is - * defined as: - * - * typedef enum CUarray_format_enum { - * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - * CU_AD_FORMAT_SIGNED_INT8 = 0x08, - * CU_AD_FORMAT_SIGNED_INT16 = 0x09, - * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - * CU_AD_FORMAT_HALF = 0x10, - * CU_AD_FORMAT_FLOAT = 0x20 - * } CUarray_format; - * - * - NumChannels speci?es the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - Flags provides for future features. For now, it must be set to 0. - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 2048; - * desc.Height = 0; - * desc.Depth = 0; - * - * Description for a 64 x 64 CUDA array of floats: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 64; - * desc.Height = 64; - * desc.Depth = 0; - * - * Description for a width x height x depth CUDA array of 64-bit, - * 4x16-bit float16's: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.FormatFlags = CU_AD_FORMAT_HALF; - * desc.NumChannels = 4; - * desc.Width = width; - * desc.Height = height; - * desc.Depth = depth; - * - * \param p_handle Returned Handle. - * \param p_allocate_array 3D array descriptor. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaArray3DCreate(CUarray *p_handle, - const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array) -{ - CUresult result = 0; - - if (p_handle == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_handle is NULL"); - goto error; - } - - result = cuArray3DCreate(p_handle, p_allocate_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *p_rray_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. - * - * This function may be called on 1D and 2D arrays, in which case the - * Height and/or Depth members of the descriptor struct will be set to 0. - * - * \param p_array_descriptor Returned 3D array descriptor. - * \param h_array 3D array to get descriptor of. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, - CUarray h_array) -{ - CUresult result = 0; - - if (p_array_descriptor == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_array_descriptor is NULL"); - goto error; - } - - result = cuArray3DGetDescriptor(p_array_descriptor, h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Creates a CUDA array according to the CUDA_ARRAY_DESCRIPTOR structure - * p_allocate_array and returns a handle to the new CUDA array in - * p_handle. The CUDA_ARRAY_DESCRIPTOR is defined as: - * - * typedef struct { - * unsigned int Width; - * unsigned int Height; - * CUarray_format Format; - * unsigned int NumChannels; - * } CUDA_ARRAY_DESCRIPTOR; - * - * where: - * - * - Width, and Height are the width, and height of the CUDA array - * (in elements); the CUDA array is one-dimensional if height is 0, - * two-dimensional otherwise; - * - Format speci?es the format of the elements; CUarray_format is - * defined as: - * - * typedef enum CUarray_format_enum { - * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - * CU_AD_FORMAT_SIGNED_INT8 = 0x08, - * CU_AD_FORMAT_SIGNED_INT16 = 0x09, - * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - * CU_AD_FORMAT_HALF = 0x10, - * CU_AD_FORMAT_FLOAT = 0x20 - * } CUarray_format; - * - * - NumChannels specifies the number of packed components per CUDA - * array element; it may be 1, 2, or 4; - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 2048; - * desc.Height = 1; - * - * Description for a 64 x 64 CUDA array of floats: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 64; - * desc.Height = 64; - * - * Description for a width x height CUDA array of 64-bit, 4x16-bit - * float16's: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.FormatFlags = CU_AD_FORMAT_HALF; - * desc.NumChannels = 4; - * desc.Width = width; - * desc.Height = height; - * - * Description for a width x height CUDA array of 16-bit elements, each - * of which is two 8-bit unsigned chars: - * - * CUDA_ARRAY_DESCRIPTOR arrayDesc; - * desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; - * desc.NumChannels = 2; - * desc.Width = width; - * desc.Height = height; - * - * \param p_handle Returned array. - * \param p_allocate_array Array descriptor. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaArrayCreate(CUarray *p_handle, - const CUDA_ARRAY_DESCRIPTOR *p_allocate_array) -{ - CUresult result = 0; - - if (p_handle == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_handle is NULL"); - goto error; - } - - result = cuArrayCreate(p_handle, p_allocate_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - - -/** - * \brief Destroys the CUDA array h_array. - * - * \param h_array Array to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaArrayDestroy(CUarray h_array) -{ - int result = cuArrayDestroy(h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_DESTROY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *p_array_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. - * - * \param p_array_descriptor Returned array descriptor. - * \param h_array Array to get descriptor of. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, - CUarray h_array) -{ - CUresult result = 0; - - if (p_array_descriptor == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_array_descriptor is NULL"); - goto error; - } - - result = cuArrayGetDescriptor(p_array_descriptor, h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_GET_DESCRIPTOR) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id) -{ - CUresult result = 0; - - result = cuDeviceGetByPCIBusId(dev, pci_bus_id); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev) -{ - CUresult result = 0; - - result = cuDeviceGetPCIBusId(pci_bus_id, len, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaIpcCloseMemHandle(CUdeviceptr dptr) -{ - CUresult result = 0; - - result = cuIpcCloseMemHandle(dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event) -{ - CUresult result = 0; - - result = cuIpcGetEventHandle(p_handle, event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr) -{ - CUresult result = 0; - - result = cuIpcGetMemHandle(p_handle, dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle) -{ - CUresult result = 0; - - result = cuIpcOpenEventHandle(ph_event, handle); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, - unsigned int flags) -{ - CUresult result = 0; - - result = cuIpcOpenMemHandle(pdptr, handle, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Returns in *p_array_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. - * - * \param p_array_descriptor Returned array descriptor. - * \param h_array Array to get descriptor of. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size) -{ - CUresult result = 0; - - if (dptr == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "dptr is NULL"); - goto error; - } - - result = cuMemAlloc(dptr, byte_size); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Allocates bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the vir-tual memory - * ranges allocated with this function and automatically accelerates - * calls to functions such as cuMemcpy(). Since the memory can be - * accessed directly by the device, it can be read or written with - * much higher bandwidth than pageable memory obtained with functions - * such as SCMalloc(). Allocating excessive amounts of memory with - * cuMemAllocHost() may degrade system performance, since it reduces - * the amount of memory available to the system for paging. As a result, - * this function is best used sparingly to allocate staging areas for - * data exchange between host and device. - * - * \param pp Returned host pointer to page-locked memory. - * \param byte_size Requested allocation size in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemAllocHost(void **pp, size_t byte_size) -{ - CUresult result = 0; - - if (pp == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pp is NULL"); - goto error; - } - - result = cuMemAllocHost(pp, byte_size); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Allocates at least width_in_bytes * height bytes of linear memory on the - * device and returns in *dptr a pointer to the allocated memory. The - * function may pad the allocation to ensure that corresponding pointers in - * any given row will continue to meet the alignment requirements for - * coalescing as the address is updated from row to row. ElementSizeBytes - * specifies the size of the largest reads and writes that will be - * performed on the memory range. - * - * element_size_bytes may be 4, 8 or 16 (since coalesced memory - * transactions are not possible on other data sizes). If element_size_bytes - * is smaller than the actual read/write size of a kernel, the kernel will - * run correctly, but possibly at reduced speed. The pitch returned in - * *p_itch by cuMemAllocPitch() is the width in bytes of the allocation. - * The intended usage of pitch is as a separate parameter of the allocation, - * used to compute addresses within the 2D array. Given the row and column - * of an array element of type T, the address is computed as: - * - * T * p_element = (T*)((char*)base_address + row * pitch) + column; - * - * The pitch returned by cuMemAllocPitch() is guaranteed to work with - * cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it - * is recommended that programmers consider performing pitch allocations - * using cuMemAllocPitch(). Due to alignment restrictions in the hardware, - * this is especially true if the application will be performing 2D memory - * copies between different regions of device memory (whether linear memory - * or CUDA arrays). - * - * \param dptr Returned device pointer. - * \param p_pitch Returned pitch of allocation in bytes. - * \param width_in_bytes Requested allocation width in bytes. - * \param height Requested allocation width in rows. - * \param element_size_bytes Size of largest reads/writes for range. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch, - size_t width_in_bytes, - size_t height, - unsigned int element_size_bytes) -{ - CUresult result = 0; - - if (dptr == NULL || p_pitch == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "dptr is NULL or p_pitch is NULL"); - goto error; - } - - result = cuMemAllocPitch(dptr, p_pitch, width_in_bytes, height, - element_size_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_PITCH) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpy(dst, src, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY) == -1) - goto error; - - return 0; - error: - return -1; -} - - -/** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. - * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. - * - * \param p_copy Parameters for the memory copy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy) -{ - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy2D(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. - * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. - * - * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a - * stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param p_copy Parameters for the memory copy. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream) -{ - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy2DAsync(p_copy, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. - * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. - * - * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a - * stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param p_copy Parameters for the memory copy. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy) -{ - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy2DUnaligned(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_UNALIGNED) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Perform a 3D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY3D structure is defined as: - * - * typedef struct CUDA_MEMCPY3D_st { - * unsigned int srcXInBytes, srcY, srcZ; - * unsigned int srcLOD; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; // ignored when src is array - * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - * unsigned int dstXInBytes, dstY, dstZ; - * unsigned int dstLOD; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; // ignored when dst is array - * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - * unsigned int WidthInBytes; - * unsigned int Height; - * unsigned int Depth; - * } CUDA_MEMCPY3D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * CUmemorytype_enum is defined as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight - * specify the (host) base address of the source data, the bytes per row, - * and the height of each 2D slice of the 3D array. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and - * srcHeight specify the (device) base address of the source data, the - * bytes per row, and the height of each 2D slice of the 3D array. - * srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle - * of the source data. srcHost, srcDevice, srcPitch and srcHeight are - * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch - * specify the (host) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data, the bytes - * per row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the - * handle of the destination data. dstHost, dstDevice, dstPitch and - * dstHeight are ignored. - * - * - srcXInBytes, srcY and srcZ specify the base address of the source - * data for the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, dstY and dstZ specify the base address of the destination - * data for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes, Height and Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. Any pitches must be greater - * than or equal to WidthInBytes. - * - * cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param p_copy Parameters for the memory copy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy) -{ - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy3D(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Perform a 3D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY3D structure is defined as: - * - * typedef struct CUDA_MEMCPY3D_st { - * unsigned int srcXInBytes, srcY, srcZ; - * unsigned int srcLOD; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; // ignored when src is array - * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - * unsigned int dstXInBytes, dstY, dstZ; - * unsigned int dstLOD; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; // ignored when dst is array - * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - * unsigned int WidthInBytes; - * unsigned int Height; - * unsigned int Depth; - * } CUDA_MEMCPY3D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * CUmemorytype_enum is defined as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight - * specify the (host) base address of the source data, the bytes per row, - * and the height of each 2D slice of the 3D array. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and - * srcHeight specify the (device) base address of the source data, the - * bytes per row, and the height of each 2D slice of the 3D array. - * srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle - * of the source data. srcHost, srcDevice, srcPitch and srcHeight are - * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch - * specify the (host) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data, the bytes - * per row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the - * handle of the destination data. dstHost, dstDevice, dstPitch and - * dstHeight are ignored. - * - * - srcXInBytes, srcY and srcZ specify the base address of the source - * data for the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, dstY and dstZ specify the base address of the destination - * data for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes, Height and Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. Any pitches must be greater - * than or equal to WidthInBytes. - * - * cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * cuMemcpy3DAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param p_copy Parameters for the memory copy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream) -{ - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy3DAsync(p_copy, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy) -{ - CUresult result = 0; - - result = cuMemcpy3DPeer(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpy3DPeerAsync(p_copy, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyAsync(dst, src, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Copies from one 1D CUDA array to another. dstArray and srcArray - * specify the handles of the destination and source CUDA arrays for the - * copy, respectively. dstIndex and srcIndex specify the destination and - * source indices into the CUDA array. These values are in the range - * [0, Width-1] for the CUDA array; they are not byte offsets. ByteCount - * is the number of bytes to be copied. The size of the elements in the - * CUDA arrays need not be the same format, but the elements must be the - * same size; and count must be evenly divisible by that size. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param src_array Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset, - CUarray src_array, size_t src_offset, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyAtoA(dst_array, dst_offset, src_array, src_offset, - byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_A) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \param Copies from one 1D CUDA array to device memory. dstDevice specifies the - * base pointer of the destination and must be naturally aligned with the - * CUDA array elements. hSrc and SrcIndex specify the CUDA array handle and - * the index (in array elements) of the array element where the copy is - * to begin. ByteCount speci?es the number of bytes to copy and must be - * evenly divisible by the array element size. - * - * \param dst_device Destination device pointer. - * \param h_src Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array, - size_t src_offset, size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyAtoD(dst_device, src_array, src_offset, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \param Copies from one 1D CUDA array to host memory. dstHost specifies the - * base pointer of the destination. srcArray and srcIndex specify the - * CUDA array handle and starting index of the source data. ByteCount - * specifies the number of bytes to copy. - * - * \param dst_device Destination device pointer. - * \param h_src Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyAtoH(dst_host, src_array, src_offset, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \param Copies from one 1D CUDA array to host memory. dstHost specifies the - * base pointer of the destination. srcArray and srcIndex specify the - * CUDA array handle and starting index of the source data. ByteCount - * specifies the number of bytes to copy. - * - * cuMemcpyAtoHAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero stream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param dst_device Destination device pointer. - * \param src_array Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, - size_t src_offset, size_t byte_count, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyAtoHAsync(dst_host, src_array, src_offset, byte_count, - h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from device memory to a 1D CUDA array. dstArray and dstIndex - * specify the CUDA array handle and starting index of the destination - * data. srcDevice speci?es the base pointer of the source. ByteCount - * specifies the number of bytes to copy. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset, - CUdeviceptr src_device, size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyDtoA(dst_array, dst_offset, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_A) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from device memory to device memory. dstDevice and srcDevice are - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is asynchronous. - * - * \param dst_device Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyDtoD(dst_device, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device, - size_t byte_count, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyDtoDAsync(dst_device, src_device, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - - -/** - * \brief Copies from device to host memory. dst_host and src_device specify - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is synchronous. - * - * \param dst_host Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyDtoH(dst_host, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from device to host memory. dst_host and src_device specify - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. - * - * cuMemcpyDtoHAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works - * on page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param dst_host Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device, - size_t byte_count, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyDtoHAsync(dst_host, src_device, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index - * specify the CUDA array handle and starting index of the destination - * data. p_src specifies the base address of the source. byte_count - * specifies the number of bytes to copy. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param p_src Source host pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset, - const void *src_host, size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyHtoA(dst_array, dst_offset, src_host, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index - * specify the CUDA array handle and starting index of the destination - * data. p_src specifies the base address of the source. byte_count - * specfies the number of bytes to copy. - * - * cuMemcpyHtoAAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works on - * page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param p_src Source host pointer. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset, - const void *src_host, size_t byte_count, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyHtoAAsync(dst_array, dst_offset, src_host, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from host memory to device memory. dst_device and src_host - * are the base addresses of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is synchronous. - * - * \param dst_device Destination device pointer. - * \param src_host Source host pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyHtoD(dst_device, src_host,byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies from host memory to device memory. dst_device and src_host are - * the base addresses of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. - * - * cuMemcpyHtoDAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works on - * page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * - * \param dst_device Destination device pointer. - * \param src_host Source host pointer. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, - size_t byte_count, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyHtoDAsync(dst_device, src_host, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context, - CUdeviceptr src_device, CUcontext src_context, - size_t byte_count) -{ - CUresult result = 0; - - result = cuMemcpyPeer(dst_device, dst_context, src_device, src_context, - byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context, - CUdeviceptr src_device, CUcontext src_context, - size_t byte_count, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemcpyPeerAsync(dst_device, dst_context, src_device, src_context, - byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Frees the memory space pointed to by dptr, which must have been - * returned by a previous call to cuMemAlloc() or cuMemAllocPitch(). - * - * \param dptr Pointer to the memory to free. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemFree(CUdeviceptr dptr) -{ - CUresult result = 0; - - result = cuMemFree(dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Frees the memory space pointed to by p, which must have been returned - * by a previous call to cuMemAllocHost(). - * - * \param p Pointer to the memory to free. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemFreeHost(void *p) -{ - CUresult result = 0; - - if (p == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p is NULL"); - goto error; - } - - result = cuMemFreeHost(p); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE_HOST) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns the base address in *pbase and size in *psize of the allocation - * by cuMemAlloc() or cuMemAllocPitch() that contains the input pointer - * dptr. Both parameters pbase and psize are optional. If one of them is - * NULL, it is ignored. - * - * \param pbase Returned base address. - * \param psize Returned size of device memory allocation. - * \param dptr Device pointer to query - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, - CUdeviceptr dptr) -{ - CUresult result = 0; - - result = cuMemGetAddressRange(pbase, psize, dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_ADDRESS_RANGE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *free and *total respectively, the free and total amount - * of memory available for allocation by the CUDA context, in bytes. - * - * \param free Returned free memory in bytes. - * \param total Returned total memory in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemGetInfo(size_t *free, size_t *total) -{ - CUresult result = 0; - - if (free == NULL || total == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "free is NULL || total is NULL"); - goto error; - } - - result = cuMemGetInfo(free, total); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_INFO) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Allocates bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the virtual memory ranges - * allocated with this function and automatically accelerates calls to - * functions such as cuMemcpyHtoD(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher - * bandwidth than pageable memory obtained with functions such as - * SCMalloc(). Allocating excessive amounts of pinned memory may degrade - * system performance, since it reduces the amount of memory available - * to the system for paging. As a result, this function is best used - * sparingly to allocate staging areas for data exchange between host - * and device. - * - * The Flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one - * that performed the allocation. - * - CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA - * address space. The device pointer to the memory may be obtained by - * calling cuMemHostGetDevicePointer(). This feature is available only - * on GPUs with compute capability greater than or equal to 1.1. - * - CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined - * (WC). WC memory can be transferred across the PCI Express bus more - * quickly on some system con?gurations, but cannot be read efficiently - * by most CPUs. WC memory is a good option for buffers that will be - * written by the CPU and read by the GPU via mapped pinned memory or - * host->device transfers. All of these fags are orthogonal to one - * another: a developer may allocate memory that is portable, mapped - * and/or write-combined with no restrictions. - * - * The CUDA context must have been created with the CU_CTX_MAP_HOST flag - * in order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect. - * - * The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is - * deferred to cuMemHostGetDevicePointer() because the memory may be - * mapped into other CUDA contexts via the CU_MEMHOSTALLOC_PORTABLE flag. - * - * The memory allocated by this function must be freed with cuMemFreeHost(). - * - * \param pp Returned host pointer to page-locked memory. - * \param byte_size Requested allocation size in bytes. - * \param flags Flags for allocation request. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags) -{ - CUresult result = 0; - - result = cuMemHostAlloc(pp, byte_size, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_ALLOC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Passes back the device pointer pdptr corresponding to the mapped, - * pinned host buffer p allocated by cuMemHostAlloc. - * - * cuMemHostGetDevicePointer() will fail if the CU_MEMALLOCHOST_DEVICEMAP - * flag was not speci?ed at the time the memory was allocated, or if the - * function is called on a GPU that does not support mapped pinned memory. - * - * Flags provides for future releases. For now, it must be set to 0. - * - * \param pdptr Returned device pointer. - * \param p Host pointer. - * \param flags Options(must be 0). - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int flags) -{ - CUresult result = 0; - - result = cuMemHostGetDevicePointer(pdptr, p, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Passes back the flags p_flags that were specified when allocating the - * pinned host buffer p allocated by cuMemHostAlloc. - * - * cuMemHostGetFlags() will fail if the pointer does not reside in an - * allocation performed by cuMemAllocHost() or cuMemHostAlloc(). - * - * \param p_flags Returned flags word. - * \param p Host pointer. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p) -{ - CUresult result = 0; - - result = cuMemHostGetFlags(p_flags, p); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_FLAGS) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags) -{ - CUresult result = 0; - - result = cuMemHostRegister(p, byte_size, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_REGISTER) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaMemHostUnregister(void *p) -{ - CUresult result = 0; - - result = cuMemHostUnregister(p); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_UNREGISTER) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the memory range of N 16-bit values to the speci?ed value us. - * - * \param dst_device Destination device pointer. - * \param us Value to set. - * \param n Number of elements. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n) -{ - CUresult result = 0; - - result = cuMemsetD16(dst_device, us, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us, - size_t n, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD16Async(dst_device, us, n, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the 2D memory range of Width 16-bit values to the specified - * value us. Height specifies the number of rows to set, and dst_pitch - * specifies the number of bytes between each row. This function - * performs fastest when the pitch is one that has been passed back - * by cuMemAllocPitch(). - * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param us Value to set - * \param width Width of row. - * \param height Number of rows - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch, - unsigned short us, size_t width, - size_t height) -{ - CUresult result = 0; - - result = cuMemsetD2D16(dst_device, dst_pitch, us, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned short us, size_t width, - size_t height, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD2D16Async(dst_device, dst_pitch, us, width, height, - h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the 2D memory range of Width 32-bit values to the specified value - * ui. Height speci?es the number of rows to set, and dstPitch specifies - * the number of bytes between each row. This function performs fastest - * when the pitch is one that has been passed back by cuMemAllocPitch(). - * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param ui Value to set - * \param width Width of row. - * \param height Number of rows - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch, - unsigned int ui, size_t width, size_t height) -{ - CUresult result = 0; - - result = cuMemsetD2D32(dst_device, dst_pitch, ui, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned int ui, size_t width, size_t height, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD2D32Async(dst_device, dst_pitch, ui, width, height, - h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the 2D memory range of Width 8-bit values to the specified value - * uc. Height speci?es the number of rows to set, and dstPitch specifies - * the number of bytes between each row. This function performs fastest - * when the pitch is one that has been passed back by cuMemAllocPitch(). - * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param uc Value to set - * \param width Width of row. - * \param height Number of rows - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch, - unsigned char uc, size_t width, size_t height) -{ - CUresult result = 0; - - result = cuMemsetD2D8(dst_device, dst_pitch, uc, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned char uc, size_t width, size_t height, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD2D8Async(dst_device, dst_pitch, uc, width, height, - h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the memory range of N 32-bit values to the specified value ui. - * - * \param dst_device Destination device pointer. - * \param ui Value to set. - * \param n Number of elements. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n) -{ - CUresult result = 0; - - result = cuMemsetD32(dst_device, ui, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui, - size_t n, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD32Async(dst_device, ui, n, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Sets the memory range of N 8-bit values to the specified value ui. - * - * \param dst_device Destination device pointer. - * \param uc Value to set. - * \param n Number of elements. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n) -{ - CUresult result = 0; - - result = cuMemsetD8(dst_device, uc, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc, - size_t n, CUstream h_stream) -{ - CUresult result = 0; - - result = cuMemsetD8Async(dst_device, uc, n, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8_ASYNC) == -1) - goto error; - - return 0; - error: - return -1; -} - -/*****************************Unified_Addressing_API****************************/ - -int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute, - CUdeviceptr ptr) -{ - CUresult result = 0; - - result = cuPointerGetAttribute(data, attribute, ptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_POINTER_GET_ATTRIBUTE) == -1) - goto error; - - return 0; - error: - return -1; -} - -/*****************************Stream_Management_API****************************/ - -/** - * \brief Creates a stream and returns a handle in ph_stream. Flags is - * required to be 0. - * - * \param ph_stream Returned newly created stream. - * \param flags Parameters for stream creation(must be 0). - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaStreamCreate(CUstream *ph_stream, unsigned int flags) -{ - CUresult result = 0; - - if (ph_stream == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "phStream is NULL"); - goto error; - } - - result = cuStreamCreate(ph_stream, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Destroys the stream specified by h_stream. - * - * \param h_stream Stream to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaStreamDestroy(CUstream h_stream) -{ - CUresult result = 0; - - result = cuStreamDestroy(h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_DESTROY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns CUDA_SUCCESS if all operations in the stream specifed by - * h_stream have completed, or CUDA_ERROR_NOT_READY if not. - * - * \param h_stream Stream to query status of. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaStreamQuery(CUstream h_stream) -{ - CUresult result = 0; - - result = cuStreamQuery(h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_QUERY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Waits until the device has completed all operations in the stream - * specified by h_stream. - * - * \param h_stream Stream to wait for. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaStreamSynchronize(CUstream h_stream) -{ - CUresult result = 0; - - result = cuStreamSynchronize(h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_SYNCHRONIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event, - unsigned int flags) -{ - CUresult result = 0; - - result = cuStreamWaitEvent(h_stream, h_event, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_WAIT_EVENT) == -1) - goto error; - - return 0; - error: - return -1; -} - -/*****************************Event_Management_API*****************************/ - -/** - * \brief Creates an event *ph_event with the flags specified via flags. Valid - * flags include: - * - * CU_EVENT_DEFAULT: Default event creation flag. - * CU_EVENT_BLOCKING_SYNC: Specifies that event should use blocking - * synchronization. - * - * \param ph_event Returns newly created event. - * \param flags Event creation flags. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventCreate(CUevent *ph_event, unsigned int flags) -{ - CUresult result = 0; - - if (ph_event == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ph_event is NULL"); - goto error; - } - - result = cuEventCreate(ph_event, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Destroys the event specified by h_event. - * - * \param h_event Event to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventDestroy(CUevent h_event) -{ - CUresult result = 0; - - result = cuEventDestroy(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_DESTROY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Computes the elapsed time between two events (in milliseconds with - * a resolution of around 0.5 microseconds). If either event has not - * been recorded yet, this function returns CUDA_ERROR_NOT_READY. If - * either event has been recorded with a non-zero stream, the result - * is undefined. - * - * \param p_milli_seconds Returned elapsed time in milliseconds. - * \param h_start Starting event. - * \param h_end Ending event. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, CUevent h_end) -{ - CUresult result = 0; - - if (p_milli_seconds == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_milli_seconds is NULL"); - goto error; - } - - result = cuEventElapsedTime(p_milli_seconds, h_start, h_end); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_ELAPSED_TIME) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns CUDA_SUCCESS if the event has actually been recorded, or - * CUDA_ERROR_NOT_READY if not. If cuEventRecord() has not been called - * on this event, the function returns CUDA_ERROR_INVALID_VALUE. - * - * \param h_event Event to query. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventQuery(CUevent h_event) -{ - CUresult result = 0; - - result = cuEventQuery(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_QUERY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Records an event. If stream is non-zero, the event is recorded after - * all preceding operations in the stream have been completed; otherwise, - * it is recorded after all preceding operations in the CUDA context have - * been completed. Since operation is asynchronous, cuEventQuery() and/or - * cuEventSynchronize() must be used to determine when the event has - * actually been recorded. - * - * If cuEventRecord() has previously been called and the event has not - * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. - * - * \param h_event Event to record. - * \param h_stream Stream to record event for. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventRecord(CUevent h_event, CUstream h_stream) -{ - CUresult result = 0; - - result = cuEventRecord(h_event, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_RECORD) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Waits until the event has actually been recorded. If cuEventRecord() - * has been called on this event, the function returns - * CUDA_ERROR_INVALID_VALUE. - * - * If cuEventRecord() has previously been called and the event has not - * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. - * - * \param h_event Event to wait for. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventSynchronize(CUevent h_event) -{ - CUresult result = 0; - - result = cuEventSynchronize(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_SYNCHRONIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/***********************Execution_Control_Management_API***********************/ - -/** - * \brief Returns in *pi the integer value of the attribute attrib on the - * kernel given by hfunc. The supported attributes are: - * - * - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The number of threads - * beyond which a launch of the function would fail. This number - * depends on both the function and the device on which the - * function is currently loaded. - * - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of - * statically-allocated shared memory required by this function. - * This does not include dynamically-allocated shared memory - * requested by the user at runtime. - * - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of - * user-allocated constant memory required by this function. - * - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of thread - * local memory used by this function. - * - CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each - * thread of this function. - * - * \param pi Pointer to an integer which would be updated with the returned - * attribute value. - * \param attrib Attribute requested. - * \param hfunc Function to query attribute of. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) -{ - CUresult result = 0; - - if (pi == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pi is NULL"); - goto error; - } - - result = cuFuncGetAttribute(pi, attrib, hfunc); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_GET_ATTRIBUTE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) -{ - CUresult result = 0; - - result = cuFuncSetCacheConfig(hfunc, config); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_CACHE_CONFIG) == -1) - goto error; - - return 0; - error: - return -1; -} - -int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x, - unsigned int grid_dim_y, unsigned int grid_dim_z, - unsigned int block_dim_x, unsigned int block_dim_y, - unsigned int block_dim_z, unsigned int shared_mem_bytes, - CUstream h_stream, void **kernel_params, void **extra) -{ - CUresult result = 0; - - result = cuLaunchKernel(f, grid_dim_x, grid_dim_y, grid_dim_z, - block_dim_x, block_dim_y, block_dim_z, - shared_mem_bytes, h_stream, kernel_params, extra); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_KERNEL) == -1) - goto error; - - return 0; - error: - return -1; -} - -/** - * \brief Specifies the x, y, and z dimensions of the thread blocks that are - * created when the kernel given by hfunc is launched. - * - * \param hfunc Kernel to specify dimensions of. - * \param x X dimension. - * \param y Y dimension. - * \param z Z dimension. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) -{ - CUresult result = 0; - - result = cuFuncSetBlockShape(hfunc, x, y, z); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Sets through bytes the amount of dynamic shared memory that will be - * available to each thread block when the kernel given by hfunc is - * launched. - * - * \param hfunc Kernel to specify dynamic shared memory for. - * \param bytes Dynamic shared memory size per thread in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) -{ - CUresult result = 0; - - result = cuFuncSetSharedSize(hfunc, bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_SHARED_SIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Invokes the kernel f on a 1 x 1 x 1 grid of blocks. The block contains - * the number of threads specified by a previous call to - * cuFuncSetBlockShape(). - * - * \param f Kernel to launch. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaLaunch(CUfunction f) -{ - CUresult result = 0; - - result = cuLaunch(f); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. - * Each block contains the number of threads specified by a previous call - * to cuFuncSetBlockShape(). - * - * \param f Kernel to launch. - * \param grid_width Width of grid in blocks. - * \param grib_height Height of grid in blocks. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height) -{ - CUresult result = 0; - - result = cuLaunchGrid(f, grid_width, grid_height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. - * Each block contains the number of threads specified by a previous call - * to cuFuncSetBlockShape(). cuLaunchGridAsync() can optionally be - * associated to a stream by passing a non-zero hStream argument. - * - * \param f Kernel to launch. - * \param grid_width Width of grid in blocks. - * \param grib_height Height of grid in blocks. - * \param h_stream Stream identifier. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, - CUstream h_stream) -{ - CUresult result = 0; - - result = cuLaunchGridAsync(f, grid_width, grid_height, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID_ASYNC) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Sets a foating-point parameter that will be specified the next time - * the kernel corresponding to hfunc will be invoked. offset is a byte - * offset. - * - * \param h_func Kernel to add parameter to. - * \param offset Offset to add parameter to argument list. - * \param value Value of parameter. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetf(CUfunction h_func, int offset, float value) -{ - CUresult result = 0; - - result = cuParamSetf(h_func, offset, value); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETF) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Sets an integer parameter that will be specified the next time - * the kernel corresponding to hfunc will be invoked. offset is a byte - * offset. - * - * \param h_func Kernel to add parameter to. - * \param offset Offset to add parameter to argument list. - * \param value Value of parameter. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value) -{ - CUresult result = 0; - - result = cuParamSeti(h_func, offset, value); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETI) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Sets through numbytes the total size in bytes needed by the function - * parameters of the kernel corresponding to hfunc. - * - * \param h_func Kernel to set parameter size for. - * \param num_bytes Size of paramter list in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes) -{ - CUresult result = 0; - - result = cuParamSetSize(h_func, num_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_SIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Makes the CUDA array or linear memory bound to the texture reference - * h_tex_ref available to a device program as a texture. In this version - * of CUDA, the texture-reference must be obtained via cuModuleGetTexRef() - * and the tex_unit parameter must be set to CU_PARAM_TR_DEFAULT. - * - * \param h_func Kernel to add texture-reference to. - * \param tex_unit Texture unit (must be CU_PARAM_TR_DEFAULT). - * \param h_tex_ref Texture-reference to add to argument list. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref) -{ - CUresult result = 0; - - result = cuParamSetTexRef(h_func, tex_unit, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_TEX_REF) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Copies an arbitrary amount of data (specified in numbytes) from ptr - * into the parameter space of the kernel corresponding to hfunc. - * offset is a byte offset. - * - * \param h_func Kernel to add data to. - * \param offset Offset to add data to argument list. - * \param ptr Pointer to arbitrary data. - * \param num_bytes Size of data to copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, - unsigned int num_bytes) -{ - CUresult result = 0; - - if (ptr == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ptr is NULL"); - goto error; - } - - result = cuParamSetv(h_func, offset, ptr, num_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETV) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/***********************Texture_Reference_Management_API***********************/ - -/** - * \brief Creates a texture reference and returns its handle in *pTexRef. Once - * created, the application must call cuTexRefSetArray() or cuTexRefSetAddress() - * to associate the reference with allocated memory. Other texture reference - * functions are used to specify the format and interpretation (addressing, - * filtering, etc.) to be used when the memory is read through this texture - * reference. To associate the texture reference with a texture ordinal for - * a given function, the application should call cuParamSetTexRef(). - * - * \param p_tex_ref Returned texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefCreate(CUtexref *p_tex_ref) -{ - CUresult result = 0; - - if (p_tex_ref == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_tex_ref is NULL"); - goto error; - } - - result = cuTexRefCreate(p_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_CREATE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Destroys the texture reference specified by hTexRef. - * - * \param h_tex_ref Texture reference to destroy - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefDestroy(CUtexref h_tex_ref) -{ - CUresult result = 0; - - result = cuTexRefDestroy(h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_DESTROY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *pdptr the base address bound to the texture reference - * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any device memory range. - * - * \param pdptr Returned device address - * \param h_tex_ref Texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref) -{ - CUresult result = 0; - - if (pdptr == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pdptr is NULL"); - goto error; - } - - result = cuTexRefGetAddress(pdptr, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *pam the addressing mode corresponding to the dimension - * dim of the texture reference hTexRef. Currently, the only valid value - * for dim are 0 and 1. - * - * \param pam Returned addressing mode - * \param h_tex_ref Texture reference - * \param dim Dimension - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, int dim) -{ - CUresult result = 0; - - if (pam == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pam is NULL"); - goto error; - } - - result = cuTexRefGetAddressMode(pam, h_tex_ref, dim); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *phArray the CUDA array bound to the texture reference - * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA array. - * - * \param ph_array Returned array - * \param h_tex_ref Texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref) -{ - CUresult result = 0; - - if (ph_array == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ph_array is NULL"); - goto error; - } - - result = cuTexRefGetArray(ph_array, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ARRAY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *pfm the filtering mode of the texture reference hTexRef. - * - * \param pfm Returned filtering mode - * \param h_tex_ref Texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref) -{ - CUresult result = 0; - - if (pfm == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pfm is NULL"); - goto error; - } - - result = cuTexRefGetFilterMode(pfm, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FILTER_MODE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *pFlags the flags of the texture reference hTexRef. - * - * \param p_flags Returned flags - * \param h_tex_ref Texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref) -{ - CUresult result = 0; - - if (p_flags == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_flags is NULL"); - goto error; - } - - result = cuTexRefGetFlags(p_flags, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FLAGS) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *pFormat and *pNumChannels the format and number of - * components of the CUDA array bound to the texture reference hTexRef. - * If pFormat or pNumChannels is NULL, it will be ignored. - * - * \param p_format Returned format - * \param p_num_channels Returned number of components - * \param h_tex_ref Texture reference - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, - CUtexref h_tex_ref) -{ - CUresult result = 0; - - if (p_format == NULL || p_num_channels == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_format == NULL || p_num_channels == NULL"); - goto error; - } - - result = cuTexRefGetFormat(p_format, p_num_channels, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FORMAT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Binds a linear address range to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Any memory previously - * bound to hTexRef is unbound. - * - * Since the hardware enforces an alignment requirement on texture - * base addresses, cuTexRefSetAddress() passes back a byte offset in - * *ByteOffset that must be applied to texture fetches in order to read - * from the desired memory. This offset must be divided by the texel - * size and passed to kernels that read from the texture so they can be - * applied to the tex1Dfetch() function. - * - * If the device memory pointer was returned from cuMemAlloc(), the - * offset is guaranteed to be 0 and NULL may be passed as the - * ByteOffset parameter. - * - * \param byte_offset Returned byte offset - * \param h_tex_ref Texture reference to bind - * \param dptr Device pointer to bind - * \param bytes Size of memory to bind in bytes - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref, - CUdeviceptr dptr, unsigned int bytes) -{ - CUresult result = 0; - - if (byte_offset == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "byte_offset is NULL"); - goto error; - } - - result = cuTexRefSetAddress(byte_offset, h_tex_ref, dptr, bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Binds a linear address range to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Any memory previously bound - * to hTexRef is unbound. - * - * Using a tex2D() function inside a kernel requires a call to either - * cuTexRefSetArray() to bind the corresponding texture reference to an - * array, or cuTexRefSetAddress2D() to bind the texture reference to - * linear memory. - * - * Function calls to cuTexRefSetFormat() cannot follow calls to - * cuTexRefSetAddress2D() for the same texture reference. - * - * It is required that dptr be aligned to the appropriate hardware- - * specific texture alignment. You can query this value using the device - * attribute CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned dptr - * is supplied, CUDA_ERROR_INVALID_VALUE is returned. - * - * \param h_tex_ref Texture reference to bind - * \param desc Descriptor of CUDA array - * \param dptr Device pointer to bind - * \param pitch Line pitch in bytes - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, const CUDA_ARRAY_DESCRIPTOR *desc, - CUdeviceptr dptr, unsigned int pitch) -{ - CUresult result = 0; - - result = cuTexRefSetAddress2D(h_tex_ref, desc, dptr, pitch); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Specifies the addressing mode am for the given dimension dim of the - * texture reference hTexRef. If dim is zero, the addressing mode is - * applied to the first parameter of the functions used to fetch from - * the texture; if dim is 1, the second, and so on. CUaddress_mode is - * defined as: - * - * typedef enum CUaddress_mode_enum { - * CU_TR_ADDRESS_MODE_WRAP = 0, - * CU_TR_ADDRESS_MODE_CLAMP = 1, - * CU_TR_ADDRESS_MODE_MIRROR = 2, - * } CUaddress_mode; - * - * \param h_tex_ref Texture reference - * \param dim Dimension - * \param am Addressing mode to set - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am) -{ - CUresult result = 0; - - result = cuTexRefSetAddressMode(h_tex_ref, dim, am); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Binds the CUDA array hArray to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Flags must be set to - * CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to hTexRef - * is unbound. - * - * \param h_tex_ref Texture reference to bind - * \param h_array Array to bind - * \param flags Options (must be CU_TRSA_OVERRIDE_FORMAT) - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, unsigned int flags) -{ - CUresult result = 0; - - result = cuTexRefSetArray(h_tex_ref, h_array, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ARRAY) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Specifies the filtering mode fm to be used when reading memory through - * the texture reference hTexRef. CUfilter_mode_enum is defined as: - * - * typedef enum CUfilter_mode_enum { - * CU_TR_FILTER_MODE_POINT = 0, - * CU_TR_FILTER_MODE_LINEAR = 1 - * } CUfilter_mode; - * - * \param h_tex_ref Texture reference - * \param fm Filtering mode to set - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm) -{ - CUresult result = 0; - - result = cuTexRefSetFilterMode(h_tex_ref, fm); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FILTER_MODE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Specifies optional flags via Flags to specify the behavior of data - * returned through the texture reference hTexRef. The valid flags are: - * - * * CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in - * the range [0, 1]; - * * CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default - * behavior of having the texture coordinates range from [0, Dim) where - * Dim is the width or height of the CUDA array. Instead, the texture - * coordinates [0, 1.0) reference the entire breadth of the array - * dimension; - * - * \param h_tex_ref Texture reference - * \param flags Optional flags to set - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags) -{ - CUresult result = 0; - - result = cuTexRefSetFlags(h_tex_ref, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FLAGS) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Specifies the format of the data to be read by the texture reference - * hTexRef. fmt and NumPackedComponents are exactly analogous to the - * Format and NumChannels members of the CUDA_ARRAY_DESCRIPTOR structure: - * They specify the format of each component and the number of components - * per array element. - * - * \param h_tex_ref Texture reference - * \param fmt Format to set - * \param num_packed_components Number of components per array element - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, - int num_packed_components) -{ - CUresult result = 0; - - result = cuTexRefSetFormat(h_tex_ref, fmt, num_packed_components); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FORMAT) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/**************************Cuda_Env_Initialization_API*************************/ - -/** - * \brief Initialize the CUDA Environment for the engine. - * - * \retval 0 On successfully initializing the CUDA environment for the engine. - * \retval -1 On failure. - */ -int SCCudaInitCudaEnvironment(void) -{ - if (devices != NULL) { - SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine already initalized!!!!"); - return 0; - } - - if (SCCudaInit(0) == -1) { - SCLogError(SC_ERR_CUDA_ERROR, "Error initializing CUDA API. SCCudaInit() " - "returned -1"); - goto error; - } - - if ( (devices = SCCudaGetDevices()) == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "Error getting CUDA device list. " - "SCCudaGetDevices() returned NULL"); - goto error; - } - - SCCudaPrintBasicDeviceInfo(devices); - - return 0; - - error: - SCCudaDeAllocSCCudaDevices(devices); - return -1; -} - -/**********************************Cuda_Utility********************************/ - -/** - * \brief List the cuda cards on the system. - * - */ -void SCCudaListCards(void) -{ - int i = 0; - - if (devices == NULL) { - SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine not initalized! Please " - "initialize the cuda environment using " - "SCCudaInitCudaEnvironment()."); - return; - } - - printf("CUDA Cards recognized by the suricata CUDA module - \n"); - printf("|-----------------------------------------------------------------------------|\n"); - printf("| %-10s | %-20s | %-10s | %-10s | %-13s |\n", - "Device Id", " Device Name", " Multi-", "Clock Rate", "Cuda Compute"); - printf("| %-10s | %-20s | %-10s | %-10s | %-13s |\n", - "", "", "Processors", " (MHz)", "Capability"); - printf("|-----------------------------------------------------------------------------|\n"); - for (i = 0; i < devices->count; i++) { - printf("| %-10d | %-20s | %-10d | %-10d | %d.%-11d |\n", - i, - devices->devices[i]->name, - devices->devices[i]->attr_multiprocessor_count, - devices->devices[i]->attr_clock_rate/1000, - devices->devices[i]->major_rev, - devices->devices[i]->minor_rev); - } - printf("|-----------------------------------------------------------------------------|\n"); - - return; -} - -int SCCudaIsCudaDeviceIdValid(int cuda_device_id) -{ - if (devices == NULL) { - SCLogWarning(SC_ERR_CUDA_ERROR, "CUDA engine not initalized! Please " - "initialize the cuda environment using " - "SCCudaInitCudaEnvironment()."); - return 0; - } - - return (cuda_device_id < devices->count); -} - -/**********************************Unittests***********************************/ - -int SCCudaTest01(void) -{ - SCCudaDevices *devices = SCCudaGetDeviceList(); - - if (devices == NULL) - return 0; - - return (devices->count != 0); -} - -#if defined(__x86_64__) || defined(__ia64__) -/** - * extern "C" __global__ void SCCudaSuricataTest(int *input, int *output) - * { - * output[threadIdx.x] = input[threadIdx.x] * 2; - * } - */ -static const char *sc_cuda_test_kernel_64_bit = - " .version 1.4\n" - " .target sm_10, map_f64_to_f32\n" - " .entry SCCudaSuricataTest (\n" - " .param .u64 __cudaparm_SCCudaSuricataTest_input,\n" - " .param .u64 __cudaparm_SCCudaSuricataTest_output)\n" - "{\n" - " .reg .u32 %r<5>;\n" - " .reg .u64 %rd<8>;\n" - " .loc 15 1 0\n" - " $LBB1_SCCudaSuricataTest:\n" - " .loc 15 3 0\n" - " cvt.u32.u16 %r1, %tid.x;\n" - " cvt.u64.u32 %rd1, %r1;\n" - " mul.lo.u64 %rd2, %rd1, 4;\n" - " ld.param.u64 %rd3, [__cudaparm_SCCudaSuricataTest_input];\n" - " add.u64 %rd4, %rd3, %rd2;\n" - " ld.global.s32 %r2, [%rd4+0];\n" - " mul.lo.s32 %r3, %r2, 2;\n" - " ld.param.u64 %rd5, [__cudaparm_SCCudaSuricataTest_output];\n" - " add.u64 %rd6, %rd5, %rd2;\n" - " st.global.s32 [%rd6+0], %r3;\n" - " .loc 15 4 0\n" - " exit;\n" - " $LDWend_SCCudaSuricataTest:\n" - "} // SCCudaSuricataTest\n" - "\n"; -#else -/** - * extern "C" __global__ void SCCudaSuricataTest(int *input, int *output) - * { - * output[threadIdx.x] = input[threadIdx.x] * 2; - * } - */ -static const char *sc_cuda_test_kernel_32_bit = - " .version 1.4\n" - " .target sm_10, map_f64_to_f32\n" - " .entry SCCudaSuricataTest (\n" - " .param .u32 __cudaparm_SCCudaSuricataTest_input,\n" - " .param .u32 __cudaparm_SCCudaSuricataTest_output)\n" - " {\n" - " .reg .u16 %rh<3>;\n" - " .reg .u32 %r<9>;\n" - " .loc 15 2 0\n" - "$LBB1_SCCudaSuricataTest:\n" - " .loc 15 4 0\n" - " mov.u16 %rh1, %tid.x;\n" - " mul.wide.u16 %r1, %rh1, 4;\n" - " ld.param.u32 %r2, [__cudaparm_SCCudaSuricataTest_input];\n" - " add.u32 %r3, %r2, %r1;\n" - " ld.global.s32 %r4, [%r3+0];\n" - " mul.lo.s32 %r5, %r4, 2;\n" - " ld.param.u32 %r6, [__cudaparm_SCCudaSuricataTest_output];\n" - " add.u32 %r7, %r6, %r1;\n" - " st.global.s32 [%r7+0], %r5;\n" - " .loc 15 5 0\n" - " exit;\n" - "$LDWend_SCCudaSuricataTest:\n" - " } // SCCudaSuricataTest\n" - ""; -#endif - -int SCCudaTest02(void) -{ -#define ALIGN_UP(offset, alignment) do { \ - (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1); \ - } while (0) -#define N 256 - CUcontext context; - CUmodule module; - CUfunction kernel; - CUdeviceptr d_input, d_output; - int h_input[N]; - int h_result[N]; - SCCudaDevices *devices = SCCudaGetDeviceList(); - int result = 0; - int offset = 0; - int i = 0; - - if (devices == NULL) - goto end; - - if (devices->count == 0) - goto end; - - if (SCCudaCtxCreate(&context, 0, devices->devices[0]->device) == -1) - goto end; - -#if defined(__x86_64__) || defined(__ia64__) - if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_64_bit) == -1) - goto end; -#else - if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_32_bit) == -1) - goto end; -#endif - - if (SCCudaModuleGetFunction(&kernel, module, "SCCudaSuricataTest") == -1) - goto end; - - for (i = 0; i < N; i++) - h_input[i] = i * 2; - - if (SCCudaMemAlloc(&d_input, N * sizeof(int)) == -1) - goto end; - - if (SCCudaMemcpyHtoD(d_input, h_input, N * sizeof(int)) == -1) - goto end; - - if (SCCudaMemAlloc(&d_output, N * sizeof(int)) == -1) - goto end; - - offset = 0; - ALIGN_UP(offset, __alignof(void *)); - if (SCCudaParamSetv(kernel, offset, (void *)&d_input, sizeof(void *)) == -1) - goto end; - offset += sizeof(void *); - - ALIGN_UP(offset, __alignof(void *)); - if (SCCudaParamSetv(kernel, offset, (void *)&d_output, sizeof(void *)) == -1) - goto end; - offset += sizeof(void *); - - if (SCCudaParamSetSize(kernel, offset) == -1) - goto end; - - if (SCCudaFuncSetBlockShape(kernel, N, 1, 1) == -1) - goto end; - - if (SCCudaLaunchGrid(kernel, 1, 1) == -1) - goto end; - - if (SCCudaMemcpyDtoH(h_result, d_output, N * sizeof(int)) == -1) - goto end; - - for (i = 0; i < N; i++) - h_input[i] = i * 4; - - for (i = 0; i < N; i++) { - if (h_result[i] != h_input[i]) - goto end; - } - - if (SCCudaMemFree(d_input) == -1) - goto end; - - if (SCCudaMemFree(d_output) == -1) - goto end; - - if (SCCudaModuleUnload(module) == -1) - goto end; - - if (SCCudaCtxDestroy(context) == -1) - goto end; - - result = 1; - - end: - return result; -} - -void SCCudaRegisterTests(void) -{ -#ifdef UNITTESTS - UtRegisterTest("SCCudaTest01", SCCudaTest01); - UtRegisterTest("SCCudaTest02", SCCudaTest02); -#endif - - return; -} - -#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/util-cuda.h b/src/util-cuda.h deleted file mode 100644 index 8e544fd04a..0000000000 --- a/src/util-cuda.h +++ /dev/null @@ -1,323 +0,0 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - */ - -#ifndef __UTIL_CUDA__H__ -#define __UTIL_CUDA__H__ - -#ifdef __SC_CUDA_SUPPORT__ - -#include - -#define SC_CUDA_DEFAULT_DEVICE 0 -#define SC_CUDA_DEVICE_NAME_MAX_LEN 128 - -typedef struct SCCudaDevice_ { - /* device id */ - CUdevice device; - - /* device name */ - char name[SC_CUDA_DEVICE_NAME_MAX_LEN]; - - /* device compute capability */ - int major_rev; - int minor_rev; - - /* device properties */ - CUdevprop prop; - - /* device total memory */ - size_t bytes; - - /* device attributes. We could have used a fixed int array table to hold - * the attributes, but it is better we specify it exclusively this way, - * since the usage would be less error prone */ - int attr_max_threads_per_block; - int attr_max_block_dim_x; - int attr_max_block_dim_y; - int attr_max_block_dim_z; - int attr_max_grid_dim_x; - int attr_max_grid_dim_y; - int attr_max_grid_dim_z; - int attr_max_shared_memory_per_block; - int attr_total_constant_memory; - int attr_warp_size; - int attr_max_pitch; - int attr_max_registers_per_block; - int attr_clock_rate; - int attr_texture_alignment; - int attr_gpu_overlap; - int attr_multiprocessor_count; - int attr_kernel_exec_timeout; - int attr_integrated; - int attr_can_map_host_memory; - int attr_compute_mode; -} SCCudaDevice; - - -typedef struct SCCudaDevices_ { - int count; - SCCudaDevice **devices; -} SCCudaDevices; - - -/**************************Cuda_Initialization_API**************************/ -int SCCudaInit(unsigned int flags); - -/***************************Version_Management_API***************************/ -int SCCudaDriverGetVersion(int *driver_version); - -/***************************Device_Management_API****************************/ -int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev); -int SCCudaDeviceGet(CUdevice *device, int ordinal); -int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, - CUdevice dev); -int SCCudaDeviceGetCount(int *count); -int SCCudaDeviceGetName(char *name, int len, CUdevice dev); -int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev); -int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev); - -void SCCudaPrintDeviceList(SCCudaDevices *); -void SCCudaPrintBasicDeviceInfo(SCCudaDevices *); -SCCudaDevices *SCCudaGetDeviceList(void); - -/***************************Context_Management_API***************************/ -int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); -int SCCudaCtxDestroy(CUcontext ctx); -int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version); -int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig); -int SCCudaCtxGetCurrent(CUcontext *pctx); -int SCCudaCtxGetDevice(CUdevice *device); -int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit); -int SCCudaCtxPopCurrent(CUcontext *pctx); -int SCCudaCtxPushCurrent(CUcontext ctx); -int SCCudaCtxSetCacheConfig(CUfunc_cache config); -int SCCudaCtxSetCurrent(CUcontext ctx); -int SCCudaCtxSetLimit(CUlimit limit, size_t value); -int SCCudaCtxSynchronize(void); -int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags); -int SCCudaCtxDetach(CUcontext ctx); - -/***************************Module_Management_API****************************/ -int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, - const char *name); -int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, - const char *name); -int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod, - const char *name); -int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, - const char *name); -int SCCudaModuleLoad(CUmodule *module, const char *fname); -int SCCudaModuleLoadData(CUmodule *module, const void *image); -int SCCudaModuleLoadDataEx(CUmodule *module, const void *image, - unsigned int num_options, CUjit_option *options, - void **option_values); -int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin); -int SCCudaModuleUnload(CUmodule hmod); - -/**************************Memory_Management_API*****************************/ -int SCCudaArray3DCreate(CUarray *p_handle, - const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array); -int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, - CUarray h_array); -int SCCudaArrayCreate(CUarray *p_handle, - const CUDA_ARRAY_DESCRIPTOR *p_allocate_array); -int SCCudaArrayDestroy(CUarray h_array); -int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, - CUarray h_array); -int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id); -int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev); -int SCCudaIpcCloseMemHandle(CUdeviceptr dptr); -int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event); -int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr); -int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle); -int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, - unsigned int flags); -int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size); -int SCCudaMemAllocHost(void **pp, size_t byte_size); -int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch, - size_t width_in_bytes, - size_t height, - unsigned int element_size_bytes); -int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count); -int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy); -int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream); -int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy); -int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy); -int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream); -int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy); -int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy, - CUstream h_stream); -int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count, - CUstream h_stream); -int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset, - CUarray src_array, size_t src_offset, - size_t byte_count); -int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array, - size_t src_offset, size_t byte_count); -int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset, - size_t byte_count); -int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, - size_t src_offset, size_t byte_count, - CUstream h_stream); -int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset, - CUdeviceptr src_device, size_t byte_count); -int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device, - size_t byte_count); -int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device, - size_t byte_count, CUstream h_stream); -int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device, - size_t byte_count); -int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device, - size_t byte_count, CUstream h_stream); -int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset, - const void *src_host, size_t byte_count); -int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset, - const void *src_host, size_t byte_count, - CUstream h_stream); -int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, - size_t byte_count); -int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, - size_t byte_count, CUstream h_stream); -int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context, - CUdeviceptr src_device, CUcontext src_context, - size_t byte_count); -int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context, - CUdeviceptr src_device, CUcontext src_context, - size_t byte_count, CUstream h_stream); -int SCCudaMemFree(CUdeviceptr dptr); -int SCCudaMemFreeHost(void *p); -int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, - CUdeviceptr dptr); -int SCCudaMemGetInfo(size_t *free, size_t *total); -int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags); -int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, - unsigned int flags); -int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p); -int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags); -int SCCudaMemHostUnregister(void *p); -int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n); -int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us, - size_t n, CUstream h_stream); -int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch, - unsigned short us, size_t width, - size_t height); -int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned short us, size_t width, - size_t height, CUstream h_stream); -int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch, - unsigned int ui, size_t width, size_t height); -int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned int ui, size_t width, size_t height, - CUstream h_stream); -int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch, - unsigned char uc, size_t width, size_t height); -int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch, - unsigned char uc, size_t width, size_t height, - CUstream h_stream); -int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n); -int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui, - size_t n, CUstream h_stream); -int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n); -int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc, - size_t n, CUstream h_stream); - -/***************************Unified_Addressing_API****************************/ - -int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute, - CUdeviceptr ptr); - -/***************************Stream_Management_API****************************/ -int SCCudaStreamCreate(CUstream *ph_stream, unsigned int flags); -int SCCudaStreamDestroy(CUstream h_stream); -int SCCudaStreamQuery(CUstream h_stream); -int SCCudaStreamSynchronize(CUstream h_stream); -int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event, - unsigned int flags); - -/***************************Event_Management_API*****************************/ -int SCCudaEventCreate(CUevent *ph_event, unsigned int flags); -int SCCudaEventDestroy(CUevent h_event); -int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, - CUevent h_end); -int SCCudaEventQuery(CUevent h_event); -int SCCudaEventRecord(CUevent h_event, CUstream h_stream); -int SCCudaEventSynchronize(CUevent h_event); - -/***********************Execution_Control_Management_API***********************/ -int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, - CUfunction hfunc); -int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); -int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x, - unsigned int grid_dim_y, unsigned int grid_dim_z, - unsigned int block_dim_x, unsigned int block_dim_y, - unsigned int block_dim_z, unsigned int shared_mem_bytes, - CUstream h_stream, void **kernel_params, void **extra); -int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); -int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); -int SCCudaLaunch(CUfunction f); -int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height); -int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, - CUstream h_stream); -int SCCudaParamSetf(CUfunction h_func, int offset, float value); -int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value); -int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes); -int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref); -int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, - unsigned int num_bytes); - -/*********************Texture_Reference_Management_API***********************/ -int SCCudaTexRefCreate(CUtexref *p_tex_ref); -int SCCudaTexRefDestroy(CUtexref h_tex_ref); -int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref); -int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, - int dim); -int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref); -int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref); -int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref); -int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, - CUtexref h_tex_ref); -int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref, - CUdeviceptr dptr, unsigned int bytes); -int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, - const CUDA_ARRAY_DESCRIPTOR *desc, - CUdeviceptr dptr, unsigned int pitch); -int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am); -int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, - unsigned int flags); -int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm); -int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags); -int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, - int num_packed_components); - -/************************Cuda_Env_Initialization_API*************************/ -int SCCudaInitCudaEnvironment(void); - -/********************************Cuda_Utility********************************/ -void SCCudaListCards(void); -int SCCudaIsCudaDeviceIdValid(int cuda_device_id); - -/********************************Unittests***********************************/ -void SCCudaRegisterTests(void); - -#endif /* __SC_CUDA_SUPPORT__ */ -#endif /* __UTIL_CUDA_H__ */ diff --git a/src/util-mpm-ac-bs.c b/src/util-mpm-ac-bs.c index 16fb9ba795..ded1ad18fa 100644 --- a/src/util-mpm-ac-bs.c +++ b/src/util-mpm-ac-bs.c @@ -995,8 +995,6 @@ void SCACBSInitThreadCtx(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx) * \brief Initialize the AC context. * * \param mpm_ctx Mpm context. - * \param module_handle Cuda module handle from the cuda handler API. We don't - * have to worry about this here. */ void SCACBSInitCtx(MpmCtx *mpm_ctx) { diff --git a/src/util-mpm-ac-cuda-kernel.cu b/src/util-mpm-ac-cuda-kernel.cu deleted file mode 100644 index d7cc125bf2..0000000000 --- a/src/util-mpm-ac-cuda-kernel.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (C) 2007-2012 Open Information Security Foundation - * - * You can copy, redistribute or modify this Program under the terms of - * the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -/** - * \file - * - * \author Anoop Saldanha - * - * The Cuda kernel for MPM AC. - * - * \todo - This is a basic version of the kernel. - * - Support 16 bit state tables. - * - Texture memory. - * - Multiple threads per blocks of threads. Make use of - * shared memory/texture memory. - */ - -extern "C" -__global__ void SCACCudaSearch64(unsigned char *d_buffer, - unsigned int d_buffer_start_offset, - unsigned int *o_buffer, - unsigned int *results_buffer, - unsigned int nop, - unsigned char *tolower) -{ - unsigned int u = 0; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= nop) - return; - - unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset))); - unsigned int (*state_table_u32)[256] = - (unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8)); - unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16); - - unsigned int state = 0; - unsigned int matches = 0; - unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1); - for (u = 0; u < buflen; u++) { - state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]]; - if (state & 0xFF000000) { - results[matches++] = u; - results[matches++] = state & 0x00FFFFFF; - } - } - - *(results - 1) = matches; - return; -} - -extern "C" -__global__ void SCACCudaSearch32(unsigned char *d_buffer, - unsigned int d_buffer_start_offset, - unsigned int *o_buffer, - unsigned int *results_buffer, - unsigned int nop, - unsigned char *tolower) -{ - unsigned int u = 0; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= nop) - return; - - unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset))); - unsigned int (*state_table_u32)[256] = - (unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4)); - unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8); - - unsigned int state = 0; - unsigned int matches = 0; - unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1); - for (u = 0; u < buflen; u++) { - state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]]; - if (state & 0xFF000000) { - results[matches++] = u; - results[matches++] = state & 0x00FFFFFF; - } - } - - *(results - 1) = matches; - return; -} diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c index 0331fd0297..42666f111d 100644 --- a/src/util-mpm-ac.c +++ b/src/util-mpm-ac.c @@ -60,15 +60,6 @@ #include "util-mpm-ac.h" #include "util-memcpy.h" -#ifdef __SC_CUDA_SUPPORT__ - -#include "util-mpm.h" -#include "tm-threads.h" -#include "detect-engine-mpm.h" -#include "util-cuda.h" -#include "util-cuda-handlers.h" -#endif /* __SC_CUDA_SUPPORT__ */ - void SCACInitCtx(MpmCtx *); void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *); void SCACDestroyCtx(MpmCtx *); @@ -851,25 +842,6 @@ int SCACPreparePatterns(MpmCtx *mpm_ctx) /* prepare the state table required by AC */ SCACPrepareStateTable(mpm_ctx); -#ifdef __SC_CUDA_SUPPORT__ - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda, - ctx->state_count * sizeof(unsigned int) * 256); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - - r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda, - ctx->state_table_u32, - ctx->state_count * sizeof(unsigned int) * 256); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); - exit(EXIT_FAILURE); - } - } -#endif - /* free all the stored patterns. Should save us a good 100-200 mbs */ for (i = 0; i < mpm_ctx->pattern_cnt; i++) { if (ctx->parray[i] != NULL) { @@ -1258,549 +1230,6 @@ void SCACPrintInfo(MpmCtx *mpm_ctx) return; } -/****************************Cuda side of things****************************/ - -#ifdef __SC_CUDA_SUPPORT__ - -/* \todo Technically it's generic to all mpms, but since we use ac only, the - * code internally directly references ac and hence it has found its - * home in this file, instead of util-mpm.c - */ -void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx) -{ - MpmCtx *mpm_ctx = NULL; - - int ac_16_tables = 0; - int ac_32_tables = 0; - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0); - if (mpm_ctx->mpm_type == MPM_AC_CUDA) { - SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx; - if (ctx->state_count < 32767) - ac_16_tables++; - else - ac_32_tables++; - } - - if (ac_16_tables > 0 && ac_32_tables > 0) - SCACConstructBoth16and32StateTables(); - - SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables); - SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables); - -} - -void CudaReleasePacket(Packet *p) -{ - if (p->cuda_pkt_vars.cuda_mpm_enabled == 1) { - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); - p->cuda_pkt_vars.cuda_done = 0; - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - } - - return; -} - -/* \todos - * - Use texture memory - Can we fit all the arrays into a 3d texture. - * Texture memory definitely offers slightly better performance even - * on gpus that offer cache for global memory. - * - Packetpool - modify to support > 65k max pending packets. We are - * hitting packetpool limit currently even with 65k packets. - * - Use streams. We have tried overlapping parsing results from the - * previous call with invoking the next call. - * - Offer higher priority to decode threads. - * - Modify pcap file mode to support reading from multiple pcap files - * and hence we will have multiple receive threads. - * - Split state table into many small pieces and have multiple threads - * run each small state table on the same payload. - * - Used a config peference of l1 over shared memory with no noticeable - * perf increase. Explore it in detail over cards/architectures. - * - Constant memory performance sucked. Explore it in detail. - * - Currently all our state tables are small. Implement 16 bit state - * tables on priority. - * - Introduce profiling. - * - Retrieve sgh before buffer packet. - */ - -void SCACConstructBoth16and32StateTables(void) -{ - construct_both_16_and_32_state_tables = 1; - - return; -} - -/* \todo Reduce offset buffer size. Probably a 100,000 entry would be sufficient. */ -static void *SCACCudaDispatcher(void *arg) -{ -#define BLOCK_SIZE 32 - - int r = 0; - ThreadVars *tv = (ThreadVars *)arg; - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - uint32_t sleep_interval_ms = conf->batching_timeout; - - SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of " - "\"%"PRIu32"\" micro-seconds", sleep_interval_ms); - - CudaBufferData *cb_data = - CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, - MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); - - CUcontext cuda_context = - CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); - if (cuda_context == 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL."); - exit(EXIT_FAILURE); - } - r = SCCudaCtxPushCurrent(cuda_context); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed."); - exit(EXIT_FAILURE); - } - CUmodule cuda_module = 0; - if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module."); - exit(EXIT_FAILURE); - } - CUfunction kernel = 0; -#if __WORDSIZE==64 - if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel"); - exit(EXIT_FAILURE); - } -#else - if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel"); - exit(EXIT_FAILURE); - } -#endif - - uint8_t g_u8_lowercasetable[256]; - for (int c = 0; c < 256; c++) - g_u8_lowercasetable[c] = tolower((uint8_t)c); - CUdeviceptr cuda_g_u8_lowercasetable_d = 0; - CUdeviceptr cuda_packets_buffer_d = 0; - CUdeviceptr cuda_offset_buffer_d = 0; - CUdeviceptr cuda_results_buffer_d = 0; - uint32_t *cuda_results_buffer_h = NULL; - r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable)); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable)); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure."); - exit(EXIT_FAILURE); - } - - CudaBufferCulledInfo cb_culled_info; - memset(&cb_culled_info, 0, sizeof(cb_culled_info)); - - TmThreadsSetFlag(tv, THV_INIT_DONE); - while (1) { - if (TmThreadsCheckFlag(tv, THV_KILL)) - break; - - usleep(sleep_interval_ms); - - /**************** 1 SEND ****************/ - CudaBufferCullCompletedSlices(cb_data, &cb_culled_info, conf->gpu_transfer_size); - if (cb_culled_info.no_of_items == 0) - continue; -#if 0 - SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" " - "cb_culled_info.buffer_len - %"PRIu32" " - "cb_culled_info.average size - %f " - "cb_culled_info.d_buffer_start_offset - %"PRIu32" " - "cb_culled_info.op_buffer_start_offset - %"PRIu32" " - "cb_data.no_of_items - %"PRIu32" " - "cb_data.d_buffer_read - %"PRIu32" " - "cb_data.d_buffer_write - %"PRIu32" " - "cb_data.op_buffer_read - %"PRIu32" " - "cb_data.op_buffer_write - %"PRIu32"\n", - cb_culled_info.no_of_items, - cb_culled_info.d_buffer_len, - cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items, - cb_culled_info.d_buffer_start_offset, - cb_culled_info.op_buffer_start_offset, - cb_data->no_of_items, - cb_data->d_buffer_read, - cb_data->d_buffer_write, - cb_data->op_buffer_read, - cb_data->op_buffer_write); -#endif - r = SCCudaMemcpyHtoDAsync(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len, 0); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemcpyHtoDAsync(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items, 0); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure."); - exit(EXIT_FAILURE); - } - void *args[] = { &cuda_packets_buffer_d, - &cb_culled_info.d_buffer_start_offset, - &cuda_offset_buffer_d, - &cuda_results_buffer_d, - &cb_culled_info.no_of_items, - &cuda_g_u8_lowercasetable_d }; - r = SCCudaLaunchKernel(kernel, - (cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1, - BLOCK_SIZE, 1, 1, - 0, 0, - args, NULL); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure."); - exit(EXIT_FAILURE); - } - r = SCCudaMemcpyDtoHAsync(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2), 0); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure."); - exit(EXIT_FAILURE); - } - - - - /**************** 1 SYNCHRO ****************/ - r = SCCudaCtxSynchronize(); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure."); - exit(EXIT_FAILURE); - } - - /************* 1 Parse Results ************/ - uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset; - uint32_t no_of_items = cb_culled_info.no_of_items; - uint32_t *o_buffer = cb_data->o_buffer; - uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset; - for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) { - Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset]; - - SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); - if (p->cuda_pkt_vars.cuda_mpm_enabled == 0) { - p->cuda_pkt_vars.cuda_done = 0; - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - continue; - } - - p->cuda_pkt_vars.cuda_gpu_matches = - cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)]; - if (p->cuda_pkt_vars.cuda_gpu_matches != 0) { - memcpy(p->cuda_pkt_vars.cuda_results, - cuda_results_buffer_h + - ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2), - (cuda_results_buffer_h[((o_buffer[i_op_start_offset] - - d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4); - } - - p->cuda_pkt_vars.cuda_done = 1; - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - SCCondSignal(&p->cuda_pkt_vars.cuda_cond); - } - if (no_of_items != 0) - CudaBufferReportCulledConsumption(cb_data, &cb_culled_info); - } /* while (1) */ - - r = SCCudaModuleUnload(cuda_module); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module."); - exit(EXIT_FAILURE); - } - r = SCCudaMemFree(cuda_packets_buffer_d); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); - exit(EXIT_FAILURE); - } - r = SCCudaMemFree(cuda_offset_buffer_d); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); - exit(EXIT_FAILURE); - } - r = SCCudaMemFree(cuda_results_buffer_d); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory."); - exit(EXIT_FAILURE); - } - r = SCCudaMemFreeHost(cuda_results_buffer_h); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); - exit(EXIT_FAILURE); - } - - TmThreadsSetFlag(tv, THV_RUNNING_DONE); - TmThreadWaitForFlag(tv, THV_DEINIT); - TmThreadsSetFlag(tv, THV_CLOSED); - - return NULL; - -#undef BLOCK_SIZE -} - -uint32_t SCACCudaPacketResultsProcessing(Packet *p, const MpmCtx *mpm_ctx, - PrefilterRuleStore *pmq) -{ - uint32_t u = 0; - - while (!p->cuda_pkt_vars.cuda_done) { - SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); - if (p->cuda_pkt_vars.cuda_done) { - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - break; - } else { - SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex); - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - } - } /* while */ - p->cuda_pkt_vars.cuda_done = 0; - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - - uint32_t cuda_matches = p->cuda_pkt_vars.cuda_gpu_matches; - if (cuda_matches == 0) - return 0; - - uint32_t matches = 0; - uint32_t *results = p->cuda_pkt_vars.cuda_results + 1; - uint8_t *buf = p->payload; - SCACCtx *ctx = mpm_ctx->ctx; - SCACOutputTable *output_table = ctx->output_table; - SCACPatternList *pid_pat_list = ctx->pid_pat_list; - - uint8_t bitarray[ctx->pattern_id_bitarray_size]; - memset(bitarray, 0, ctx->pattern_id_bitarray_size); - - for (u = 0; u < cuda_matches; u += 2) { - uint32_t offset = results[u]; - uint32_t state = results[u + 1]; - /* we should technically be doing state & 0x00FFFFFF, but we don't - * since the cuda kernel does that for us */ - uint32_t no_of_entries = output_table[state].no_of_entries; - /* we should technically be doing state & 0x00FFFFFF, but we don't - * since the cuda kernel does that for us */ - uint32_t *pids = output_table[state].pids; - uint32_t k; - /* note that this is not a verbatim copy from SCACSearch(). We - * don't copy the pattern id into the pattern_id_array. That's - * the only change */ - for (k = 0; k < no_of_entries; k++) { - if (pids[k] & AC_CASE_MASK) { - uint32_t lower_pid = pids[k] & 0x0000FFFF; - if (SCMemcmp(pid_pat_list[lower_pid].cs, - buf + offset - pid_pat_list[lower_pid].patlen + 1, - pid_pat_list[lower_pid].patlen) != 0) { - /* inside loop */ - continue; - } - if (bitarray[(lower_pid) / 8] & (1 << ((lower_pid) % 8))) { - ; - } else { - bitarray[(lower_pid) / 8] |= (1 << ((lower_pid) % 8)); - PrefilterAddSids(pmq, pid_pat_list[lower_pid].sids, - pid_pat_list[lower_pid].sids_size); - } - matches++; - } else { - if (bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) { - ; - } else { - bitarray[pids[k] / 8] |= (1 << (pids[k] % 8)); - PrefilterAddSids(pmq, pid_pat_list[pids[k]].sids, - pid_pat_list[pids[k]].sids_size); - } - matches++; - } - } - } - - return matches; -} - -void SCACCudaStartDispatcher(void) -{ - /* create the threads */ - ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher", - NULL, NULL, - NULL, NULL, - "custom", SCACCudaDispatcher, 0); - if (tv == NULL) { - SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for " - "ac cuda dispatcher. Killing engine."); - exit(EXIT_FAILURE); - } - if (TmThreadSpawn(tv) != 0) { - SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for " - "ac cuda dispatcher. Killing engine."); - exit(EXIT_FAILURE); - } - - return; -} - -int MpmCudaBufferSetup(void) -{ - int r = 0; - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - if (conf == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); - return -1; - } - - CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); - if (cuda_context == 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context."); - return -1; - } - r = SCCudaCtxPushCurrent(cuda_context); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context."); - return -1; - } - - uint8_t *d_buffer = NULL; - uint32_t *o_buffer = NULL; - void **p_buffer = NULL; - - r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure."); - return -1; - } - SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size); - r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue."); - return -1; - } - r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure."); - return -1; - } - - r = SCCudaCtxPopCurrent(NULL); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure."); - return -1; - } - - CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT); - if (cb == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance."); - return -1; - } - CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb); - - return 0; -} - -int MpmCudaBufferDeSetup(void) -{ - int r = 0; - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - if (conf == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile."); - return -1; - } - - CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME); - BUG_ON(cb_data == NULL); - - CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id); - if (cuda_context == 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context."); - return -1; - } - r = SCCudaCtxPushCurrent(cuda_context); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context."); - return -1; - } - - r = SCCudaMemFreeHost(cb_data->d_buffer); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); - return -1; - } - r = SCCudaMemFreeHost(cb_data->o_buffer); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); - return -1; - } - r = SCCudaMemFreeHost(cb_data->p_buffer); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory."); - return -1; - } - - r = SCCudaCtxPopCurrent(NULL); - if (r < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure."); - return -1; - } - - CudaBufferDeRegister(cb_data); - - return 0; -} - -#endif /* __SC_CUDA_SUPPORT */ /************************** Mpm Registration ***************************/ @@ -1825,31 +1254,6 @@ void MpmACRegister(void) return; } -#ifdef __SC_CUDA_SUPPORT__ - -/** - * \brief Register the aho-corasick cuda mpm. - */ -void MpmACCudaRegister(void) -{ - mpm_table[MPM_AC_CUDA].name = "ac-cuda"; - mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx; - mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx; - mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx; - mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx; - mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS; - mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI; - mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns; - mpm_table[MPM_AC_CUDA].Search = SCACSearch; - mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo; - mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats; - mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests; - - return; -} - -#endif /* __SC_CUDA_SUPPORT__ */ - /*************************************Unittests********************************/ #ifdef UNITTESTS diff --git a/src/util-mpm-ac.h b/src/util-mpm-ac.h index 4fa2f9da25..a9dbd6a090 100644 --- a/src/util-mpm-ac.h +++ b/src/util-mpm-ac.h @@ -28,16 +28,6 @@ #define SC_AC_STATE_TYPE_U16 uint16_t #define SC_AC_STATE_TYPE_U32 uint32_t -#ifdef __SC_CUDA_SUPPORT__ -#include "suricata-common.h" -#include "util-cuda.h" -#include "util-cuda-vars.h" -#include "decode.h" -#include "util-cuda-buffer.h" -#include "util-mpm.h" -#include "flow.h" -#endif /* __SC_CUDA_SUPPORT__ */ - typedef struct SCACPatternList_ { uint8_t *cs; uint16_t patlen; @@ -83,10 +73,6 @@ typedef struct SCACCtx_ { uint32_t allocated_state_count; -#ifdef __SC_CUDA_SUPPORT__ - CUdeviceptr state_table_u16_cuda; - CUdeviceptr state_table_u32_cuda; -#endif /* __SC_CUDA_SUPPORT__ */ } SCACCtx; typedef struct SCACThreadCtx_ { @@ -98,105 +84,4 @@ typedef struct SCACThreadCtx_ { void MpmACRegister(void); - -#ifdef __SC_CUDA_SUPPORT__ - -#define MPM_AC_CUDA_MODULE_NAME "ac_cuda" -#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb" - -static inline void CudaBufferPacket(CudaThreadVars *ctv, Packet *p) -{ - if (p->cuda_pkt_vars.cuda_mpm_enabled) { - while (!p->cuda_pkt_vars.cuda_done) { - SCMutexLock(&p->cuda_pkt_vars.cuda_mutex); - if (p->cuda_pkt_vars.cuda_done) { - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - break; - } else { - SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex); - SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex); - } - } - } - p->cuda_pkt_vars.cuda_done = 0; - - if (p->payload_len == 0 || - (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) || - (p->flags & PKT_ALLOC) || - (ctv->data_buffer_size_min_limit != 0 && p->payload_len < ctv->data_buffer_size_min_limit) || - (p->payload_len > ctv->data_buffer_size_max_limit && ctv->data_buffer_size_max_limit != 0) ) { - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - return; - } - - MpmCtx *mpm_ctx = NULL; - if (p->proto == IPPROTO_TCP) { - if (p->flowflags & FLOW_PKT_TOSERVER) - mpm_ctx = ctv->mpm_proto_tcp_ctx_ts; - else - mpm_ctx = ctv->mpm_proto_tcp_ctx_tc; - } else if (p->proto == IPPROTO_UDP) { - if (p->flowflags & FLOW_PKT_TOSERVER) - mpm_ctx = ctv->mpm_proto_udp_ctx_ts; - else - mpm_ctx = ctv->mpm_proto_udp_ctx_tc; - } else { - mpm_ctx = ctv->mpm_proto_other_ctx; - } - if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) { - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - return; - } - -#if __WORDSIZE==64 - CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb, - p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr), - (void *)p); - if (slice == NULL) { - SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " - "this to dev."); - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - return; - } - *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len; - *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; - memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); -#else - CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb, - p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr), - (void *)p); - if (slice == NULL) { - SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report " - "this to dev."); - p->cuda_pkt_vars.cuda_mpm_enabled = 0; - return; - } - *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len; - *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda; - memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len); -#endif - p->cuda_pkt_vars.cuda_mpm_enabled = 1; - SC_ATOMIC_SET(slice->done, 1); - - SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n", - p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda); - - return; -} - -void MpmACCudaRegister(void); -void SCACConstructBoth16and32StateTables(void); -int MpmCudaBufferSetup(void); -int MpmCudaBufferDeSetup(void); -void SCACCudaStartDispatcher(void); -void SCACCudaKillDispatcher(void); -uint32_t SCACCudaPacketResultsProcessing(Packet *p, const MpmCtx *mpm_ctx, - PrefilterRuleStore *pmq); -void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx); - -void CudaReleasePacket(Packet *p); - -#endif /* __SC_CUDA_SUPPORT__ */ - - #endif /* __UTIL_MPM_AC__H__ */ diff --git a/src/util-mpm.c b/src/util-mpm.c index b8b68f26dd..0dfe1793ba 100644 --- a/src/util-mpm.c +++ b/src/util-mpm.c @@ -35,16 +35,11 @@ #include "util-hashlist.h" #include "detect-engine.h" -#include "util-cuda.h" #include "util-misc.h" #include "conf.h" #include "conf-yaml-loader.h" #include "queue.h" #include "util-unittest.h" -#ifdef __SC_CUDA_SUPPORT__ -#include "util-cuda-handlers.h" -#include "detect-engine-mpm.h" -#endif #include "util-memcpy.h" #ifdef BUILD_HYPERSCAN #include "hs.h" @@ -258,142 +253,6 @@ void MpmFactoryDeRegisterAllMpmCtxProfiles(DetectEngineCtx *de_ctx) return; } -#ifdef __SC_CUDA_SUPPORT__ - -static void MpmCudaConfFree(void *conf) -{ - SCFree(conf); - return; -} - -static void *MpmCudaConfParse(ConfNode *node) -{ - const char *value; - - MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf)); - if (unlikely(conf == NULL)) - exit(EXIT_FAILURE); - memset(conf, 0, sizeof(*conf)); - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT; - } else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "data-buffer-size-min-limit - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT; - } else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "data-buffer-size-max-limit - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT; - } else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "cb-buffer-size - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "gpu-transfer-size"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE; - } else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "gpu-transfer-size - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "batching-timeout"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT; - } else if ((conf->batching_timeout = atoi(value)) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "batching-timeout - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "device-id"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT; - } else if ((conf->device_id = atoi(value)) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "device-id - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - if (node != NULL) - value = ConfNodeLookupChildValue(node, "cuda-streams"); - else - value = NULL; - if (value == NULL) { - /* default */ - conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT; - } else if ((conf->cuda_streams = atoi(value)) < 0) { - SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s." - "cuda-streams - \"%s\"", node->name, value); - exit(EXIT_FAILURE); - } - - return conf; -} - -void MpmCudaEnvironmentSetup() -{ - if (PatternMatchDefaultMatcher() != MPM_AC_CUDA) - return; - - CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree); - - MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm"); - if (conf == NULL) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm " - "profile."); - exit(EXIT_FAILURE); - } - - if (MpmCudaBufferSetup() < 0) { - SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac " - "cuda"); - exit(EXIT_FAILURE); - } - - return; -} - -#endif - void MpmInitThreadCtx(MpmThreadCtx *mpm_thread_ctx, uint16_t matcher) { mpm_table[matcher].InitThreadCtx(NULL, mpm_thread_ctx); @@ -446,9 +305,6 @@ void MpmTableSetup(void) MpmHSRegister(); #endif /* HAVE_HS_VALID_PLATFORM */ #endif /* BUILD_HYPERSCAN */ -#ifdef __SC_CUDA_SUPPORT__ - MpmACCudaRegister(); -#endif /* __SC_CUDA_SUPPORT__ */ } int MpmAddPatternCS(struct MpmCtx_ *mpm_ctx, uint8_t *pat, uint16_t patlen, diff --git a/src/util-mpm.h b/src/util-mpm.h index 3b7960a86d..d6a1605139 100644 --- a/src/util-mpm.h +++ b/src/util-mpm.h @@ -33,9 +33,6 @@ enum { /* aho-corasick */ MPM_AC, -#ifdef __SC_CUDA_SUPPORT__ - MPM_AC_CUDA, -#endif MPM_AC_BS, MPM_AC_TILE, MPM_HS, @@ -168,42 +165,6 @@ typedef struct MpmTableElmt_ { MpmTableElmt mpm_table[MPM_TABLE_SIZE]; int mpm_default_matcher; -/* macros decides if cuda is enabled for the platform or not */ -#ifdef __SC_CUDA_SUPPORT__ - -/* the min size limit of a payload(or any other data) to be buffered */ -#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0 -/* the max size limit of a payload(or any other data) to be buffered */ -#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500 -/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */ -#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024 -/* Default value for the max data chunk that would be sent to gpu */ -#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024 -/* Default value for offset/pointer buffer to be used by cuda mpm - * engine for CudaBuffer reg */ -#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000 -#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000 -#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2 -#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0 - -/** - * \brief Cuda configuration for "mpm" profile. We can further extend this - * to have conf for specific mpms. For now its common for all mpms. - */ -typedef struct MpmCudaConf_ { - uint16_t data_buffer_size_min_limit; - uint16_t data_buffer_size_max_limit; - uint32_t cb_buffer_size; - uint32_t gpu_transfer_size; - int batching_timeout; - int device_id; - int cuda_streams; -} MpmCudaConf; - -void MpmCudaEnvironmentSetup(); - -#endif /* __SC_CUDA_SUPPORT__ */ - struct DetectEngineCtx_; int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *); diff --git a/src/util-running-modes.c b/src/util-running-modes.c index ebc44a3fb6..e152c5a251 100644 --- a/src/util-running-modes.c +++ b/src/util-running-modes.c @@ -25,7 +25,6 @@ #include "app-layer-detect-proto.h" #include "app-layer.h" #include "app-layer-parser.h" -#include "util-cuda.h" #include "util-unittest.h" #include "util-debug.h" #include "conf-yaml-loader.h" @@ -55,11 +54,3 @@ int ListAppLayerProtocols() exit(EXIT_SUCCESS); } -#ifdef __SC_CUDA_SUPPORT__ -int ListCudaCards() -{ - SCCudaInitCudaEnvironment(); - SCCudaListCards(); - exit(EXIT_SUCCESS); -} -#endif diff --git a/src/util-running-modes.h b/src/util-running-modes.h index 510a86f125..f047e02dde 100644 --- a/src/util-running-modes.h +++ b/src/util-running-modes.h @@ -23,11 +23,7 @@ #ifndef __UTIL_RUNNING_MODES_H__ #define __UTIL_RUNNING_MODES_H__ - int ListKeywords(const char *keyword_info); int ListAppLayerProtocols(void); -#ifdef __SC_CUDA_SUPPORT__ -int ListCudaCards(void); -#endif #endif /* __UTIL_RUNNING_MODES_H__ */ diff --git a/suricata.yaml.in b/suricata.yaml.in index eb89c8c428..4944ba1c81 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -967,10 +967,6 @@ host-mode: auto # Number of packets preallocated per thread. The default is 1024. A higher number # will make sure each CPU will be more easily kept busy, but may negatively # impact caching. -# -# If you are using the CUDA pattern matcher (mpm-algo: ac-cuda), different rules -# apply. In that case try something like 60000 or more. This is because the CUDA -# pattern matcher buffers and scans as many packets as possible in parallel. #max-pending-packets: 1024 # Runmode the engine should use. Please check --list-runmodes to get the available @@ -1345,7 +1341,6 @@ detect: # The supported algorithms are: # "ac" - Aho-Corasick, default implementation # "ac-bs" - Aho-Corasick, reduced memory implementation -# "ac-cuda" - Aho-Corasick, CUDA implementation # "ac-ks" - Aho-Corasick, "Ken Steele" variant # "hs" - Hyperscan, available when built with Hyperscan support # @@ -1358,10 +1353,6 @@ detect: # to be set to "single", because of ac's memory requirements, unless the # ruleset is small enough to fit in one's memory, in which case one can # use "full" with "ac". Rest of the mpms can be run in "full" mode. -# -# There is also a CUDA pattern matcher (only available if Suricata was -# compiled with --enable-cuda: b2g_cuda. Make sure to update your -# max-pending-packets setting above as well if you use b2g_cuda. mpm-algo: auto @@ -1724,40 +1715,6 @@ mpipe: size10386: 0 size16384: 0 -## -## Hardware accelaration -## - -# Cuda configuration. -cuda: - # The "mpm" profile. On not specifying any of these parameters, the engine's - # internal default values are used, which are same as the ones specified in - # in the default conf file. - mpm: - # The minimum length required to buffer data to the gpu. - # Anything below this is MPM'ed on the CPU. - # Can be specified in kb, mb, gb. Just a number indicates it's in bytes. - # A value of 0 indicates there's no limit. - data-buffer-size-min-limit: 0 - # The maximum length for data that we would buffer to the gpu. - # Anything over this is MPM'ed on the CPU. - # Can be specified in kb, mb, gb. Just a number indicates it's in bytes. - data-buffer-size-max-limit: 1500 - # The ring buffer size used by the CudaBuffer API to buffer data. - cudabuffer-buffer-size: 500mb - # The max chunk size that can be sent to the gpu in a single go. - gpu-transfer-size: 50mb - # The timeout limit for batching of packets in microseconds. - batching-timeout: 2000 - # The device to use for the mpm. Currently we don't support load balancing - # on multiple gpus. In case you have multiple devices on your system, you - # can specify the device to use, using this conf. By default we hold 0, to - # specify the first device cuda sees. To find out device-id associated with - # the card(s) on the system run "suricata --list-cuda-cards". - device-id: 0 - # No of Cuda streams used for asynchronous processing. All values > 0 are valid. - # For this option you need a device with Compute Capability > 1.0. - cuda-streams: 2 ## ## Include other configs