--- /dev/null
+# first look in pcre-$version or pcre subdirs
+if (PCRE_SOURCE)
+ # either provided on cmdline or we've seen it already
+ set (PCRE_BUILD_SOURCE TRUE)
+elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
+ set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION})
+ set (PCRE_BUILD_SOURCE TRUE)
+elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre)
+ set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre)
+ set (PCRE_BUILD_SOURCE TRUE)
+endif()
+
+if (PCRE_BUILD_SOURCE)
+ if (NOT IS_ABSOLUTE ${PCRE_SOURCE})
+ set(PCRE_SOURCE "${CMAKE_BINARY_DIR}/${PCRE_SOURCE}")
+ endif ()
+ set (saved_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
+ set (CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${PCRE_SOURCE}")
+
+ if (PCRE_CHECKED)
+ set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
+ set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
+
+ # already processed this file and set up pcre building
+ return()
+ endif ()
+
+ # first, check version number
+ CHECK_C_SOURCE_COMPILES("#include <pcre.h.generic>
+ #if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR != ${PCRE_REQUIRED_MINOR_VERSION}
+ #error Incorrect pcre version
+ #endif
+ main() {}" CORRECT_PCRE_VERSION)
+ set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
+
+ if (NOT CORRECT_PCRE_VERSION)
+ unset(CORRECT_PCRE_VERSION CACHE)
+ message(FATAL_ERROR "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} is required")
+ else()
+ message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} - building from source.")
+ endif()
+
+ # PCRE compile options
+ option(PCRE_BUILD_PCRECPP OFF)
+ option(PCRE_BUILD_PCREGREP OFF)
+ option(PCRE_SHOW_REPORT OFF)
+ set(PCRE_SUPPORT_UNICODE_PROPERTIES ON CACHE BOOL "Build pcre with unicode")
+ add_subdirectory(${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre EXCLUDE_FROM_ALL)
+ set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre)
+ set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre)
+else ()
+ # pkgconf should save us
+ find_package(PkgConfig)
+ pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION})
+ if (PCRE_FOUND)
+ message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}")
+ else ()
+ message(FATAL_ERROR "PCRE version ${PCRE_REQUIRED_VERSION} not found")
+ endif ()
+endif (PCRE_BUILD_SOURCE)
+
+set (PCRE_CHECKED TRUE PARENT_SCOPE)
--- /dev/null
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef BOUNDEDQUEUE_H
+#define BOUNDEDQUEUE_H
+
+#include <algorithm>
+#include <cassert>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <type_traits>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+//#define QUEUE_STATS 1
+
+#ifdef QUEUE_STATS
+
+#include <iostream>
+
+class BoundedQueueStats {
+public:
+ size_t pop = 0; //!< Number of pop operations.
+ size_t pop_block = 0; //!< Number of pop operations that had to block.
+ size_t push = 0; //!< Number of push operations.
+ size_t push_elements = 0; //!< Number of elements pushed.
+ size_t push_block = 0; //!< Number of push operations that had to block.
+ size_t refill = 0; //!< Number of refills done.
+ size_t stolen_from = 0; //!< Number of times we were stolen from.
+
+ void dump() const {
+ std::cout << "pop : " << pop << std::endl;
+ std::cout << "pop_block : " << pop_block << std::endl;
+ std::cout << "push : " << push << std::endl;
+ std::cout << "push_elements : " << push_elements << std::endl;
+ std::cout << "push_block : " << push_block << std::endl;
+ std::cout << "refill : " << refill << std::endl;
+ std::cout << "stolen_from : " << stolen_from << std::endl;
+ }
+};
+#endif
+
+template<typename T>
+class BoundedQueue : boost::noncopyable {
+private:
+ // Encapsulates a queue and the mutex used to protect access to it.
+ class MutexQueue {
+ public:
+ // Forwarded queue operations.
+ void push(std::unique_ptr<T> elem) { q.push(std::move(elem)); }
+ void pop() { q.pop(); }
+ std::unique_ptr<T> &front() { return q.front(); }
+ bool empty() const { return q.empty(); }
+ size_t size() const { return q.size(); }
+
+ // Acquire the mutex lock.
+ std::unique_lock<std::mutex> lock() {
+ return std::unique_lock<std::mutex>(mutex);
+ }
+
+#ifdef QUEUE_STATS
+ BoundedQueueStats stats;
+#endif
+
+ private:
+ std::mutex mutex;
+ std::queue<std::unique_ptr<T>> q;
+ };
+
+public:
+ BoundedQueue(size_t consumers, size_t size)
+ : max_elements(size), consumer_q(consumers) {
+ assert(consumers > 0);
+ assert(size > 0);
+ }
+
+#ifdef QUEUE_STATS
+ ~BoundedQueue() {
+ std::cout << "Global queue stats:" << std::endl;
+ global_q.stats.dump();
+ std::cout << std::endl;
+ for (size_t i = 0; i < consumer_q.size(); i++) {
+ std::cout << "Consumer queue " << i << ":" << std::endl;
+ consumer_q[i].stats.dump();
+ std::cout << std::endl;
+ }
+ }
+#endif // QUEUE_STATS
+
+ void push(std::unique_ptr<T> elem) {
+ auto lock = global_q.lock();
+
+#ifdef QUEUE_STATS
+ global_q.stats.push++;
+ global_q.stats.push_elements++;
+ if (global_q.size() >= max_elements) {
+ global_q.stats.push_block++;
+ }
+#endif // QUEUE_STATS
+
+ // Block until queue is able to accept new elements.
+ cond_can_accept.wait(lock,
+ [&] { return global_q.size() < max_elements; });
+ assert(global_q.size() < max_elements);
+
+ global_q.push(std::move(elem));
+ cond_can_consume.notify_all();
+ }
+
+ template<class Iter>
+ void push(Iter begin, Iter end) {
+ using ElemType = typename std::remove_reference<decltype(*begin)>::type;
+ static_assert(std::is_same<ElemType, std::unique_ptr<T>>::value,
+ "Iterator must be over unique_ptr<T>");
+
+ if (begin == end) {
+ return;
+ }
+
+ auto lock = global_q.lock();
+
+#ifdef QUEUE_STATS
+ global_q.stats.push++;
+ global_q.stats.push_elements += std::distance(begin, end);
+ if (global_q.size() >= max_elements) {
+ global_q.stats.push_block++;
+ }
+#endif // QUEUE_STATS
+
+ // Block until queue is able to accept new elements.
+ cond_can_accept.wait(lock,
+ [&] { return global_q.size() < max_elements; });
+ assert(global_q.size() < max_elements);
+
+ for (auto it = begin; it != end; ++it) {
+ global_q.push(std::move(*it));
+ }
+ cond_can_consume.notify_all();
+ }
+
+ std::unique_ptr<T> pop(size_t consumer_id) {
+ assert(consumer_id < consumer_q.size());
+ auto &q = consumer_q[consumer_id];
+
+ // Try and satisfy the request from our per-consumer queue.
+ {
+ auto consumer_lock = q.lock();
+ if (!q.empty()) {
+ return pop_from_queue(q);
+ }
+ }
+
+ // Try and satisfy the request with a refill from the global queue.
+ {
+ auto lock = global_q.lock();
+ if (!global_q.empty()) {
+ auto consumer_lock = q.lock();
+ return refill_and_pop(q);
+ }
+ }
+
+ // Try and satisfy the request by stealing it from another queue.
+ for (size_t i = 1; i < consumer_q.size(); i++) {
+ size_t victim_id = (consumer_id + i) % consumer_q.size();
+ auto &victim_q = consumer_q[victim_id];
+ auto victim_lock = victim_q.lock();
+ // Note: we don't steal sentinel elements.
+ if (!victim_q.empty() && victim_q.front() != nullptr) {
+#ifdef QUEUE_STATS
+ victim_q.stats.stolen_from++;
+#endif
+ return pop_from_queue(victim_q);
+ }
+ }
+
+ // All avenues exhausted, we must block until we've received a new
+ // element.
+ auto lock = global_q.lock();
+#ifdef QUEUE_STATS
+ global_q.stats.pop_block++;
+#endif
+ cond_can_consume.wait(lock, [&]{ return !global_q.empty(); });
+ assert(!global_q.empty());
+ auto consumer_lock = q.lock();
+ return refill_and_pop(q);
+ }
+
+private:
+ std::unique_ptr<T> pop_from_queue(MutexQueue &q) {
+ assert(!q.empty());
+ auto elem = std::move(q.front());
+ q.pop();
+#ifdef QUEUE_STATS
+ q.stats.pop++;
+#endif
+ return elem;
+ }
+
+ std::unique_ptr<T> refill_and_pop(MutexQueue &q) {
+ assert(!global_q.empty());
+
+#ifdef QUEUE_STATS
+ q.stats.refill++;
+#endif
+
+ auto elem = pop_from_queue(global_q);
+ if (elem == nullptr) {
+ return elem; // Sentinel.
+ }
+
+ // Grab all subsequent elements that share the same ID.
+ const auto &id = elem->id;
+ while (!global_q.empty()) {
+ auto &first = global_q.front();
+ if (first == nullptr) {
+#ifdef QUEUE_STATS
+ q.stats.push++;
+ q.stats.push_elements++;
+#endif
+ // Sentinel element. We can grab one, but no more.
+ q.push(pop_from_queue(global_q));
+ break;
+ }
+ if (first->id != id) {
+ break;
+ }
+#ifdef QUEUE_STATS
+ q.stats.push++;
+ q.stats.push_elements++;
+#endif
+ q.push(pop_from_queue(global_q));
+ }
+
+ if (global_q.size() < max_elements) {
+ cond_can_accept.notify_all();
+ }
+
+ return elem;
+ }
+
+ // Maximum number of elements in the global queue (subsequent push
+ // operations will block). Note that we may overshoot this value when
+ // handling bulk push operations.
+ const size_t max_elements;
+
+ // Global queue.
+ MutexQueue global_q;
+
+ // Per-consumer queues.
+ std::vector<MutexQueue> consumer_q;
+
+ // Condition variable for producers to wait on when the queue is full.
+ std::condition_variable cond_can_accept;
+
+ // Condition variable for consumers to wait on when the queue is empty.
+ std::condition_variable cond_can_consume;
+};
+
+#ifdef QUEUE_STATS
+#undef QUEUE_STATS
+#endif
+
+#endif // BOUNDEDQUEUE_H
--- /dev/null
+# we have a fixed requirement for PCRE
+set(PCRE_REQUIRED_MAJOR_VERSION 8)
+set(PCRE_REQUIRED_MINOR_VERSION 41)
+set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
+
+include (${CMAKE_MODULE_PATH}/pcre.cmake)
+
+include_directories(${PCRE_INCLUDE_DIRS})
+
+include(${CMAKE_MODULE_PATH}/backtrace.cmake)
+
+# we need static libs - too much deep magic for shared libs
+if (NOT BUILD_STATIC_LIBS)
+ return ()
+endif ()
+
+CHECK_FUNCTION_EXISTS(sigaltstack HAVE_SIGALTSTACK)
+CHECK_FUNCTION_EXISTS(sigaction HAVE_SIGACTION)
+CHECK_FUNCTION_EXISTS(setrlimit HAVE_SETRLIMIT)
+
+set_source_files_properties(
+ ${CMAKE_CURRENT_BINARY_DIR}/ColliderCorporaParser.cpp
+ PROPERTIES
+ COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
+
+ragelmaker(ColliderCorporaParser.rl)
+
+# only set these after all tests are done
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+
+SET(hscollider_SOURCES
+ common.h
+ BoundedQueue.h
+ Corpora.cpp
+ FileCorpora.h
+ FileCorpora.cpp
+ ColliderCorporaParser.h
+ ColliderCorporaParser.cpp
+ NfaGeneratedCorpora.h
+ NfaGeneratedCorpora.cpp
+ GraphTruth.h
+ GraphTruth.cpp
+ GroundTruth.h
+ GroundTruth.cpp
+ UltimateTruth.h
+ UltimateTruth.cpp
+ ResultSet.h
+ args.cpp
+ args.h
+ limit.cpp
+ pcre_util.cpp
+ sig.cpp
+ sig.h
+ DatabaseProxy.h
+ Thread.h
+ Thread.cpp
+ main.cpp
+)
+
+set_source_files_properties(${hscollider_SOURCES} PROPERTIES
+ INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(hscollider ${hscollider_SOURCES})
+add_dependencies(hscollider ragel_ColliderCorporaParser)
+add_dependencies(hscollider pcre)
+
+if(NOT WIN32)
+ target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
+ expressionutil corpusomatic crosscompileutil pthread
+ "${BACKTRACE_LDFLAGS}")
+
+if(HAVE_BACKTRACE)
+ set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS
+ "${BACKTRACE_CFLAGS}")
+endif()
+else() # WIN32
+ target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
+ expressionutil corpusomatic crosscompileutil)
+endif()
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FILECORPORAPARSER_H
+#define FILECORPORAPARSER_H
+
+#include <string>
+
+struct Corpus;
+
+// parse an escaped string into a real data buffer
+bool parseCorpus(const std::string &line, Corpus &c, unsigned int &id);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ColliderCorporaParser.h"
+#include "Corpora.h"
+
+#include "ue2common.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <cstdio>
+
+using namespace std;
+
+namespace /* anonymous */ {
+
+// Take a string like '\xFF' and convert it to the character it represents
+char unhex(const char *start, UNUSED const char *end) {
+ assert(start + 4 == end);
+ assert(start[0] == '\\');
+ assert(start[1] == 'x');
+ assert(isxdigit(start[2]));
+ assert(isxdigit(start[2]));
+
+ char temp[3] = {start[2], start[3], 0};
+
+ return strtol(temp, nullptr, 16);
+}
+
+%%{
+ machine FileCorporaParser;
+
+ action accumulateNum {
+ num = (num * 10) + (fc - '0');
+ }
+
+ action handleHexEscaped {
+ sout.push_back(unhex(ts, te));
+ }
+
+ action handleSpecial {
+ switch (*(ts+1)) {
+ case '0': sout.push_back('\x00'); break;
+ case 'a': sout.push_back('\x07'); break;
+ case 'e': sout.push_back('\x1b'); break;
+ case 'f': sout.push_back('\x0c'); break;
+ case 'n': sout.push_back('\x0a'); break;
+ case 'v': sout.push_back('\x0b'); break;
+ case 'r': sout.push_back('\x0d'); break;
+ case 't': sout.push_back('\x09'); break;
+ default: fbreak;
+ }
+ }
+
+ action handleMatch {
+ c.matches.insert(num);
+ }
+
+ write data;
+}%%
+
+} // namespace
+
+bool parseCorpus(const string &line, Corpus &c, unsigned int &id) {
+ const char *p = line.c_str();
+ const char *pe = p + line.size();
+ const char *eof = pe;
+ const char *ts;
+ const char *te;
+ int cs;
+ UNUSED int act;
+
+ // For storing integers as they're scanned
+ unsigned int num = 0;
+
+ string &sout = c.data;
+
+ %%{
+ id = ( digit @accumulateNum)+ >{num = 0;} @{id = num;};
+
+ backslashed = '\\' ^alnum;
+ specials = '\\' [0aefnvrt];
+ hexescaped = '\\x' xdigit{2};
+
+ corpus_old := |*
+ hexescaped => handleHexEscaped;
+ specials => handleSpecial;
+ backslashed => { sout.push_back(*(ts + 1)); };
+ any => { sout.push_back(*ts); };
+ *|;
+
+ corpus_new := |*
+ hexescaped => handleHexEscaped;
+ specials => handleSpecial;
+ backslashed => { sout.push_back(*(ts + 1)); };
+ any - '"' => { sout.push_back(*ts); };
+ '"' => { fgoto colon_sep; };
+ *|;
+
+ colon_sep := |*
+ ':' => {fgoto match_list; };
+ *|;
+
+ match_list := |*
+ (' '* (digit @accumulateNum)+ ' '* ','?) >{num = 0;} => handleMatch;
+ *|;
+
+ # Old simple line format
+ line_old = id ':' @{ fgoto corpus_old; };
+
+ # New line format with matches
+ line_new = id "=\"" @{ c.hasMatches = true; fgoto corpus_new; };
+
+ main := ( line_new | line_old );
+
+ # Initialize and execute
+ write init;
+ write exec;
+ }%%
+
+ return (cs != FileCorporaParser_error) && (p == pe);
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "Corpora.h"
+
+CorporaSource::~CorporaSource() { }
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CORPORA_H
+#define CORPORA_H
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+struct Corpus {
+ Corpus() : hasMatches(false) {}
+ explicit Corpus(const std::string &s) : data(s), hasMatches(false) {}
+
+ std::string data; // Corpus itself
+ bool hasMatches; // Have the matches been pre-calculated?
+ std::set<unsigned int> matches; // end-offsets of matches
+};
+
+struct CorpusFailure {
+ explicit CorpusFailure(const std::string &s) : message(s) {}
+ std::string message;
+};
+
+// Abstract class for a corpora source: new ways to load or generate corpora
+// can be written by subclassing this class and providing its generate
+// method.
+class CorporaSource : boost::noncopyable {
+public:
+ // destructor
+ virtual ~CorporaSource();
+
+ // Make a copy of this corpora source.
+ virtual CorporaSource *clone() const = 0;
+
+ // Generate corpora for the given signature ID, adding them to the
+ // vector of strings provided.
+ virtual void generate(unsigned id, std::vector<Corpus> &data) = 0;
+};
+
+#endif // CORPORA_H
--- /dev/null
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UE2COLLIDER_DATABASEPROXY_H
+#define UE2COLLIDER_DATABASEPROXY_H
+
+#include "UltimateTruth.h"
+
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+
+#include <boost/core/noncopyable.hpp>
+
+/**
+ * When a compile fails for the first time, we throw this exception so that a
+ * compilation error can be reported to the user. Subsequent failures will
+ * simply return nullptr rather than throwing this exception.
+ */
+struct CompileFailed {
+public:
+ explicit CompileFailed(const std::string &err) : error(err) {}
+ std::string error;
+};
+
+class DatabaseProxy : boost::noncopyable {
+public:
+ explicit DatabaseProxy(const std::set<unsigned> &expr_ids)
+ : ids(expr_ids) {}
+
+ explicit DatabaseProxy(std::shared_ptr<HyperscanDB> built_db)
+ : db(built_db) {}
+
+ std::shared_ptr<HyperscanDB> get(const UltimateTruth &ultimate) {
+ std::lock_guard<std::mutex> lock(mutex);
+ if (failed) {
+ // We have previously failed to compile this database.
+ return nullptr;
+ }
+ if (db) {
+ return db;
+ }
+
+ // Database hasn't been compiled yet.
+ std::string error;
+ db = ultimate.compile(ids, error);
+ if (!db) {
+ failed = true;
+ throw CompileFailed(error);
+ }
+
+ return db;
+ }
+
+private:
+ std::mutex mutex;
+ std::shared_ptr<HyperscanDB> db;
+ std::set<unsigned> ids;
+ bool failed = false; // Database failed compilation.
+};
+
+#endif // UE2COLLIDER_DATABASEPROXY_H
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ColliderCorporaParser.h"
+#include "FileCorpora.h"
+#include "common.h"
+#include "util/expression_path.h"
+
+#include <iostream>
+#include <fstream>
+
+#include <boost/algorithm/string/trim.hpp>
+
+using namespace std;
+
+// Returns true if this line is empty or a comment and should be skipped
+static
+bool emptyLine(const string& line) {
+ return line.empty() || line[0] == '#';
+}
+
+FileCorpora *FileCorpora::clone() const {
+ FileCorpora *copy = new FileCorpora();
+ copy->corpora_by_pat = corpora_by_pat;
+ return copy;
+}
+
+bool FileCorpora::readLine(const string &line) {
+ unsigned id = 0;
+ Corpus c;
+ bool rv = parseCorpus(line, c, id);
+ if (rv) {
+ corpora_by_pat[id].push_back(c);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool FileCorpora::readFile(const string &filename) {
+ ifstream f(filename.c_str());
+ if (!f.good()) {
+ return false;
+ }
+
+ unsigned lineNum = 0;
+ string line;
+ while (getline(f, line)) {
+ lineNum++;
+
+ boost::trim(line);
+
+ if (emptyLine(line)) {
+ continue;
+ }
+ if (!readLine(line)) {
+ cerr << "Error in corpora file parsing line " << lineNum << endl;
+ return false;
+ }
+ }
+ return !corpora_by_pat.empty();
+}
+
+void FileCorpora::generate(unsigned id,
+ vector<Corpus> &data) {
+ auto i = corpora_by_pat.find(id);
+ if (i == corpora_by_pat.end() || i->second.empty()) {
+ throw CorpusFailure("no corpora found for pattern.");
+ }
+
+ data.insert(data.end(), i->second.begin(), i->second.end());
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FILECORPORA_H
+#define FILECORPORA_H
+
+#include "Corpora.h"
+
+#include <string>
+#include <vector>
+#include <list>
+#include <map>
+
+class FileCorpora : public CorporaSource {
+public:
+ // copy
+ FileCorpora *clone() const override;
+
+ // read corpora in from a file
+ bool readFile(const std::string &filename);
+
+ // generator
+ void generate(unsigned id, std::vector<Corpus> &data) override;
+
+private:
+ // read in a line from our file
+ bool readLine(const std::string &line);
+
+ std::map<unsigned, std::list<Corpus>> corpora_by_pat;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "GraphTruth.h"
+
+#include "common.h"
+#include "expressions.h"
+#include "ExpressionParser.h"
+#include "ng_find_matches.h"
+#include "pcre_util.h"
+
+#include "grey.h"
+#include "hs_compile.h"
+#include "ue2common.h"
+#include "compiler/compiler.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_depth.h"
+#include "nfagraph/ng_dump.h"
+#include "nfagraph/ng_fuzzy.h"
+#include "nfagraph/ng_holder.h"
+#include "nfagraph/ng_util.h"
+#include "parser/Parser.h"
+#include "parser/unsupported.h"
+#include "util/compile_context.h"
+#include "util/make_unique.h"
+#include "util/report_manager.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace ue2;
+
+// Struct to store the actual compiled NFA graph.
+class CompiledNG : boost::noncopyable {
+public:
+ CompiledNG(unique_ptr<NGHolder> g_in,
+ unique_ptr<ReportManager> rm_in)
+ : g(std::move(g_in)), rm(std::move(rm_in)) {}
+ unique_ptr<ue2::NGHolder> g;
+ unique_ptr<ue2::ReportManager> rm;
+};
+
+static
+void populateMatchSet(ResultSet &rs, const set<pair<size_t, size_t>> &matches,
+ const CNGInfo &cngi) {
+ for (const auto &m : matches) {
+ u64a from = m.first;
+ u64a to = m.second;
+ if (g_streamOffset) {
+ // Subtract stream offset imposed by offset test.
+ u64a offset = min(100ull, g_streamOffset);
+ assert(to >= offset);
+ from -= min(offset, from);
+ to -= offset;
+ }
+ u64a len = to - from;
+
+ if (to < cngi.min_offset || to > cngi.max_offset ||
+ len < cngi.min_length) {
+ // this match does not satisfy extparams constraints
+ DEBUG_PRINTF("skipping NFA Match @ (%llu,%llu)\n", from, to);
+ continue;
+ }
+ if (!cngi.som) {
+ from = 0;
+ }
+ rs.addMatch(from, to);
+ }
+}
+
+CNGInfo::CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in)
+ : id(id_in), m_expr(m_expr_in) {}
+
+CNGInfo::~CNGInfo() = default;
+
+void CNGInfo::compile() {
+ auto i = m_expr.find(id);
+ if (i == m_expr.end()) {
+ throw NGCompileFailure("ID not found in expression map.");
+ }
+
+ string re;
+ unsigned hs_flags;
+ hs_expr_ext ext;
+
+ // read the flags for NFA compiler
+ if (!readExpression(i->second, re, &hs_flags, &ext)) {
+ throw NGCompileFailure("Cannot parse expression flags.");
+ }
+ // make sure we respect collider's UTF-8 setting
+ if (force_utf8) {
+ hs_flags |= HS_FLAG_UTF8;
+ }
+
+ try {
+ bool isStreaming = colliderMode == MODE_STREAMING;
+ bool isVectored = colliderMode == MODE_VECTORED;
+ CompileContext cc(isStreaming, isVectored, get_current_target(),
+ Grey());
+ ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
+
+ // UE-2850: ParsedExpression may have updated the utf8 flag if the
+ // original expression starts with (*UTF8)
+ utf8 |= pe.expr.utf8;
+
+ auto rm = ue2::make_unique<ReportManager>(cc.grey);
+
+ // Expressions containing zero-width assertions and other extended pcre
+ // types aren't supported yet. This call will throw a ParseError
+ // exception if the component tree contains such a construct.
+ checkUnsupported(*pe.component);
+
+ pe.component->checkEmbeddedStartAnchor(true);
+ pe.component->checkEmbeddedEndAnchor(true);
+
+ // edit distance may be set globally
+ if (force_edit_distance) {
+ pe.expr.edit_distance = edit_distance;
+ }
+
+ // validate_fuzzy_compile checks this, but we don't need to build the
+ // graph to know it will fail
+ if (pe.expr.edit_distance && utf8) {
+ throw NGCompileFailure("UTF-8 patterns cannot be "
+ "approximately matched");
+ }
+
+ auto built_expr = buildGraph(*rm, cc, pe);
+ auto &expr = built_expr.expr;
+ auto &g = built_expr.g;
+
+ if (expr.edit_distance || expr.hamm_distance) {
+ // check if this pattern can be approximately matched, throws
+ // CompileError on failure
+ bool hamming = expr.hamm_distance > 0;
+ u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
+ validate_fuzzy_compile(*g, e_dist, hamming, utf8, cc.grey);
+ }
+
+ if (isVacuous(*g)) {
+ if (som) {
+ throw NGUnsupportedFailure("Vacuous patterns are not supported "
+ "in SOM mode");
+ }
+ if (expr.min_length > 0) {
+ throw NGUnsupportedFailure("Vacuous patterns are not supported "
+ "in combination with min_length");
+ }
+ }
+
+ cng = make_unique<CompiledNG>(move(g), move(rm));
+ } catch (CompileError &e) {
+ throw NGCompileFailure(e.reason);
+ } catch (NGUnsupportedFailure &e) {
+ throw NGCompileFailure(e.msg);
+ } catch (...) {
+ throw NGCompileFailure("NFA graph construction failed");
+ }
+}
+
+GraphTruth::GraphTruth(ostream &os, const ExpressionMap &expr)
+ : out(os), m_expr(expr) {}
+
+unique_ptr<CNGInfo> GraphTruth::preprocess(unsigned id,
+ bool ignoreUnsupported) {
+ bool highlander = false;
+ bool prefilter = false;
+ bool som = false;
+
+ auto i = m_expr.find(id);
+ if (i == m_expr.end()) {
+ throw NGCompileFailure("ID not found in expression map.");
+ }
+
+ string re;
+ unsigned flags, hs_flags;
+ hs_expr_ext ext;
+
+ // read the flags for NFA compiler
+ if (!readExpression(i->second, re, &hs_flags, &ext)) {
+ throw NGCompileFailure("Cannot parse expression flags.");
+ }
+ // read PCRE flags
+ if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) {
+ throw NGCompileFailure("Cannot get PCRE flags.");
+ }
+ if (force_utf8) {
+ hs_flags |= HS_FLAG_UTF8;
+ }
+
+ // edit distance might be set globally
+ if (force_edit_distance) {
+ ext.edit_distance = edit_distance;
+ }
+
+ // SOM flags might be set globally.
+ som |= !!somFlags;
+
+ if (force_prefilter) {
+ prefilter = true;
+ }
+
+ u64a supported_flags = HS_EXT_FLAG_HAMMING_DISTANCE |
+ HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET |
+ HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH;
+ if (ext.flags & ~supported_flags) {
+ if (!ignoreUnsupported) {
+ throw NGUnsupportedFailure("Unsupported extended flags specified.");
+ }
+ }
+
+ auto cngi = make_unique<CNGInfo>(id, m_expr);
+ cngi->utf8 = hs_flags & HS_FLAG_UTF8;
+ cngi->highlander = highlander;
+ cngi->prefilter = prefilter;
+ cngi->som = som;
+ cngi->min_offset = ext.min_offset;
+ cngi->max_offset = ext.max_offset;
+ cngi->min_length = ext.min_length;
+ cngi->max_edit_distance = ext.edit_distance;
+ cngi->max_hamm_distance = ext.hamming_distance;
+
+ return cngi;
+}
+
+bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi,
+ const string &buffer, ResultSet &rs, string &) {
+ set<pair<size_t, size_t>> matches;
+
+ if (g_streamOffset) {
+ size_t offset = MIN(100, g_streamOffset);
+ assert(offset > 0);
+ const string preamble(string(offset, '\0'));
+
+ set<pair<size_t, size_t>> pre_matches;
+
+ // First, scan an empty buffer size of the preamble so that we can
+ // discard any matches therein after the real scan, later. We use
+ // notEod so that end-anchors in our expression don't match at the
+ // end of the buffer.
+ if (!findMatches(*cng.g, *cng.rm, preamble, pre_matches,
+ cngi.max_edit_distance, cngi.max_hamm_distance, true,
+ cngi.utf8)) {
+ return false;
+ }
+
+ // Real scan.
+ if (!findMatches(*cng.g, *cng.rm, preamble + buffer, matches,
+ cngi.max_edit_distance, cngi.max_hamm_distance, false,
+ cngi.utf8)) {
+ return false;
+ }
+
+ // Erase any matches due entirely to the preamble.
+ for (const auto &m : pre_matches) {
+ matches.erase(m);
+ }
+ } else {
+ if (!findMatches(*cng.g, *cng.rm, buffer, matches,
+ cngi.max_edit_distance, cngi.max_hamm_distance, false,
+ cngi.utf8)) {
+ return false;
+ }
+ }
+
+ populateMatchSet(rs, matches, cngi);
+
+ if (echo_matches) {
+ for (const auto &m : rs.matches) {
+ out << "NFA Match @ (" << m.from << "," << m.to << ")" << endl;
+ }
+ }
+
+ return true;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GRAPHTRUTH_H
+#define GRAPHTRUTH_H
+
+#include "expressions.h"
+#include "ResultSet.h"
+
+#include "hs_compile.h" // for hs_expr_ext
+#include "ue2common.h"
+
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include <boost/core/noncopyable.hpp>
+
+namespace ue2 {
+
+class ReportManager;
+struct BoundaryReports;
+
+} // namespace ue2
+
+struct NGCompileFailure {
+ explicit NGCompileFailure(const std::string &msg_s) : msg(msg_s) {}
+ std::string msg;
+};
+
+struct NGUnsupportedFailure {
+ explicit NGUnsupportedFailure(const std::string &msg_s) : msg(msg_s) {}
+ std::string msg;
+};
+
+// Struct to store the actual compiled NFA graph.
+class CompiledNG;
+
+// Struct to store the precompile information about the graph.
+class CNGInfo : boost::noncopyable {
+public:
+ CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in);
+ ~CNGInfo();
+
+ bool is_bad() {
+ std::lock_guard<std::mutex> lock(bad_mutex);
+ bool val = bad;
+ return val;
+ }
+
+ void mark_bad() {
+ std::lock_guard<std::mutex> lock(bad_mutex);
+ bad = true;
+ }
+
+ const CompiledNG *get() {
+ std::lock_guard<std::mutex> lock(cng_mutex);
+
+ if (cng) {
+ return cng.get();
+ }
+
+ // NFA graph hasn't been compiled yet.
+ try {
+ compile();
+ } catch (NGCompileFailure &e) {
+ throw NGCompileFailure(e);
+ } catch (NGUnsupportedFailure &e) {
+ throw NGCompileFailure(e.msg);
+ }
+
+ return cng.get();
+ }
+
+ u64a min_offset = 0;
+ u64a max_offset = 0;
+ u64a min_length = 0;
+ u32 max_edit_distance = 0;
+ u32 max_hamm_distance = 0;
+ bool utf8 = false;
+ bool highlander = false;
+ bool prefilter = false;
+ bool som = false;
+private:
+ void compile();
+ // If NFA graph scan failed for some reason, we mark it as bad and skip
+ // the remaining tests for it for performance reasons.
+ bool bad = false;
+ std::mutex bad_mutex; // serialised accesses to bad flag.
+
+ std::unique_ptr<CompiledNG> cng; // compiled NFA graph
+ std::mutex cng_mutex; // serialised accesses to NFA graph
+
+ unsigned id;
+
+ // Our expression map
+ const ExpressionMap &m_expr;
+};
+
+
+class GraphTruth : boost::noncopyable {
+public:
+ GraphTruth(std::ostream &os, const ExpressionMap &expr);
+
+ bool run(unsigned id, const CompiledNG &cng, const CNGInfo &cngi,
+ const std::string &buffer, ResultSet &rs, std::string &error);
+
+ std::unique_ptr<CNGInfo> preprocess(unsigned id,
+ bool ignoreUnsupported = false);
+
+private:
+ // Output stream.
+ std::ostream &out;
+
+ // Our expression map
+ const ExpressionMap &m_expr;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common.h"
+#include "ExpressionParser.h"
+#include "expressions.h"
+#include "GroundTruth.h"
+#include "pcre_util.h"
+
+#include "hs_compile.h" // for hs_expr_ext
+#include "ue2common.h"
+#include "parser/control_verbs.h"
+#include "parser/Parser.h"
+#include "parser/parse_error.h"
+#include "util/make_unique.h"
+#include "util/unicode_def.h"
+#include "util/unordered.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <pcre.h>
+
+/* -X, -Y support
+ * as PCRE performance is `non-linear' and these options add a large amount of
+ * scanning, the following short cuts are used:
+ * 1: the suffix is not scanned - we are more interested in the matches from
+ * the original corpora.
+ * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
+ * minor correctness issues for a few patterns.
+ */
+
+using namespace std;
+using namespace ue2;
+
+// We store matches in a hash table as we're likely to see lots of them. These
+// are moved into a ResultSet at the end.
+using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
+
+namespace {
+struct CalloutContext {
+ explicit CalloutContext(ostream &os) : out(os) {}
+ ostream &out;
+ PcreMatchSet matches;
+};
+}
+
+static
+int pcreCallOut(pcre_callout_block *block) {
+ assert(block);
+ assert(block->callout_data);
+ CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
+
+ if (echo_matches) {
+ ctx->out << "PCRE Match @ (" << block->start_match << ","
+ << block->current_position << ")" << endl;
+ }
+
+ unsigned int from = block->start_match;
+ unsigned int to = block->current_position;
+ assert(from <= to);
+
+ ctx->matches.insert(make_pair(from, to));
+ return 1;
+}
+
+static
+bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
+ bool *prefilter, bool *som, hs_expr_ext *ext) {
+ string regex;
+ unsigned int hs_flags = 0;
+ if (!readExpression(expr, regex, &hs_flags, ext)) {
+ return false;
+ }
+
+ expr.swap(regex);
+
+ if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
+ return false;
+ }
+
+ if (force_utf8) {
+ *flags |= PCRE_UTF8;
+ }
+
+ if (force_prefilter) {
+ *prefilter = true;
+ }
+
+ return true;
+}
+
+static
+string pcreErrStr(int err) {
+ switch (err) {
+ case PCRE_ERROR_NOMATCH:
+ return "PCRE_ERROR_NOMATCH";
+ case PCRE_ERROR_NULL:
+ return "PCRE_ERROR_NULL";
+ case PCRE_ERROR_BADOPTION:
+ return "PCRE_ERROR_BADOPTION";
+ case PCRE_ERROR_BADMAGIC:
+ return "PCRE_ERROR_BADMAGIC";
+#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
+ case PCRE_ERROR_UNKNOWN_OPCODE:
+ return "PCRE_ERROR_UNKNOWN_OPCODE";
+#else
+ case PCRE_ERROR_UNKNOWN_NODE:
+ return "PCRE_ERROR_UNKNOWN_NODE";
+#endif
+ case PCRE_ERROR_NOMEMORY:
+ return "PCRE_ERROR_NOMEMORY";
+ case PCRE_ERROR_NOSUBSTRING:
+ return "PCRE_ERROR_NOSUBSTRING";
+ case PCRE_ERROR_MATCHLIMIT:
+ return "PCRE_ERROR_MATCHLIMIT";
+ case PCRE_ERROR_CALLOUT:
+ return "PCRE_ERROR_CALLOUT";
+ case PCRE_ERROR_BADUTF8:
+ return "PCRE_ERROR_BADUTF8";
+ case PCRE_ERROR_BADUTF8_OFFSET:
+ return "PCRE_ERROR_BADUTF8_OFFSET";
+ case PCRE_ERROR_PARTIAL:
+ return "PCRE_ERROR_PARTIAL";
+ case PCRE_ERROR_BADPARTIAL:
+ return "PCRE_ERROR_BADPARTIAL";
+ case PCRE_ERROR_INTERNAL:
+ return "PCRE_ERROR_INTERNAL";
+ case PCRE_ERROR_BADCOUNT:
+ return "PCRE_ERROR_BADCOUNT";
+#if defined(PCRE_ERROR_RECURSIONLIMIT)
+ case PCRE_ERROR_RECURSIONLIMIT:
+ return "PCRE_ERROR_RECURSIONLIMIT";
+#endif
+ case PCRE_ERROR_DFA_UITEM:
+ return "PCRE_ERROR_DFA_UITEM";
+ case PCRE_ERROR_DFA_UCOND:
+ return "PCRE_ERROR_DFA_UCOND";
+ case PCRE_ERROR_DFA_UMLIMIT:
+ return "PCRE_ERROR_DFA_UMLIMIT";
+ case PCRE_ERROR_DFA_WSSIZE:
+ return "PCRE_ERROR_DFA_WSSIZE";
+ case PCRE_ERROR_DFA_RECURSE:
+ return "PCRE_ERROR_DFA_RECURSE";
+ default:
+ {
+ ostringstream oss;
+ oss << "Unknown PCRE error (value: " << err << ")";
+ return oss.str();
+ }
+ }
+}
+
+GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
+ unsigned long int limit,
+ unsigned long int limit_recursion)
+ : out(os), m_expr(expr), matchLimit(limit),
+ matchLimitRecursion(limit_recursion) {}
+
+void GroundTruth::global_prep() {
+ // We're using pcre callouts
+ pcre_callout = &pcreCallOut;
+}
+
+static
+void addCallout(string &re) {
+ // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
+ // it at the front. We reuse the control verbs mini-parser for this.
+ size_t startpos = 0;
+ try {
+ ue2::ParseMode mode;
+ const char *ptr = ue2::read_control_verbs(
+ re.c_str(), re.c_str() + re.size(), 0, mode);
+ startpos = ptr - re.c_str();
+ } catch (const ue2::ParseError &err) {
+ // fall through
+ }
+ assert(startpos <= re.length());
+ re.insert(startpos, "(?:");
+ // We include a \E to close any open \Q quoted block. If there isn't
+ // one, pcre will ignore the \E.
+ re.append("\\E)(?C)");
+}
+
+unique_ptr<CompiledPcre>
+GroundTruth::compile(unsigned id, bool no_callouts) {
+ bool highlander = false;
+ bool prefilter = false;
+ bool som = false;
+
+ // we can still match approximate matching patterns with PCRE if edit
+ // distance 0 is requested
+ if (force_edit_distance && edit_distance) {
+ throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
+ }
+
+ ExpressionMap::const_iterator i = m_expr.find(id);
+ if (i == m_expr.end()) {
+ throw PcreCompileFailure("ID not found in expression map.");
+ }
+
+ string re(i->second);
+ unsigned flags;
+ hs_expr_ext ext;
+
+ // Decode the flags
+ if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
+ throw PcreCompileFailure("Unable to decode flags.");
+ }
+
+ // filter out flags not supported by PCRE
+ u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
+ HS_EXT_FLAG_MIN_LENGTH;
+ if (ext.flags & ~supported) {
+ // edit distance is a known unsupported flag, so just throw a soft error
+ if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
+ throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
+ }
+ if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
+ throw SoftPcreCompileFailure(
+ "Hamming distance not supported by PCRE.");
+ }
+ throw PcreCompileFailure("Unsupported extended flags.");
+ }
+
+ // SOM flags might be set globally.
+ som |= !!somFlags;
+
+ // For traditional Hyperscan, add global callout to pattern.
+ if (!no_callouts) {
+ addCallout(re);
+ }
+
+ // Compile the pattern
+ const char *errptr = nullptr;
+ int errloc = 0;
+ int errcode = 0;
+
+ unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
+ compiled->utf8 = flags & PCRE_UTF8;
+ compiled->highlander = highlander;
+ compiled->prefilter = prefilter;
+ compiled->som = som;
+ compiled->min_offset = ext.min_offset;
+ compiled->max_offset = ext.max_offset;
+ compiled->min_length = ext.min_length;
+ compiled->expression = i->second; // original PCRE
+ flags |= PCRE_NO_AUTO_POSSESS;
+
+ compiled->bytecode =
+ pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
+
+ if (!compiled->bytecode || errptr) {
+ assert(errcode);
+ ostringstream oss;
+ oss << "Failed to compile expression '" << re << '\'';
+ oss << " (" << errptr << " at " << errloc << ").";
+ if (errcode == 20) { // "regular expression is too large"
+ throw SoftPcreCompileFailure(oss.str());
+ } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
+ throw SoftPcreCompileFailure(oss.str());
+ } else {
+ throw PcreCompileFailure(oss.str());
+ }
+ }
+
+ // Study the pattern
+ shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
+ free);
+ if (errptr) {
+ ostringstream oss;
+ oss << "Error studying pattern (" << errptr << ").";
+ throw PcreCompileFailure(oss.str());
+ }
+
+ int infoRes =
+ pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
+ &compiled->captureCount);
+ if (infoRes < PCRE_ERROR_NOMATCH) {
+ ostringstream oss;
+ oss << "Error determining number of capturing subpatterns ("
+ << pcreErrStr(infoRes) << ").";
+ throw PcreCompileFailure(oss.str());
+ }
+
+ return compiled;
+}
+
+static
+void filterLeftmostSom(ResultSet &rs) {
+ if (rs.matches.size() <= 1) {
+ return;
+ }
+
+ set<u64a> seen; // End offsets.
+ set<MatchResult>::iterator it = rs.matches.begin();
+ while (it != rs.matches.end()) {
+ if (seen.insert(it->to).second) {
+ ++it; // First time we've seen this end-offset.
+ } else {
+ rs.matches.erase(it++); // Dupe with a "righter" SOM.
+ }
+ }
+}
+
+static
+void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
+ set<MatchResult>::iterator it = rs.matches.begin();
+ while (it != rs.matches.end()) {
+ unsigned int from = it->from, to = it->to;
+ unsigned int len = to - from;
+ if (to < compiled.min_offset || to > compiled.max_offset ||
+ len < compiled.min_length) {
+ rs.matches.erase(it++);
+ } else {
+ ++it;
+ }
+ }
+}
+
+static
+int scanBasic(const CompiledPcre &compiled, const string &buffer,
+ const pcre_extra &extra, vector<int> &ovector,
+ CalloutContext &ctx) {
+ const size_t prefix_len = g_corpora_prefix.size();
+ const size_t suffix_len = g_corpora_suffix.size();
+
+ size_t begin_offset = prefix_len - MIN(50, prefix_len);
+ size_t real_len = buffer.size();
+
+ if (suffix_len > 2) {
+ real_len -= suffix_len - 2;
+ }
+
+ int flags = suffix_len ? PCRE_NOTEOL : 0;
+ int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
+ begin_offset, flags, &ovector[0], ovector.size());
+
+ if (!g_corpora_prefix.empty()) {
+ PcreMatchSet tmp;
+ tmp.swap(ctx.matches);
+
+ for (const auto &m : tmp) {
+ unsigned from = m.first;
+ unsigned to = m.second;
+ if (to >= prefix_len && to <= buffer.size() - suffix_len) {
+ from = from < prefix_len ? 0 : from - prefix_len;
+ to -= prefix_len;
+ ctx.matches.insert(make_pair(from, to));
+ }
+ }
+ }
+
+ return ret;
+}
+
+static
+int scanOffset(const CompiledPcre &compiled, const string &buffer,
+ const pcre_extra &extra, vector<int> &ovector,
+ CalloutContext &ctx) {
+ size_t offset = MIN(100, g_streamOffset);
+ assert(offset > 0);
+
+ const string buf(string(offset, '\0') + buffer);
+
+ // First, scan our preamble so that we can discard any matches therein
+ // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
+ // our expression don't match at the end of the preamble.
+ int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
+ PCRE_NOTEOL, &ovector[0], ovector.size());
+ if (ret < PCRE_ERROR_NOMATCH) {
+ return ret;
+ }
+
+ PcreMatchSet pre_matches;
+ pre_matches.swap(ctx.matches);
+
+ // Real scan.
+ ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
+ &ovector[0], ovector.size());
+ if (ret < PCRE_ERROR_NOMATCH) {
+ return ret;
+ }
+
+ // Erase any matches due entirely to the preamble.
+ for (const auto &m : pre_matches) {
+ ctx.matches.erase(m);
+ }
+
+ return ret;
+}
+
+bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
+ const string &buffer, ResultSet &rs, string &error) {
+ CalloutContext ctx(out);
+
+ pcre_extra extra;
+ extra.flags = 0;
+
+ // Switch on callouts.
+ extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
+ extra.callout_data = &ctx;
+
+ // Set the match_limit (in order to bound execution time on very complex
+ // patterns)
+ extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
+ extra.match_limit = matchLimit;
+ extra.match_limit_recursion = matchLimitRecursion;
+
+#ifdef PCRE_NO_START_OPTIMIZE
+ // Switch off optimizations that may result in callouts not occurring.
+ extra.flags |= PCRE_NO_START_OPTIMIZE;
+#endif
+
+ // Ensure there's enough room in the ovector for the capture groups in this
+ // pattern.
+ int ovecsize = (compiled.captureCount + 1) * 3;
+ ovector.resize(ovecsize);
+
+ int ret;
+ switch (colliderMode) {
+ case MODE_BLOCK:
+ case MODE_STREAMING:
+ case MODE_VECTORED:
+ if (g_streamOffset) {
+ ret = scanOffset(compiled, buffer, extra, ovector, ctx);
+ } else {
+ ret = scanBasic(compiled, buffer, extra, ovector, ctx);
+ }
+ break;
+ default:
+ assert(0);
+ ret = PCRE_ERROR_NULL;
+ break;
+ }
+
+ if (ret < PCRE_ERROR_NOMATCH) {
+ error = pcreErrStr(ret);
+ return false;
+ }
+
+ // Move matches into a ResultSet.
+ for (const auto &m : ctx.matches) {
+ unsigned long long from = m.first;
+ unsigned long long to = m.second;
+
+ if (g_streamOffset) {
+ // Subtract stream offset imposed by offset test.
+ unsigned long long offset = min(100ull, g_streamOffset);
+ assert(to >= offset);
+ from -= min(offset, from);
+ to -= offset;
+ }
+
+ rs.addMatch(from, to);
+ }
+
+ // If we have no matches, there's no further work to do.
+ if (rs.matches.empty()) {
+ return true;
+ }
+
+ if (compiled.som) {
+ filterLeftmostSom(rs);
+ }
+
+ filterExtParams(rs, compiled);
+
+ // If we haven't been asked for SOM, strip the from offsets.
+ if (!compiled.som) {
+ set<MatchResult> endonly;
+ for (const auto &m : rs.matches) {
+ endonly.insert(MatchResult(0, m.to));
+ }
+ rs.matches.swap(endonly);
+ }
+
+ return true;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GROUNDTRUTH_H
+#define GROUNDTRUTH_H
+
+#include "expressions.h"
+#include "ResultSet.h"
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <pcre.h>
+
+#include <boost/core/noncopyable.hpp>
+
+// Thrown by GroundTruth::compile in the event of a PCRE compile failure.
+struct PcreCompileFailure {
+ PcreCompileFailure(const std::string &msg_s) : msg(msg_s) {}
+ std::string msg;
+};
+
+// Thrown in the event of a "soft" PCRE compile failure, one that we don't want
+// to consider a ue2collider failure (e.g. "regular expression too large").
+struct SoftPcreCompileFailure : PcreCompileFailure {
+ SoftPcreCompileFailure(const std::string &msg_s)
+ : PcreCompileFailure(msg_s) {}
+};
+
+// Struct to store everything about a PCRE. Note that the code assumes that
+// once populated, the data in this structure will remain constant while tests
+// are running, except for the bad flag (which is protected by a mutex).
+class CompiledPcre : boost::noncopyable {
+public:
+ CompiledPcre() {}
+ ~CompiledPcre() {
+ free(bytecode);
+ }
+
+ bool is_bad() {
+ std::lock_guard<std::mutex> lock(bad_mutex);
+ bool val = bad;
+ return val;
+ }
+
+ void mark_bad() {
+ std::lock_guard<std::mutex> lock(bad_mutex);
+ bad = true;
+ }
+
+ std::string expression;
+ pcre *bytecode = nullptr;
+ unsigned long long min_offset = 0;
+ unsigned long long max_offset = ~0ULL;
+ unsigned long long min_length = 0;
+ int captureCount = 0;
+ bool utf8 = false;
+ bool highlander = false;
+ bool prefilter = false;
+ bool som = false;
+
+private:
+ // If a PCRE has hit its match recursion limit when scanning a corpus, we
+ // mark it as bad and skip the remaining tests for it for performance
+ // reasons.
+ bool bad = false;
+ std::mutex bad_mutex; // serialised accesses to bad flag.
+};
+
+// Wrapper around libpcre to generate results for an expression and corpus.
+class GroundTruth : boost::noncopyable {
+public:
+ GroundTruth(std::ostream &os, const ExpressionMap &expr,
+ unsigned long limit, unsigned long limit_recursion);
+
+ static void global_prep();
+
+ std::unique_ptr<CompiledPcre> compile(unsigned id,
+ bool no_callouts = false);
+
+ bool run(unsigned id, const CompiledPcre &compiled,
+ const std::string &buffer, ResultSet &rs, std::string &error);
+
+private:
+ // Output stream.
+ std::ostream &out;
+
+ // Our expression map
+ const ExpressionMap &m_expr;
+
+ // PCRE match limit
+ const unsigned long int matchLimit;
+ const unsigned long int matchLimitRecursion;
+
+ // Persistent ovector used to run tests.
+ std::vector<int> ovector;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_corpus_properties.h"
+#include "ng_corpus_generator.h"
+#include "NfaGeneratedCorpora.h"
+#include "ExpressionParser.h"
+
+#include "grey.h"
+#include "hs_compile.h"
+#include "compiler/compiler.h"
+#include "nfagraph/ng.h"
+#include "parser/parse_error.h"
+#include "parser/Parser.h"
+#include "parser/prefilter.h"
+#include "parser/unsupported.h"
+#include "util/compile_context.h"
+#include "util/compile_error.h"
+#include "util/report_manager.h"
+#include "util/target_info.h"
+
+#include <string>
+#include <sstream>
+#include <vector>
+
+using namespace std;
+using namespace ue2;
+
+NfaGeneratedCorpora::NfaGeneratedCorpora(const ExpressionMap &expr,
+ const CorpusProperties &props,
+ bool force_utf8_mode_in,
+ bool force_prefilter_mode_in)
+ : m_expr(expr), m_props(props), force_utf8_mode(force_utf8_mode_in),
+ force_prefilter_mode(force_prefilter_mode_in) {
+ // empty
+}
+
+NfaGeneratedCorpora *NfaGeneratedCorpora::clone() const {
+ return new NfaGeneratedCorpora(m_expr, m_props, force_utf8_mode,
+ force_prefilter_mode);
+}
+
+void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
+ ExpressionMap::const_iterator i = m_expr.find(id);
+ if (i == m_expr.end()) {
+ throw CorpusFailure("Expression not found.");
+ }
+
+ string re;
+ u32 hs_flags;
+ hs_expr_ext ext;
+ if (!readExpression(i->second, re, &hs_flags, &ext)) {
+ throw CorpusFailure("Expression could not be read: " + i->second);
+ }
+
+ if (force_utf8_mode) {
+ hs_flags |= HS_FLAG_UTF8;
+ }
+
+ if (force_prefilter_mode) {
+ hs_flags |= HS_FLAG_PREFILTER;
+ }
+
+ // Wrap the UE2 parser and compiler functionality and use it to generate
+ // corpora for us.
+ vector<string> c;
+
+ try {
+ ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext);
+
+ // Apply prefiltering transformations if desired.
+ if (pe.expr.prefilter) {
+ prefilterTree(pe.component, ParseMode(hs_flags));
+ }
+
+ // Bail on patterns with unsupported constructs.
+ checkUnsupported(*pe.component);
+ pe.component->checkEmbeddedStartAnchor(true);
+ pe.component->checkEmbeddedEndAnchor(true);
+
+ CompileContext cc(false, false, get_current_target(), Grey());
+ ReportManager rm(cc.grey);
+ auto built_expr = buildGraph(rm, cc, pe);
+ if (!built_expr.g) {
+ // A more specific error should probably have been thrown by
+ // buildGraph.
+ throw CorpusFailure("could not build graph.");
+ }
+
+ const auto cg =
+ makeCorpusGenerator(*built_expr.g, built_expr.expr, m_props);
+ cg->generateCorpus(c);
+ }
+ catch (const ParseError &e) {
+ throw CorpusFailure("compilation failed, " + e.reason);
+ }
+ catch (const CompileError &e) {
+ throw CorpusFailure("compilation failed, " + e.reason);
+ }
+ catch (const std::bad_alloc &) {
+ throw CorpusFailure("out of memory.");
+ }
+ catch (const CorpusGenerationFailure &e) {
+ // if corpus generation failed, just pass up the error message
+ throw CorpusFailure("corpus generation failed: " + e.message);
+ }
+ catch (...) {
+ throw CorpusFailure("unknown error.");
+ }
+
+ if (c.empty()) {
+ throw CorpusFailure("no corpora generated.");
+ }
+
+ data.reserve(data.size() + c.size());
+ for (const auto &e : c) {
+ data.push_back(Corpus(e));
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NFAGENERATEDCORPORA_H
+#define NFAGENERATEDCORPORA_H
+
+#include "Corpora.h"
+#include "ng_corpus_properties.h"
+#include "expressions.h"
+
+#include <string>
+#include <vector>
+
+// Corpora associated with a pattern set
+class NfaGeneratedCorpora : public CorporaSource {
+public:
+ NfaGeneratedCorpora(const ExpressionMap &expr,
+ const CorpusProperties &props, bool force_utf8_mode_in,
+ bool force_prefilter_mode_in);
+
+ NfaGeneratedCorpora *clone() const override;
+
+ void generate(unsigned id, std::vector<Corpus> &data) override;
+
+private:
+ // Expressions handled by this corpora object
+ const ExpressionMap &m_expr;
+
+ // CorpusProperties policy object
+ CorpusProperties m_props;
+
+ bool force_utf8_mode;
+ bool force_prefilter_mode;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RESULTSET_H
+#define RESULTSET_H
+
+#include <iostream>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+// Class representing a single match, encapsulating to/from offsets.
+class MatchResult {
+public:
+ MatchResult(unsigned long long start, unsigned long long end)
+ : from(start), to(end) {}
+
+ bool operator<(const MatchResult &a) const {
+ if (from != a.from) {
+ return from < a.from;
+ }
+ return to < a.to;
+ }
+
+ bool operator==(const MatchResult &a) const {
+ return from == a.from && to == a.to;
+ }
+
+ unsigned long long from;
+ unsigned long long to;
+};
+
+enum ResultSource {
+ RESULT_FROM_UE2,
+ RESULT_FROM_PCRE,
+ RESULT_FROM_GRAPH,
+};
+
+inline
+std::ostream &operator<<(std::ostream &out, ResultSource src) {
+ switch (src) {
+ case RESULT_FROM_UE2:
+ out << "UE2";
+ break;
+ case RESULT_FROM_GRAPH:
+ out << "Graph";
+ break;
+ case RESULT_FROM_PCRE:
+ out << "PCRE";
+ break;
+ }
+ return out;
+}
+
+class ResultSet {
+public:
+ // Constructor.
+ explicit ResultSet(ResultSource s) : src(s) {}
+
+ // Can be constructed with a set of end-offsets.
+ ResultSet(const std::set<unsigned int> &m, ResultSource s) : src(s) {
+ for (const auto &offset : m) {
+ matches.emplace(0, offset);
+ }
+ }
+
+ // Equality.
+ bool operator==(const ResultSet &other) const {
+ return uoom == other.uoom &&
+ match_after_halt == other.match_after_halt &&
+ invalid_id == other.invalid_id &&
+ matches == other.matches;
+ }
+
+ // Inequality.
+ bool operator!=(const ResultSet &other) const { return !(*this == other); }
+
+ // Add a match.
+ void addMatch(unsigned long long from, unsigned long long to,
+ int block = 0) {
+ MatchResult m(from, to);
+ matches.insert(m);
+
+ if (matches_by_block[block].find(m) != matches_by_block[block].end()) {
+ dupe_matches.insert(m);
+ } else {
+ matches_by_block[block].insert(m);
+ }
+ }
+
+ // Unexpected out of order match seen.
+ bool uoom = false;
+
+ // A match was received after termination was requested.
+ bool match_after_halt = false;
+
+ // A match from an invalid ID was seen.
+ bool invalid_id = false;
+
+ // Ordered set of matches.
+ std::set<MatchResult> matches;
+
+ // Matches grouped by stream write/block that we see them in.
+ std::map<int, std::set<MatchResult>> matches_by_block;
+
+ // Dupe matches that we have seen.
+ std::set<MatchResult> dupe_matches;
+
+ /* Where these results came from (does not take part in comparisions) */
+ ResultSource src;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "Thread.h"
+#include "common.h"
+#include "sig.h"
+
+#include <cstdlib>
+#include <iostream>
+
+#include <pthread.h>
+
+static const size_t COLLIDER_THREAD_STACK_SIZE = 8192 * 1024;
+
+void Thread::start() {
+ // Some systems, notably Mac OS X, use a default stack size that is
+ // smaller than what we want (particularly given that we're planning on
+ // running PCRE, which recurses inside pcre_exec). We attempt to
+ // increase it to 8MB.
+ int ret;
+ pthread_attr_t attr;
+ ret = pthread_attr_init(&attr);
+ if (ret) {
+ std::cerr << "pthread_attr_init failed" << std::endl;
+ exit(1);
+ }
+
+ size_t stacksize = 0;
+ ret = pthread_attr_getstacksize(&attr, &stacksize);
+ if (ret) {
+ std::cerr << "Warning: can't query stack size with "
+ "pthread_attr_getstacksize" << std::endl;
+ goto create_thread;
+ }
+
+ if (stacksize < COLLIDER_THREAD_STACK_SIZE) {
+ ret = pthread_attr_setstacksize(&attr, COLLIDER_THREAD_STACK_SIZE);
+ if (ret) {
+ std::cerr << "Warning: pthread_attr_setstacksize failed, "
+ "unable to set stack size to "
+ << COLLIDER_THREAD_STACK_SIZE << " bytes." << std::endl;
+ // Fall through: this isn't necessarily fatal (yet!)
+ }
+ }
+
+create_thread:
+ ret = pthread_create(&thread, &attr, &runThread, this);
+ if (ret) {
+ std::cerr << "pthread_create failed for thread id " << thread_id
+ << std::endl;
+ exit(1);
+ }
+}
+
+// Dispatch
+void *Thread::runThread(void *thr) {
+ if (!no_signal_handler) {
+ setSignalStack();
+ }
+ ((Thread *)thr)->run();
+ return nullptr;
+}
+
+void Thread::join() { pthread_join(thread, nullptr); }
+
+Thread::Thread(size_t num) : thread_id(num) {}
+
+Thread::~Thread() {}
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UE2COLLIDER_THREAD_H
+#define UE2COLLIDER_THREAD_H
+
+#include <cstdlib>
+
+#include <pthread.h>
+
+#include <boost/core/noncopyable.hpp>
+
+class Thread : boost::noncopyable {
+public:
+ explicit Thread(size_t num);
+ virtual ~Thread();
+
+ virtual void start();
+
+ // Dispatch
+ static void *runThread(void *thr);
+
+ virtual void join();
+
+ // Implemented by subclasses.
+ virtual void run() = 0;
+
+protected:
+ const size_t thread_id;
+
+private:
+ pthread_t thread;
+};
+
+#endif // UE2COLLIDER_THREAD_H
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ResultSet.h"
+#include "UltimateTruth.h"
+#include "util/database_util.h"
+#include "util/ExpressionParser.h"
+#include "util/string_util.h"
+
+#include "ue2common.h"
+#include "common.h"
+#include "crc32.h"
+#include "hs.h"
+#include "hs_internal.h"
+#include "util/make_unique.h"
+
+#include "scratch.h"
+#include "nfa/nfa_api_queue.h"
+#include "rose/rose_internal.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+using namespace std;
+using namespace ue2;
+using boost::ptr_vector;
+
+#ifndef RELEASE_BUILD
+
+#include "database.h"
+#include "state.h"
+
+static
+hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags,
+ hs_stream_t **stream, hs_scratch_t *scratch,
+ unsigned long long start_offset) {
+ hs_error_t ret = hs_open_stream(db, flags, stream);
+ if (ret != HS_SUCCESS) {
+ return ret;
+ }
+
+ const char dummy_data[100] = { 0 };
+ UNUSED const struct RoseEngine *rose
+ = (const struct RoseEngine *)hs_get_bytecode(db);
+ assert(sizeof(dummy_data) >= rose->historyRequired);
+ hs_scan_stream(*stream, dummy_data, MIN(start_offset, sizeof(dummy_data)), 0,
+ scratch, nullptr, nullptr);
+ (*stream)->offset = start_offset;
+ return ret;
+}
+
+#endif // RELEASE_BUILD
+
+class HyperscanDB : boost::noncopyable {
+public:
+ // Constructor takes iterators over a container of pattern IDs.
+ template <class Iter>
+ HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end)
+ : db(db_in), ids(ids_begin, ids_end) {}
+
+ ~HyperscanDB() {
+ hs_free_database(db);
+ }
+
+ // Underlying Hyperscan database pointer.
+ hs_database_t *db;
+
+ // The set of expression IDs that must return their matches in order.
+ unordered_set<unsigned> ordered;
+
+ // The complete set of expression IDs associated with this database.
+ unordered_set<unsigned> ids;
+};
+
+// Used to track the ID and result set.
+namespace {
+struct MultiContext {
+ MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in,
+ bool single_in, ostream &os)
+ : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {}
+ unsigned int id;
+ int block = 0;
+ const HyperscanDB &db;
+ ResultSet *rs;
+ u64a lastRawMatch = 0; /* store last known unadjusted match location */
+ u64a lastOrderMatch = 0;
+ bool single;
+ bool use_max_offset = false;
+ unsigned long long max_offset = 0; /* don't record matches beyond this */
+ bool terminated = false; //!< user has instructed us to stop
+ bool in_scan_call = false;
+ ostream &out;
+};
+}
+
+// Callback used for all (both single and multi-mode) scans.
+static
+int callbackMulti(unsigned int id, unsigned long long from,
+ unsigned long long to, UNUSED unsigned int flags, void *ctx) {
+ MultiContext *mctx = static_cast<MultiContext *>(ctx);
+ assert(mctx);
+ assert(mctx->rs);
+ assert(mctx->in_scan_call);
+
+ ostream &out = mctx->out;
+
+ // Sanity check: in single mode, we'd better not be getting matches for the
+ // wrong ID!
+ if (mctx->single && id != mctx->id) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " which is not the id we're looking for" << endl;
+ mctx->rs->invalid_id = true;
+ return 1;
+ }
+
+ // In any mode, we should NEVER get a match from an ID outside our known set.
+ if (mctx->db.ids.find(id) == mctx->db.ids.end()) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " which is not in the pattern set" << endl;
+ mctx->rs->invalid_id = true;
+ return 1;
+ }
+
+ if (mctx->terminated) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " after termination" << endl;
+ mctx->rs->match_after_halt = true;
+ }
+
+#ifndef RELEASE_BUILD
+ unsigned int adjustment = flags & HS_MATCH_FLAG_ADJUSTED ? 1 : 0;
+ if (mctx->lastRawMatch > to + adjustment) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " unordered" << endl;
+ mctx->rs->uoom = true;
+ }
+ mctx->lastRawMatch = to + adjustment;
+#endif
+
+ if (mctx->db.ordered.find(id) != mctx->db.ordered.end()) {
+ if (mctx->lastOrderMatch > to) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " unordered" << endl;
+ mctx->rs->uoom = true;
+ }
+ mctx->lastOrderMatch = to;
+ }
+
+ if (mctx->use_max_offset && to > mctx->max_offset) {
+ if (echo_matches) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " ignored" << endl;
+ }
+ return 0;
+ }
+
+ if (to - g_streamOffset < g_corpora_prefix.size()) {
+ if (echo_matches) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id
+ << " too early" << endl;
+ }
+ return 0;
+ }
+
+ u64a offsetDelta = g_corpora_prefix.size() + g_streamOffset;
+
+ if (from) {
+ // from only set in SOM mode, otherwise zero. If we wanted to be REALLY
+ // principled about this, we'd probably want to stash the flags
+ // somewhere at compile time.
+ from -= (from > offsetDelta ? offsetDelta : from);
+ }
+
+ to -= offsetDelta;
+
+ if (echo_matches) {
+ out << "UE2 Match @ (" << from << "," << to << ") for " << id << endl;
+ }
+
+ if (mctx->single || id == mctx->id) {
+ mctx->rs->addMatch(from, to, mctx->block);
+ if (limit_matches && mctx->rs->matches.size() == limit_matches) {
+ if (echo_matches) {
+ out << "Terminating matching (hit match limit)" << endl;
+ }
+ mctx->terminated = true;
+ return 1; // terminate matching.
+ }
+ }
+
+ return 0;
+}
+
+static
+void filterLeftmostSom(ResultSet &rs) {
+ if (rs.matches.size() <= 1) {
+ return;
+ }
+
+ set<u64a> seen; // End offsets.
+ auto it = rs.matches.begin();
+ while (it != rs.matches.end()) {
+ if (seen.insert(it->to).second) {
+ ++it; // First time we've seen this end-offset.
+ } else {
+ rs.matches.erase(it++);
+ }
+ }
+}
+
+UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
+ const hs_platform_info_t *plat,
+ const Grey &grey_in, unsigned int streamBlocks)
+ : grey(grey_in), out(os), m_expr(expr), m_xcompile(false),
+ m_streamBlocks(streamBlocks), scratch(nullptr),
+ platform(plat) {
+ // Build our mode flags.
+
+ switch (colliderMode) {
+ case MODE_STREAMING:
+ m_mode = HS_MODE_STREAM;
+ break;
+ case MODE_BLOCK:
+ m_mode = HS_MODE_BLOCK;
+ break;
+ case MODE_VECTORED:
+ m_mode = HS_MODE_VECTORED;
+ break;
+ }
+
+ // Set desired SOM precision, if we're in streaming mode.
+ if (colliderMode == MODE_STREAMING) {
+ m_mode |= somPrecisionMode;
+ }
+}
+
+UltimateTruth::~UltimateTruth() {
+ hs_free_scratch(scratch);
+}
+
+static
+void mangle_scratch(hs_scratch_t *scratch) {
+ /* Use our knowledge of the internals of scratch to make a mess */
+
+ memset(&scratch->tctxt, 0xc0, sizeof(scratch->tctxt));
+ memset(scratch->bstate, 0xd0, scratch->bStateSize);
+ memset(scratch->tstate, 0xe0, scratch->tStateSize);
+ memset(scratch->fullState, 0xf0, scratch->fullStateSize);
+
+ for (u32 i = 0; i < scratch->queueCount; i++) {
+ struct mq *q = &scratch->queues[i];
+ memset(q, 0x01, sizeof(*q));
+ q->scratch = scratch;
+ }
+
+ memset(scratch->aqa, 0xb0, scratch->activeQueueArraySize);
+ for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) {
+ memset(scratch->delay_slots[i], 0x05, scratch->delay_fatbit_size);
+ }
+
+ memset(scratch->catchup_pq.qm, 0x06,
+ scratch->queueCount * sizeof(struct queue_match));
+ scratch->catchup_pq.qm_size = 45;
+ memset(&scratch->core_info, 0x07, sizeof(scratch->core_info));
+ memset(scratch->deduper.som_start_log[0], 0x90,
+ sizeof(u64a) * scratch->deduper.dkey_count);
+ memset(scratch->deduper.som_start_log[1], 0x09,
+ sizeof(u64a) * scratch->deduper.dkey_count);
+ memset(scratch->deduper.log[0], 0xa0, scratch->deduper.log_size);
+ memset(scratch->deduper.log[1], 0x0a, scratch->deduper.log_size);
+ memset(scratch->deduper.som_log[0], 0xd0, scratch->deduper.log_size);
+ memset(scratch->deduper.som_log[1], 0x0d, scratch->deduper.log_size);
+
+ for (u32 i = 0; i < scratch->anchored_literal_region_len; i++) {
+ memset(scratch->al_log[i], 0xa0, scratch->anchored_literal_fatbit_size);
+ }
+ scratch->al_log_sum=0xf0f;
+
+ memset(scratch->handled_roles, 0x05, scratch->handledKeyFatbitSize);
+ memset(scratch->som_store, 0x06,
+ scratch->som_store_count * sizeof(u64a));
+ memset(scratch->som_attempted_store, 0x06,
+ scratch->som_store_count * sizeof(u64a));
+ memset(scratch->som_set_now, 0x03, scratch->som_fatbit_size);
+ memset(scratch->som_attempted_set, 0x04, scratch->som_fatbit_size);
+ scratch->som_set_now_offset = 45;
+ memset(&scratch->fdr_conf, 0x0d, sizeof(scratch->fdr_conf));
+ scratch->fdr_conf_offset = 0xe4;
+}
+
+bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer,
+ size_t align, match_event_handler callback,
+ void *ctx_in, ResultSet *) {
+ assert(colliderMode == MODE_BLOCK);
+ assert(!m_xcompile);
+
+ const hs_database_t *db = hdb.db;
+ assert(db);
+ MultiContext *ctx = (MultiContext *)ctx_in;
+
+ char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align);
+ if (!realigned) {
+ return false;
+ }
+
+ if (use_copy_scratch && !cloneScratch()) {
+ return false;
+ }
+
+ ctx->in_scan_call = true;
+ hs_error_t ret =
+ hs_scan(db, realigned, buffer.size(), 0, scratch, callback, ctx);
+ ctx->in_scan_call = false;
+
+ if (g_verbose) {
+ out << "Scan call returned " << ret << endl;
+ }
+
+ if (ctx->terminated) {
+ if (g_verbose && ret != HS_SCAN_TERMINATED) {
+ out << "Scan should have returned HS_SCAN_TERMINATED, returned "
+ << ret << " instead." << endl;
+ }
+ return ret == HS_SCAN_TERMINATED;
+ }
+
+ if (g_verbose && ret != HS_SUCCESS) {
+ out << "Scan should have returned HS_SUCCESS, returned " << ret
+ << " instead." << endl;
+ }
+
+ if (use_mangle_scratch) {
+ mangle_scratch(scratch);
+ }
+
+ return ret == HS_SUCCESS;
+}
+
+static
+vector<char> compressAndCloseStream(hs_stream_t *stream) {
+ size_t needed;
+ hs_error_t err = hs_compress_stream(stream, nullptr, 0, &needed);
+ if (err != HS_INSUFFICIENT_SPACE) {
+ return {};
+ }
+
+ vector<char> buf(needed);
+ err = hs_compress_stream(stream, buf.data(), needed, &needed);
+ if (err != HS_SUCCESS) {
+ return {};
+ }
+ assert(needed == buf.size());
+
+ err = hs_close_stream(stream, nullptr, nullptr, nullptr);
+ if (err != HS_SUCCESS) {
+ return {};
+ }
+
+ return buf;
+}
+
+
+static
+hs_stream_t *compressAndExpandStream(const hs_database_t *db,
+ hs_stream_t *stream) {
+ vector<char> buf = compressAndCloseStream(stream);
+ hs_stream_t *out;
+ hs_error_t err = hs_expand_stream(db, &out, buf.data(), buf.size());
+
+ if (err != HS_SUCCESS) {
+ return nullptr;
+ }
+
+ return out;
+}
+
+static
+hs_stream_t *compressAndResetExpandStream(const hs_database_t *db,
+ hs_stream_t *stream) {
+ vector<char> buf = compressAndCloseStream(stream);
+ if (buf.empty()) {
+ return nullptr;
+ }
+
+ hs_stream_t *out;
+
+ hs_error_t err = hs_open_stream(db, 0, &out);
+
+ if (err != HS_SUCCESS) {
+ return nullptr;
+ }
+
+ err = hs_reset_and_expand_stream(out, buf.data(), buf.size(), nullptr,
+ nullptr, nullptr);
+ if (err != HS_SUCCESS) {
+ return nullptr;
+ }
+
+ return out;
+}
+
+bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer,
+ size_t align, match_event_handler callback,
+ void *ctx_in, ResultSet *rs) {
+ assert(colliderMode == MODE_STREAMING);
+ assert(!m_xcompile);
+
+ const hs_database_t *db = hdb.db;
+ assert(db);
+ MultiContext *ctx = (MultiContext *)ctx_in;
+
+ // open a stream
+ hs_stream_t *stream;
+ size_t stream_size;
+ int ret;
+
+ ret = hs_stream_size(db, &stream_size);
+ if (ret != HS_SUCCESS) {
+ out << "Unable to size stream." << endl;
+ return false;
+ }
+
+ if (!g_streamOffset) {
+ ret = hs_open_stream(db, 0, &stream);
+ } else {
+#ifndef RELEASE_BUILD
+ ret = open_magic_stream(db, 0, &stream, scratch, g_streamOffset);
+#else
+ ret = HS_INVALID;
+#endif
+ }
+
+ if (ret != HS_SUCCESS) {
+ out << "Unable to open stream." << endl;
+ return false;
+ }
+
+ // scan our data, split into blocks and copied into a temporary buffer
+ // aligned as requested (out of paranoia)
+ unsigned blockSize = buffer.size() / m_streamBlocks;
+ if (blockSize == 0) {
+ blockSize = 1;
+ }
+ const char *ptr = buffer.c_str();
+ const char *end = ptr + buffer.size();
+ ctx->block = 0;
+
+ // We use a do-while loop here so that zero-byte cases still generate at
+ // least one hs_scan_stream call, since it's something users might try.
+ do {
+ if (ptr + blockSize > end) {
+ // last write is a runt
+ blockSize = end - ptr;
+ }
+ char *realigned = setupScanBuffer(ptr, blockSize, align);
+ if (!realigned) {
+ return false;
+ }
+ ctx->in_scan_call = true;
+ DEBUG_PRINTF("scan stream write %u\n", ctx->block);
+ ret = hs_scan_stream(stream, realigned, blockSize, 0, scratch,
+ callback, ctx);
+ DEBUG_PRINTF("scan %u done\n", ctx->block);
+ ctx->in_scan_call = false;
+
+ if (limit_matches && rs->matches.size() == limit_matches) {
+ if (ret != HS_SCAN_TERMINATED) {
+ DEBUG_PRINTF("failure to scan %d\n", ret);
+ return false;
+ }
+ } else if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to scan %d\n", ret);
+ return false;
+ }
+
+ if (use_copy_scratch && !cloneScratch()) {
+ return false;
+ }
+
+ if (use_copy_stream) {
+ hs_stream_t *s2;
+ ret = hs_copy_stream(&s2, stream);
+ if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to copy %d\n", ret);
+ return false;
+ }
+ /* do a short write to the old stream so that it is in the wrong
+ * state. */
+ char temp[2] = {0, 0};
+ ret = hs_scan_stream(stream, temp, sizeof(temp), 0, scratch,
+ nullptr, nullptr);
+
+ hs_error_t expected = HS_SUCCESS;
+ if (limit_matches && rs->matches.size() == limit_matches) {
+ expected = HS_SCAN_TERMINATED;
+ }
+ if (ret != expected) {
+ DEBUG_PRINTF("failure to scan %d\n", ret);
+ return false;
+ }
+ ret = hs_close_stream(stream, nullptr, nullptr, nullptr);
+ if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to close %d\n", ret);
+ return false;
+ }
+ stream = s2;
+ }
+ if (use_mangle_scratch) {
+ mangle_scratch(scratch);
+ }
+
+ if (use_compress_expand) {
+ auto rv = compressAndExpandStream(db, stream);
+ if (!rv) {
+ if (g_verbose) {
+ out << "Compress/Expand failed." << endl;
+ }
+ return false;
+ } else {
+ stream = rv;
+ }
+ }
+
+ if (use_compress_reset_expand) {
+ auto rv = compressAndResetExpandStream(db, stream);
+ if (!rv) {
+ if (g_verbose) {
+ out << "Compress/Expand failed." << endl;
+ }
+ return false;
+ } else {
+ stream = rv;
+ }
+ }
+
+ ptr += blockSize;
+ ctx->block++;
+ } while (ptr < end);
+
+ // close the stream
+ ctx->in_scan_call = true;
+ DEBUG_PRINTF("close stream %u\n", ctx->block);
+ ret = hs_close_stream(stream, scratch, callback, ctx);
+ DEBUG_PRINTF("close stream done\n");
+ ctx->in_scan_call = false;
+
+ if (ret != HS_SUCCESS) {
+ return false;
+ }
+
+ // UE2 cannot dedupe SOM matches across stream boundaries, so we must
+ // filter them out.
+ filterLeftmostSom(*rs);
+
+ return ret == HS_SUCCESS;
+}
+
+bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer,
+ size_t align, match_event_handler callback,
+ void *ctx_in, ResultSet *rs) {
+ assert(colliderMode == MODE_VECTORED);
+ assert(!m_xcompile);
+
+ const hs_database_t *db = hdb.db;
+ assert(db);
+ MultiContext *ctx = (MultiContext *)ctx_in;
+
+ int ret;
+
+ assert(!g_streamOffset);
+
+ // scan our data, split into blocks and copied into a temporary buffer
+ // aligned as requested (out of paranoia)
+ unsigned blockSize = buffer.size() / m_streamBlocks;
+ if (blockSize == 0) {
+ blockSize = 1;
+ }
+ const char *ptr = buffer.c_str();
+ const char *end = ptr + buffer.size();
+ ctx->block = 0;
+
+ // We use a do-while loop here so that zero-byte cases still generate at
+ // least one hs_scan_stream call, since it's something users might try.
+
+ vector<const char *> data;
+ vector<unsigned int> length;
+
+ u32 block_count = (buffer.size() + blockSize - 1) / blockSize;
+ block_count = MAX(block_count, 1);
+
+ if (block_count > raw_blocks.size()) {
+ raw_blocks.resize(block_count);
+ }
+
+ do {
+ if (ptr + blockSize > end) {
+ // last write is a runt
+ blockSize = end - ptr;
+ }
+ char *realigned = setupVecScanBuffer(ptr, blockSize, align, ctx->block);
+ if (!realigned) {
+ return false;
+ }
+
+ data.push_back(realigned);
+ length.push_back(blockSize);
+
+ ptr += blockSize;
+ ctx->block++;
+
+ } while (ptr < end);
+
+ if (use_copy_scratch && !cloneScratch()) {
+ return false;
+ }
+
+ DEBUG_PRINTF("scan vectored write %u\n", ctx->block);
+ ctx->in_scan_call = true;
+ ret = hs_scan_vector(db, &data[0], &length[0], ctx->block, 0, scratch,
+ callback, ctx);
+ ctx->in_scan_call = false;
+ DEBUG_PRINTF("scan %u done\n", ctx->block);
+ if (use_mangle_scratch) {
+ mangle_scratch(scratch);
+ }
+
+ rs->dupe_matches.clear(); /* TODO: dedupe across vectored blocks */
+
+ if (limit_matches && rs->matches.size() == limit_matches) {
+ if (ret != HS_SCAN_TERMINATED) {
+ DEBUG_PRINTF("failure to scan %d\n", ret);
+ return false;
+ }
+ } else if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to scan %d\n", ret);
+ return false;
+ }
+
+ // UE2 cannot dedupe SOM matches across vector block boundaries, so we must
+ // filter them out.
+ filterLeftmostSom(*rs);
+
+ return true;
+}
+
+bool UltimateTruth::run(unsigned int id, shared_ptr<const HyperscanDB> hdb,
+ const string &buffer, bool single_pattern,
+ unsigned int align, ResultSet &rs) {
+ assert(!m_xcompile);
+ assert(hdb);
+
+ // Ensure that scratch is appropriate for this database.
+ if (!allocScratch(hdb)) {
+ out << "Scratch alloc failed." << endl;
+ return false;
+ }
+
+ MultiContext ctx(id, *hdb, &rs, single_pattern, out);
+ if (!g_corpora_suffix.empty()) {
+ ctx.use_max_offset = true;
+ ctx.max_offset = buffer.size() - g_corpora_suffix.size();
+ }
+
+ switch (colliderMode) {
+ case MODE_BLOCK:
+ return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
+ case MODE_STREAMING:
+ return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
+ case MODE_VECTORED:
+ return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs);
+ }
+
+ assert(0);
+ return false;
+}
+
+static
+bool isOrdered(const string &expr, unsigned int flags) {
+ // SOM doesn't produce ordered matches?
+ if (flags & HS_FLAG_SOM_LEFTMOST) {
+ return false;
+ }
+
+ hs_expr_info_t *info = nullptr;
+ hs_compile_error_t *error = nullptr;
+ hs_error_t err = hs_expression_info(expr.c_str(), flags, &info, &error);
+ if (err != HS_SUCCESS) {
+ // Expression will fail compilation and report error elsewhere.
+ free(info);
+ hs_free_compile_error(error);
+ return false;
+ }
+
+ assert(info);
+
+ // Any pattern that does not require offset adjustment should produce
+ // matches in order.
+ bool ordered = !info->unordered_matches;
+ free(info);
+ return ordered;
+}
+
+static unique_ptr<HyperscanDB>
+compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
+ vector<unsigned> &idsvec, ptr_vector<hs_expr_ext> &ext,
+ unsigned mode, const hs_platform_info *platform, string &error,
+ const Grey &grey) {
+ const unsigned count = patterns.size();
+ hs_database_t *db = nullptr;
+ hs_compile_error_t *compile_err;
+
+ hs_error_t err = hs_compile_multi_int(&patterns[0], &flags[0],
+ &idsvec[0], ext.c_array(), count,
+ mode, platform, &db,
+ &compile_err, grey);
+
+ if (err != HS_SUCCESS) {
+ error = compile_err->message;
+ hs_free_compile_error(compile_err);
+ return nullptr;
+ }
+
+ return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
+}
+
+shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
+ string &error) const {
+ // Build our vectors for compilation
+ const size_t count = ids.size();
+ vector<string> expressions(count);
+ vector<unsigned> idsvec(ids.begin(), ids.end());
+ vector<unsigned> flags(count);
+ vector<bool> check_ordered(count, false);
+ ptr_vector<hs_expr_ext> ext;
+ ext.reserve(count);
+
+ size_t n = 0;
+ for (const auto &id : ids) {
+ auto j = m_expr.find(id);
+ if (j == m_expr.end()) {
+ error = "Unable to find ID.";
+ return nullptr;
+ }
+
+ ext.push_back(new hs_expr_ext);
+ bool must_be_ordered;
+ if (!readExpression(j->second, expressions[n], &flags[n], &ext[n],
+ &must_be_ordered)) {
+ ostringstream oss;
+ oss << "Unable to decode flags: '" << j->first << ":"
+ << j->second << "'.";
+ error = oss.str();
+ return nullptr;
+ }
+
+ check_ordered[n] = must_be_ordered;
+
+ if (force_utf8) {
+ flags[n] |= HS_FLAG_UTF8;
+ }
+
+ if (force_prefilter) {
+ flags[n] |= HS_FLAG_PREFILTER;
+ }
+
+ if (somFlags) {
+ flags[n] |= somFlags;
+ }
+
+ if (force_edit_distance) {
+ ext[n].flags |= HS_EXT_FLAG_EDIT_DISTANCE;
+ ext[n].edit_distance = edit_distance;
+ }
+
+ n++;
+ }
+
+ // Our compiler takes an array of plain ol' C strings.
+ vector<const char *> patterns(count);
+ for (unsigned int i = 0; i < count; i++) {
+ patterns[i] = expressions[i].c_str();
+ }
+
+ // Compile
+ if (!count) { /* slight hack to allow us to compile empty sets cleanly */
+ patterns.push_back(nullptr);
+ flags.push_back(0);
+ idsvec.push_back(0);
+ }
+
+ auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform,
+ error, grey);
+ if (!db) {
+ return nullptr;
+ }
+
+ // Track IDs of patterns that require ordering for validation at match
+ // time.
+ for (unsigned int i = 0; i < count; i++) {
+ bool is_ordered = isOrdered(expressions[i], flags[i]);
+ if (check_ordered[i] && !is_ordered) {
+ error = "Ordering required, but hs_expression_info suggests "
+ "that ordering is not guaranteed.";
+ return nullptr;
+ }
+ if (is_ordered) {
+ db->ordered.insert(idsvec[i]);
+ }
+ }
+
+ return move(db);
+}
+
+bool UltimateTruth::allocScratch(shared_ptr<const HyperscanDB> db) {
+ assert(db);
+
+ // We explicitly avoid running scratch allocators for the same HyperscanDB
+ // over and over again by retaining a shared_ptr to the last one we saw.
+ if (db == last_db) {
+ return true;
+ }
+
+ hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch);
+ if (err != HS_SUCCESS) {
+ return false;
+ }
+
+ last_db = db;
+ return true;
+}
+
+bool UltimateTruth::cloneScratch(void) {
+ hs_scratch_t *old_scratch = scratch;
+ hs_scratch_t *new_scratch;
+ hs_error_t ret = hs_clone_scratch(scratch, &new_scratch);
+ if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to clone %d\n", ret);
+ return false;
+ }
+ scratch = new_scratch;
+ ret = hs_free_scratch(old_scratch);
+ if (ret != HS_SUCCESS) {
+ DEBUG_PRINTF("failure to free %d\n", ret);
+ return false;
+ }
+ DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
+ return true;
+}
+
+// Return an appropriately aligned (modulo max align) copy of the given buffer
+char * UltimateTruth::setupScanBuffer(const char *begin, size_t len,
+ size_t align) {
+ if (align >= MAX_MAX_UE2_ALIGN) {
+ return nullptr;
+ }
+
+ // Realloc if necessary
+ size_t maxBufSize = len + MAX_MAX_UE2_ALIGN;
+ if (maxBufSize > m_scanBuf.size()) {
+ m_scanBuf.resize(maxBufSize);
+ }
+
+ uintptr_t currentAlign = (uintptr_t)(m_scanBuf.data()) % MAX_MAX_UE2_ALIGN;
+ char *ptr;
+
+ ptrdiff_t diff = align - currentAlign;
+ if (diff >= 0) {
+ ptr = (m_scanBuf.data() + diff);
+ } else {
+ ptr = (m_scanBuf.data() + (MAX_MAX_UE2_ALIGN + diff));
+ }
+ assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align);
+
+ // copy the buffer
+ memcpy(ptr, begin, len);
+ return ptr;
+}
+
+char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len,
+ size_t align, u32 block_id) {
+ if (align >= MAX_MAX_UE2_ALIGN) {
+ return nullptr;
+ }
+
+ assert(block_id < raw_blocks.size());
+ vector<char> &raw = raw_blocks[block_id];
+
+ // Realloc if necessary
+ size_t maxBufSize = len + MAX_MAX_UE2_ALIGN;
+ if (maxBufSize > raw.size()) {
+ raw.resize(maxBufSize);
+ }
+ assert(maxBufSize <= raw.size());
+
+ uintptr_t currentAlign = (uintptr_t)(&raw[0]) % MAX_MAX_UE2_ALIGN;
+ char *ptr;
+
+ ptrdiff_t diff = align - currentAlign;
+ if (diff >= 0) {
+ ptr = (&raw[0] + diff);
+ } else {
+ ptr = (&raw[0] + (MAX_MAX_UE2_ALIGN + diff));
+ }
+ assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align);
+
+ // copy the buffer
+ memcpy(ptr, begin, len);
+ return ptr;
+}
+
+bool UltimateTruth::saveDatabase(const HyperscanDB &hdb,
+ const string &filename) const {
+ return ::saveDatabase(hdb.db, filename.c_str(), g_verbose);
+}
+
+shared_ptr<HyperscanDB>
+UltimateTruth::loadDatabase(const string &filename,
+ const std::set<unsigned> &ids) const {
+ hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose);
+ if (!hs_db) {
+ return nullptr;
+ }
+
+ auto db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
+ assert(db);
+
+ // Fill db::ordered with the expressions that require the ordered flag.
+ for (const auto &id : ids) {
+ auto j = m_expr.find(id);
+ if (j == m_expr.end()) {
+ cerr << "Can't find expression with ID " << id << endl;
+ assert(0);
+ db.reset();
+ return db;
+ }
+ string expr;
+ hs_expr_ext ext;
+ unsigned int flags;
+ if (!readExpression(j->second, expr, &flags, &ext)) {
+ cerr << "Can't parse expression with ID " << id << ": "
+ << j->second << endl;
+ assert(0);
+ db.reset();
+ return db;
+ }
+ if (isOrdered(expr, flags)) {
+ db->ordered.insert(id);
+ }
+ }
+
+ return db;
+}
+
+unsigned int UltimateTruth::describe() const {
+ return m_mode;
+}
+
+// Hash the settings used to compile a database, returning a string that can be
+// used as a filename.
+string UltimateTruth::dbSettingsHash(const set<unsigned int> &ids) const {
+ // create a single string to contain a description of the db
+ ostringstream info_oss;
+
+ // settings from UltimateTruth::describe()
+ info_oss << ' ' << describe() << ' ';
+
+ // our set
+ for (unsigned int id : ids) {
+ info_oss << id << ' ';
+ }
+
+ string info = info_oss.str();
+
+ u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size());
+
+ // return STL string with printable version of digest
+ ostringstream oss;
+ oss << hex << setw(8) << setfill('0') << crc << dec;
+
+ return oss.str();
+}
+
+string UltimateTruth::dbFilename(const set<unsigned int> &ids) const {
+ ostringstream oss;
+ oss << serializePath << '/' << dbSettingsHash(ids) << ".db";
+ return oss.str();
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ULTIMATETRUTH_H
+#define ULTIMATETRUTH_H
+
+#include "expressions.h"
+
+#include "hs.h"
+
+#include <memory>
+#include <ostream>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+namespace ue2 {
+
+struct Grey;
+
+} // namespace ue2
+
+class HyperscanDB;
+class ResultSet;
+
+// Wrapper around ue2 to generate results for an expression and corpus.
+class UltimateTruth : boost::noncopyable {
+public:
+ UltimateTruth(std::ostream &os, const ExpressionMap &expr,
+ const hs_platform_info *plat, const ue2::Grey &grey,
+ unsigned streamBlocks = 0);
+
+ ~UltimateTruth();
+
+ std::shared_ptr<HyperscanDB> compile(const std::set<unsigned> &ids,
+ std::string &error) const;
+
+ bool saveDatabase(const HyperscanDB &db,
+ const std::string &filename) const;
+
+ std::shared_ptr<HyperscanDB>
+ loadDatabase(const std::string &filename,
+ const std::set<unsigned> &ids) const;
+
+ // Are we runnable? (i.e. not xcompiling)
+ bool runnable() const {
+ return !m_xcompile;
+ }
+
+ bool run(unsigned id, std::shared_ptr<const HyperscanDB> db,
+ const std::string &buffer, bool single_pattern, unsigned align,
+ ResultSet &rs);
+
+ // Returns a value completely representing this object's compile options.
+ unsigned int describe() const;
+
+ std::string dbFilename(const std::set<unsigned int> &ids) const;
+
+private:
+ bool blockScan(const HyperscanDB &db, const std::string &buffer,
+ size_t align, match_event_handler callback, void *ctx,
+ ResultSet *rs);
+ bool streamingScan(const HyperscanDB &db, const std::string &buffer,
+ size_t align, match_event_handler callback, void *ctx,
+ ResultSet *rs);
+ bool vectoredScan(const HyperscanDB &db, const std::string &buffer,
+ size_t align, match_event_handler callback, void *ctx,
+ ResultSet *rs);
+
+ char *setupScanBuffer(const char *buf, size_t len, size_t align);
+
+ char *setupVecScanBuffer(const char *buf, size_t len, size_t align,
+ unsigned int block_id);
+
+ bool allocScratch(std::shared_ptr<const HyperscanDB> db);
+
+ bool cloneScratch(void);
+
+ std::string dbSettingsHash(const std::set<unsigned int> &ids) const;
+
+ const ue2::Grey &grey;
+
+ // Output stream.
+ std::ostream &out;
+
+ // Our expression map
+ const ExpressionMap &m_expr;
+
+ // Are we cross-compiling, and therefore unable to scan at all?
+ bool m_xcompile;
+
+ // Our mode flags to pass into the compiler: calculated from streaming,
+ // etc.
+ unsigned m_mode;
+
+ // In streaming mode, what is the number of blocks to chop data into?
+ unsigned m_streamBlocks;
+
+ // Scratch space for Hyperscan.
+ hs_scratch_t *scratch;
+
+ // Temporary scan buffer used for realigned scanning
+ std::vector<char> m_scanBuf;
+
+ std::vector<std::vector<char> > raw_blocks; /* temp scan buffers used by
+ * vectored mode */
+
+ // Last database we successfully allocated scratch for, so that we can
+ // avoid unnecessarily reallocating for it.
+ std::shared_ptr<const HyperscanDB> last_db;
+
+ const hs_platform_info *platform;
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_corpus_properties.h"
+#include "args.h"
+#include "common.h"
+#include "cross_compile.h"
+#include "util/expression_path.h"
+#include "util/string_util.h"
+
+#include "grey.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_*
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <getopt.h>
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+using namespace ue2;
+using namespace std;
+
+// display usage information, with an optional error
+static
+void usage(const char *name, const char *error) {
+ printf("Usage: %s [OPTIONS...]\n\n", name);
+ printf("General Options:\n\n");
+ printf(" -h Display help and exit.\n");
+ printf(" -G OVERRIDES Overrides for the grey box.\n");
+ printf(" -e PATH Path to expression directory or file.\n");
+ printf(" -s FILE Signature file to use.\n");
+ printf(" -z NUM Signature ID to use.\n");
+ printf(" -c FILE Load corpora from FILE rather than using "
+ "generator.\n");
+ printf(" -w FILE After running, save corpora (with matches) to "
+ "FILE.\n");
+ printf(" -a [BAND] Compile all expressions in UE2 (but still match "
+ "singly).\n");
+ printf(" If BAND, compile patterns in groups of size "
+ "BAND.\n");
+ printf(" -t NUM Use streaming mode, split data into ~NUM "
+ "blocks.\n");
+ printf(" -V NUM Use vectored mode, split data into ~NUM "
+ "blocks.\n");
+ printf(" -Z {R or 0-%d} Only test one alignment, either as given or "
+ "'R' for random.\n", MAX_MAX_UE2_ALIGN - 1);
+ printf(" -q Quiet; display only match differences, no other "
+ "failures.\n");
+ printf(" -v Verbose; display successes as well as "
+ "failures.\n");
+ printf("\n");
+ printf("Pattern flags:\n");
+ printf("\n");
+ printf(" -8 Force UTF8 mode on all patterns.\n");
+ printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n");
+ printf(" -E DISTANCE Match all patterns within edit distance"
+ " DISTANCE.\n");
+ printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
+ printf("\n");
+ printf("Testing mode options:\n");
+ printf("\n");
+ printf(" -d NUM Set SOM precision mode (default: 8 (large)).\n");
+ printf(" -O NUM In streaming mode, set initial offset to NUM.\n");
+ printf(" -k NUM Terminate callback after NUM matches per "
+ "pattern.\n");
+ printf(" --copy-scratch Copy scratch after each scan call.\n");
+ printf(" --copy-stream Copy stream state after each scan call.\n");
+ printf(" --compress-expand Compress and expand stream state after each "
+ "scan call.\n");
+ printf(" --compress-reset-expand Compress, reset and expand stream state "
+ "after each scan call.\n");
+ printf(" --mangle-scratch Mangle scratch space after each scan call.\n");
+ printf(" --no-nfa Disable NFA graph execution engine.\n");
+ printf(" --no-pcre Disable PCRE engine.\n");
+ printf(" --test-nfa Disable UE2 engine (test NFA against PCRE).\n");
+ printf(" --abort-on-fail Abort, rather than exit, on failure.\n");
+ printf(" --no-signal-handler Do not handle handle signals (to generate "
+ "backtraces).\n");
+ printf("\n");
+ printf("Memory and resource control options:\n");
+ printf("\n");
+ printf(" -T NUM Run with NUM threads.\n");
+ printf(" -M NUM Set maximum memory allocated to NUM megabytes per"
+ " thread.\n");
+ printf(" (0 means no limit, default is 1000 MB).\n");
+ printf(" -m NUM Set PCRE_MATCH_LIMIT (default: %lu).\n",
+ DEFAULT_PCRE_MATCH_LIMIT);
+ printf(" -r NUM Set PCRE_MATCH_LIMIT_RECURSION (default: %lu).\n",
+ DEFAULT_PCRE_MATCH_RECURSION_LIMIT);
+ printf("\n");
+ printf("Cross-compiling:\n");
+ printf("\n");
+ printf(" -x NAME Cross-compile for arch NAME.\n");
+ printf(" -i DIR Don't compile, load from files in DIR "
+ "instead.\n");
+ printf(" -o DIR After compiling, save to files in DIR.\n");
+ printf("\n");
+ printf("Corpus generation options:\n");
+ printf("\n");
+ printf(" -n NUM Max corpora to generate for a given signature "
+ "(default: %u).\n", DEFAULT_CORPUS_GENERATOR_LIMIT);
+ printf(" -R NUM Random seed to use (default: seeded from "
+ "time()).\n");
+ printf(" -p NUM,NUM,NUM Percentage probabilities of "
+ "(match,unmatch,random) char.\n");
+ printf(" -C NUM,NUM Follow cycles (min,max) times.\n");
+ printf(" -P NUM,NUM Add a random prefix of length between "
+ "(min,max).\n");
+ printf(" -S NUM,NUM Add a random suffix of length between "
+ "(min,max).\n");
+ printf(" -D NUM Apply an edit distance (default: 0) to each "
+ "corpus.\n");
+ printf(" -b NUM Limit alphabet to NUM characters, starting at "
+ "lower-case 'a'.\n");
+ printf("\n");
+
+ if (error) {
+ printf("Error: %s\n", error);
+ }
+}
+
+void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
+ vector<string> *corpora, UNUSED Grey *grey,
+ unique_ptr<hs_platform_info> *plat_out) {
+ static const char options[]
+ = "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
+ s32 in_multi = 0;
+ s32 in_corpora = 0;
+ int pcreFlag = 1;
+ int nfaFlag = 1;
+ int ue2Flag = 1;
+ int copyScratch = 0;
+ int copyStream = 0;
+ int mangleScratch = 0;
+ int compressFlag = 0;
+ int compressResetFlag = 0;
+ static const struct option longopts[] = {
+ {"copy-scratch", 0, ©Scratch, 1},
+ {"copy-stream", 0, ©Stream, 1},
+ {"mangle-scratch", 0, &mangleScratch, 1},
+ {"prefilter", 0, &force_prefilter, 1},
+ {"no-pcre", 0, &pcreFlag, 0},
+ {"no-nfa", 0, &nfaFlag, 0},
+ {"test-nfa", 0, &ue2Flag, 0},
+ {"abort-on-fail", 0, &abort_on_failure, 1},
+ {"no-signal-handler", 0, &no_signal_handler, 1},
+ {"compress-expand", 0, &compressFlag, 1},
+ {"compress-reset-expand", 0, &compressResetFlag, 1},
+ {nullptr, 0, nullptr, 0}};
+
+ for (;;) {
+ int c = getopt_long(argc, argv, options, longopts, nullptr);
+ if (c < 0) {
+ break;
+ }
+
+ switch (c) {
+ case 'a':
+ g_ue2CompileAll = true;
+ in_multi = 2;
+ break;
+ case 'b': {
+ unsigned sz;
+ if (!fromString(optarg, sz) || sz > 256) {
+ usage(argv[0], "Must provide an integer argument <= 256"
+ "to '-b' flag");
+ exit(1);
+ }
+ corpus_gen_prop.alphabetSize = sz;
+ break;
+ }
+ case 'c':
+ in_corpora = 2;
+ break;
+ case 'C': {
+ vector<unsigned> nums;
+ if (!strToList(optarg, nums) || nums.size() != 2
+ || nums[0] > nums[1]) {
+ usage(argv[0], "Cycle limit '-C' argument takes a list of "
+ " integers: MIN,MAX");
+ exit(1);
+ }
+ corpus_gen_prop.setCycleLimit(nums[0], nums[1]);
+ break;
+ }
+ case 'd': {
+ unsigned dist;
+ if (!fromString(optarg, dist)) {
+ usage(argv[0],
+ "Must provide an integer argument to '-d' flag");
+ exit(1);
+ }
+ switch (dist) {
+ case 2:
+ somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL;
+ break;
+ case 4:
+ somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM;
+ break;
+ case 8:
+ somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
+ break;
+ default:
+ usage(argv[0], "SOM precision must be 2, 4 or 8");
+ exit(1);
+ }
+ break;
+ }
+ case 'D': {
+ unsigned dist;
+ if (!fromString(optarg, dist)) {
+ usage(argv[0],
+ "Must provide an integer argument to '-D' flag");
+ exit(1);
+ }
+ corpus_gen_prop.editDistance = dist;
+ break;
+ }
+ case 'e':
+ g_exprPath.assign(optarg);
+ break;
+ case 'E': {
+ u32 dist;
+ if (!fromString(optarg, dist)) {
+ usage(argv[0], "Argument to '-E' flag must be an integer");
+ exit(1);
+ }
+ force_edit_distance = true;
+ edit_distance = dist;
+ break;
+ }
+#ifndef RELEASE_BUILD
+ case 'G':
+ applyGreyOverrides(grey, string(optarg));
+ break;
+#endif
+ case 'h':
+ usage(argv[0], nullptr);
+ exit(0);
+ case 'i':
+ loadDatabases = true;
+ serializePath = optarg;
+ break;
+ case 'k':
+ if (!fromString(optarg, limit_matches) || limit_matches < 1) {
+ usage(argv[0],
+ "Must provide a positive integer argument to '-k' "
+ "flag");
+ exit(1);
+ }
+ break;
+ case 'L':
+ somFlags = HS_FLAG_SOM_LEFTMOST;
+ break;
+ case 'm':
+ if (!fromString(optarg, g_matchLimit) || g_matchLimit < 1) {
+ usage(argv[0],
+ "Must provide a positive integer argument to '-m' "
+ "flag");
+ exit(1);
+ }
+ break;
+ case 'M':
+ if (!fromString(optarg, g_memoryLimit)) {
+ usage(argv[0],
+ "Must provide a positive (or zero) integer argument "
+ "to '-M' flag");
+ exit(1);
+ }
+ break;
+ case 'n': {
+ unsigned int count;
+ if (!fromString(optarg, count)) {
+ usage(argv[0], "Argument to '-n' flag must be an integer");
+ exit(1);
+ }
+ corpus_gen_prop.corpusLimit = count;
+ break;
+ }
+ case 'o':
+ saveDatabases = true;
+ serializePath = optarg;
+ break;
+ case 'O':
+ if (!fromString(optarg, g_streamOffset)) {
+ usage(argv[0],
+ "Argument '-O' flag must be a positive integer");
+ exit(1);
+ }
+ break;
+ case 'p': {
+ vector<unsigned> prob;
+ if (!strToList(optarg, prob) || prob.size() != 3) {
+ usage(argv[0], "Probabilities '-p' argument takes a list "
+ "of three integers: MATCH,UNMATCH,RANDOM");
+ exit(1);
+ }
+ if (!corpus_gen_prop.setPercentages(prob[0], prob[1],
+ prob[2])) {
+ usage(argv[0],
+ "Unable to set corpus generator probabilities.");
+ exit(1);
+ }
+ break;
+ }
+ case 'P': {
+ vector<unsigned> nums;
+ if (!strToList(optarg, nums) || nums.size() != 2
+ || nums[0] > nums[1]) {
+ usage(argv[0], "Prefix '-P' argument takes a list of two"
+ " integers: MIN,MAX");
+ exit(1);
+ }
+ corpus_gen_prop.prefixRange = min_max(nums[0], nums[1]);
+ break;
+ }
+ case 'q':
+ g_quiet++;
+ break;
+ case 'r':
+ if (!fromString(optarg, g_matchLimitRecursion)
+ || g_matchLimitRecursion < 1) {
+ usage(argv[0], "Must provide a positive integer argument "
+ "to '-r' flag");
+ exit(1);
+ }
+ break;
+ case 'R': {
+ if (!fromString(optarg, randomSeed)) {
+ usage(argv[0], "Argument to '-R' flag must be an integer");
+ exit(1);
+ }
+ corpus_gen_prop.seed(randomSeed);
+ break;
+ }
+ case 's':
+ g_signatureFiles.push_back(optarg);
+ break;
+ case 'S': {
+ vector<unsigned> nums;
+ if (!strToList(optarg, nums) || nums.size() != 2 ||
+ nums[0] > nums[1]) {
+ usage(argv[0], "Suffix '-S' argument takes a list of two"
+ " integers: MIN,MAX");
+ exit(1);
+ }
+ corpus_gen_prop.suffixRange = min_max(nums[0], nums[1]);
+ break;
+ }
+ case 't':
+ if (colliderMode != MODE_BLOCK) {
+ usage(argv[0], "You can only use one mode at a time!");
+ exit(1);
+ }
+ colliderMode = MODE_STREAMING;
+ if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
+ usage(argv[0], "Must provide a positive integer argument "
+ "to '-t' flag");
+ exit(1);
+ }
+ break;
+ case 'T':
+ if (!fromString(optarg, numThreads) || numThreads < 1) {
+ usage(argv[0], "Must provide a positive integer argument "
+ "to '-T' flag");
+ exit(1);
+ }
+ break;
+ case 'v':
+ if (g_verbose) {
+ echo_matches = true;
+ }
+ g_verbose = true;
+ break;
+ case 'V':
+ if (colliderMode != MODE_BLOCK) {
+ usage(argv[0], "You can only use one mode at a time!");
+ exit(1);
+ }
+ colliderMode = MODE_VECTORED;
+ if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) {
+ usage(argv[0], "Must provide a positive integer argument "
+ "to '-t' flag");
+ exit(1);
+ }
+ break;
+ case 'w':
+ saveCorpora = true;
+ saveCorporaFile = optarg;
+ break;
+ case 'x':
+ *plat_out = xcompileReadMode(optarg);
+ if (!*plat_out) {
+ usage(argv[0], xcompileUsage().c_str());
+ exit(1);
+ }
+ break;
+ case 'X': {
+ u32 count;
+ if (!fromString(optarg, count)) {
+ usage(argv[0], "Argument to '-X' flag must be an integer");
+ exit(1);
+ }
+ g_corpora_prefix.insert(g_corpora_prefix.end(), count, '~');
+ break;
+ }
+ case 'Y':
+ {
+ u32 count;
+ if (!fromString(optarg, count)) {
+ usage(argv[0], "Argument to '-Y' flag must be an integer");
+ exit(1);
+ }
+ g_corpora_suffix.insert(g_corpora_suffix.end(), count, '~');
+ break;
+ }
+ case 'z':
+ if (!strToList(optarg, g_signatures)) {
+ usage(argv[0],
+ "Argument to '-z' flag must be a list of integers");
+ exit(1);
+ }
+ break;
+ case 'Z':
+ static constexpr unsigned ALIGN_LIMIT = MAX_MAX_UE2_ALIGN - 1;
+ if (optarg == string("R")) {
+ // Random min alignment selected.
+ use_random_alignment = true;
+ break;
+ } else if (!fromString(optarg, min_ue2_align)
+ || min_ue2_align > ALIGN_LIMIT) {
+ usage(argv[0], "Argument must be 'R' or numeric < "
+ xstr(MAX_MAX_UE2_ALIGN) " to '-Z'");
+ exit(1);
+ }
+ max_ue2_align = min_ue2_align + 1;
+ break;
+ case '8':
+ force_utf8 = true;
+ break;
+ case 1:
+ if (in_multi) {
+ if (!fromString(optarg, multicompile_bands)) {
+ usage(argv[0],
+ "Argument to '-a' flag must be an integer");
+ exit(1);
+ }
+ break;
+ } else if (in_corpora) {
+ corpora->push_back(optarg);
+ in_corpora = 2;
+ break;
+ }
+ case 0:
+ break;
+ default:
+ usage(argv[0], "Unrecognised command line argument.");
+ exit(1);
+ }
+
+ in_multi = MAX(0, in_multi - 1);
+ in_corpora = MAX(0, in_corpora - 1);
+ }
+
+ if (g_streamOffset && !g_streamBlocks) {
+ usage(argv[0], "stream offset requires streams");
+ exit(1);
+ }
+
+ if (g_exprPath.empty() && !g_signatureFiles.empty()) {
+ /* attempt to infer an expression directory */
+ for (const auto &fname : g_signatureFiles) {
+ string exprPath = inferExpressionPath(fname);
+ if (!g_exprPath.empty() && exprPath != g_exprPath) {
+ usage(argv[0], "Only one expression path is allowed.");
+ }
+ g_exprPath.assign(exprPath);
+ }
+ }
+
+ // Must have a valid expression path
+ if (g_exprPath.empty()) {
+ usage(argv[0], "Must specify an expression path with the -e option.");
+ exit(1);
+ }
+
+ // If we've been handed an expr file and no restrictions, use 'em all!
+ if (!isDir(g_exprPath) && isFile(g_exprPath) && g_signatureFiles.empty()
+ && g_signatures.empty()) {
+ g_allSignatures = true;
+ }
+
+ // Must have a valid signature file
+ if (g_signatureFiles.empty() && g_signatures.empty() && !g_allSignatures) {
+ usage(argv[0], "Must specify a signature file with the -s option.");
+ exit(1);
+ }
+
+ // Cannot ask for both loading and saving
+ if (loadDatabases && saveDatabases) {
+ usage(argv[0], "You cannot both load and save databases.");
+ exit(1);
+ }
+
+ // Cannot ask for cross-compile and loading
+ if (loadDatabases && *plat_out) {
+ usage(argv[0], "You cannot both load and xcompile of databases.");
+ exit(1);
+ }
+
+ // need at least two pattern engines active
+ if (nfaFlag + pcreFlag + ue2Flag < 2) {
+ usage(argv[0], "At least two pattern engines should be active.");
+ exit(1);
+ }
+
+ if (copyStream && !g_streamBlocks) {
+ usage(argv[0], "Copying streams only makes sense in streaming mode.");
+ exit(1);
+ }
+ if (compressFlag && compressResetFlag) {
+ usage(argv[0],
+ "Only use one of --compress-expand and --compress-reset-expand.");
+ exit(1);
+ }
+
+ // set booleans appropriately
+ use_NFA = (bool) nfaFlag;
+ use_PCRE = (bool) pcreFlag;
+ use_UE2 = (bool) ue2Flag;
+ use_copy_scratch = (bool) copyScratch;
+ use_copy_stream = (bool) copyStream;
+ use_mangle_scratch = (bool) mangleScratch;
+ use_compress_expand = (bool)compressFlag;
+ use_compress_reset_expand = (bool)compressResetFlag;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARGS_H
+#define ARGS_H
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace ue2 {
+struct Grey;
+}
+struct hs_platform_info;
+class CorpusProperties;
+
+void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
+ std::vector<std::string> *corpora, ue2::Grey *grey,
+ std::unique_ptr<hs_platform_info> *plat_out);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+enum ColliderMode {
+ MODE_BLOCK,
+ MODE_STREAMING,
+ MODE_VECTORED
+};
+
+extern unsigned numThreads;
+extern enum ColliderMode colliderMode;
+extern unsigned int somFlags;
+extern bool loadDatabases;
+extern bool saveDatabases;
+extern bool saveCorpora;
+extern std::string saveCorporaFile;
+extern std::string serializePath;
+extern bool echo_matches;
+extern int g_quiet;
+extern bool g_verbose;
+extern std::string g_exprPath;
+extern std::vector<std::string> g_signatureFiles;
+extern bool g_allSignatures;
+extern bool g_ue2CompileAll;
+extern unsigned g_streamBlocks;
+extern unsigned long long g_streamOffset;
+extern std::string g_corpora_prefix;
+extern std::string g_corpora_suffix;
+extern unsigned multicompile_bands;
+extern std::string g_corporaFile;
+extern std::vector<unsigned> g_signatures;
+extern unsigned long int g_matchLimit;
+extern unsigned long int g_matchLimitRecursion;
+extern unsigned min_ue2_align;
+extern unsigned max_ue2_align;
+extern size_t g_memoryLimit;
+extern bool force_utf8;
+extern int force_prefilter;
+extern unsigned somPrecisionMode;
+extern unsigned limit_matches;
+extern unsigned randomSeed;
+extern bool use_random_alignment;
+extern bool use_PCRE;
+extern bool use_NFA;
+extern bool use_UE2;
+extern bool use_copy_scratch;
+extern bool use_copy_stream;
+extern bool use_mangle_scratch;
+extern bool use_compress_expand;
+extern bool use_compress_reset_expand;
+extern int abort_on_failure;
+extern int no_signal_handler;
+extern bool force_edit_distance;
+extern unsigned edit_distance;
+
+// Constants
+static const unsigned long int DEFAULT_PCRE_MATCH_LIMIT = 10*1000*1000;
+static const unsigned long int DEFAULT_PCRE_MATCH_RECURSION_LIMIT = 10000;
+#define MAX_MAX_UE2_ALIGN 64
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "limit.h"
+
+#include <cstdlib>
+
+#if defined(HAVE_SETRLIMIT)
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <sys/resource.h>
+
+void setMemoryLimit(size_t mbytes) {
+ size_t bytes = mbytes * 1024 * 1024;
+
+ struct rlimit r;
+ r.rlim_cur = bytes;
+ r.rlim_max = bytes;
+
+ int rv = setrlimit(RLIMIT_DATA, &r);
+ if (rv != 0) {
+ std::cerr << "setrlimit(RLIMIT_DATA, ...) failed: " <<
+ strerror(errno) << std::endl;
+ }
+
+ rv = setrlimit(RLIMIT_AS, &r);
+ if (rv != 0) {
+ std::cerr << "setrlimit(RLIMIT_AS, ...) failed: " <<
+ strerror(errno) << std::endl;
+ }
+}
+#else // no setrlimit
+void setMemoryLimit(size_t) {}
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LIMIT_H
+#define LIMIT_H
+
+#include <cstddef>
+
+void setMemoryLimit(size_t mbytes);
+
+#endif // LIMIT_H
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "BoundedQueue.h"
+#include "DatabaseProxy.h"
+#include "FileCorpora.h"
+#include "GraphTruth.h"
+#include "GroundTruth.h"
+#include "NfaGeneratedCorpora.h"
+#include "Thread.h"
+#include "UltimateTruth.h"
+#include "args.h"
+#include "common.h"
+#include "cross_compile.h"
+#include "expressions.h"
+#include "limit.h"
+#include "ng_corpus_properties.h"
+#include "sig.h"
+#include "simple_timer.h"
+#include "util/expression_path.h"
+#include "util/string_util.h"
+
+#include "grey.h"
+#include "hs.h"
+#include "parser/utf8_validate.h"
+#include "ue2common.h"
+#include "util/container.h"
+#include "util/make_unique.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <errno.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+using namespace std;
+using namespace ue2;
+
+unsigned int numThreads = 1;
+unsigned int numScannerThreads = 1;
+unsigned int numGeneratorThreads = 1;
+enum ColliderMode colliderMode = MODE_BLOCK;
+bool echo_matches = false;
+int g_quiet = 0;
+bool g_verbose = false;
+bool g_allSignatures = false;
+string g_exprPath;
+vector<string> g_signatureFiles;
+string g_cmdline;
+bool g_ue2CompileAll = false;
+unsigned g_streamBlocks = 0;
+unsigned long long g_streamOffset = 0;
+unsigned multicompile_bands = 0;
+vector<unsigned> g_signatures;
+unsigned long int g_matchLimit = DEFAULT_PCRE_MATCH_LIMIT;
+unsigned long int g_matchLimitRecursion = DEFAULT_PCRE_MATCH_RECURSION_LIMIT;
+string g_corpora_prefix;
+string g_corpora_suffix;
+size_t g_memoryLimit = 1000; // megabytes per thread
+unsigned int somFlags = 0;
+bool loadDatabases = false;
+bool saveDatabases = false;
+bool saveCorpora = false;
+string saveCorporaFile;
+string serializePath;
+bool force_utf8 = false;
+int force_prefilter = 0;
+int no_groups = 0;
+unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
+unsigned limit_matches = 0;
+unsigned randomSeed = 0;
+bool use_random_alignment = false;
+bool use_PCRE = true;
+bool use_NFA = true;
+bool use_UE2 = true;
+bool use_copy_scratch = false;
+bool use_copy_stream = false;
+bool use_mangle_scratch = false;
+bool use_compress_expand = false;
+bool use_compress_reset_expand = false;
+int abort_on_failure = 0;
+int no_signal_handler = 0;
+size_t max_scan_queue_len = 25000;
+size_t max_generator_queue_len = 25000;
+bool force_edit_distance = false;
+unsigned edit_distance = 0;
+CorpusProperties corpus_gen_prop;
+
+// Semi constants
+unsigned min_ue2_align = 0;
+unsigned max_ue2_align = MAX_MAX_UE2_ALIGN;
+
+#define DEDUPE_MATCHES
+
+static
+unsigned countCores() {
+ unsigned count = std::thread::hardware_concurrency();
+ return count ? count : 1;
+}
+
+// Detect the Address Sanitizer with either GCC or Clang.
+#if defined(__SANITIZE_ADDRESS__)
+# define BUILT_WITH_ASAN
+#elif defined(__has_feature)
+# if __has_feature(address_sanitizer)
+# define BUILT_WITH_ASAN
+# endif
+#endif
+
+// Set the default params that can be overridden with commandline args
+static
+void setDefaults() {
+ // Seed random number generator for corpora
+ randomSeed = time(nullptr);
+ // Overcommit since we have generators and scanners running.
+ numThreads = countCores() * 2;
+
+#ifdef BUILT_WITH_ASAN
+ cout << "NOTE: Built with AddressSanitizer.\n"
+ << "Defaulting to no memory limit and no signal handler.\n"
+ << endl;
+ g_memoryLimit = 0;
+ no_signal_handler = 1;
+#endif
+}
+
+static
+void exit_with_fail(void) {
+ cout << "Failing cmdline was:\n " << g_cmdline << endl;
+ if (abort_on_failure) {
+ cout << "Calling abort()" << endl;
+ abort();
+ }
+ exit(1);
+}
+
+namespace /* anonymous */ {
+
+// For saving corpora out if the -w flag is specified. Note that we need a
+// mutex to serialise writes from different threads.
+class CorpusWriter {
+public:
+ explicit CorpusWriter(const string &filename)
+ : out(filename.c_str(), ios_base::trunc) {}
+
+ void write(const string &str) {
+ std::lock_guard<std::mutex> lock(mutex);
+ out << str << flush;
+ }
+
+private:
+ ofstream out;
+ std::mutex mutex;
+};
+
+unique_ptr<CorpusWriter> corporaOut = nullptr;
+
+// Encapsulates all of the data reported from a test
+struct TestSummary {
+ unsigned totalCorpora = 0;
+ unsigned totalExpressions = 0;
+ unsigned failCorpora = 0;
+ unsigned failPcreCompile = 0;
+ unsigned failNGCompile = 0;
+ unsigned failUe2Compile = 0;
+ unsigned failCompileDifference = 0; // failed in pcre but not ue2
+ unsigned failPcreScan = 0;
+ unsigned failNGScan = 0;
+ unsigned failUe2Scan = 0;
+ unsigned failDiff = 0;
+ unsigned failNoGroundTruth = 0;
+ set<unsigned> failIds;
+ set<unsigned> nogtIds;
+
+ // true if we've got a failure
+ bool hasFailure() const {
+ return failDiff != 0 || !failIds.empty() || failCompileDifference != 0;
+ }
+
+ void merge(const TestSummary &a) {
+ totalCorpora += a.totalCorpora;
+ totalExpressions += a.totalExpressions;
+ failCorpora += a.failCorpora;
+ failPcreCompile += a.failPcreCompile;
+ failNGCompile += a.failNGCompile;
+ failUe2Compile += a.failUe2Compile;
+ failCompileDifference += a.failCompileDifference;
+ failPcreScan += a.failPcreScan;
+ failNGScan += a.failNGScan;
+ failUe2Scan += a.failUe2Scan;
+ failDiff += a.failDiff;
+ failNoGroundTruth += a.failNoGroundTruth;
+ failIds.insert(begin(a.failIds), end(a.failIds));
+ nogtIds.insert(begin(a.nogtIds), end(a.nogtIds));
+ }
+};
+
+enum TestResult {
+ TEST_NO_GROUND_TRUTH,
+ TEST_PASSED,
+ TEST_SKIPPED,
+ TEST_FAILED_COMPILE,
+ TEST_FAILED
+};
+
+struct TestUnit {
+ shared_ptr<CompiledPcre> pcre; // libpcre bytecode
+ shared_ptr<CNGInfo> cngi; // NFA graph info (compilation is deferred)
+ shared_ptr<DatabaseProxy> ue2; // ue2 bytecode
+ Corpus corpus; // a local copy, as we may modify it
+
+ unsigned id; // expression id
+ unsigned corpus_id; // corpus id
+ bool highlander; // single match flag
+ bool prefilter; // prefilter flag
+ bool som; // start of match flag
+ bool multi; // if false, we're in single mode.
+ bool utf8; // at least one of our patterns is utf8
+
+ enum TestResult result;
+
+ TestUnit(unsigned sig_id, unsigned c_id, const Corpus &c,
+ shared_ptr<CompiledPcre> pcre_in, shared_ptr<CNGInfo> cngi_in,
+ shared_ptr<DatabaseProxy> ue2_in, bool multi_in, bool utf8_in,
+ bool highlander_in, bool prefilter_in, bool som_in)
+ : pcre(pcre_in), cngi(cngi_in), ue2(ue2_in), corpus(c), id(sig_id),
+ corpus_id(c_id), highlander(highlander_in), prefilter(prefilter_in),
+ som(som_in), multi(multi_in), utf8(utf8_in),
+ result(TEST_NO_GROUND_TRUTH) {}
+};
+
+} // namespace
+
+// For ease of printing match sets
+static
+std::ostream &operator<<(std::ostream &os, const set<MatchResult> &v) {
+ auto vi = v.begin(), ve = v.end();
+ while (vi != ve) {
+ // match offsets
+ os << '(' << vi->from << ',' << vi->to << ')';
+ if (++vi != ve) {
+ os << ", ";
+ }
+ }
+ return os;
+}
+
+static
+void printCorpus(ostream &out, const Corpus &corpus) {
+ // Print the offending corpus
+ string corpus_data(corpus.data.begin() + g_corpora_prefix.size(),
+ corpus.data.end() - g_corpora_suffix.size());
+ bool trimmed = false;
+ if (corpus_data.size() > 1000) {
+ corpus_data.resize(1000);
+ trimmed = true;
+ }
+ out << " Corpus data: '" << printable(corpus_data) << "'";
+ if (trimmed) {
+ out << " ...";
+ }
+ out << "\n";
+}
+
+static
+void printGroundTruthDifference(ostream &out, const ExpressionMap &exprMap,
+ const TestUnit &unit,
+ const ResultSet &pcre_results,
+ const ResultSet &ngw_results) {
+ assert(contains(exprMap, unit.id));
+ // Print the expression itself
+ out << " Expression: '" << exprMap.at(unit.id) << "'\n";
+ printCorpus(out, unit.corpus);
+ out << " PCRE matches: " << pcre_results.matches << "\n";
+ out << " NFA matches: " << ngw_results.matches << "\n";
+
+ vector<MatchResult> diff;
+
+ set_difference(pcre_results.matches.begin(), pcre_results.matches.end(),
+ ngw_results.matches.begin(), ngw_results.matches.end(),
+ back_inserter(diff));
+
+ for (const auto &match : diff) {
+ out << " PCRE only: match (" << match.from << "," << match.to << ")\n";
+ }
+
+ diff.clear();
+
+ set_difference(ngw_results.matches.begin(), ngw_results.matches.end(),
+ pcre_results.matches.begin(), pcre_results.matches.end(),
+ back_inserter(diff));
+
+ for (const auto &match : diff) {
+ out << " NFA only: match (" << match.from << "," << match.to << ")\n";
+ }
+ out.flush();
+}
+
+// Report the difference information when a pattern causes different matches in
+// our engines.
+static
+void printDifference(ostream &out, const ExpressionMap &exprMap,
+ const TestUnit &unit, const ResultSet >_results,
+ const vector<ResultSet> &ue2_results,
+ const vector<bool> &pass) {
+ assert(contains(exprMap, unit.id));
+ // Print the expression itself
+ out << " Expression: '" << exprMap.at(unit.id) << "'\n";
+ printCorpus(out, unit.corpus);
+ out << " " << gt_results.src << " matches: " << gt_results.matches << endl;
+
+ for (u32 align = min_ue2_align; align < max_ue2_align; align++) {
+ if (pass[align]) {
+ continue;
+ }
+
+ u32 align_in = align;
+ out << " UE2 (" << align;
+ while (align + 1 < max_ue2_align) {
+ if (pass[align + 1] ||
+ ue2_results[align] != ue2_results[align + 1]) {
+ break;
+ }
+ align++;
+ }
+
+ if (align != align_in) {
+ out << " - " << align;
+ }
+
+ out << ") matches: " << ue2_results[align].matches;
+ out << endl;
+
+ vector<MatchResult> only;
+
+ // Print matches only returned by ground truth
+ set_difference(gt_results.matches.begin(),
+ gt_results.matches.end(),
+ ue2_results[align].matches.begin(),
+ ue2_results[align].matches.end(),
+ back_inserter(only));
+ for (const auto &match : only) {
+ out << " " << gt_results.src << " only: match ("
+ << match.from << "," << match.to << ')' << endl;
+ }
+
+ // Print matches only returned by UE2
+ only.clear();
+
+ set_difference(ue2_results[align].matches.begin(),
+ ue2_results[align].matches.end(),
+ gt_results.matches.begin(),
+ gt_results.matches.end(),
+ back_inserter(only));
+
+ for (const auto &match : only) {
+ out << " UE2 only: match (" << match.from << "," << match.to << ')'
+ << endl;
+ }
+
+#ifdef DEDUPE_MATCHES
+ for (const auto &match : ue2_results[align].dupe_matches) {
+ out << " UE2 dupe: match (" << match.from << "," << match.to
+ << ')' << endl;
+ }
+#endif
+
+ if (ue2_results[align].uoom) {
+ out << " *** UE2 produced matches out of order" << endl;
+ }
+ if (ue2_results[align].match_after_halt) {
+ out << " *** UE2 produced matches after termination" << endl;
+ }
+ if (ue2_results[align].invalid_id) {
+ out << " *** UE2 produced matches for invalid ids" << endl;
+ }
+ }
+}
+
+static
+void printMode(void) {
+ if (!g_ue2CompileAll) {
+ cout << "Single/";
+ } else if (!multicompile_bands) {
+ cout << "Multi/";
+ } else {
+ cout << "Multi-" << multicompile_bands << "/";
+ }
+
+ switch (colliderMode) {
+ case MODE_BLOCK:
+ cout << "Block";
+ break;
+ case MODE_STREAMING:
+ cout << "Streaming-" << g_streamBlocks;
+ if (g_streamOffset) {
+ cout << " offset " << g_streamOffset;
+ }
+ if (use_copy_stream) {
+ cout << " [copy stream]";
+ }
+ if (use_compress_expand) {
+ cout << " [compress]";
+ }
+ if (use_compress_reset_expand) {
+ cout << " [compress+reset]";
+ }
+ break;
+ case MODE_VECTORED:
+ cout << "Vectored-" << g_streamBlocks;
+ break;
+ }
+
+ if (use_copy_scratch) {
+ cout << " [copy scratch]";
+ }
+ if (use_mangle_scratch) {
+ cout << " [mangle]";
+ }
+ cout << endl;
+}
+
+static
+void printSummaryV(const TestSummary &sum) {
+ cout << endl;
+ cout << "Summary:" << endl;
+ cout << "Mode: ";
+ printMode();
+ cout << "=========" << endl;
+ cout << "Expressions processed: " << sum.totalExpressions << endl;
+ cout << "Corpora processed: " << sum.totalCorpora << endl;
+ cout << "Expressions with failures: " << sum.failIds.size() << endl;
+ cout << " Corpora generation failures: " << sum.failCorpora << endl;
+ cout << " Compilation failures: ";
+ cout << "pcre:" << sum.failPcreCompile << ", ";
+ cout << "ng:" << sum.failNGCompile << ", ";
+ cout << "ue2:" << sum.failUe2Compile << endl;
+
+ cout << " Matching failures: ";
+ cout << "pcre:" << sum.failPcreScan << ", ";
+ cout << "ng:" << sum.failNGScan << ", ";
+ cout << "ue2:" << sum.failUe2Scan << endl;
+ cout << " Match differences: " << sum.failIds.size() << endl;
+ cout << " No ground truth: " << sum.nogtIds.size() << endl;
+ cout << "Total match differences: " << sum.failDiff << endl;
+}
+
+static
+void printSummaryQ(const TestSummary &sum) {
+ cout << "Summary: ";
+ printMode();
+
+ cout << "Processed: " << sum.totalExpressions << " expressions, "
+ << sum.totalCorpora << " corpora" << endl;
+ cout << "Failures: " << sum.failIds.size()
+ << " (corpora: " << sum.failCorpora << "; compile: ";
+ cout << "pcre:" << sum.failPcreCompile << ", ";
+ cout << "ng:" << sum.failNGCompile << ", ";
+ cout << "ue2:" << sum.failUe2Compile << "; match: ";
+
+ cout << "pcre:" << sum.failPcreScan << ", ";
+ cout << "ng:" << sum.failNGScan << ", ";
+ cout << "ue2:" << sum.failUe2Scan << ")" << endl;
+ cout << "Differences: " << sum.failIds.size() << " expressions, "
+ << sum.failDiff << " total" << endl;
+ cout << "No ground truth: " << sum.nogtIds.size() << " expressions" << endl;
+}
+
+static
+void printSummary(const TestSummary &sum) {
+ if (g_quiet > 1) {
+ printSummaryQ(sum);
+ } else {
+ printSummaryV(sum);
+ }
+}
+
+// Returns true if this Highlander mode test succeeded.
+static
+bool checkSingleMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
+ // In Highlander (single-match) mode, UE2 must return only one of the
+ // matches returned by PCRE/GraphTruth. It need not be the earliest one.
+ if (ground_truth.matches.empty()) {
+ return ue2.matches.empty();
+ } else if (ue2.matches.size() != 1) {
+ return false;
+ } else {
+ return contains(ground_truth.matches, *ue2.matches.begin());
+ }
+}
+
+// Returns true if this prefiltering mode test succeeded.
+static
+bool checkPrefilterMatch(const ResultSet &ground_truth, const ResultSet &ue2,
+ bool highlander) {
+ if (highlander) {
+ // Highlander + prefilter is tricky. Best we can do is say that if PCRE
+ // returns matches, UE2 must return a match, though it may not be one
+ // of the ones returned by PCRE (it may be an earlier match).
+ if (!ground_truth.matches.empty()) {
+ return ue2.matches.size() == 1;
+ }
+ // We can't verify anything more.
+ return true;
+ } else if (!limit_matches || ue2.matches.size() < limit_matches) {
+ // In prefilter mode, every match found by PCRE must be found by UE2,
+ // but the UE2 set may be a superset of the PCRE match set.
+ return std::includes(ue2.matches.begin(), ue2.matches.end(),
+ ground_truth.matches.begin(), ground_truth.matches.end());
+ }
+
+ // Otherwise, we've hit our match limit. Prefilter mode is quite difficult
+ // to verify in this case, so we just verify that "something happened".
+ return true;
+}
+
+static
+ResultSet makeEndOfMatchOnly(const ResultSet &rs) {
+ ResultSet out(rs.src);
+ for (const auto &match : rs.matches) {
+ out.addMatch(0, match.to);
+ }
+ return out;
+}
+
+static
+bool checkMultiMatch(const ResultSet &ground_truth, const ResultSet &ue2) {
+ // If we had out-of-order matches or matches after termination, we have a
+ // bug!
+ if (ue2.uoom || ue2.match_after_halt || ue2.invalid_id) {
+ return false;
+ }
+
+ // If we have more UE2 matches than our limit, we have a bug!
+ if (limit_matches && ue2.matches.size() > limit_matches) {
+ return false;
+ }
+
+ // If we have more UE2 matches than PCRE matches, we have a bug!
+ if (ue2.matches.size() > ground_truth.matches.size()) {
+ return false;
+ }
+
+ // If we've got fewer matches than our limit to test, then the match sets
+ // must be identical.
+ if (!limit_matches || ground_truth.matches.size() < limit_matches) {
+ return ground_truth == ue2;
+ }
+
+ // We're in limit_matches mode _and_ we have hit the limit. Every match in
+ // 'ue2' must be in 'pcre'. (We can't just trim pcre and do an equality
+ // test as matches may come out of UE2 a little out of order.)
+
+ // In streaming mode, the limit may mean that we get a different SOM from
+ // the leftmost one. So we compare only end offsets.
+ if (colliderMode == MODE_STREAMING || colliderMode == MODE_VECTORED) {
+ ResultSet gt_eom = makeEndOfMatchOnly(ground_truth);
+ ResultSet ue2_eom = makeEndOfMatchOnly(ue2);
+ return std::includes(gt_eom.matches.begin(), gt_eom.matches.end(),
+ ue2_eom.matches.begin(), ue2_eom.matches.end());
+ }
+
+ return std::includes(ground_truth.matches.begin(),
+ ground_truth.matches.end(),
+ ue2.matches.begin(), ue2.matches.end());
+}
+
+// Check results, returns true if there has any failure.
+static
+bool checkTestResults(ostream &out, TestSummary &summary,
+ const ExpressionMap &exprMap, TestUnit &unit,
+ const ResultSet >_results,
+ const vector<ResultSet> &ue2_results) {
+ bool failed = false;
+ bool any_fail = false;
+ vector<bool> pass(max_ue2_align, false);
+
+ for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
+ if (unit.prefilter) {
+ failed = !checkPrefilterMatch(gt_results, ue2_results[align],
+ unit.highlander);
+ } else if (unit.highlander) {
+ failed = !checkSingleMatch(gt_results, ue2_results[align]);
+ } else {
+ // In non-Highlander mode, the two result sets MUST be equal
+ // don't check PCRE if the scan didn't succeed
+ failed = !checkMultiMatch(gt_results, ue2_results[align]);
+ }
+
+#ifdef DEDUPE_MATCHES
+ if (!failed) {
+ failed |= !ue2_results[align].dupe_matches.empty();
+ }
+#endif
+
+ pass[align] = !failed;
+
+ any_fail |= failed;
+
+ summary.failDiff += failed ? 1 : 0;
+
+ if (g_verbose) {
+ if (failed) {
+ out << "FAILED: id " << unit.id << ", alignment " << align
+ << ", corpus " << unit.corpus_id << ", results differ"
+ << endl;
+ } else {
+ out << "PASSED: id " << unit.id << ", alignment " << align
+ << ", corpus " << unit.corpus_id
+ << " (matched "<< gt_results.src << ":"
+ << gt_results.matches.size()
+ << ", ue2:" << ue2_results[align].matches.size() << ")"
+ << endl;
+ }
+ }
+ }
+
+ if (!any_fail) {
+ return false;
+ }
+
+ if (!g_verbose) {
+ out << "FAILED: id " << unit.id << ", alignment";
+ for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) {
+ if (!pass[align]) {
+ out << " " << align;
+
+ if (align + 1 < max_ue2_align && !pass[align + 1]) {
+ while (align + 1 < max_ue2_align && !pass[align + 1]) {
+ align++;
+ }
+
+ out << "-" << align;
+ }
+ }
+ }
+
+ out << ", corpus " << unit.corpus_id << ", results differ" << endl;
+ }
+ printDifference(out, exprMap, unit, gt_results, ue2_results, pass);
+
+ return true;
+}
+
+// Construct a UE2 database, taking care of loading/saving to disk when
+// appropriate
+static
+shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
+ const UltimateTruth &ultimate) {
+ assert(!ids.empty());
+
+ if (loadDatabases) {
+ string filename = ultimate.dbFilename(ids);
+ shared_ptr<HyperscanDB> db = ultimate.loadDatabase(filename, ids);
+ if (!db) {
+ if (!g_quiet) {
+ cout << "FAILED: could not load database " << filename << endl;
+ }
+ return nullptr;
+ }
+ return make_shared<DatabaseProxy>(db);
+ }
+
+ shared_ptr<DatabaseProxy> ue2 = make_shared<DatabaseProxy>(ids);
+
+ try {
+ // If we're not runnable (i.e. we're cross-compiling), let's at least
+ // try to build the database.
+ if (!ultimate.runnable()) {
+ shared_ptr<HyperscanDB> db = ue2->get(ultimate);
+ assert(db); // throws otherwise
+ }
+
+ // Compile and save if we've been told to.
+ if (saveDatabases) {
+ string filename = ultimate.dbFilename(ids);
+ if (!ultimate.saveDatabase(*(ue2->get(ultimate)),
+ filename.c_str())) {
+ cout << "FAILED: could not save database to file: " << filename
+ << endl;
+ }
+ }
+ } catch (const CompileFailed &fail) {
+ if (!g_quiet) {
+ cout << "FAILED: ue2 compile failed for " << *ids.begin() << ": "
+ << fail.error << endl;
+ }
+ // Return null database to indicate failure.
+ ue2 = nullptr;
+ }
+
+ return ue2;
+}
+
+static
+bool getGraphTruth(ostream &out, CNGInfo &cngi, GraphTruth &graph,
+ TestUnit &unit, ResultSet &ngw_results,
+ TestSummary &summary, const string &expression) {
+ debug_stage = STAGE_GRAPH_RUN;
+
+ // Skip patterns we've previously marked as bad.
+ if (cngi.is_bad()) {
+ summary.failNGScan++;
+ return false;
+ }
+
+ // If we already have match information for this corpus, we don't need to
+ // run PCRE at all. At the moment our on-disk format for corpora with match
+ // information only includes the end-of-match offset, so we only use these
+ // in non-som modes. If edit distance is forced, all bets are off so we
+ // ignore this as well.
+ if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cngi.som &&
+ !force_edit_distance) {
+ if (g_verbose) {
+ out << "Using corpus match set rather than NFA graph" << endl;
+ }
+ ngw_results = ResultSet(unit.corpus.matches, RESULT_FROM_GRAPH);
+ } else {
+ // compile the actual graph
+ const CompiledNG *cng;
+ try {
+ debug_stage = STAGE_GRAPH_COMPILE;
+ cng = cngi.get();
+ debug_stage = STAGE_UNDEFINED;
+ }
+ catch (const NGCompileFailure &err) {
+ debug_stage = STAGE_UNDEFINED;
+ summary.failNGCompile++;
+ summary.failNGScan++;
+ cngi.mark_bad();
+ if (!g_quiet) {
+ cout << "FAILED: id " << unit.id
+ << ", NFA graph compile failed (" << err.msg << ")"
+ << endl;
+ }
+ return false;
+ }
+ debug_stage = STAGE_GRAPH_RUN;
+
+ // Run NFA graph and collect match information.
+ string error;
+ assert(cng);
+ if (!graph.run(unit.id, *cng, cngi, unit.corpus.data, ngw_results,
+ error)) {
+ if (!g_quiet) {
+ out << "FAILED: id " << unit.id
+ << ", NFA graph scan failed: " << error << "\n"
+ << " Expression: '" << expression << "'\n"
+ << " Corpus data: '" << printable(unit.corpus.data)
+ << "'\n"
+ << " (note: marking bad, skipping subsequent tests)"
+ << endl;
+ }
+ summary.failNGScan++;
+ cngi.mark_bad();
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static
+bool getGroundTruth(ostream &out, CompiledPcre &cpcre, GroundTruth &ground,
+ TestUnit &unit, ResultSet &pcre_results,
+ TestSummary &summary) {
+ debug_stage = STAGE_PCRE_RUN;
+
+ // Skip patterns we've previously marked as bad.
+ if (cpcre.is_bad()) {
+ summary.failPcreScan++;
+ return false;
+ }
+
+ // If we already have match information for this corpus, we don't need to
+ // run PCRE at all. At the moment our on-disk format for corpora with match
+ // information only includes the end-of-match offset, so we only use these
+ // in non-som modes. Also, we can't trust corpus matches if there was an
+ // edit distance requested for all patterns.
+ if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cpcre.som
+ && !force_edit_distance) {
+ if (g_verbose) {
+ out << "Using corpus match set rather than PCRE" << endl;
+ }
+ pcre_results = ResultSet(unit.corpus.matches, RESULT_FROM_PCRE);
+ } else {
+ // Run PCRE and collect match information.
+ string error;
+ if (!ground.run(unit.id, cpcre, unit.corpus.data, pcre_results,
+ error)) {
+ if (!g_quiet) {
+ out << "FAILED: id " << unit.id
+ << ", libpcre scan failed: " << error << "\n"
+ << " Expression: '" << cpcre.expression << "'\n"
+ << " Corpus data: '" << printable(unit.corpus.data)
+ << "'\n"
+ << " (note: marking PCRE bad, skipping subsequent tests)"
+ << endl;
+ }
+ summary.failPcreScan++;
+ cpcre.mark_bad();
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static
+void writeCorpus(unsigned id, const Corpus &corpus, const ResultSet &results) {
+ assert(corporaOut);
+ ostringstream oss;
+ oss << id << "=\"" << printable(corpus.data) << "\": ";
+
+ auto vi = results.matches.begin();
+ auto ve = results.matches.end();
+
+ // Print match end offsets only.
+ while (vi != ve) {
+ oss << vi->to;
+ if (++vi != ve) {
+ oss << ",";
+ }
+ }
+ oss << "\n";
+ corporaOut->write(oss.str());
+}
+
+static
+void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
+ UltimateTruth &ultimate, TestUnit &unit, TestSummary &summary,
+ const ExpressionMap &exprMap) {
+ assert(use_UE2);
+ Corpus &corpus = unit.corpus;
+
+ shared_ptr<const HyperscanDB> db;
+ if (use_UE2) {
+ // Acquire UE2 database.
+ debug_stage = STAGE_UE2_COMPILE;
+ try {
+ db = unit.ue2->get(ultimate);
+ } catch (const CompileFailed &fail) {
+ summary.failUe2Compile++;
+ if (!g_quiet) {
+ out << "FAILED: ue2 compile failed for " << unit.id << ": "
+ << fail.error << endl;
+ unit.result = TEST_FAILED_COMPILE;
+ debug_stage = STAGE_UNDEFINED;
+ return;
+ }
+ }
+ debug_stage = STAGE_UNDEFINED;
+
+ if (!db) {
+ // Database previously failed compilation.
+ unit.result = TEST_SKIPPED;
+ return;
+ }
+ }
+
+ // If the user has specified that they want prefix/suffix data added to
+ // their corpora, we do it here; this is as local as possible to the
+ // test, so we don't keep piles of HUGE corpora hanging around.
+ if (!g_corpora_prefix.empty()) {
+ corpus.data.insert(0, g_corpora_prefix);
+ corpus.hasMatches = false;
+ }
+ if (!g_corpora_suffix.empty()) {
+ corpus.data.append(g_corpora_suffix);
+ corpus.hasMatches = false;
+ }
+
+ ResultSet gt_results(RESULT_FROM_PCRE);
+ vector<ResultSet> ue2_results(max_ue2_align, ResultSet(RESULT_FROM_UE2));
+
+ bool gt_done = false;
+
+ // run PCRE test if enabled and if compile succeeded
+ if (unit.pcre && use_PCRE) {
+ gt_done = getGroundTruth(out, *unit.pcre, ground, unit, gt_results,
+ summary);
+ }
+
+ // run NFA if PCRE failed (or wasn't run), or if we don't run UE2
+ if (unit.cngi && (use_NFA && !gt_done)) {
+ gt_done = getGraphTruth(out, *unit.cngi, graph, unit, gt_results,
+ summary, exprMap.find(unit.id)->second);
+ }
+
+ // both ground truth methods either failed or didn't run
+ if (!gt_done) {
+ unit.result = TEST_NO_GROUND_TRUTH;
+ return;
+ }
+
+ // Write out corpora if we've been told to
+ if (saveCorpora) {
+ writeCorpus(unit.id, unit.corpus, gt_results);
+ }
+
+ debug_stage = STAGE_UE2_RUN;
+ for (unsigned int align = min_ue2_align; align != max_ue2_align; ++align) {
+ bool ok = ultimate.run(unit.id, db, corpus.data, !unit.multi, align,
+ ue2_results[align]);
+
+ if (!ok) {
+ if (!g_quiet) {
+ out << "FAILED: id " << unit.id << ", ue2 scan at alignment "
+ << align << " failed" << endl;
+ }
+ unit.result = TEST_FAILED;
+ debug_stage = STAGE_UNDEFINED;
+ return;
+ }
+ }
+
+ // if we're using UE2, check all the different results modes
+ if (checkTestResults(out, summary, exprMap, unit, gt_results,
+ ue2_results)) {
+ unit.result = TEST_FAILED;
+ } else {
+ unit.result = TEST_PASSED;
+ }
+
+ debug_stage = STAGE_UNDEFINED;
+}
+
+/* Used for testing the graph truth agains PCE */
+static
+void runGroundCompTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
+ TestUnit &unit, TestSummary &summary,
+ const ExpressionMap &exprMap) {
+ assert(!use_UE2);
+ assert(use_PCRE);
+ assert(use_NFA);
+ Corpus &corpus = unit.corpus;
+
+ // If the user has specified that they want prefix/suffix data added to
+ // their corpora, we do it here; this is as local as possible to the
+ // test, so we don't keep piles of HUGE corpora hanging around.
+ if (!g_corpora_prefix.empty()) {
+ corpus.data.insert(0, g_corpora_prefix);
+ corpus.hasMatches = false;
+ }
+ if (!g_corpora_suffix.empty()) {
+ corpus.data.append(g_corpora_suffix);
+ corpus.hasMatches = false;
+ }
+
+ ResultSet pcre_results(RESULT_FROM_PCRE);
+ ResultSet ngw_results(RESULT_FROM_GRAPH);
+
+ bool pcreResult = false;
+ bool graphResult = false;
+
+ if (unit.pcre) {
+ pcreResult = getGroundTruth(out, *unit.pcre, ground, unit, pcre_results,
+ summary);
+ }
+
+ if (unit.cngi) {
+ graphResult = getGraphTruth(out, *unit.cngi, graph, unit, ngw_results,
+ summary, exprMap.find(unit.id)->second);
+ }
+
+ // no ground truth found either NFA or PCRE failed
+ if (!pcreResult || !graphResult) {
+ unit.result = TEST_NO_GROUND_TRUTH;
+ return;
+ }
+
+ // Write out corpora if we've been told to
+ if (saveCorpora) {
+ writeCorpus(unit.id, unit.corpus, pcre_results);
+ }
+
+ if (pcre_results.matches != ngw_results.matches) {
+ unit.result = TEST_FAILED;
+ out << "FAILED: id " << unit.id << ", corpus " << unit.corpus_id
+ << ", results differ" << endl;
+
+ printGroundTruthDifference(out, exprMap, unit, pcre_results,
+ ngw_results);
+ } else {
+ unit.result = TEST_PASSED;
+ if (g_verbose) {
+ out << "PASSED: id " << unit.id << ", corpus " << unit.corpus_id
+ << " (matched pcre:" << pcre_results.matches.size()
+ << ", matched ng:" << ngw_results.matches.size() << ")" << endl;
+ }
+ }
+
+ debug_stage = STAGE_UNDEFINED;
+}
+
+static
+void addCorporaToQueue(ostream &out, BoundedQueue<TestUnit> &testq, unsigned id,
+ CorporaSource &corpora, TestSummary &summary,
+ shared_ptr<CompiledPcre> cpcre, shared_ptr<CNGInfo> cngi,
+ shared_ptr<DatabaseProxy> ue2, bool multi, bool utf8) {
+ // build corpora
+ vector<Corpus> c;
+ try {
+ corpora.generate(id, c);
+ }
+ catch (CorpusFailure &err) {
+ if (!g_quiet) {
+ out << "FAILED: id " << id << ", corpora failure: " << err.message
+ << endl;
+ }
+ summary.failCorpora++;
+ return;
+ }
+
+ const bool som = cpcre ? cpcre->som : cngi->som;
+ const bool prefilter = cpcre ? cpcre->prefilter : cngi->prefilter;
+ const bool highlander = cpcre ? cpcre->highlander : cngi->highlander;
+
+ // If we're in UTF-8 mode and the corpus isn't valid UTF-8, skip it:
+ // Hyperscan's behaviour when scanning invalid UTF-8 data in UTF-8 mode
+ // is undefined.
+ if (utf8) {
+ auto is_invalid_utf8 = [](const Corpus &corpus) {
+ return !isValidUtf8(corpus.data.c_str());
+ };
+ c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c));
+ }
+
+ // Collect together corpora units in a container so that we don't have to
+ // repeatedly lock the queue.
+ vector<unique_ptr<TestUnit>> tests;
+ tests.reserve(c.size());
+
+ size_t corpus_id = 0;
+ for (const Corpus &corpus : c) {
+ tests.push_back(ue2::make_unique<TestUnit>(id, corpus_id, corpus, cpcre,
+ cngi, ue2, multi, utf8,
+ highlander, prefilter, som));
+ corpus_id++;
+ }
+
+ testq.push(begin(tests), end(tests));
+}
+
+namespace /* anonymous */ {
+
+// A subclass of Thread that stores its own output in a stringstream, flushing
+// it to cout when necessary.
+class OutputThread : public Thread {
+public:
+ OutputThread(size_t id) : Thread(id) {}
+ ~OutputThread() override {
+ flush_output();
+ }
+
+protected:
+ void flush_output() {
+ const string &s = out.str();
+ if (!s.empty()) {
+ cout << s;
+ out.str(""); // make empty
+ }
+ }
+
+ // Output stream, flushed to cout after every test unit.
+ stringstream out;
+};
+
+class ScanThread : public OutputThread {
+public:
+ ScanThread(size_t id, BoundedQueue<TestUnit> &testq, const ExpressionMap &e,
+ const hs_platform_info *plat, const Grey &grey)
+ : OutputThread(id), q(testq),
+ ground(out, e, g_matchLimit, g_matchLimitRecursion), graph(out, e),
+ ultimate(out, e, plat, grey, g_streamBlocks), exprMap(e) {}
+
+ void run() override {
+ DEBUG_PRINTF("thread %zu running\n", thread_id);
+ for (;;) {
+ const auto unit = q.pop(thread_id);
+ if (!unit) {
+ // Sentinel value, indicates that we have run out of units to
+ // process.
+ DEBUG_PRINTF("thread %zu stopped\n", thread_id);
+ break;
+ }
+
+ assert(unit);
+ assert(exprMap.find(unit->id) != exprMap.end());
+
+ // Debug information is stored in TLS and (hopefully) printed out in
+ // the event of a crash.
+ debug_expr = unit->id;
+ debug_corpus = unit->corpus_id;
+ debug_corpus_ptr = unit->corpus.data.c_str();
+ debug_corpus_len = unit->corpus.data.size();
+ debug_expr_ptr = exprMap.find(unit->id)->second.c_str();
+
+ if (use_UE2) {
+ runTestUnit(out, ground, graph, ultimate, *unit, summary,
+ exprMap);
+ } else {
+ runGroundCompTestUnit(out, ground, graph, *unit, summary,
+ exprMap);
+ }
+
+ if (unit->result == TEST_NO_GROUND_TRUTH) {
+ summary.nogtIds.insert(unit->id);
+ // this is fine, continue
+ } else if (unit->result == TEST_FAILED) {
+ summary.failIds.insert(unit->id);
+ }
+
+ count++;
+ summary.totalCorpora++;
+ flush_output();
+ }
+ }
+
+ const TestSummary &getSummary() const { return summary; }
+
+public:
+ size_t count = 0; // number of units processed
+
+private:
+ // Shared queue.
+ BoundedQueue<TestUnit> &q;
+
+ // Thread-local data.
+ GroundTruth ground; // independent copy
+ GraphTruth graph; // independent copy
+ UltimateTruth ultimate; // independent copy
+ TestSummary summary;
+
+ // Constant shared data.
+ const ExpressionMap &exprMap;
+};
+
+/** Represent a work item for the corpus generation threads. This contains
+ * all information relating to an expression. The corpus generator will
+ * generate corpora for this expression and enqueue work items representing
+ * complete test cases for the scanning threads.
+ */
+struct CorpusGenUnit {
+ CorpusGenUnit(unique_ptr<CNGInfo> cngi_in, unique_ptr<CompiledPcre> pcre_in,
+ shared_ptr<DatabaseProxy> ue2_in, unsigned expr_id,
+ bool multi_in, bool utf8_in)
+ : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id),
+ multi(multi_in), utf8(utf8_in) {}
+
+ unique_ptr<CNGInfo> cngi;
+ unique_ptr<CompiledPcre> pcre;
+
+ /* ue2 shared_ptr as in multicompile and banded compile it is shared amongst
+ * various corpus units (with differing expression ids). */
+ shared_ptr<DatabaseProxy> ue2;
+
+ unsigned id; // expression id
+ bool multi; // ue2 contains more than one expression
+ bool utf8; // ue2 can be run against utf8 corpora
+};
+
+class CorpusGenThread : public OutputThread {
+public:
+ CorpusGenThread(size_t id, BoundedQueue<TestUnit> &testq_in,
+ BoundedQueue<CorpusGenUnit> &corpq_in,
+ const CorporaSource &corpora_in)
+ : OutputThread(id), testq(testq_in), corpq(corpq_in),
+ corpora(corpora_in.clone()) {}
+
+ void run() override {
+ DEBUG_PRINTF("thread %zu running\n", thread_id);
+ for (;;) {
+ auto c = corpq.pop(thread_id);
+ if (!c) {
+ break;
+ }
+
+ addCorporaToQueue(out, testq, c->id, *corpora, summary,
+ move(c->pcre), move(c->cngi), c->ue2, c->multi,
+ c->utf8);
+
+ count++;
+ flush_output();
+ }
+ }
+
+ const TestSummary &getSummary() const { return summary; }
+
+public:
+ size_t count = 0; // number of units processed
+
+private:
+ // Output queue, shared between threads.
+ BoundedQueue<TestUnit> &testq;
+
+ // Input queue, shared between corpus generator threads.
+ BoundedQueue<CorpusGenUnit> &corpq;
+
+ // Thread-local data.
+ const unique_ptr<CorporaSource> corpora; // independent copy
+ TestSummary summary;
+};
+
+} // namespace
+
+static
+unique_ptr<CNGInfo> makeNGInfo(const unsigned id, TestSummary &summary,
+ GraphTruth &graph, UltimateTruth &ultimate,
+ shared_ptr<DatabaseProxy> ue2) {
+ string nfaErr;
+
+ try {
+ debug_stage = STAGE_GRAPH_PREPROCESS;
+ auto cngi = graph.preprocess(id);
+ debug_stage = STAGE_UNDEFINED;
+ return cngi;
+ }
+ catch (const NGCompileFailure &err) {
+ nfaErr = err.msg;
+ debug_stage = STAGE_UNDEFINED;
+ // fall through
+ }
+ catch (const NGUnsupportedFailure &err) {
+ // unsupported error happens when the pattern appears to be valid, but
+ // there are things that we don't yet support (e.g. SOM).
+ // in this case, try again, suppressing the errors
+ debug_stage = STAGE_UNDEFINED;
+ summary.failNGCompile++;
+
+ // try again and suppress unsupported errors
+ try {
+ debug_stage = STAGE_GRAPH_PREPROCESS;
+ auto cngi = graph.preprocess(id, true);
+ debug_stage = STAGE_UNDEFINED;
+
+ // preprocess succeeded - that means the pattern itself is valid.
+ // however, we can't use it, so we have to mark it as bad
+ // only print the error in the following cases:
+ // 1) if verbose is specified
+ // 2) if we are not using UE2 and quiet is NOT specified
+ if ((!use_UE2 && !g_quiet) || g_verbose) {
+ cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
+ << err.msg << ")" << endl;
+ }
+ cngi->mark_bad();
+ return cngi;
+ }
+ catch (const NGCompileFailure &e) {
+ // compile failed
+ nfaErr = e.msg;
+ debug_stage = STAGE_UNDEFINED;
+ // fall through
+ }
+ }
+
+ // We should ensure that we also fail compilation with UE2, otherwise we
+ // likely have a pattern support bug.
+ try {
+ auto db = ue2->get(ultimate);
+ if (db) {
+ // if we made it this far, that means UE2 compile succeeded while
+ // NFA compile failed.
+ cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
+ << nfaErr << ") but UE2 compile succeeded." << endl;
+ summary.failNGCompile++;
+ summary.failCompileDifference++;
+ return nullptr;
+ }
+ // If db is nullptr, we have previously failed compilation of this
+ // database.
+ }
+ catch (const CompileFailed &) {
+ // Everything's OK: compilation failed in Hyperscan as well. Fall
+ // through.
+ }
+ summary.failNGCompile++;
+ if (!g_quiet) {
+ cout << "FAILED: id " << id << ", NFA graph preprocess failed ("
+ << nfaErr << ")" << endl;
+ }
+ return nullptr;
+}
+
+static
+unique_ptr<CompiledPcre> makePcre(const unsigned id, TestSummary &summary,
+ GroundTruth &ground, UltimateTruth &ultimate,
+ shared_ptr<DatabaseProxy> ue2) {
+ string pcreErr;
+
+ try {
+ debug_stage = STAGE_PCRE_COMPILE;
+ auto cpcre = ground.compile(id);
+ debug_stage = STAGE_UNDEFINED;
+ return cpcre;
+ }
+ catch (const SoftPcreCompileFailure &err) {
+ debug_stage = STAGE_UNDEFINED;
+ summary.failPcreCompile++;
+ if (g_verbose) {
+ cout << "FAILED: id " << id
+ << ", libpcre compile failed with soft error: " << err.msg
+ << endl;
+ }
+ return nullptr;
+ }
+ catch (const PcreCompileFailure &err) {
+ debug_stage = STAGE_UNDEFINED;
+ pcreErr = err.msg;
+ // fall through
+ }
+
+ // We should ensure that we also fail compilation with UE2, otherwise we
+ // likely have a pattern support bug.
+ try {
+ auto db = ue2->get(ultimate);
+ if (db) {
+ // OK, so now we have a situation: PCRE failed but UE2 succeeded.
+ // There is one situation where this is legal: patterns beginning
+ // with (*UTF8), which will throw an error due to the callback
+ // wrapping we do for PCRE. We can check these by trying to compile
+ // an "unwrapped" PCRE.
+ ground.compile(id, true);
+ // If we didn't throw, PCRE failed above but succeeded when not
+ // wrapped in a callback, and UE2 succeeded. Not worth reporting,
+ // fall through.
+ }
+ }
+ catch (const CompileFailed &) {
+ // Everything's OK: compilation failed in Hyperscan as well. Fall
+ // through.
+ }
+ catch (const PcreCompileFailure &) {
+ cout << "FAILED: id " << id << ", libpcre compile failed (" << pcreErr
+ << ") but UE2 compile succeeded." << endl;
+ summary.failPcreCompile++;
+ summary.failCompileDifference++;
+ return nullptr;
+ }
+
+ if (!g_quiet) {
+ cout << "FAILED: id " << id << ", libpcre compile failed: " << pcreErr
+ << endl;
+ }
+
+ summary.failPcreCompile++;
+ return nullptr;
+}
+
+static
+void drainGenerators(BoundedQueue<CorpusGenUnit> &corpq,
+ vector<unique_ptr<CorpusGenThread>> &generators,
+ TestSummary &summary) {
+ // Push a sentinel per thread.
+ for (size_t i = 0; i < generators.size(); i++) {
+ corpq.push(nullptr);
+ }
+
+ // Wait for workers to end and retrieve their results.
+ for (auto &c : generators) {
+ c->join();
+ summary.merge(c->getSummary());
+ }
+}
+
+// Note: In multi-pattern cases, utf8 is true if any pattern to be run against
+// this corpus is in UTF-8 mode.
+static
+unique_ptr<CorpusGenUnit> makeCorpusGenUnit(unsigned id, TestSummary &summary,
+ GroundTruth &ground,
+ GraphTruth &graph,
+ UltimateTruth &ultimate,
+ shared_ptr<DatabaseProxy> ue2,
+ bool multi, bool utf8) {
+ unique_ptr<CompiledPcre> cpcre;
+ unique_ptr<CNGInfo> cngi;
+
+ // compile PCRE bytecode
+ if (use_PCRE) {
+ cpcre = makePcre(id, summary, ground, ultimate, ue2);
+ }
+ if (use_NFA) {
+ cngi = makeNGInfo(id, summary, graph, ultimate, ue2);
+ }
+
+ // if both compiles failed, skip the test
+ if (!cpcre && !cngi) {
+ return nullptr;
+ }
+
+ // Caller may already have set the UTF-8 property (in multi cases)
+ utf8 |= cpcre ? cpcre->utf8 : cngi->utf8;
+
+ return ue2::make_unique<CorpusGenUnit>(move(cngi), move(cpcre), ue2, id,
+ multi, utf8);
+}
+
+static
+bool hasUTF8Pattern(GroundTruth &ground, ExpressionMap::const_iterator it,
+ ExpressionMap::const_iterator end) {
+ /* note: we cannot just check the flags as utf8 can be enabled in the
+ * pattern itself with (*UTF) */
+ debug_stage = STAGE_PCRE_COMPILE;
+ for (; it != end; ++it) {
+ try {
+ auto cpcre = ground.compile(it->first);
+ assert(cpcre); // Would have thrown PcreCompileFailure otherwise.
+ if (cpcre->utf8) {
+ DEBUG_PRINTF("UTF8 mode\n");
+ debug_stage = STAGE_UNDEFINED;
+ return true;
+ }
+ }
+ catch (const PcreCompileFailure &) {
+ continue;
+ }
+ }
+ debug_stage = STAGE_UNDEFINED;
+ return false;
+}
+
+// Fill a test queue with single-pattern tests.
+static
+void buildSingle(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
+ GroundTruth &ground, GraphTruth &graph,
+ UltimateTruth &ultimate, const ExpressionMap &exprMap) {
+ for (const auto &m : exprMap) {
+ unsigned id = m.first;
+ debug_expr = id;
+ debug_expr_ptr = m.second.c_str();
+
+ shared_ptr<DatabaseProxy> ue2 = constructDatabase({id}, ultimate);
+ if (!ue2) {
+ summary.failUe2Compile++;
+ continue;
+ }
+
+ // if we're cross-compiling, then we don't bother building PCRE and
+ // running scans, we're just going to output the database bytecode.
+ if (!ultimate.runnable()) {
+ continue;
+ }
+
+ bool multi = false;
+ bool utf8 = false;
+ auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
+ multi, utf8);
+ if (u) {
+ corpq.push(move(u));
+ }
+ }
+}
+
+// Fill a test queue with multi-pattern tests of size N, where N is the band
+// size specified on the command line.
+static
+void buildBanded(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
+ GroundTruth &ground, GraphTruth &graph,
+ UltimateTruth &ultimate, const ExpressionMap &exprMap) {
+ for (auto i = exprMap.begin(), e = exprMap.end(); i != e;) {
+ debug_expr = i->first;
+ debug_expr_ptr = i->second.c_str();
+
+ // Build a set of IDs in this band from the expression map
+ set<unsigned> bandIds;
+
+ if (g_verbose) {
+ cout << "Building set:";
+ }
+
+ ExpressionMap::const_iterator band_end = i;
+ for (u32 j = 0; j < multicompile_bands && band_end != e;
+ j++, ++band_end) {
+ bandIds.insert(bandIds.end(), band_end->first);
+ if (g_verbose) {
+ cout << " " << band_end->first;
+ }
+ }
+
+ if (g_verbose) {
+ cout << endl;
+ }
+
+ // compile UE2 bytecode
+ shared_ptr<DatabaseProxy> ue2 = constructDatabase(bandIds, ultimate);
+ if (!ue2) {
+ summary.failUe2Compile++;
+ i = band_end;
+ continue;
+ }
+
+ // if we're cross-compiling, then we don't bother building PCRE and
+ // running scans, we're just going to output the database bytecode.
+ if (!ultimate.runnable()) {
+ i = band_end;
+ continue;
+ }
+
+ bool utf8 = hasUTF8Pattern(ground, i, band_end);
+
+ for (; i != band_end; ++i) {
+ unsigned id = i->first;
+ bool multi = true;
+ auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate,
+ ue2, multi, utf8);
+ if (u) {
+ corpq.push(move(u));
+ }
+ }
+ }
+}
+
+// Fill a test queue with multi-pattern tests.
+static
+void buildMulti(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
+ GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate,
+ const ExpressionMap &exprMap) {
+ // Build a set of all IDs from the expression map
+ set<unsigned> idsAll;
+ for (const auto &e : exprMap) {
+ idsAll.insert(e.first);
+ }
+
+ // Compile in UE2
+ shared_ptr<DatabaseProxy> ue2 = constructDatabase(idsAll, ultimate);
+ if (!ue2) {
+ summary.failUe2Compile++;
+ return;
+ }
+
+ // if we're cross-compiling, then we don't bother building PCRE and
+ // running scans, we're just going to output the database bytecode.
+ if (!ultimate.runnable()) {
+ return;
+ }
+
+ bool utf8 = hasUTF8Pattern(ground, exprMap.begin(), exprMap.end());
+
+ for (const auto &m : exprMap) {
+ unsigned id = m.first;
+ debug_expr = id;
+ debug_expr_ptr = m.second.c_str();
+ bool multi = true;
+ auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
+ multi, utf8);
+ if (u) {
+ corpq.push(move(u));
+ }
+ }
+}
+
+static
+void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap,
+ TestSummary &summary, const hs_platform_info *plat,
+ const Grey &grey, BoundedQueue<TestUnit> &testq) {
+ GraphTruth graph(cout, exprMap);
+ GroundTruth ground(cout, exprMap, g_matchLimit, g_matchLimitRecursion);
+ UltimateTruth ultimate(cout, exprMap, plat, grey, g_streamBlocks);
+
+ // Construct corpus generator queue and threads.
+ BoundedQueue<CorpusGenUnit> corpq(numGeneratorThreads,
+ max_generator_queue_len);
+ vector<unique_ptr<CorpusGenThread>> generators;
+ for (size_t i = 0; i < numGeneratorThreads; i++) {
+ auto c = make_unique<CorpusGenThread>(i, testq, corpq, corpora_src);
+ c->start();
+ generators.push_back(move(c));
+ }
+
+ if (g_ue2CompileAll && multicompile_bands) {
+ printf("Running single-pattern/banded-multi-compile test for %zu "
+ "expressions.\n\n", exprMap.size());
+ buildBanded(corpq, summary, ground, graph, ultimate, exprMap);
+ } else if (g_ue2CompileAll) {
+ printf("Running single-pattern/multi-compile test for %zu "
+ "expressions.\n\n", exprMap.size());
+ buildMulti(corpq, summary, ground, graph, ultimate, exprMap);
+ } else {
+ printf("Running single-pattern/single-compile test for %zu "
+ "expressions.\n\n", exprMap.size());
+ buildSingle(corpq, summary, ground, graph, ultimate, exprMap);
+ }
+
+ drainGenerators(corpq, generators, summary);
+}
+
+static
+void printSettingsV(const vector<string> &corporaFiles,
+ const hs_platform_info *platform) {
+ cout << "hscollider: The Pattern Collider Mark II\n\n"
+ << "Number of threads: " << numThreads << " (" << numScannerThreads
+ << " scanner, " << numGeneratorThreads << " generator)\n"
+ << "Expression path: " << g_exprPath << "\n"
+ << "Signature files: ";
+ if (g_signatureFiles.empty()) {
+ cout << "none" << endl;
+ } else {
+ for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
+ string &fname = g_signatureFiles[i];
+ if (i > 0) {
+ cout << string(20, ' ');
+ }
+ cout << fname << endl;
+ }
+ }
+ cout << "Mode of operation: ";
+
+ switch (colliderMode) {
+ case MODE_BLOCK: cout << "block mode"; break;
+ case MODE_STREAMING: cout << "streaming mode"; break;
+ case MODE_VECTORED: cout << "vectored mode"; break;
+ }
+ cout << endl;
+
+ if (limit_matches) {
+ cout << "Terminate scanning after " << limit_matches << " matches."
+ << endl;
+ }
+
+ if (platform) {
+ cout << "Cross-compile for: " << to_string(*platform) << endl;
+ }
+
+ if (loadDatabases) {
+ cout << "Loading DBs from: " << serializePath << endl;
+ }
+ if (saveDatabases) {
+ cout << "Saving DBs to: " << serializePath << endl;
+ }
+ if (colliderMode == MODE_STREAMING) {
+ cout << "Stream block count: " << g_streamBlocks << endl;
+ }
+ if (colliderMode == MODE_VECTORED) {
+ cout << "Vectored block count: " << g_streamBlocks << endl;
+ }
+
+ if (use_UE2) {
+ if (max_ue2_align == min_ue2_align + 1) {
+ cout << "UE2 scan alignment: " << min_ue2_align << endl;
+ } else {
+ cout << "UE2 scan alignment: [" << min_ue2_align << ", "
+ << max_ue2_align << ")" << endl;
+ }
+ }
+
+ if (!corporaFiles.empty()) {
+ for (const auto &file : corporaFiles) {
+ cout << "Corpora read from file: " << file << endl;
+ }
+ } else {
+ cout << "Corpora properties: \n"
+ << " random seed: " << corpus_gen_prop.getSeed() << "\n"
+ << " percentages: " << corpus_gen_prop.percentMatch()
+ << "% match, "
+ << corpus_gen_prop.percentUnmatch() << "% unmatch, "
+ << corpus_gen_prop.percentRandom() << "% random" << endl;
+
+ // prefix and suffix info
+ const min_max &prefixSpan = corpus_gen_prop.prefixRange;
+ const min_max &suffixSpan = corpus_gen_prop.suffixRange;
+ if (prefixSpan.max) {
+ cout << " random prefix: " << prefixSpan.min << " to "
+ << prefixSpan.max << endl;
+ } else {
+ cout << " random prefix: none" << endl;
+ }
+ if (suffixSpan.max) {
+ cout << " random suffix: " << suffixSpan.min
+ << " to " << suffixSpan.max << endl;
+ } else {
+ cout << " random suffix: none" << endl;
+ }
+
+ // cycle info
+ pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
+ cout << " follow cycles: " << cycleSpan.first << " to "
+ << cycleSpan.second << " times" << endl;
+ }
+
+ if (saveCorpora) {
+ cout << "Saving corpora to: " << saveCorporaFile << endl;
+ }
+
+ cout << endl;
+}
+
+static
+void printSettingsQ(const vector<string> &corporaFiles,
+ const hs_platform_info *platform) {
+ cout << "Number of threads: " << numThreads << endl
+ << "Expression path: " << g_exprPath << endl
+ << "Signature files: ";
+ if (g_signatureFiles.empty()) {
+ cout << "none" << endl;
+ } else {
+ for (unsigned i = 0; i < g_signatureFiles.size(); i++) {
+ string &fname = g_signatureFiles[i];
+ if (i > 0) {
+ cout << string(20, ' ');
+ }
+ cout << fname << endl;
+ }
+ }
+ cout << "Mode of operation: ";
+
+ switch (colliderMode) {
+ case MODE_BLOCK: cout << "block mode"; break;
+ case MODE_STREAMING: cout << "streaming mode"; break;
+ case MODE_VECTORED: cout << "vectored mode"; break;
+ }
+ cout << endl;
+
+ if (limit_matches) {
+ cout << "Terminate scanning after " << limit_matches << " matches."
+ << endl;
+ }
+
+ if (platform) {
+ cout << "Cross-compile for: " << to_string(*platform) << endl;
+ }
+
+ if (colliderMode == MODE_STREAMING) {
+ cout << "Stream block count: " << g_streamBlocks << endl;
+ }
+ if (colliderMode == MODE_VECTORED) {
+ cout << "Vectored block count: " << g_streamBlocks << endl;
+ }
+
+ if (max_ue2_align == min_ue2_align + 1) {
+ cout << "UE2 scan alignment: " << min_ue2_align << endl;
+ } else {
+ cout << "UE2 scan alignment: [" << min_ue2_align << ", "
+ << max_ue2_align << ")" << endl;
+ }
+
+ if (!g_corpora_prefix.empty()) {
+ cout << "Prefix of " << g_corpora_prefix.size() << "bytes" << endl;
+ }
+ if (!g_corpora_suffix.empty()) {
+ cout << "Suffix of " << g_corpora_suffix.size() << "bytes" << endl;
+ }
+
+ if (!corporaFiles.empty()) {
+ cout << "Corpora: from file" << endl;
+ } else {
+ cout << "Corpora: -R " << corpus_gen_prop.getSeed() << " -p "
+ << corpus_gen_prop.percentMatch() << ","
+ << corpus_gen_prop.percentUnmatch() << ","
+ << corpus_gen_prop.percentRandom();
+
+ // prefix and suffix info
+ const min_max &prefixSpan = corpus_gen_prop.prefixRange;
+ const min_max &suffixSpan = corpus_gen_prop.suffixRange;
+ if (prefixSpan.max) {
+ cout << " -P " << prefixSpan.min << "," << prefixSpan.max;
+ }
+ if (suffixSpan.max) {
+ cout << " -S " << suffixSpan.min << "," << suffixSpan.max;
+ }
+
+ // cycle info
+ pair<unsigned, unsigned> cycleSpan = corpus_gen_prop.getCycleLimit();
+ cout << " -C " << cycleSpan.first << "," << cycleSpan.second;
+ cout << endl;
+ }
+}
+
+static
+void printSettings(const vector<string> &c, const hs_platform_info *plat) {
+ if (g_quiet > 1) {
+ printSettingsQ(c, plat);
+ } else {
+ printSettingsV(c, plat);
+ }
+}
+
+static
+unique_ptr<CorporaSource> buildCorpora(const vector<string> &corporaFiles,
+ const ExpressionMap &exprMap) {
+ if (!corporaFiles.empty()) {
+ auto c = ue2::make_unique<FileCorpora>();
+ for (const auto &file : corporaFiles) {
+ if (!c->readFile(file)) {
+ cout << "Error reading corpora from file: " << file << endl;
+ exit_with_fail();
+ }
+ }
+ return c;
+ } else {
+ auto c = ue2::make_unique<NfaGeneratedCorpora>(
+ exprMap, corpus_gen_prop, force_utf8, force_prefilter);
+ return c;
+ }
+}
+
+static
+bool needsQuotes(const char *s) {
+ size_t len = strlen(s);
+ // don't confuse the correct isblank for the one in locale
+ int (*blank)(int) = &std::isblank;
+
+ if (len == 0) {
+ return true;
+ }
+ if (find_if(s, s + len, blank) != s + len) {
+ return true;
+ }
+
+ return false;
+}
+
+static
+void storeCmdline(int argc, char **argv) {
+ for (int i = 0; i < argc; i++) {
+ const char *s = argv[i];
+ if (needsQuotes(s)) {
+ g_cmdline += '"';
+ g_cmdline += s;
+ g_cmdline += '"';
+ } else {
+ g_cmdline += s;
+ }
+ if (i != argc - 1) {
+ g_cmdline += " ";
+ }
+ }
+}
+
+static
+bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap,
+ const hs_platform_info *plat, const Grey &grey) {
+ TestSummary summary;
+ summary.totalExpressions = exprMap.size();
+ BoundedQueue<TestUnit> testq(numScannerThreads, max_scan_queue_len);
+
+ // Start scanning threads.
+ vector<unique_ptr<ScanThread>> scanners;
+ for (size_t i = 0; i < numScannerThreads; i++) {
+ auto s = ue2::make_unique<ScanThread>(i, testq, exprMap, plat, grey);
+ s->start();
+ scanners.push_back(move(s));
+ }
+
+ generateTests(corpora_source, exprMap, summary, plat, grey, testq);
+
+ // Push a sentinel per scanning thread to ensure that everyone finishes
+ // work.
+ for (size_t i = 0; i < scanners.size(); i++) {
+ testq.push(nullptr);
+ }
+
+ // Wait for consumers to end and retrieve their results.
+ for (size_t i = 0; i < scanners.size(); i++) {
+ const auto &s = scanners[i];
+ s->join();
+
+ if (g_verbose) {
+ cout << "Thread " << i << " processed " << s->count << " units."
+ << endl;
+ }
+
+ summary.merge(s->getSummary());
+ }
+
+ printSummary(summary);
+ return !summary.hasFailure();
+}
+
+int main(int argc, char *argv[]) {
+ Grey grey;
+ vector<string> corporaFiles;
+
+ for (int i = 1; i < argc - 1; i++) {
+ if (!strcmp(argv[i], "-G")) {
+ cout << "Override: " << argv[i + 1] << endl;
+ }
+ }
+
+ setDefaults();
+ storeCmdline(argc, argv);
+ unique_ptr<hs_platform_info> plat;
+ corpus_gen_prop.seed(randomSeed);
+
+ processArgs(argc, argv, corpus_gen_prop, &corporaFiles, &grey, &plat);
+
+ // If the user has asked for a random alignment, we select it here (after
+ // random number seed applied).
+ if (use_random_alignment) {
+ min_ue2_align = corpus_gen_prop.rand(0, 15);
+ max_ue2_align = min_ue2_align + 1;
+ }
+
+ // Limit memory usage, unless the user has specified zero on the command
+ // line or in a config file.
+ if (g_memoryLimit) {
+ setMemoryLimit(g_memoryLimit * numThreads);
+ }
+
+ // Split threads available up amongst scanner and generator threads.
+ numGeneratorThreads = max(1u, static_cast<unsigned int>(numThreads * 0.5));
+ numScannerThreads = max(1u, numThreads - numGeneratorThreads);
+
+ ExpressionMap exprMap;
+ loadExpressions(g_exprPath, exprMap);
+
+ if (!g_allSignatures) {
+ SignatureSet signatures;
+ if (!g_signatureFiles.empty()) {
+ for (string &fname : g_signatureFiles) {
+ loadSignatureList(fname, signatures);
+ }
+ } else {
+ signatures.insert(signatures.end(), g_signatures.begin(),
+ g_signatures.end());
+ }
+
+ exprMap = limitToSignatures(exprMap, signatures);
+ }
+
+ printSettings(corporaFiles, plat.get());
+
+ if (exprMap.empty()) {
+ cout << "Warning: no signatures to scan. Exiting." << endl;
+ exit(0);
+ }
+
+ if (!no_signal_handler) {
+ installSignalHandler();
+ }
+
+ if (saveDatabases || loadDatabases) {
+ struct stat st;
+ if (stat(serializePath.c_str(), &st) < 0) {
+ cout << "Unable to stat serialize path '" << serializePath
+ << "': " << strerror(errno) << endl;
+ exit_with_fail();
+ }
+ }
+
+ // If we're saving corpora out, truncate the output file.
+ if (saveCorpora) {
+ corporaOut = ue2::make_unique<CorpusWriter>(saveCorporaFile);
+ }
+
+ GroundTruth::global_prep();
+
+ auto corpora_source = buildCorpora(corporaFiles, exprMap);
+
+ if (!g_verbose && g_quiet < 2) {
+ cout << "Only failed tests are displayed." << endl;
+ }
+
+ SimpleTimer timer;
+ bool success = runTests(*corpora_source, exprMap, plat.get(), grey);
+ cout << "\nTotal elapsed time: " << timer.elapsed() << " secs." << endl;
+ exprMap.clear();
+
+ if (!success) {
+ exit_with_fail();
+ }
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pcre_util.h"
+
+#include "hs.h"
+
+#include <assert.h>
+#include <pcre.h> /* for pcre flags */
+
+bool getPcreFlags(unsigned int hs_flags, unsigned int *flags,
+ bool *highlander, bool *prefilter, bool *som) {
+ assert(flags);
+ assert(highlander);
+ assert(prefilter);
+ assert(som);
+ *flags = 0;
+ *highlander = false;
+ *prefilter = false;
+ *som = false;
+
+ if (hs_flags & HS_FLAG_CASELESS) {
+ *flags |= PCRE_CASELESS;
+ hs_flags &= ~HS_FLAG_CASELESS;
+ }
+ if (hs_flags & HS_FLAG_DOTALL) {
+ *flags |= PCRE_DOTALL;
+ hs_flags &= ~HS_FLAG_DOTALL;
+ }
+ if (hs_flags & HS_FLAG_MULTILINE) {
+ *flags |= PCRE_MULTILINE;
+ hs_flags &= ~HS_FLAG_MULTILINE;
+ }
+ if (hs_flags & HS_FLAG_UCP) {
+ *flags |= PCRE_UCP;
+ hs_flags &= ~HS_FLAG_UCP;
+ }
+ if (hs_flags & HS_FLAG_UTF8) {
+ *flags |= PCRE_UTF8;
+ hs_flags &= ~HS_FLAG_UTF8;
+ }
+ if (hs_flags & HS_FLAG_SINGLEMATCH) {
+ *highlander = true;
+ hs_flags &= ~HS_FLAG_SINGLEMATCH;
+ }
+ if (hs_flags & HS_FLAG_PREFILTER) {
+ *prefilter = true;
+ hs_flags &= ~HS_FLAG_PREFILTER;
+ }
+ if (hs_flags & HS_FLAG_SOM_LEFTMOST) {
+ *som = true;
+ hs_flags &= ~HS_FLAG_SOM_LEFTMOST;
+ }
+
+ // Flags that are irrelevant to PCRE.
+ hs_flags &= ~HS_FLAG_ALLOWEMPTY;
+
+ if (hs_flags) {
+ // You've added new flags, haven't you?
+ assert(0);
+ return false;
+ }
+
+ return true;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PCRE_UTIL_H
+#define PCRE_UTIL_H
+
+/** Translates the given hyperscan flags into pcre flags (where appropriate)
+ * and other bools (for flags which are not directly translateable).
+ *
+ * Returns false if an unknown hyperscan flag is encountered.
+ */
+bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags,
+ bool *highlander, bool *prefilter, bool *som);
+
+#endif /* PCRE_UTIL_H */
+
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "sig.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <string>
+
+#ifdef HAVE_SIGACTION
+#include <signal.h>
+#endif
+
+#ifdef HAVE_BACKTRACE
+#include <execinfo.h>
+#include <unistd.h>
+#endif
+
+#define BACKTRACE_BUFFER_SIZE 200
+
+TLS_VARIABLE volatile int debug_stage = STAGE_UNDEFINED;
+TLS_VARIABLE volatile int debug_expr = 0;
+TLS_VARIABLE const char * volatile debug_expr_ptr = nullptr;
+TLS_VARIABLE volatile int debug_corpus = 0;
+TLS_VARIABLE const char * volatile debug_corpus_ptr = nullptr;
+TLS_VARIABLE volatile size_t debug_corpus_len = 0;
+
+extern std::string g_cmdline;
+
+#ifdef HAVE_SIGACTION
+static void sighandler(int signum) {
+ /* NOTE: This signal handler is designed solely to provide more information
+ * when a crash occurs in ue2collider -- it makes calls to signal-unsafe
+ * functions like printf() and backtrace() by design, since we're already
+ * in deep trouble and are going to exit anyway. */
+
+ fflush(stdout);
+ printf("signal %d\n", signum);
+ printf("\nFailing cmdline was:\n%s\n\n", g_cmdline.c_str());
+ printf("expression %d ", debug_expr);
+ switch(debug_stage) {
+ case STAGE_UE2_COMPILE:
+ printf("ue2 compile\n");
+ break;
+ case STAGE_UE2_RUN:
+ printf("corpus %d ue2 scan\n", debug_corpus);
+ break;
+ case STAGE_PCRE_COMPILE:
+ printf("pcre compile\n");
+ break;
+ case STAGE_PCRE_RUN:
+ printf("corpus %d pcre scan\n", debug_corpus);
+ break;
+ case STAGE_GRAPH_PREPROCESS:
+ printf("graph preprocess\n");
+ break;
+ case STAGE_GRAPH_COMPILE:
+ printf("graph compile\n");
+ break;
+ case STAGE_GRAPH_RUN:
+ printf("corpus %d graph scan\n", debug_corpus);
+ break;
+ default:
+ case STAGE_UNDEFINED:
+ printf("unknown stage\n");
+ break;
+ }
+ printf("\n");
+
+ if (debug_expr_ptr) {
+ printf("expression %p\n", debug_expr_ptr);
+ printf("%d:%s\n\n", debug_expr, debug_expr_ptr);
+ }
+
+ if (debug_stage == STAGE_PCRE_RUN || debug_stage == STAGE_UE2_RUN) {
+ printf("corpus %p len %zu\n", debug_corpus_ptr, debug_corpus_len);
+
+ printf("%d:", debug_expr);
+ for (size_t i = 0; i < debug_corpus_len && debug_corpus_ptr; i++) {
+ unsigned char c = debug_corpus_ptr[i];
+ if (c == '\n') {
+ printf("\\n");
+ } else if (c == '\t') {
+ printf("\\t");
+ } else if (c == '\r') {
+ printf("\\r");
+ } else if (0x20 <= c && c <= 0x7e && c != '\\') {
+ printf("%c", c);
+ } else {
+ printf("\\x%02hhx", c);
+ }
+ }
+ printf("\n\n");
+ }
+
+ fflush(stdout);
+
+#ifdef HAVE_BACKTRACE
+ static void *bt[BACKTRACE_BUFFER_SIZE];
+ int count = backtrace(bt, BACKTRACE_BUFFER_SIZE);
+ if (count) {
+ backtrace_symbols_fd(bt, count, STDOUT_FILENO);
+ } else {
+ printf("(Call to backtrace() returns zero count.)\n");
+ }
+#else
+ printf("(Backtrace unavailable on this platform.)\n");
+#endif
+
+ _exit(signum);
+}
+#endif // HAVE_SIGACTION
+
+void installSignalHandler(void) {
+#ifdef HAVE_SIGACTION
+ struct sigaction act;
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = sighandler;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+ sigaddset(&act.sa_mask, SIGSEGV);
+ sigaddset(&act.sa_mask, SIGBUS);
+ sigaddset(&act.sa_mask, SIGFPE);
+ sigaddset(&act.sa_mask, SIGILL);
+ sigaddset(&act.sa_mask, SIGABRT);
+ sigaction(SIGBUS, &act, nullptr);
+ sigaction(SIGFPE, &act, nullptr);
+ sigaction(SIGILL, &act, nullptr);
+ sigaction(SIGABRT, &act, nullptr);
+ sigaction(SIGSEGV, &act, nullptr);
+ setSignalStack();
+#endif // HAVE_SIGACTION
+}
+
+#ifdef HAVE_SIGALTSTACK
+static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
+#endif
+
+void setSignalStack(void) {
+#ifdef HAVE_SIGALTSTACK
+ struct sigaction act;
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = sighandler;
+ act.sa_flags = 0;
+ stack_t alt_stack;
+ memset(&alt_stack, 0, sizeof(alt_stack));
+ alt_stack.ss_flags = 0;
+ alt_stack.ss_size = SIGSTKSZ;
+ alt_stack.ss_sp = alt_stack_loc;
+ if (!sigaltstack(&alt_stack, nullptr)) {
+ act.sa_flags |= SA_ONSTACK;
+ }
+ sigaction(SIGSEGV, &act, nullptr);
+#endif
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIG_H
+#define SIG_H
+
+#include <cstddef> // for size_t
+
+#define STAGE_UNDEFINED 0
+#define STAGE_UE2_COMPILE 1
+#define STAGE_UE2_RUN 2
+#define STAGE_PCRE_COMPILE 3
+#define STAGE_PCRE_RUN 4
+#define STAGE_GRAPH_PREPROCESS 5
+#define STAGE_GRAPH_COMPILE 6
+#define STAGE_GRAPH_RUN 7
+
+#define TLS_VARIABLE __thread
+
+extern TLS_VARIABLE volatile int debug_stage;
+extern TLS_VARIABLE volatile int debug_expr;
+extern TLS_VARIABLE const char * volatile debug_expr_ptr;
+extern TLS_VARIABLE volatile int debug_corpus;
+extern TLS_VARIABLE const char * volatile debug_corpus_ptr;
+extern TLS_VARIABLE volatile size_t debug_corpus_len;
+
+void installSignalHandler(void);
+
+// Must be called by every thread.
+void setSignalStack(void);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMPLE_TIMER_H
+#define SIMPLE_TIMER_H
+
+#include <chrono>
+
+class SimpleTimer {
+public:
+ SimpleTimer();
+ double elapsed() const;
+private:
+ std::chrono::time_point<std::chrono::system_clock> start;
+};
+
+SimpleTimer::SimpleTimer() {
+ start = std::chrono::system_clock::now();
+}
+
+double SimpleTimer::elapsed() const {
+ std::chrono::time_point<std::chrono::system_clock> end;
+ end = std::chrono::system_clock::now();
+
+ std::chrono::duration<double> delta = end - start;
+ return delta.count();
+}
+
+#endif // SIMPLE_TIMER_H