FDR runtime simplification

author Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>

Tue, 17 Nov 2015 17:50:23 +0000 (17:50 +0000)

committer Matthew Barr <matthew.barr@intel.com>

Fri, 20 Nov 2015 03:44:43 +0000 (14:44 +1100)
author Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>
Tue, 17 Nov 2015 17:50:23 +0000 (17:50 +0000)
committer Matthew Barr <matthew.barr@intel.com>
Fri, 20 Nov 2015 03:44:43 +0000 (14:44 +1100)
diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py

index 36e4c16c36b22e8e2e20933b2297d7ac25e5f19f..e5b4f39e7de361a349004af1e470f7a9ad59e57b 100755 (executable)
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
  
  def build_fdr_matchers():
      all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
-    big_domains = [ 14, 15 ]
+    strides = [ 1, 2, 4 ]
  
      common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
-        all_matchers += [ M3(stride = 2, domain = d, **common) ]
-        all_matchers += [ M3(stride = 4, domain = d, **common) ]
-    for d in big_domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+    for s in strides:
+        all_matchers += [ M3(stride = s, **common) ]
  
      return all_matchers
  
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c

index 082800f10ce98976ffc99e6132b40b09a32d53ab..f83a42652191dbad82b63c4dc118c2bab3c564d2 100644 (file)
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -40,27 +40,6 @@
  #include "fdr_confirm_runtime.h"
  #include "fdr_streaming_runtime.h"
  #include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
-    u32 r = 0;
-    if (a->start_offset == 0) {
-        if (numBits <= 8) {
-            r = a->buf_history[a->len_history - 1];
-        } else {
-            r = a->buf_history[a->len_history - 1];
-            r |= (a->buf[0] << 8);
-        }
-    } else {
-        if (numBits <= 8) {
-            r = a->buf[a->start_offset - 1];
-        } else {
-            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
-        }
-    }
-    return r & ((1 << numBits) - 1);
-}
-
  #include "fdr_autogen.c"
  
  #define FAKE_HISTORY_SIZE 16
diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py

index 685cca3b84290874ac5ca2e923d6c05cb448d855..748d811f2bd850eb3b8a1699bd3658617e8e3da6 100755 (executable)
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@@ -74,12 +74,12 @@ class ValueExtractStep(Step):
          dsb = m.datasize_bytes
          modval = offset % dsb
  
-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
              # Case 1: reading more than one byte over the end of the bulk load
  
              self.latency = 4
              if sub_load_cautious:
-                code_string = "cautious_forward" 
+                code_string = "cautious_forward"
              else:
                  code_string = "normal"
              load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                      temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
  
  
-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
          v_var = self.nv(m.value_extract_type, "v%d" % offset)
          self.val = v_var.gen_initializer_stmt(init_string)
  
@@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                            enable_confirmless = m.stride == 1, do_bailout = False)
  
  class M3(MatcherBase):
-    def get_hash_safety_parameters(self):
-        h_size = self.single_load_type.size_in_bytes()
-        return (0, h_size - 1)
-
      def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
                self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
                self.arch.target, self.conf_pull_back, self.conf_top_level_split)
  
      def produce_main_loop(self, switch_variant = False):
@@ -192,8 +188,8 @@ class M3(MatcherBase):
          ctxt = CodeGenContext(self)
  
          if switch_variant:
-            print " ptr -= (iterBytes - dist);"
-            print " { " # need an extra scope around switch variant to stop its globals escaping
+            print "    ptr -= (iterBytes - dist);"
+            print "    { " # need an extra scope around switch variant to stop its globals escaping
          else:
              print "    if (doMainLoop) {"
              print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@@ -349,25 +345,30 @@ class M3(MatcherBase):
          shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
  
          s = Template("""
-            $TYPENAME s;
-            if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
-                s = *((const $TYPENAME *)ft + tmp);
-                $SHIFT_EXPR;
-            } else {
-                s = *(const $TYPENAME *)&fdr->start;
-            }
+    $TYPENAME s;
+    if (a->len_history) {
+        u32 tmp = 0;
+        if (a->start_offset == 0) {
+            tmp = a->buf_history[a->len_history - 1];
+            tmp |= (a->buf[0] << 8);
+        } else {
+            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+        tmp &= fdr->domainMask;
+        s = *((const $TYPENAME *)ft + tmp);
+        $SHIFT_EXPR;
+    } else {
+        s = *(const $TYPENAME *)&fdr->start;
+    }
  """).substitute(TYPENAME = s_type.get_name(),
                  ZERO_EXPR = s_type.zero_expression(),
-                DOMAIN = self.domain,
                  SHIFT_EXPR = shift_expr)
          return s
  
      def produce_code(self):
  
-        (behind, ahead) = self.get_hash_safety_parameters()
-        loop_read_behind = behind
-        loop_read_ahead = self.loop_bytes + ahead
+        loop_read_behind = 0
+        loop_read_ahead = self.loop_bytes + 1
  
          # we set up mask and shift stuff for extracting our masks from registers
          #
@@ -380,7 +381,7 @@ class M3(MatcherBase):
          ssb = self.state_type.size / 8 # state size in bytes
  
          # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
              # obscure corner - we don't have the room in the register to
              # do this for all values so we don't. domain==16 is pretty
              # bad anyhow, of course
@@ -390,7 +391,6 @@ class M3(MatcherBase):
  
          shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
          self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
-        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
  
          print self.produce_header(visible = False)
  
@@ -398,21 +398,19 @@ class M3(MatcherBase):
          print " Arch: " + self.arch.name,
          print " State type: " + self.state_type.get_name(),
          print " Num buckets: %d" % self.num_buckets,
-        print " Domain: %d" % self.domain,
          print " Stride: %d" % self.stride
  
          print self.produce_common_declarations()
-        print
  
-        print "\tconst size_t tabSize = %d;" % self.table_size
-        print """
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
+        print
+        print "    u64a domain_mask = fdr->domainMask;"
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
          print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
  
          print """
      while (ptr < buf + len) {
@@ -451,9 +449,9 @@ class M3(MatcherBase):
          print self.produce_footer()
  
      def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
  
-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                   arch,
                   table_state_width = None,
                   num_buckets = 8,
@@ -474,17 +472,9 @@ class M3(MatcherBase):
          self.table_state_width = state_width
          self.table_state_type = getRequiredType(self.table_state_width)
  
-        # domain is the number of bits that we draw from our input to
-        # index our 'reach' table
-        if not 8 <= domain <= 16:
-            fail_out("Unsupported domain: %d" % domain)
-        self.domain = domain
-        # this is the load type required for this domain if we want to
+        # this is the load type required for domain [9:15] if we want to
          # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
-
-        # table size
-        self.table_size = 2**domain * table_state_width // 8
+        self.single_load_type = IntegerType(16)
  
          # stride is the frequency with which we make data-driven
          # accesses to our reach table
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp

index 8be443708f8f516555c696819d8276509e8e8016..ccf176267f4a941276899fa47262d78c3c81a24f 100644 (file)
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
      ptr += floodControlTmp.second;
      aligned_free(floodControlTmp.first);
  
+    /*  we are allowing domains 9 to 15 only */
+    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->domain = eng.bits;
+    fdr->schemeWidthByte = eng.schemeWidth / 8;
+    fdr->domainMask = (1 << eng.bits) - 1;
+    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
      if (link.first) {
          fdr->link = verify_u32(ptr - fdr_base);
          memcpy(ptr, link.first, link.second);
@@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
          return nullptr;
      }
  
+    // temporary hack for unit testing
+    if (hint != HINT_INVALID) {
+        des->bits = 9;
+    }
+
      FDRCompiler fc(lits, *des, make_small);
      return fc.build(link);
  }
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp

index ae246270ff6767b5fb75baa48355267fcb005ebc..158170c2655d4f07d64d5a325473eb2540273523 100644 (file)
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
          unique_ptr<FDREngineDescription> des =
              getFdrDescription(fdr->engineID);
          if (des) {
+            fprintf(f, "    domain     %u\n", des->bits);
              fprintf(f, "    stride     %u\n", des->stride);
              fprintf(f, "    buckets    %u\n", des->getNumBuckets());
              fprintf(f, "    width      %u\n", des->schemeWidth);
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp

index 2a6fda790f966113413aab9a2c5c4a2f25d3cdc9..5d470c7e221f25aab7b3b98231d25b9c8b2ecd84 100644 (file)
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
      : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                          def.numBuckets, def.confirmPullBackDistance,
                          def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
  
  u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
      // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
      DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                   desiredStride);
  
-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
      u32 best_score = 0;
  
-    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
-        if (!eng.isValidOnTarget(target)) {
-            continue;
-        }
-        if (msl < eng.stride) {
-            continue;
-        }
+    for (u32 domain = 9; domain <= 15; domain++) {
+        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+            // to make sure that domains >=14 have stride 1 according to origin
+            if (domain > 13 && engineID > 0) {
+                continue;
+            }
+            FDREngineDescription &eng = allDescs[engineID];
+            if (!eng.isValidOnTarget(target)) {
+                continue;
+            }
+            if (msl < eng.stride) {
+                continue;
+            }
  
-        u32 score = 100;
+            u32 score = 100;
  
-        score -= absdiff(desiredStride, eng.stride);
+            score -= absdiff(desiredStride, eng.stride);
  
-        if (eng.stride <= desiredStride) {
-            score += eng.stride;
-        }
+            if (eng.stride <= desiredStride) {
+                score += eng.stride;
+            }
  
-        u32 effLits = vl.size(); /* * desiredStride;*/
-        u32 ideal;
-        if (effLits < eng.getNumBuckets()) {
-            if (eng.stride == 1) {
-                ideal = 8;
-            } else {
+            u32 effLits = vl.size(); /* * desiredStride;*/
+            u32 ideal;
+            if (effLits < eng.getNumBuckets()) {
+                if (eng.stride == 1) {
+                    ideal = 8;
+                } else {
+                    ideal = 10;
+                }
+            } else if (effLits < 20) {
                  ideal = 10;
+            } else if (effLits < 100) {
+                ideal = 11;
+            } else if (effLits < 1000) {
+                ideal = 12;
+            } else if (effLits < 10000) {
+                ideal = 13;
+            } else {
+                ideal = 15;
              }
-        } else if (effLits < 20) {
-            ideal = 10;
-        } else if (effLits < 100) {
-            ideal = 11;
-        } else if (effLits < 1000) {
-            ideal = 12;
-        } else if (effLits < 10000) {
-            ideal = 13;
-        } else {
-            ideal = 15;
-        }
  
-        if (ideal != 8 && eng.schemeWidth == 32) {
-            ideal += 1;
-        }
+            if (ideal != 8 && eng.schemeWidth == 32) {
+                ideal += 1;
+            }
  
-        if (make_small) {
-            ideal -= 2;
-        }
+            if (make_small) {
+                ideal -= 2;
+            }
  
-        if (eng.stride > 1) {
-            ideal++;
-        }
+            if (eng.stride > 1) {
+                ideal++;
+            }
  
-        DEBUG_PRINTF("effLits %u\n", effLits);
+            DEBUG_PRINTF("effLits %u\n", effLits);
  
-        if (target.is_atom_class() && !make_small && effLits < 4000) {
-            /* Unless it is a very heavy case, we want to build smaller tables
-             * on lightweight machines due to their small caches. */
-            ideal -= 2;
-        }
+            if (target.is_atom_class() && !make_small && effLits < 4000) {
+                /* Unless it is a very heavy case, we want to build smaller tables
+                 * on lightweight machines due to their small caches. */
+                ideal -= 2;
+            }
  
-        score -= absdiff(ideal, eng.bits);
+            score -= absdiff(ideal, domain);
  
-        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
-                     "-> score=%u\n",
-                     eng.getID(), eng.schemeWidth, eng.bits,
-                     eng.getNumBuckets(), eng.stride, score);
+            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                         "-> score=%u\n",
+                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getNumBuckets(), eng.stride, score);
  
-        if (!best || score > best_score) {
-            best = &eng;
-            best_score = score;
+            if (!best || score > best_score) {
+                eng.bits = domain;
+                best = &eng;
+                best_score = score;
+            }
          }
      }
  
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h

index d936095b1dff625696457955da76d16e1acb3ea7..45f64ac0f9a67db1c85c6d0d6b87c067a83aeead 100644 (file)
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -43,7 +43,6 @@ struct FDREngineDef {
      u32 schemeWidth;
      u32 numBuckets;
      u32 stride;
-    u32 bits;
      u64a cpu_features;
      u32 confirmPullBackDistance;
      u32 confirmTopLevelSplit;
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h

index 6c7227779e696d083b40d85a67f449f2a8042f2a..607e039c8b6ad999746f9e5f5f07fe89a1bbf542 100644 (file)
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -76,9 +76,11 @@ struct FDR {
       * structures (spillover strings and hash table) if we're a secondary
       * structure. */
      u32 link;
+    u8 domain; /* dynamic domain info */
+    u8 schemeWidthByte;  /* scheme width in bytes */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
      u32 pad1;
-    u32 pad2;
-    u32 pad3;
  
      union {
          u32 s_u32;
author	Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>
	Tue, 17 Nov 2015 17:50:23 +0000 (17:50 +0000)
committer	Matthew Barr <matthew.barr@intel.com>
	Fri, 20 Nov 2015 03:44:43 +0000 (14:44 +1100)
src/fdr/autogen.py		patch \| blob \| blame \| history
src/fdr/fdr.c		patch \| blob \| blame \| history
src/fdr/fdr_autogen.py		patch \| blob \| blame \| history
src/fdr/fdr_compile.cpp		patch \| blob \| blame \| history
src/fdr/fdr_dump.cpp		patch \| blob \| blame \| history
src/fdr/fdr_engine_description.cpp		patch \| blob \| blame \| history
src/fdr/fdr_engine_description.h		patch \| blob \| blame \| history
src/fdr/fdr_internal.h		patch \| blob \| blame \| history