Removed static specialisation of domains.
def build_fdr_matchers():
all_matchers = [ ]
- domains = [8, 10, 11, 12, 13]
- big_domains = [ 14, 15 ]
+ strides = [ 1, 2, 4 ]
common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
- for d in domains:
- all_matchers += [ M3(stride = 1, domain = d, **common) ]
- all_matchers += [ M3(stride = 2, domain = d, **common) ]
- all_matchers += [ M3(stride = 4, domain = d, **common) ]
- for d in big_domains:
- all_matchers += [ M3(stride = 1, domain = d, **common) ]
+ for s in strides:
+ all_matchers += [ M3(stride = s, **common) ]
return all_matchers
#include "fdr_confirm_runtime.h"
#include "fdr_streaming_runtime.h"
#include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
- u32 r = 0;
- if (a->start_offset == 0) {
- if (numBits <= 8) {
- r = a->buf_history[a->len_history - 1];
- } else {
- r = a->buf_history[a->len_history - 1];
- r |= (a->buf[0] << 8);
- }
- } else {
- if (numBits <= 8) {
- r = a->buf[a->start_offset - 1];
- } else {
- r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
- }
- }
- return r & ((1 << numBits) - 1);
-}
-
#include "fdr_autogen.c"
#define FAKE_HISTORY_SIZE 16
dsb = m.datasize_bytes
modval = offset % dsb
- if m.domain > 8 and modval == dsb - 1:
+ if modval == dsb - 1:
# Case 1: reading more than one byte over the end of the bulk load
self.latency = 4
if sub_load_cautious:
- code_string = "cautious_forward"
+ code_string = "cautious_forward"
else:
code_string = "normal"
load_string = m.single_load_type.load_expr_data(self.offset, code_string)
temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
- init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+ init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
v_var = self.nv(m.value_extract_type, "v%d" % offset)
self.val = v_var.gen_initializer_stmt(init_string)
enable_confirmless = m.stride == 1, do_bailout = False)
class M3(MatcherBase):
- def get_hash_safety_parameters(self):
- h_size = self.single_load_type.size_in_bytes()
- return (0, h_size - 1)
-
def produce_compile_call(self):
- print " { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+ print " { %d, %d, %d, %d, %s, %d, %d }," % (
self.id, self.state_width, self.num_buckets,
- self.stride, self.domain,
+ self.stride,
self.arch.target, self.conf_pull_back, self.conf_top_level_split)
def produce_main_loop(self, switch_variant = False):
ctxt = CodeGenContext(self)
if switch_variant:
- print " ptr -= (iterBytes - dist);"
- print " { " # need an extra scope around switch variant to stop its globals escaping
+ print " ptr -= (iterBytes - dist);"
+ print " { " # need an extra scope around switch variant to stop its globals escaping
else:
print " if (doMainLoop) {"
print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
s = Template("""
- $TYPENAME s;
- if (a->len_history) {
- u32 tmp = getPreStartVal(a, $DOMAIN);
- s = *((const $TYPENAME *)ft + tmp);
- $SHIFT_EXPR;
- } else {
- s = *(const $TYPENAME *)&fdr->start;
- }
+ $TYPENAME s;
+ if (a->len_history) {
+ u32 tmp = 0;
+ if (a->start_offset == 0) {
+ tmp = a->buf_history[a->len_history - 1];
+ tmp |= (a->buf[0] << 8);
+ } else {
+ tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+ }
+ tmp &= fdr->domainMask;
+ s = *((const $TYPENAME *)ft + tmp);
+ $SHIFT_EXPR;
+ } else {
+ s = *(const $TYPENAME *)&fdr->start;
+ }
""").substitute(TYPENAME = s_type.get_name(),
ZERO_EXPR = s_type.zero_expression(),
- DOMAIN = self.domain,
SHIFT_EXPR = shift_expr)
return s
def produce_code(self):
- (behind, ahead) = self.get_hash_safety_parameters()
- loop_read_behind = behind
- loop_read_ahead = self.loop_bytes + ahead
+ loop_read_behind = 0
+ loop_read_ahead = self.loop_bytes + 1
# we set up mask and shift stuff for extracting our masks from registers
#
ssb = self.state_type.size / 8 # state size in bytes
# Intel path
- if ssb == 16 and self.domain == 16:
+ if ssb == 16:
# obscure corner - we don't have the room in the register to
# do this for all values so we don't. domain==16 is pretty
# bad anyhow, of course
shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
- self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
print self.produce_header(visible = False)
print " Arch: " + self.arch.name,
print " State type: " + self.state_type.get_name(),
print " Num buckets: %d" % self.num_buckets,
- print " Domain: %d" % self.domain,
print " Stride: %d" % self.stride
print self.produce_common_declarations()
- print
- print "\tconst size_t tabSize = %d;" % self.table_size
- print """
- const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
- const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+ print " assert(fdr->domain > 8 && fdr->domain < 16);"
+ print
+ print " u64a domain_mask = fdr->domainMask;"
+ print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+ print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
print self.produce_init_state()
- print "\tconst size_t iterBytes = %d;" % self.loop_bytes
- print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
- print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+ print " const size_t iterBytes = %d;" % self.loop_bytes
+ print " const size_t START_MOD = %d;" % self.datasize_bytes
+ print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
print """
while (ptr < buf + len) {
print self.produce_footer()
def get_name(self):
- return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+ return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
- def __init__(self, state_width, domain, stride,
+ def __init__(self, state_width, stride,
arch,
table_state_width = None,
num_buckets = 8,
self.table_state_width = state_width
self.table_state_type = getRequiredType(self.table_state_width)
- # domain is the number of bits that we draw from our input to
- # index our 'reach' table
- if not 8 <= domain <= 16:
- fail_out("Unsupported domain: %d" % domain)
- self.domain = domain
- # this is the load type required for this domain if we want to
+ # this is the load type required for domain [9:15] if we want to
# load it one at a time
- self.single_load_type = getRequiredType(self.domain)
-
- # table size
- self.table_size = 2**domain * table_state_width // 8
+ self.single_load_type = IntegerType(16)
# stride is the frequency with which we make data-driven
# accesses to our reach table
ptr += floodControlTmp.second;
aligned_free(floodControlTmp.first);
+ /* we are allowing domains 9 to 15 only */
+ assert(eng.bits > 8 && eng.bits < 16);
+ fdr->domain = eng.bits;
+ fdr->schemeWidthByte = eng.schemeWidth / 8;
+ fdr->domainMask = (1 << eng.bits) - 1;
+ fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
if (link.first) {
fdr->link = verify_u32(ptr - fdr_base);
memcpy(ptr, link.first, link.second);
return nullptr;
}
+ // temporary hack for unit testing
+ if (hint != HINT_INVALID) {
+ des->bits = 9;
+ }
+
FDRCompiler fc(lits, *des, make_small);
return fc.build(link);
}
unique_ptr<FDREngineDescription> des =
getFdrDescription(fdr->engineID);
if (des) {
+ fprintf(f, " domain %u\n", des->bits);
fprintf(f, " stride %u\n", des->stride);
fprintf(f, " buckets %u\n", des->getNumBuckets());
fprintf(f, " width %u\n", des->schemeWidth);
: EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
def.numBuckets, def.confirmPullBackDistance,
def.confirmTopLevelSplit),
- schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+ schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
// rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
desiredStride);
- const FDREngineDescription *best = nullptr;
+ FDREngineDescription *best = nullptr;
u32 best_score = 0;
- for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
- const FDREngineDescription &eng = allDescs[engineID];
- if (!eng.isValidOnTarget(target)) {
- continue;
- }
- if (msl < eng.stride) {
- continue;
- }
+ for (u32 domain = 9; domain <= 15; domain++) {
+ for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+ // to make sure that domains >=14 have stride 1 according to origin
+ if (domain > 13 && engineID > 0) {
+ continue;
+ }
+ FDREngineDescription &eng = allDescs[engineID];
+ if (!eng.isValidOnTarget(target)) {
+ continue;
+ }
+ if (msl < eng.stride) {
+ continue;
+ }
- u32 score = 100;
+ u32 score = 100;
- score -= absdiff(desiredStride, eng.stride);
+ score -= absdiff(desiredStride, eng.stride);
- if (eng.stride <= desiredStride) {
- score += eng.stride;
- }
+ if (eng.stride <= desiredStride) {
+ score += eng.stride;
+ }
- u32 effLits = vl.size(); /* * desiredStride;*/
- u32 ideal;
- if (effLits < eng.getNumBuckets()) {
- if (eng.stride == 1) {
- ideal = 8;
- } else {
+ u32 effLits = vl.size(); /* * desiredStride;*/
+ u32 ideal;
+ if (effLits < eng.getNumBuckets()) {
+ if (eng.stride == 1) {
+ ideal = 8;
+ } else {
+ ideal = 10;
+ }
+ } else if (effLits < 20) {
ideal = 10;
+ } else if (effLits < 100) {
+ ideal = 11;
+ } else if (effLits < 1000) {
+ ideal = 12;
+ } else if (effLits < 10000) {
+ ideal = 13;
+ } else {
+ ideal = 15;
}
- } else if (effLits < 20) {
- ideal = 10;
- } else if (effLits < 100) {
- ideal = 11;
- } else if (effLits < 1000) {
- ideal = 12;
- } else if (effLits < 10000) {
- ideal = 13;
- } else {
- ideal = 15;
- }
- if (ideal != 8 && eng.schemeWidth == 32) {
- ideal += 1;
- }
+ if (ideal != 8 && eng.schemeWidth == 32) {
+ ideal += 1;
+ }
- if (make_small) {
- ideal -= 2;
- }
+ if (make_small) {
+ ideal -= 2;
+ }
- if (eng.stride > 1) {
- ideal++;
- }
+ if (eng.stride > 1) {
+ ideal++;
+ }
- DEBUG_PRINTF("effLits %u\n", effLits);
+ DEBUG_PRINTF("effLits %u\n", effLits);
- if (target.is_atom_class() && !make_small && effLits < 4000) {
- /* Unless it is a very heavy case, we want to build smaller tables
- * on lightweight machines due to their small caches. */
- ideal -= 2;
- }
+ if (target.is_atom_class() && !make_small && effLits < 4000) {
+ /* Unless it is a very heavy case, we want to build smaller tables
+ * on lightweight machines due to their small caches. */
+ ideal -= 2;
+ }
- score -= absdiff(ideal, eng.bits);
+ score -= absdiff(ideal, domain);
- DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
- "-> score=%u\n",
- eng.getID(), eng.schemeWidth, eng.bits,
- eng.getNumBuckets(), eng.stride, score);
+ DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+ "-> score=%u\n",
+ eng.getID(), eng.schemeWidth, eng.bits,
+ eng.getNumBuckets(), eng.stride, score);
- if (!best || score > best_score) {
- best = ŋ
- best_score = score;
+ if (!best || score > best_score) {
+ eng.bits = domain;
+ best = ŋ
+ best_score = score;
+ }
}
}
u32 schemeWidth;
u32 numBuckets;
u32 stride;
- u32 bits;
u64a cpu_features;
u32 confirmPullBackDistance;
u32 confirmTopLevelSplit;
* structures (spillover strings and hash table) if we're a secondary
* structure. */
u32 link;
+ u8 domain; /* dynamic domain info */
+ u8 schemeWidthByte; /* scheme width in bytes */
+ u16 domainMask; /* pre-computed domain mask */
+ u32 tabSize; /* pre-computed hashtable size in bytes */
u32 pad1;
- u32 pad2;
- u32 pad3;
union {
u32 s_u32;