failures, but they can be fixed up in-repo. This resolves bug 198395.
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10444
helgrind \
drd
-EXP_TOOLS = exp-ptrcheck
+EXP_TOOLS = exp-ptrcheck \
+ exp-bbv
# DDD: once all tools work on Darwin, TEST_TOOLS and TEST_EXP_TOOLS can be
# replaced with TOOLS and EXP_TOOLS.
lackey \
none
- TEST_EXP_TOOLS =
+ TEST_EXP_TOOLS = exp-bbv
endif
# Put docs last because building the HTML is slow and we want to get
* XXX: something about improved Wine support?
+* XXX: exp-bbv has been added...
+
* A new Memcheck client request VALGRIND_COUNT_LEAK_BLOCKS has been added.
It is similar to VALGRIND_COUNT_LEAKS but counts blocks instead of bytes.
[XXX: consider adding VALGRIND_COUNT_LEAK_BYTES as a synonym and
drd/docs/Makefile
drd/scripts/download-and-build-splash2
drd/tests/Makefile
+ exp-bbv/Makefile
+ exp-bbv/docs/Makefile
+ exp-bbv/tests/Makefile
+ exp-bbv/tests/x86/Makefile
+ exp-bbv/tests/x86-linux/Makefile
+ exp-bbv/tests/amd64-linux/Makefile
+ exp-bbv/tests/ppc32-linux/Makefile
])
AC_OUTPUT
xmlns:xi="http://www.w3.org/2001/XInclude" />
<xi:include href="../../exp-ptrcheck/docs/pc-manual.xml" parse="xml"
xmlns:xi="http://www.w3.org/2001/XInclude" />
+ <xi:include href="../../exp-bbv/docs/bbv-manual.xml" parse="xml"
+ xmlns:xi="http://www.w3.org/2001/XInclude" />
<xi:include href="../../none/docs/nl-manual.xml" parse="xml"
xmlns:xi="http://www.w3.org/2001/XInclude" />
<xi:include href="../../lackey/docs/lk-manual.xml" parse="xml"
</refsect1>
+<refsect1 id="bbv-options">
+<title>BBV Options</title>
+
+<xi:include href="../../exp-bbv/docs/bbv-manual.xml"
+ xpointer="bbv.opts.list"
+ xmlns:xi="http://www.w3.org/2001/XInclude" />
+
+</refsect1>
<refsect1 id="lackey-options">
</refsect1>
-
<refsect1 id="see_also">
<title>See Also</title>
--- /dev/null
+include $(top_srcdir)/Makefile.tool.am
+
+#----------------------------------------------------------------------------
+# exp-bbv-<platform>
+#----------------------------------------------------------------------------
+
+noinst_PROGRAMS = exp-bbv-@VGCONF_ARCH_PRI@-@VGCONF_OS@
+if VGCONF_HAVE_PLATFORM_SEC
+noinst_PROGRAMS += exp-bbv-@VGCONF_ARCH_SEC@-@VGCONF_OS@
+endif
+
+BBV_SOURCES_COMMON = bbv_main.c
+
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES = $(BBV_SOURCES_COMMON)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS = \
+ $(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS = \
+ $(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_DEPENDENCIES = \
+ $(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDADD = \
+ $(TOOL_LDADD_@VGCONF_PLATFORM_PRI_CAPS@)
+exp_bbv_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS = \
+ $(TOOL_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+if VGCONF_HAVE_PLATFORM_SEC
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES = $(BBV_SOURCES_COMMON)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS = \
+ $(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS = \
+ $(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_DEPENDENCIES = \
+ $(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDADD = \
+ $(TOOL_LDADD_@VGCONF_PLATFORM_SEC_CAPS@)
+exp_bbv_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS = \
+ $(TOOL_LDFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+endif
--- /dev/null
+//--------------------------------------------------------------------*/
+//--- BBV: a SimPoint basic block vector generator bbv_main.c ---*/
+//--------------------------------------------------------------------*/
+
+/*
+ This file is part of BBV, a Valgrind tool for generating SimPoint
+ basic block vectors.
+
+ Copyright (C) 2006-2009 Vince Weaver
+ vince _at_ csl.cornell.edu
+
+ pcfile code is Copyright (C) 2006-2009 Oriol Prat
+ oriol.prat _at _ bsc.es
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+
+#include "pub_tool_basics.h"
+#include "pub_tool_tooliface.h"
+#include "pub_tool_options.h" /* command line options */
+
+#include "pub_tool_vki.h" /* vki_stat */
+#include "pub_tool_libcbase.h" /* VG_(strlen) */
+#include "pub_tool_libcfile.h" /* VG_(write) */
+#include "pub_tool_libcprint.h" /* VG_(printf) */
+#include "pub_tool_libcassert.h" /* VG_(exit) */
+#include "pub_tool_mallocfree.h" /* plain_free */
+#include "pub_tool_machine.h" /* VG_(fnptr_to_fnentry) */
+#include "pub_tool_debuginfo.h" /* VG_(get_fnname) */
+
+#include "pub_tool_oset.h" /* ordered set stuff */
+
+ /* instruction special cases */
+#define REP_INSTRUCTION 0x1
+#define FLDCW_INSTRUCTION 0x2
+
+ /* interval variables */
+#define DEFAULT_GRAIN_SIZE 100000000 /* 100 million by default */
+static Int interval_size=DEFAULT_GRAIN_SIZE;
+
+ /* filenames */
+static UChar *clo_bb_out_file="bb.out.%p";
+static UChar *clo_pc_out_file="pc.out.%p";
+static UChar *pc_out_file=NULL;
+static UChar *bb_out_file=NULL;
+
+
+ /* output parameters */
+static Bool instr_count_only=False;
+static Bool generate_pc_file=False;
+
+ /* write buffer */
+static UChar buf[1024];
+
+ /* Global values */
+static OSet* instr_info_table; /* table that holds the basic block info */
+static Int block_num=1; /* global next block number */
+static Int current_thread=0;
+static Int allocated_threads=1;
+struct thread_info *bbv_thread=NULL;
+
+ /* Per-thread variables */
+struct thread_info {
+ ULong dyn_instr; /* Current retired instruction count */
+ ULong total_instr; /* Total retired instruction count */
+ Addr last_rep_addr; /* rep counting values */
+ ULong rep_count;
+ ULong global_rep_count;
+ ULong unique_rep_count;
+ ULong fldcw_count; /* fldcw count */
+ Int bbtrace_fd; /* file descriptor */
+};
+
+#define FUNCTION_NAME_LENGTH 20
+
+struct BB_info {
+ Addr BB_addr; /* used as key, must be first */
+ Int n_instrs; /* instructions in the basic block */
+ Int block_num; /* unique block identifier */
+ Int *inst_counter; /* times entered * num_instructions */
+ Bool is_entry; /* is this block a function entry point */
+ UChar fn_name[FUNCTION_NAME_LENGTH]; /* Function block is in */
+};
+
+
+ /* dump the optional PC file, which contains basic block number to */
+ /* instruction address and function name mappings */
+static void dumpPcFile(void)
+{
+ struct BB_info *bb_elem;
+ Int pctrace_fd;
+ SysRes sres;
+
+ pc_out_file =
+ VG_(expand_file_name)("--pc-out-file", clo_pc_out_file);
+
+ sres = VG_(open)(pc_out_file, VKI_O_CREAT|VKI_O_TRUNC|VKI_O_WRONLY,
+ VKI_S_IRUSR|VKI_S_IWUSR|VKI_S_IRGRP|VKI_S_IWGRP);
+ if (sr_isError(sres)) {
+ VG_UMSG("Error: cannot create pc file %s\n", pc_out_file);
+ VG_(exit)(1);
+ } else {
+ pctrace_fd = sr_Res(sres);
+ }
+
+ /* Loop through the table, printing the number, address, */
+ /* and function name for each basic block */
+ VG_(OSetGen_ResetIter)(instr_info_table);
+ while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+ VG_(write)(pctrace_fd,"F",1);
+ VG_(sprintf)( buf,":%d:%x:%s\n",
+ bb_elem->block_num,
+ (Int)bb_elem->BB_addr,
+ bb_elem->fn_name);
+ VG_(write)(pctrace_fd, (void*)buf, VG_(strlen)(buf));
+ }
+
+ VG_(close)(pctrace_fd);
+}
+
+static Int open_tracefile(Int thread_num)
+{
+ SysRes sres;
+ UChar temp_string[2048];
+
+ /* For thread 1, don't append any thread number */
+ /* This lets the single-thread case not have any */
+ /* extra values appended to the file name. */
+ if (thread_num==1) {
+ VG_(strncpy)(temp_string,bb_out_file,2047);
+ }
+ else {
+ VG_(sprintf)(temp_string,"%s.%d",bb_out_file,thread_num);
+ }
+
+ sres = VG_(open)(temp_string, VKI_O_CREAT|VKI_O_TRUNC|VKI_O_WRONLY,
+ VKI_S_IRUSR|VKI_S_IWUSR|VKI_S_IRGRP|VKI_S_IWGRP);
+
+ if (sr_isError(sres)) {
+ VG_UMSG("Error: cannot create bb file %s\n",temp_string);
+ VG_(exit)(1);
+ }
+
+ return sr_Res(sres);
+}
+
+static void handle_overflow(void)
+{
+ struct BB_info *bb_elem;
+
+ if (bbv_thread[current_thread].dyn_instr > interval_size) {
+
+ if (!instr_count_only) {
+
+ /* If our output fd hasn't been opened, open it */
+ if (bbv_thread[current_thread].bbtrace_fd < 0) {
+ bbv_thread[current_thread].bbtrace_fd=open_tracefile(current_thread);
+ }
+
+ /* put an entry to the bb.out file */
+
+ VG_(write)(bbv_thread[current_thread].bbtrace_fd,"T",1);
+
+ VG_(OSetGen_ResetIter)(instr_info_table);
+ while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+ if ( bb_elem->inst_counter[current_thread] != 0 ) {
+ VG_(sprintf)( buf,":%d:%d ",
+ bb_elem->block_num,
+ bb_elem->inst_counter[current_thread]);
+ VG_(write)(bbv_thread[current_thread].bbtrace_fd,
+ (void*)buf, VG_(strlen)(buf));
+ bb_elem->inst_counter[current_thread] = 0;
+ }
+ }
+
+ VG_(write)(bbv_thread[current_thread].bbtrace_fd,"\n",1);
+ }
+
+ bbv_thread[current_thread].dyn_instr -= interval_size;
+ }
+}
+
+
+static void close_out_reps(void)
+{
+ bbv_thread[current_thread].global_rep_count+=bbv_thread[current_thread].rep_count;
+ bbv_thread[current_thread].unique_rep_count++;
+ bbv_thread[current_thread].rep_count=0;
+}
+
+ /* Generic function to get called each instruction */
+static VG_REGPARM(1) void per_instruction_BBV(struct BB_info *bbInfo)
+{
+ Int n_instrs=1;
+
+ tl_assert(bbInfo);
+
+ /* we finished rep but didn't clear out count */
+ if (bbv_thread[current_thread].rep_count) {
+ n_instrs++;
+ close_out_reps();
+ }
+
+ bbInfo->inst_counter[current_thread]+=n_instrs;
+
+ bbv_thread[current_thread].total_instr+=n_instrs;
+ bbv_thread[current_thread].dyn_instr +=n_instrs;
+
+ handle_overflow();
+}
+
+ /* Function to get called if instruction has a rep prefix */
+static VG_REGPARM(1) void per_instruction_BBV_rep(Addr addr)
+{
+ /* handle back-to-back rep instructions */
+ if (bbv_thread[current_thread].last_rep_addr!=addr) {
+ if (bbv_thread[current_thread].rep_count) {
+ close_out_reps();
+ bbv_thread[current_thread].total_instr++;
+ bbv_thread[current_thread].dyn_instr++;
+ }
+ bbv_thread[current_thread].last_rep_addr=addr;
+ }
+
+ bbv_thread[current_thread].rep_count++;
+
+}
+
+ /* Function to call if our instruction has a fldcw instruction */
+static VG_REGPARM(1) void per_instruction_BBV_fldcw(struct BB_info *bbInfo)
+{
+ Int n_instrs=1;
+
+ tl_assert(bbInfo);
+
+ /* we finished rep but didn't clear out count */
+ if (bbv_thread[current_thread].rep_count) {
+ n_instrs++;
+ close_out_reps();
+ }
+
+ /* count fldcw instructions */
+ bbv_thread[current_thread].fldcw_count++;
+
+ bbInfo->inst_counter[current_thread]+=n_instrs;
+
+ bbv_thread[current_thread].total_instr+=n_instrs;
+ bbv_thread[current_thread].dyn_instr +=n_instrs;
+
+ handle_overflow();
+}
+
+ /* Check if the instruction pointed to is one that needs */
+ /* special handling. If so, set a bit in the return */
+ /* value indicating what type. */
+static Int get_inst_type(Int len, Addr addr)
+{
+ int result=0;
+
+#if defined(VGA_x86) || defined(VGA_amd64)
+
+ unsigned char *inst_pointer;
+ unsigned char inst_byte;
+ int i,possible_rep;
+
+ /* rep prefixed instructions are counted as one instruction on */
+ /* x86 processors and must be handled as a special case */
+
+ /* Also, the rep prefix is re-used as part of the opcode for */
+ /* SSE instructions. So we need to specifically check for */
+ /* the following: movs, cmps, scas, lods, stos, ins, outs */
+
+ inst_pointer=(unsigned char *)addr;
+ i=0;
+ inst_byte=0;
+ possible_rep=0;
+
+ while (i<len) {
+
+ inst_byte=*inst_pointer;
+
+ if ( (inst_byte == 0x67) || /* size override prefix */
+ (inst_byte == 0x66) || /* size override prefix */
+ (inst_byte == 0x48) ) { /* 64-bit prefix */
+ } else if ( (inst_byte == 0xf2) || /* rep prefix */
+ (inst_byte == 0xf3) ) { /* repne prefix */
+ possible_rep=1;
+ } else {
+ break; /* other byte, exit */
+ }
+
+ i++;
+ inst_pointer++;
+ }
+
+ if ( possible_rep &&
+ ( ( (inst_byte >= 0xa4) && /* movs,cmps,scas */
+ (inst_byte <= 0xaf) ) || /* lods,stos */
+ ( (inst_byte >= 0x6c) &&
+ (inst_byte <= 0x6f) ) ) ) { /* ins,outs */
+
+ result|=REP_INSTRUCTION;
+ }
+
+ /* fldcw instructions are double-counted by the hardware */
+ /* performance counters on pentium 4 processors so it is */
+ /* useful to have that count when doing validation work. */
+
+ inst_pointer=(unsigned char *)addr;
+ if (len>1) {
+ /* FLDCW detection */
+ /* opcode is 0xd9/5, ie 1101 1001 oo10 1mmm */
+ if ((*inst_pointer==0xd9) &&
+ (*(inst_pointer+1)<0xb0) && /* need this case of fldz, etc, count */
+ ( (*(inst_pointer+1) & 0x38) == 0x28)) {
+ result|=FLDCW_INSTRUCTION;
+ }
+ }
+
+#endif
+ return result;
+}
+
+
+
+ /* Our instrumentation function */
+ /* sbIn = super block to translate */
+ /* layout = guest layout */
+ /* gWordTy = size of guest word */
+ /* hWordTy = size of host word */
+static IRSB* bbv_instrument ( VgCallbackClosure* closure,
+ IRSB* sbIn, VexGuestLayout* layout,
+ VexGuestExtents* vge,
+ IRType gWordTy, IRType hWordTy )
+{
+ Int i,n_instrs=1;
+ IRSB *sbOut;
+ IRStmt *st;
+ struct BB_info *bbInfo;
+ Addr64 origAddr,ourAddr;
+ IRDirty *di;
+ IRExpr **argv, *arg1;
+ Int regparms,opcode_type;
+
+ /* We don't handle a host/guest word size mismatch */
+ if (gWordTy != hWordTy) {
+ VG_(tool_panic)("host/guest word size mismatch");
+ }
+
+ /* Set up SB */
+ sbOut = deepCopyIRSBExceptStmts(sbIn);
+
+ /* Copy verbatim any IR preamble preceding the first IMark */
+ i = 0;
+ while ( (i < sbIn->stmts_used) && (sbIn->stmts[i]->tag!=Ist_IMark)) {
+ addStmtToIRSB( sbOut, sbIn->stmts[i] );
+ i++;
+ }
+
+ /* Get the first statement */
+ tl_assert(sbIn->stmts_used > 0);
+ st = sbIn->stmts[i];
+
+ /* double check we are at a Mark statement */
+ tl_assert(Ist_IMark == st->tag);
+
+ origAddr=st->Ist.IMark.addr;
+
+ /* Get the BB_info */
+ bbInfo = VG_(OSetGen_Lookup)(instr_info_table, &origAddr);
+
+ if (bbInfo==NULL) {
+
+ /* BB never translated before (at this address, at least; */
+ /* could have been unloaded and then reloaded elsewhere in memory) */
+
+ /* allocate and initialize a new basic block structure */
+ bbInfo=VG_(OSetGen_AllocNode)(instr_info_table, sizeof(struct BB_info));
+ bbInfo->BB_addr = origAddr;
+ bbInfo->n_instrs = n_instrs;
+ bbInfo->inst_counter=VG_(calloc)("bbv_instrument",
+ allocated_threads,
+ sizeof(Int));
+
+ /* assign a unique block number */
+ bbInfo->block_num=block_num;
+ block_num++;
+ /* get function name and entry point information */
+ VG_(get_fnname)(origAddr,bbInfo->fn_name,FUNCTION_NAME_LENGTH);
+ bbInfo->is_entry=VG_(get_fnname_if_entry)(origAddr, bbInfo->fn_name,
+ FUNCTION_NAME_LENGTH);
+ /* insert structure into table */
+ VG_(OSetGen_Insert)( instr_info_table, bbInfo );
+ }
+
+ /* Iterate through the basic block, putting the original */
+ /* instructions in place, plus putting a call to updateBBV */
+ /* for each original instruction */
+
+ /* This is less efficient than only instrumenting the BB */
+ /* But it gives proper results given the fact that */
+ /* valgrind uses superblocks (not basic blocks) by default */
+
+
+ while(i < sbIn->stmts_used) {
+ st=sbIn->stmts[i];
+
+ if (st->tag == Ist_IMark) {
+
+ ourAddr = st->Ist.IMark.addr;
+
+ opcode_type=get_inst_type(st->Ist.IMark.len,ourAddr);
+
+ regparms=1;
+ arg1= mkIRExpr_HWord( (HWord)bbInfo);
+ argv= mkIRExprVec_1(arg1);
+
+
+ if (opcode_type&REP_INSTRUCTION) {
+ arg1= mkIRExpr_HWord(ourAddr);
+ argv= mkIRExprVec_1(arg1);
+ di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV_rep",
+ VG_(fnptr_to_fnentry)( &per_instruction_BBV_rep ),
+ argv);
+ }
+ else if (opcode_type&FLDCW_INSTRUCTION) {
+ di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV_fldcw",
+ VG_(fnptr_to_fnentry)( &per_instruction_BBV_fldcw ),
+ argv);
+ }
+ else {
+ di= unsafeIRDirty_0_N( regparms, "per_instruction_BBV",
+ VG_(fnptr_to_fnentry)( &per_instruction_BBV ),
+ argv);
+ }
+
+
+ /* Insert our call */
+ addStmtToIRSB( sbOut, IRStmt_Dirty(di));
+ }
+
+ /* Insert the original instruction */
+ addStmtToIRSB( sbOut, st );
+
+ i++;
+ }
+
+ return sbOut;
+}
+
+static struct thread_info *allocate_new_thread(struct thread_info *old,
+ Int old_number, Int new_number)
+{
+ struct thread_info *temp;
+ struct BB_info *bb_elem;
+ Int i;
+
+ temp=VG_(realloc)("bbv_main.c allocate_threads",
+ old,
+ new_number*sizeof(struct thread_info));
+
+ /* init the new thread */
+ /* We loop in case the new thread is not contiguous */
+ for(i=old_number;i<new_number;i++) {
+ temp[i].last_rep_addr=0;
+ temp[i].dyn_instr=0;
+ temp[i].total_instr=0;
+ temp[i].global_rep_count=0;
+ temp[i].unique_rep_count=0;
+ temp[i].rep_count=0;
+ temp[i].fldcw_count=0;
+ temp[i].bbtrace_fd=-1;
+ }
+ /* expand the inst_counter on all allocated basic blocks */
+ VG_(OSetGen_ResetIter)(instr_info_table);
+ while ( (bb_elem = VG_(OSetGen_Next)(instr_info_table)) ) {
+ bb_elem->inst_counter =
+ VG_(realloc)("bbv_main.c inst_counter",
+ bb_elem->inst_counter,
+ new_number*sizeof(Int));
+ for(i=old_number;i<new_number;i++) {
+ bb_elem->inst_counter[i]=0;
+ }
+ }
+
+ return temp;
+}
+
+static void bbv_thread_called ( ThreadId tid, ULong nDisp )
+{
+ if (tid >= allocated_threads) {
+ bbv_thread=allocate_new_thread(bbv_thread,allocated_threads,tid+1);
+ allocated_threads=tid+1;
+ }
+ current_thread=tid;
+}
+
+
+
+
+/*--------------------------------------------------------------------*/
+/*--- Setup ---*/
+/*--------------------------------------------------------------------*/
+
+static void bbv_post_clo_init(void)
+{
+ bb_out_file =
+ VG_(expand_file_name)("--bb-out-file", clo_bb_out_file);
+
+ /* Try a closer approximation of basic blocks */
+ /* This is the same as the command line option */
+ /* --vex-guest-chase-thresh=0 */
+ VG_(clo_vex_control).guest_chase_thresh = 0;
+}
+
+ /* Parse the command line options */
+static Bool bbv_process_cmd_line_option(Char* arg)
+{
+ if VG_INT_CLO (arg, "--interval-size", interval_size) {}
+ else if VG_STR_CLO (arg, "--bb-out-file", clo_bb_out_file) {}
+ else if VG_STR_CLO (arg, "--pc-out-file", clo_pc_out_file) {
+ generate_pc_file = True;
+ }
+ else if VG_XACT_CLO (arg, "--instr-count-only", instr_count_only, True) {}
+ else {
+ return False;
+ }
+
+ return True;
+}
+
+static void bbv_print_usage(void)
+{
+ VG_(printf) (" --bb-out-file=<file> filename for basic block vector info\n");
+ VG_(printf) (" --pc-out-file=<file> filename for basic block addresses and function names\n");
+ VG_(printf) (" --interval-size=<num> interval size\n");
+ VG_(printf) (" --instr-count-only only print total instruction count\n");
+}
+
+static void bbv_print_debug_usage(void)
+{
+ VG_(printf)(" (none)\n");
+}
+
+static void bbv_fini(Int exitcode)
+{
+ Int i;
+
+ if (generate_pc_file) {
+ dumpPcFile();
+ }
+
+ for(i=0;i<allocated_threads;i++) {
+
+ if (bbv_thread[i].total_instr!=0) {
+
+ VG_(sprintf)(buf,"\n\n"
+ "# Thread %d\n"
+ "# Total intervals: %d (Interval Size %d)\n"
+ "# Total instructions: %lld\n"
+ "# Total reps: %lld\n"
+ "# Unique reps: %lld\n"
+ "# Total fldcw instructions: %lld\n\n",
+ i,
+ (Int)(bbv_thread[i].total_instr/(ULong)interval_size),
+ interval_size,
+ bbv_thread[i].total_instr,
+ bbv_thread[i].global_rep_count,
+ bbv_thread[i].unique_rep_count,
+ bbv_thread[i].fldcw_count);
+
+ /* Print results to display */
+ VG_UMSG("%s", buf);
+
+ /* open the output file if it hasn't already */
+ if (bbv_thread[i].bbtrace_fd < 0) {
+ bbv_thread[i].bbtrace_fd=open_tracefile(i);
+ }
+ /* Also print to results file */
+ VG_(write)(bbv_thread[i].bbtrace_fd,(void*)buf,VG_(strlen)(buf));
+ VG_(close)(bbv_thread[i].bbtrace_fd);
+ }
+ }
+}
+
+static void bbv_pre_clo_init(void)
+{
+ VG_(details_name) ("exp-bbv");
+ VG_(details_version) (NULL);
+ VG_(details_description) ("a SimPoint basic block vector generator");
+ VG_(details_copyright_author)(
+ "Copyright (C) 2006-2009 Vince Weaver");
+ VG_(details_bug_reports_to) (VG_BUGS_TO);
+
+ VG_(basic_tool_funcs) (bbv_post_clo_init,
+ bbv_instrument,
+ bbv_fini);
+
+ VG_(needs_command_line_options)(bbv_process_cmd_line_option,
+ bbv_print_usage,
+ bbv_print_debug_usage);
+
+ VG_(track_start_client_code)( bbv_thread_called );
+
+
+ instr_info_table = VG_(OSetGen_Create)(/*keyOff*/0,
+ NULL,
+ VG_(malloc), "bbv.1", VG_(free));
+
+ bbv_thread=allocate_new_thread(bbv_thread,0,allocated_threads);
+}
+
+VG_DETERMINE_INTERFACE_VERSION(bbv_pre_clo_init)
+
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
--- /dev/null
+EXTRA_DIST = bbv-manual.xml
+
--- /dev/null
+<?xml version="1.0"?> <!-- -*- sgml -*- -->
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+
+<chapter id="bbv-manual" xreflabel="BBV">
+ <title>BBV: a Basic Block Vector generation tool</title>
+
+<para>To use this tool, you must specify
+<computeroutput>--tool=exp-bbv</computeroutput> on the Valgrind
+command line.</para>
+
+<sect1 id="bbv-manual.background" xreflabel="BBV Background">
+<title>Basic Block Profiling and SimPoint</title>
+
+<para>
+ A Basic Blocks Vector (BBV) is a list of all basic blocks entered
+ during program execution, and a count of how many times each
+ block was run (a basic block is a section of code
+ with only one entry point and one exit point).
+</para>
+
+<para>
+ This tool was written to generate basic block vectors
+ for use with the SimPoint analysis tool
+ (http://www.cse.ucsd.edu/~calder/simpoint/).
+ The SimPoint methodology enables speeding up architectural
+ simulations by only running a small portion of a program
+ and then extrapolating total behavior from this
+ small portion. Most programs exhibit phase-based behavior, which
+ means that at various times during execution a program will encounter
+ intervals of time where the code behaves similarly to a previous
+ interval. If you can detect these intervals and group them together,
+ an approximation of the total program behavior can be obtained
+ by only simulating a bare minimum number of intervals, and then scaling
+ the results.
+</para>
+
+<para>
+ In computer architecture research, running a
+ benchmark on a cycle-accurate simulator can cause slowdowns on the order
+ of 1000 times, making it take days, weeks, or even longer to run full
+ benchmarks. By utilizing SimPoint this can be reduced significantly
+ while still retaining reasonable accuracy, usually in the 5-10% range.
+</para>
+
+<para>
+ A more complete introduction to how SimPoint works can be
+ found in the paper "Automatically Characterizing Large Scale
+ Program Behavior" by T. Sherwood, E Perelman, G. Hamerly, and
+ B. Calder.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.quickstart" xreflabel="Quick Start">
+<title>Using Basic Block Vectors to create SimPoints</title>
+
+<para>
+ To quickly create a basic block vector file, you will call Valgrind
+ like this:
+ <computeroutput>valgrind --tool=exp-bbv /bin/ls</computeroutput>
+ In this case we are running on the "ls" program, but this
+ can be any executable. By default a file called
+ <computeroutput>bb.out.PID</computeroutput> will be created,
+ where PID is replaced by the process ID of the running process.
+ This file is the basic block vector. For long-running programs
+ this file can be quite large, so it might be wise to compress
+ it with gzip or some other compression program.
+</para>
+
+<para>
+ To create actual SimPoint results, you will need the
+ SimPoint utility, available from the SimPoint webpage
+ (http://www.cse.ucsd.edu/~calder/simpoint/).
+ Assuming you have downloaded SimPoint 3.2 and compiled it,
+ create SimPoint results with a command like the following:
+
+ <computeroutput>./SimPoint.3.2/bin/simpoint -inputVectorsGzipped \
+ -loadFVFile bb.out.1234.gz \
+ -k 5 -saveSimpoints results.simpts \
+ -saveSimpointWeights results.weights
+ </computeroutput>
+ where bb.out.1234.gz is your compressed basic block vector file
+ generated by Valgrind exp-bbv.
+</para>
+
+<para>
+ The SimPoint utility does random linear projection using 15-dimensions,
+ then does k-mean clustering to calculate which intervals are
+ of interest. In this example we specify 5 intervals with the
+ -k 5 option.
+</para>
+
+<para>
+ The outputs from the SimPoint run are the
+ <computeroutput>results.simpts</computeroutput>
+ and <computeroutput>results.weights</computeroutput> files.
+ The first holds the 5 most relevant intervals of the program.
+ The seconds holds the weight to scale each interval by when
+ extrapolating full-program behavior. The intervals and the weights
+ can be used in conjunction with a simulator that supports
+ fast-forwarding; you fast-forward to the interval of interest,
+ collect stats for the desired interval length, then use
+ statistics gathered in conjunction with the weights to
+ calculate your results.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.usage" xreflabel="BBV Usage">
+<title>BBV Command Line Options</title>
+
+<para>
+ BBV has various options that control the behavior of the plugin:
+<!-- start of xi:include in the manpage -->
+<variablelist id="bbv.opts.list">
+
+ <varlistentry id="opt.interval-size" xreflabel="--interval-size">
+ <term>
+ <option><![CDATA[--interval-size=<number> [default: 100000000] ]]></option>
+ </term>
+ <listitem>
+ <para>
+ This option selects the size of the interval to use.
+ The default is 100
+ million instructions, which is a commonly used value.
+ Other sizes can be used; smaller intervals can help programs
+ with finer-grained phases. However smaller interval size
+ can lead to accuracy issues due to warm-up effects
+ (When fast-forwarding the various architectural features
+ will be un-initialized, and it will take some number
+ of instructions before they "warm up" to the state a
+ full simulation would be at without the fast-forwarding.
+ Large interval sizes tend to mitigate this.)
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="opt.instr-count-only" xreflabel="--instr-count-only">
+ <term>
+ <option><![CDATA[--instr-count-only [default: no] ]]></option>
+ </term>
+ <listitem>
+ <para>
+ This option tells the tool to only display instruction
+ count totals, and to not generate the
+ actual BBV file. This is useful for debugging, and for
+ gathering instruction count info without generating
+ the large BBV files.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="opt.bb-out-file" xreflabel="--bb-out-file">
+ <term>
+ <option><![CDATA[--bb-out-file=<name> [default: bb.out.%p] ]]></option>
+ </term>
+ <listitem>
+ <para>
+ This option selects the name of the basic block file. Default is
+ bb.out.%p. The
+ <option>%p</option> and <option>%q</option> format specifiers can be
+ used to embed the process ID and/or the contents of an environment
+ variable in the name, as is the case for the core option
+ <option>--log-file</option>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="opt.pc-out-file" xreflabel="--pc-out-file">
+ <term>
+ <option><![CDATA[--pc-out-file=<name> [default: pc.out.%p] ]]></option>
+ </term>
+ <listitem>
+ <para>
+ This option selects the name of the PC file.
+ This file holds program counter addresses
+ and function name info for the various basic blocks.
+ This can be used in conjunction
+ with the bbv file to fast-forward via function names
+ instead of just instruction counts.
+ The default filename is pc.out.%p.
+ <option>%p</option> and <option>%q</option> format specifiers can be
+ used to embed the process ID and/or the contents of an environment
+ variable in the name, as is the case for the core option
+ <option>--log-file</option>.
+
+ </para>
+ </listitem>
+ </varlistentry>
+</variablelist>
+<!-- end of xi:include in the manpage -->
+
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.fileformat" xreflabel="BBV File Format">
+<title>Basic Block Vector File Format</title>
+
+<para>
+ The Basic Block Vector is dumped at fixed intervals. This
+ is commonly done every 100 million instructions; the
+ <computeroutput>--interval-size</computeroutput> option can be
+ used to change this.
+</para>
+
+<para>
+ The output file looks like this:
+</para>
+
+<programlisting><![CDATA[
+T:45:1024 :189:99343
+T:11:78573 :15:1353 :56:1
+T:18:45 :12:135353 :56:78 314:4324263]]></programlisting>
+
+<para>
+ Each new interval starts with a T. This is followed by a colon,
+ then by a unique number identifying the basic block. This is followed
+ by another colon, then followed by the frequency (which is scaled
+ by the number of instructions in the basic block).
+</para>
+
+<para>
+ The entry count is multiplied by the number of instructions that are
+ in the basic block, in order to weigh the count so that instructions in
+ small Basic Blocks aren't counted as more important than instructions
+ in large Basic Blocks.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.implementation" xreflabel="Implementation">
+<title>Implementation</title>
+
+<para>
+ Valgrind provides all of the information necessary to create
+ BBV files. In the current implementation, all instructions
+ are instrumented. This is slower (by approximately a factor
+ of two) than a method that instruments at the basic-block level,
+ but there are some complications (especially with rep prefix
+ detection) that make that method more difficult.
+</para>
+
+<para>
+ Valgrind actually provides instrumentation at a super-block level.
+ A super-block has one entry point but unlike basic-blocks can
+ have multiple exit points. Once a branch occurs into the middle
+ of a block, it is split into a new basic-block. Because
+ Valgrind cannot produce "true" basic blocks, the generated
+ BBV vectors will be different than those generated by other tools.
+ In practice this does not seem to affect the accuracy of the
+ SimPoint results. We do internally force the
+ <computeroutput>--vex-guest-chase-thresh=0</computeroutput>
+ option to Valgrind which forces a more basic-block like
+ behavior.
+</para>
+
+<para>
+ When a super block is run for the first time, it is instrumented
+ with our BBV routine. This adds a call to our instruction
+ counting function for each original instruction.
+ The current superblock is looked up in an Ordered Set to find
+ a structure that holds block-specific statistics (the entry point
+ address is the index into the hash table). We increment the
+ instruction count for this superblock and
+ also update the master instruction count.
+ If the master count overflows the interval size
+ then we print out the basic block statistics for the current interval
+ to disk, and then reset all the superblock counters to zero.
+</para>
+
+<para>
+ On the x86 and amd64 architectures the code takes special
+ care with rep-prefixed string instructions. This is because
+ actual hardware counts a rep-prefixed instruction
+ as one instruction, while a naive Valgrind implementation
+ would count it as many (possibly hundreds, thousands or even millions)
+ of instructions. We have special code to handle
+ this properly, which makes the results match hardware performance
+ counter results.
+</para>
+
+<para>
+ The exp-bbv tool also counts the fldcw instruction. This
+ instruction is used on x86 machines when converting numbers
+ from floating point to integer (among other uses).
+ On Pentium 4 systems the retired instruction performance
+ counter counts this instruction as two
+ instructions (all other known processors only count it as one).
+ This can affect results when using SimPoint on Pentium 4 systems,
+ so we provide the count for use in mitigating this at analysis time.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.threadsupport" xreflabel="BBV Threaded Support">
+<title>Threaded Executable Support</title>
+
+<para>
+ BBV supports threaded programs. When a program has multiple threads,
+ an additional BBV file is created for each thread (each additional
+ file is the specified filename with the thread number
+ appended at the end).
+</para>
+
+<para>
+ There is no official method of using SimPoint with
+ threaded workloads. The most common method is to run
+ SimPoint on each thread's results independently, and use
+ some method of deterministic execution to try to match the
+ original workload. This should be possible with current
+ exp-bbv.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.validation" xreflabel="BBV Validation">
+<title>Validation</title>
+
+<para>
+ This plugin has been tested on x86, amd64, and ppc32 platforms.
+ An earlier version of the plugin was tested in detail using
+ hardware performance counters, this work is described in a paper
+ from the HiPEAC'08 conference, "Using Dynamic Binary Instrumentation
+ to Generate Multi-Platform SimPoints: Methodology and Accuracy" by
+ V.M. Weaver and S.A. McKee.
+</para>
+
+</sect1>
+
+<sect1 id="bbv-manual.performance" xreflabel="BBV Performance">
+<title>Performance</title>
+
+<para>
+ Using this program slows down execution by roughly a factor of 40
+ over native execution. This varies depending on the machine
+ used and the benchmark being run.
+ On the SPEC CPU 2000 benchmarks running on a 3.4GHz Pentium D
+ processor, the slowdown ranges from 24x (mcf) to 340x (vortex.2).
+</para>
+
+</sect1>
+
+</chapter>
--- /dev/null
+
+include $(top_srcdir)/Makefile.tool-tests.am
+
+SUBDIRS = .
+
+# Platform-specific tests
+if VGCONF_ARCHS_INCLUDE_X86
+SUBDIRS += x86
+endif
+if VGCONF_PLATFORMS_INCLUDE_X86_LINUX
+SUBDIRS += x86-linux
+endif
+if VGCONF_PLATFORMS_INCLUDE_AMD64_LINUX
+SUBDIRS += amd64-linux
+endif
+if VGCONF_PLATFORMS_INCLUDE_PPC32_LINUX
+SUBDIRS += ppc32-linux
+endif
+
+DIST_SUBDIRS = x86 x86-linux amd64-linux ppc32-linux .
+
+EXTRA_DIST = \
+ logo.include logo.lzss_new
+
+check_PROGRAMS =
+
+AM_CFLAGS += $(AM_FLAG_M3264_PRI)
+AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
+
--- /dev/null
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+ million rep_prefix ll fldcw_check complex_rep clone_test
+
+EXTRA_DIST = \
+ clone_test.stderr.exp \
+ clone_test.post.exp \
+ clone_test.vgtest \
+ complex_rep.stderr.exp \
+ complex_rep.vgtest \
+ fldcw_check.stderr.exp \
+ fldcw_check.vgtest \
+ ll.stderr.exp \
+ ll.stdout.exp \
+ ll.post.exp \
+ ll.vgtest \
+ million.stderr.exp \
+ million.post.exp \
+ million.vgtest \
+ rep_prefix.stderr.exp \
+ rep_prefix.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += -nostartfiles -nodefaultlibs
+
+clone_test = clone_test.S
+complex_rep_SOURCES = complex_rep.S
+fldcw_check_SOURCES = fldcw_check.S
+ll_SOURCES = ll.S
+million_SOURCES = million.S
+rep_prefix_SOURCES = rep_prefix.S
+
--- /dev/null
+ # count for ~1 million instructions thread 1
+ # count for ~2 million instructions thread 2
+ # count for additional 500 million each before exit
+
+ .globl _start
+_start:
+
+ #################################################
+ # 1000 cycles in initial thread #
+ #################################################
+
+ xor %rax,%rax
+ mov $499,%rcx # load counter
+initial_loop:
+ dec %rcx # repeat count times
+ jnz initial_loop
+
+
+ #####################################################
+ # Spawn a thread! #
+ #####################################################
+clone:
+ mov $56,%rax # clone syscall
+
+ # Note, clone syscall is different than the glibc implementation
+
+# int clone (flags, stack_pointer,parent_tidptr,child_tidptr,tls)
+
+
+ # Flags in
+ #/usr/include/bits/sched.h
+ # CLONE_THREAD 0x10000
+ # CLONE_SIGHAND 0x800
+ # CLONE_VM 0x100
+ # above must be called together
+ # Below required for Valgrind
+ # CLONE_FS 0x200
+ # CLONE_FILES 0x400
+
+ mov $0x10f00,%rdi
+
+
+ mov $(new_stack+4096),%rsi # new stack
+
+
+
+ mov $0,%rdx # args (none)
+
+ syscall
+
+ cmp $0,%rax # are we in new thread?
+ jz thread2 # if so, jump to thrad2
+
+
+ ###############################################
+ # thread1 #
+ ###############################################
+
+thread1:
+
+ mov $499997,%rcx # load counter
+thread1_loop:
+ dec %rcx # repeat count times
+ jnz thread1_loop
+
+ xor %rdi,%rdi # we return 0
+ jmp exit
+
+thread2:
+ mov $999997,%rcx # load counter
+thread2_loop:
+ dec %rcx # repeat count times
+ jnz thread2_loop
+
+ mov $5,%rdi # we return 5
+
+
+ #================================
+ # Exit
+ #================================
+exit:
+
+ # count an additional 500 million
+
+ mov $250000,%rcx # load counter
+exit_loop:
+ dec %rcx # repeat count times
+ jnz exit_loop
+
+actual_exit:
+ mov $60,%rax # put exit syscall number (60) in rax
+ syscall
+
+.bss
+.lcomm new_stack,4096
--- /dev/null
+T 4 996 5 2 3 98991
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 1001 2 3 98994
+T 100000
+T 100000
+T 100000
+T 100000
+
+
+# Thread 1
+# Total intervals: 15 (Interval Size 100000)
+# Total instructions: 1501007
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
+T 2 3 99996
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 99996 4
+T 100000
+T 100000
+T 100000
+T 100000
+T 99998 2
+
+
+# Thread 2
+# Total intervals: 25 (Interval Size 100000)
+# Total instructions: 2500001
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 15 (Interval Size 100000)
+# Total instructions: 1501007
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+# Thread 2
+# Total intervals: 25 (Interval Size 100000)
+# Total instructions: 2500001
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+prog: clone_test
+vgopts: --interval-size=100000 --bb-out-file=clone_test.out.bb --pc-out-file=clone_test.out.pc
+post: cat clone_test.out.bb clone_test.out.bb.2 | ../filter_bb
+cleanup: rm clone_test.out.bb
+
--- /dev/null
+# When trying (and failing) to instrument at the basic block level
+# I thought up a lot of corner-cases in the rep code. This tries
+# to catch some of them
+
+# Performance counters give us 8207 insns
+# 11 + 8*1024 + 3 = 8206
+
+ .globl _start
+_start:
+ cld # we want these to happen forward
+
+ mov $0xfeb131978,%rax # value to store
+
+ # test back-to-back rep/stosb's
+
+ mov $1024,%rcx
+ mov $buffer1, %rdi # set destination
+ rep stosb # store 1024 times
+ rep stosb # should store 0 times
+ rep stosb # should store 0 times
+
+
+ # test stosb where cx is 0
+
+ xor %rcx,%rcx
+ mov $buffer1, %rdi # set destination
+ rep stosb # should not load at all
+
+ # test rep inside of a loop
+
+ mov $1024, %rbx
+rep_loop:
+
+ mov $1024,%rcx
+ mov $buffer1, %rdi # set destination
+ rep stosb
+
+ mov $1024,%rcx
+ mov $buffer1, %rdi # set destination
+ rep stosb
+
+ dec %rbx
+ jnz rep_loop
+
+
+ #================================
+ # Exit
+ #================================
+exit:
+ mov $60,%rax
+ xor %rdi,%rdi # we return 0
+ syscall # and exit
+
+
+.bss
+
+.lcomm buffer1, 16384
+
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 100000)
+# Total instructions: 8206
+# Total reps: 2100228
+# Unique reps: 2052
+# Total fldcw instructions: 0
--- /dev/null
+prog: complex_rep
+vgopts: --interval-size=100000 --bb-out-file=complex_rep.out.bb
+cleanup: rm complex_rep.out.bb
+
--- /dev/null
+#! /bin/sh
+
+../filter_stderr
+
--- /dev/null
+
+.globl _start
+
+_start:
+ # This code tests for the fldcw "load floating point command word"
+ # instruction. On most x86 processors the retired_instruction
+ # performance counter counts this as one instruction. However,
+ # on Pentium 4 systems it counts as two. Therefore this can
+ # affect BBV results on such a system.
+ # fldcw is most often used to set the rouding mode when doing
+ # floating point to integer conversions
+
+ # It is encoded as "d9 /5" which means
+ # 1101 1001 xx10 1yyy
+ # Where xx is the "mod" which will be 00, 01, or 10 indicating offset
+ # and yyy is the register field
+
+ # these are instructions with similar encodings to fldcw
+ # that can cause false positives if the test isn't explicit enough
+similar:
+ fld1 # d9 e8
+ fldl2t # d9 e9
+ fldl2e # d9 ea
+ fldpi # d9 eb
+ fldlg2 # d9 ec
+ fldln2 # d9 ed
+ fldz # d9 ee
+
+ # check some varied ways of calling fldcw
+
+ # offset on stack
+stack:
+ sub $8,%rsp # allocate space on stack
+ fnstcw 2(%rsp)
+ fldcw 2(%rsp)
+ add $8,%rsp # restore stack
+
+ # 64-bit register
+sixtyfour_reg:
+ fnstcw cw
+ mov $cw,%rax
+ fldcw 0(%rax) # rax
+ mov $cw,%rbx
+ fldcw 0(%rbx) # rbx
+ mov $cw,%rcx
+ fldcw 0(%rcx) # rcx
+ mov $cw,%rdx
+ fldcw 0(%rdx) # rdx
+
+ # 32-bit register
+thirtytwo_reg:
+ fnstcw cw
+ mov $cw,%eax
+ fldcw 0(%eax) # eax
+ mov $cw,%ebx
+ fldcw 0(%ebx) # ebx
+ mov $cw,%ecx
+ fldcw 0(%ecx) # ecx
+ mov $cw,%edx
+ fldcw 0(%edx) # edx
+
+ # register + 8-bit offset
+eight_bit:
+ mov $cw,%eax
+ sub $32,%eax
+
+ fldcw 32(%eax) # eax + 8 bit offset
+ mov %eax,%ebx
+ fldcw 32(%ebx) # ebx + 8 bit offset
+ mov %eax,%ecx
+ fldcw 32(%ecx) # ecx + 8 bit offset
+ mov %eax,%edx
+ fldcw 32(%edx) # edx + 8 bit offset
+
+ # register + 32-bit offset
+thirtytwo_bit:
+ mov $cw,%eax
+ sub $30000,%eax
+
+ fldcw 30000(%eax) # eax + 16 bit offset
+ mov %eax,%ebx
+ fldcw 30000(%ebx) # ebx + 16 bit offset
+ mov %eax,%ecx
+ fldcw 30000(%ecx) # ecx + 16 bit offset
+ mov %eax,%edx
+ fldcw 30000(%edx) # edx + 16 bit offset
+
+ # check an fp/integer conversion
+ # in a loop to give a bigger count
+
+ mov $1024,%rcx
+big_loop:
+
+ fldl three # load value onto fp stack
+ fnstcw saved_cw # store control word to mem
+ movzwl saved_cw, %eax # load cw from mem, zero extending
+ movb $12, %ah # set cw for "round to zero"
+ movw %rax, cw # store back to memory
+ fldcw cw # save new rounding mode
+ fistpl result # save stack value as integer to mem
+ fldcw saved_cw # restore old cw
+
+ loop big_loop # loop to make the count more obvious
+
+ movl result, %ebx # sanity check to see if the
+ cmp $3,%rbx # result is the expected one
+ je exit
+
+print_error:
+ mov $1,%rax # write syscall
+ mov $1,%rdi # stdout
+ mov $error,%rsi # string
+ mov $22,%rdx # length of string
+ syscall
+
+exit:
+ xor %rdi, %rdi # return 0
+ mov $60, %rax # SYSCALL_EXIT
+ syscall
+
+
+
+.data
+saved_cw: .long 0
+cw: .long 0
+result: .long 0
+three: .long 0 # a floating point 3.0
+ .long 1074266112
+error: .asciz "Error! Wrong result!\n"
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 10000)
+# Total instructions: 9270
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 2053
--- /dev/null
+prog: fldcw_check
+vgopts: --interval-size=10000 --bb-out-file=fldcw_check.out.bb
+cleanup: rm fldcw_check.out.bb
+
--- /dev/null
+#
+# linux_logo in x86_64 assembly language
+# based on the code from ll_asm-0.36
+#
+# By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+
+.include "../logo.include"
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the results returned by the sysinfo syscall
+.equ S_TOTALRAM,32
+
+# Sycscalls
+.equ SYSCALL_EXIT, 60
+.equ SYSCALL_READ, 0
+.equ SYSCALL_WRITE, 1
+.equ SYSCALL_OPEN, 2
+.equ SYSCALL_CLOSE, 3
+.equ SYSCALL_SYSINFO, 99
+.equ SYSCALL_UNAME, 63
+
+#
+.equ STDIN,0
+.equ STDOUT,1
+.equ STDERR,2
+
+ .globl _start
+_start:
+ #=========================
+ # PRINT LOGO
+ #=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+ # we used to fill the buffer with FREQUENT_CHAR
+ # but, that only gains us one byte of space in the lzss image.
+ # the lzss algorithm does automatic RLE... pretty clever
+ # so we compress with NUL as FREQUENT_CHAR and it is pre-done for us
+
+ mov $(N-F), %ebp # R
+
+ mov $logo, %esi # %esi points to logo (for lodsb)
+
+ mov $out_buffer, %edi # point to out_buffer
+ push %rdi # save this value for later
+
+ xor %ecx, %ecx
+
+decompression_loop:
+ lodsb # load in a byte
+
+ mov $0xff, %bh # re-load top as a hackish 8-bit counter
+ mov %al, %bl # move in the flags
+
+test_flags:
+ cmp $logo_end, %esi # have we reached the end?
+ je done_logo # ! if so, exit
+
+ shr $1, %ebx # shift bottom bit into carry flag
+ jc discrete_char # ! if set, we jump to discrete char
+
+offset_length:
+ lodsw # get match_length and match_position
+ mov %eax,%edx # copy to edx
+ # no need to mask dx, as we do it
+ # by default in output_loop
+
+ shr $(P_BITS),%eax
+ add $(THRESHOLD+1),%al
+ mov %al,%cl # cl = (ax >> P_BITS) + THRESHOLD + 1
+ # (=match_length)
+
+output_loop:
+ and $POSITION_MASK,%dh # mask it
+ mov text_buf(%rdx), %al # load byte from text_buf[]
+ inc %edx # advance pointer in text_buf
+store_byte:
+ stosb # store it
+
+ mov %al, text_buf(%rbp) # store also to text_buf[r]
+ inc %ebp # r++
+ and $(N-1), %bp # mask r
+
+ loop output_loop # repeat until k>j
+
+ or %bh,%bh # ! if 0 we shifted through 8 and must
+ jnz test_flags # re-load flags
+
+ jmp decompression_loop
+
+discrete_char:
+ lodsb # load a byte
+ inc %ecx # we set ecx to one so byte
+ # will be output once
+ # (how do we know ecx is zero?)
+
+ jmp store_byte # and cleverly store it
+
+
+# end of LZSS code
+
+done_logo:
+
+ pop %rbp # get out_buffer and keep in bp
+ mov %ebp,%ecx # move out_buffer to ecx
+
+ call write_stdout # print the logo
+
+ #
+ # Setup
+ #
+setup:
+ mov $strcat,%edx # use rdx as call pointer (smaller op)
+
+
+ #==========================
+ # PRINT VERSION
+ #==========================
+
+# push $SYSCALL_UNAME # uname syscall
+# pop %rax # in 3 bytes
+ mov $uname_info,%edi # uname struct (0 extend address)
+# syscall # do syscall
+
+ mov %ebp,%edi # point %edi to out_buffer
+
+ mov $(uname_info+U_SYSNAME),%esi # os-name from uname "Linux"
+ call *%rdx # call strcat
+
+ mov $ver_string,%esi # source is " Version "
+ call *%rdx # call strcat
+ push %rsi # save our .txt pointer
+
+ mov $(uname_info+U_RELEASE),%esi # version from uname "2.4.1"
+ call *%rdx # call strcat
+
+ pop %rsi # restore .txt pointer
+ # source is ", Compiled "
+ call *%rdx # call strcat
+ push %rsi # store for later
+
+ mov $(uname_info+U_VERSION),%esi # compiled date
+ call *%rdx # call strcat
+
+ mov %ebp,%ecx # move out_buffer to ecx
+
+ mov $0xa,%ax # store linefeed on end
+ stosw # and zero
+
+ call *%rdx # call strcat
+
+ call center_and_print # center and print
+
+ #===============================
+ # Middle-Line
+ #===============================
+middle_line:
+ #=========
+ # Load /proc/cpuinfo into buffer
+ #=========
+
+ push %rdx # save call pointer
+
+# push $SYSCALL_OPEN # load 5 [ open() ]
+# pop %rax # in 3 bytes
+
+# mov $cpuinfo,%edi # '/proc/cpuinfo'
+# xor %esi,%esi # 0 = O_RDONLY <bits/fcntl.h>
+# cdq # clear edx in clever way
+# syscall # syscall. fd in eax.
+ # we should check that eax>=0
+
+# mov %eax,%edi # save our fd
+
+# xor %eax,%eax # SYSCALL_READ make== 0
+
+ mov $disk_buffer,%esi
+
+# mov $16,%dh # 4096 is maximum size of proc file #)
+ # we load sneakily by knowing
+ # 16<<8 = 4096. be sure edx clear
+
+# syscall
+
+# push $SYSCALL_CLOSE # close (to be correct)
+# pop %rax
+# syscall
+
+ #=============
+ # Number of CPUs
+ #=============
+number_of_cpus:
+
+ xor %ebx,%ebx # chip count
+
+ # $disk_buffer still in %rsi
+bogo_loop:
+ mov (%rsi), %eax # load 4 bytes into eax
+ inc %esi # increment pointer
+
+ cmp $0,%al # check for end of file
+ je done_bogo
+
+ cmp $('o'<<24+'g'<<16+'o'<<8+'b'),%eax
+ # "bogo" in little-endian
+
+ jne bogo_loop # ! if not equal, keep going
+ add $2,%ebx # otherwise, we have a bogo
+ # 2 times too for future magic
+ jmp bogo_loop
+
+done_bogo:
+ lea one-6(%rbx,%rbx,2), %esi
+ # Load into esi
+ # [one]+(num_cpus*6)
+ #
+ # the above multiplies by three
+ # esi = (ebx+(ebx*2))
+ # and we double-incremented ebx
+ # earlier
+
+ mov %ebp,%edi # move output buffer to edi
+
+ pop %rdx # restore call pointer
+ call *%rdx # copy it (call strcat)
+
+ mov $' ',%al # print a space
+ stosb
+
+ push %rbx
+ push %rdx # store strcat pointer
+
+ #=========
+ # MHz
+ #=========
+print_mhz:
+ mov $('z'<<24+'H'<<16+'M'<<8+' '),%ebx
+ # find ' MHz' and grab up to .
+ # we are little endian
+ mov $'.',%ah
+
+ # below is same as "sub $(strcat-find_string),%edx
+ # gas won't let us force the one-byte constant
+ .byte 0x83,0xEA,strcat-find_string
+
+ call *%rdx # call find string
+
+ mov %ebx,%eax # clever way to get MHz in, sadly
+ ror $8,%eax # not any smaller than a mov
+ stosl
+
+ #=========
+ # Chip Name
+ #=========
+chip_name:
+ mov $('e'<<24+'m'<<16+'a'<<8+'n'),%ebx
+ # find 'name\t: ' and grab up to \n
+ # we are little endian
+ mov $' ',%ah
+ call *%rdx # call find_string
+ stosb
+ call skip_spaces
+
+ pop %rdx
+ pop %rbx # restore chip count
+ pop %rsi
+
+ call *%rdx # ' Processor'
+ cmpb $2,%bl
+ jne print_s
+ inc %rsi # ! if singular, skip the s
+print_s:
+ call *%rdx # 's, '
+
+ push %rsi # restore the values
+ push %rdx
+
+ #========
+ # RAM
+ #========
+
+# push %rdi
+# push $SYSCALL_SYSINFO # sysinfo() syscall
+# pop %rax
+# mov $sysinfo_buff,%edi
+# syscall
+# pop %rdi
+
+ # The following has to be a 64 bit load, to support
+ # Ram > 4GB
+ mov (sysinfo_buff+S_TOTALRAM),%rax # size in bytes of RAM
+ shr $20,%rax # divide by 1024*1024 to get M
+ adc $0, %eax # round
+
+ call num_to_ascii
+
+ pop %rdx # restore strcat pointer
+
+ pop %rsi # print 'M RAM, '
+ call *%rdx # call strcat
+
+ push %rsi
+
+ #========
+ # Bogomips
+ #========
+
+ mov $('s'<<24+'p'<<16+'i'<<8+'m'),%ebx
+ # find 'mips\t: ' and grab up to \n
+ mov $0xa,%ah
+ call find_string
+
+ pop %rsi # bogo total follows RAM
+
+ call *%rdx # call strcat
+
+ push %rsi
+
+ mov %ebp,%ecx # point ecx to out_buffer
+
+ push %rcx
+ call center_and_print # center and print
+
+ #=================================
+ # Print Host Name
+ #=================================
+last_line:
+ mov %ebp,%edi # point to output_buffer
+
+ mov $(uname_info+U_NODENAME),%esi # host name from uname()
+ call *%rdx # call strcat
+
+ pop %rcx # ecx is unchanged
+ call center_and_print # center and print
+
+ pop %rcx # (.txt) pointer to default_colors
+
+ call write_stdout
+
+ #================================
+ # Exit
+ #================================
+exit:
+ push $SYSCALL_EXIT # Put exit syscall in rax
+ pop %rax
+
+ xor %edi,%edi # Make return value $0
+ syscall
+
+
+ #=================================
+ # FIND_STRING
+ #=================================
+ # ah is char to end at
+ # ebx is 4-char ascii string to look for
+ # edi points at output buffer
+
+find_string:
+
+ mov $disk_buffer-1,%esi # look in cpuinfo buffer
+find_loop:
+ inc %esi
+ cmpb $0, (%rsi) # are we at EOF?
+ je done # ! if so, done
+
+ cmp (%rsi), %ebx # do the strings match?
+ jne find_loop # ! if not, loop
+
+ # ! if we get this far, we matched
+
+find_colon:
+ lodsb # repeat till we find colon
+ cmp $0,%al
+ je done
+ cmp $':',%al
+ jne find_colon
+
+skip_spaces:
+ lodsb # skip spaces
+ cmp $0x20,%al # Loser new intel chips have lots??
+ je skip_spaces
+
+store_loop:
+ cmp $0,%al
+ je done
+ cmp %ah,%al # is it end string?
+ je almost_done # ! if so, finish
+ cmp $'\n',%al
+ je almost_done
+ stosb # ! if not store and continue
+ lodsb
+
+ jmp store_loop
+
+almost_done:
+ movb $0, (%rdi) # replace last value with NUL
+done:
+ ret
+
+
+ #================================
+ # strcat
+ #================================
+
+strcat:
+ lodsb # load a byte from [ds:esi]
+ stosb # store a byte to [es:edi]
+ cmp $0,%al # is it zero?
+ jne strcat # ! if not loop
+ dec %edi # point to one less than null
+ ret # return
+
+ #==============================
+ # center_and_print
+ #==============================
+ # string to center in ecx
+
+center_and_print:
+ push %rdx # save strcat pointer
+ push %rcx # save the string pointer
+ inc %edi # move to a clear buffer
+ push %rdi # save for later
+
+ mov $('['<<8+27),%ax # we want to output ^[[
+ stosw
+
+ cdq # clear dx
+
+str_loop2: # find end of string
+ inc %edx
+ cmpb $0,(%rcx,%rdx) # repeat till we find zero
+ jne str_loop2
+
+ push $81 # one added to cheat, we don't
+ # count the trailing '\n'
+ pop %rax
+
+ cmp %eax,%edx # see if we are >=80
+ jl not_too_big # ! if so, don't center
+ push $80
+ pop %rdx
+
+not_too_big:
+ sub %edx,%eax # subtract size from 80
+
+ shr %eax # then divide by 2
+
+ call num_to_ascii # print number of spaces
+ mov $'C',%al # tack a 'C' on the end
+ # ah is zero from num_to_ascii
+ stosw # store C and a NULL
+ pop %rcx # pop the pointer to ^[[xC
+
+ call write_stdout # write to the screen
+
+done_center:
+ pop %rcx # restore string pointer
+ # and trickily print the real string
+
+ pop %rdx # restore strcat pointer
+
+ #================================
+ # WRITE_STDOUT
+ #================================
+ # ecx has string
+ # eax,ebx,ecx,edx trashed
+write_stdout:
+ push %rdx
+ push $SYSCALL_WRITE # put 4 in eax (write syscall)
+ pop %rax # in 3 bytes of code
+
+ cdq # clear edx
+
+ lea 1(%rdx),%edi # put 1 in ebx (stdout)
+ # in 3 bytes of code
+
+ mov %ecx,%esi
+
+str_loop1:
+ inc %edx
+ cmpb $0,(%rcx,%rdx) # repeat till zero
+ jne str_loop1
+
+ syscall # run the syscall
+ pop %rdx
+ ret
+
+ ##############################
+ # num_to_ascii
+ ##############################
+ # ax = value to print
+ # edi points to where we want it
+
+num_to_ascii:
+ push $10
+ pop %rbx
+ xor %ecx,%ecx # clear ecx
+div_by_10:
+ cdq # clear edx
+ div %ebx # divide
+ push %rdx # save for later
+ inc %ecx # add to length counter
+ or %eax,%eax # was Q zero?
+ jnz div_by_10 # ! if not divide again
+
+write_out:
+ pop %rax # restore in reverse order
+ add $0x30, %al # convert to ASCII
+ stosb # save digit
+ loop write_out # loop till done
+ ret
+
+#===========================================================================
+# section .data
+#===========================================================================
+.data
+
+ver_string: .ascii " Version \0"
+compiled_string: .ascii ", Compiled \0"
+processor: .ascii " Processor\0"
+s_comma: .ascii "s, \0"
+ram_comma: .ascii "M RAM, \0"
+bogo_total: .ascii " Bogomips Total\n\0"
+
+default_colors: .ascii "\033[0m\n\n\0"
+
+cpuinfo: .ascii "/proc/cpuinfo\0"
+
+
+one: .ascii "One\0\0\0"
+two: .ascii "Two\0\0\0"
+three: .ascii "Three\0"
+four: .ascii "Four\0"
+
+.include "../logo.lzss_new"
+
+disk_buffer:
+.ascii "processor : 0\n"
+.ascii "vendor_id : GenuineIntel\n"
+.ascii "cpu family : 15\n"
+.ascii "model : 6\n"
+.ascii "model name : Intel(R) Xeon(TM) CPU 3.46GHz\n"
+.ascii "stepping : 4\n"
+.ascii "cpu MHz : 3200.000\n"
+.ascii "cache size : 2048 KB\n"
+.ascii "physical id : 0\n"
+.ascii "siblings : 2\n"
+.ascii "core id : 0\n"
+.ascii "cpu cores : 2\n"
+.ascii "apicid : 0\n"
+.ascii "initial apicid : 0\n"
+.ascii "fpu : yes\n"
+.ascii "fpu_exception : yes\n"
+.ascii "cpuid level : 6\n"
+.ascii "wp : yes\n"
+.ascii "flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc pebs bts pni dtes64 monitor ds_cpl vmx est cid cx16 xtpr pdcm lahf_lm tpr_shadow\n"
+.ascii "bogomips : 6934.38\n"
+.ascii "clflush size : 64\n"
+.ascii "cache_alignment : 128\n"
+.ascii "address sizes : 36 bits physical, 48 bits virtual\n"
+.ascii "power management:\n"
+.ascii "\n"
+.ascii "processor : 1\n"
+.ascii "vendor_id : GenuineIntel\n"
+.ascii "cpu family : 15\n"
+.ascii "model : 6\n"
+.ascii "model name : Intel(R) Xeon(TM) CPU 3.46GHz\n"
+.ascii "stepping : 4\n"
+.ascii "cpu MHz : 3200.000\n"
+.ascii "cache size : 2048 KB\n"
+.ascii "physical id : 1\n"
+.ascii "siblings : 2\n"
+.ascii "core id : 0\n"
+.ascii "cpu cores : 2\n"
+.ascii "apicid : 4\n"
+.ascii "initial apicid : 4\n"
+.ascii "fpu : yes\n"
+.ascii "fpu_exception : yes\n"
+.ascii "cpuid level : 6\n"
+.ascii "wp : yes\n"
+.ascii "flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc pebs bts pni dtes64 monitor ds_cpl vmx est cid cx16 xtpr pdcm lahf_lm tpr_shadow\n"
+.ascii "bogomips : 6934.13\n"
+.ascii "clflush size : 64\n"
+.ascii "cache_alignment : 128\n"
+.ascii "address sizes : 36 bits physical, 48 bits virtual\n"
+.ascii "power management:\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "domori\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "#1 SMP Mon May 4 09:51:54 EDT 2009\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+sysinfo_buff:
+.long 0,0,0,0,0,0,0,0,2048*1024*1024,0,0,0,0,0,0,0
+
+
+#============================================================================
+# section .bss
+#============================================================================
+.bss
+
+.lcomm text_buf, (N+F-1)
+.lcomm out_buffer,16384
--- /dev/null
+T:1:10 :7:10 :5:38 :2:44 :8:65 :9:662 :4:119 :6:2 :3:51
+T:7:5 :5:16 :2:18 :8:52 :9:858 :4:35 :6:1 :3:15
+T:7:5 :5:16 :2:18 :8:52 :9:858 :4:35 :6:1 :3:15
+T:7:5 :5:14 :2:16 :8:91 :9:863 :4:7 :6:1 :3:3
+T:7:5 :5:12 :2:14 :8:78 :9:880 :4:7 :6:1 :3:3
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:7:5 :5:10 :2:10 :8:65 :9:909 :6:1
+T:7:5 :5:14 :2:18 :8:117 :9:845 :6:1
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:8 :2:10 :8:65 :9:911 :6:1
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:5:6 :2:6 :8:39 :9:949
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:5:4 :2:4 :8:26 :9:966
+T:7:5 :5:12 :2:14 :8:78 :9:880 :4:7 :6:1 :3:3
+T:5:6 :2:6 :8:39 :9:949
+T:7:5 :5:8 :2:10 :8:65 :9:911 :6:1
+T:7:5 :5:14 :2:16 :8:91 :9:863 :4:7 :6:1 :3:3
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:10 :2:12 :8:78 :9:894 :6:1
+T:7:5 :5:10 :2:12 :8:74 :9:898 :6:1
+T:5:12 :2:12 :8:82 :9:894
+T:7:5 :5:8 :2:8 :8:39 :9:390 :4:7 :6:1 :3:3 :10:3 :11:9 :12:527
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:15:5 :18:2 :19:3 :20:2 :21:3 :22:4 :16:281 :17:10 :12:687 :13:1 :14:2
+T:23:1 :32:7 :34:351 :33:176 :16:3 :17:2 :24:10 :25:195 :26:4 :27:3 :30:4 :31:11 :11:9 :12:204 :13:2 :14:4 :28:9 :29:5
+T:34:666 :33:334
+T:34:667 :33:333
+T:34:665 :33:333 :35:2
+T:34:667 :33:333
+T:34:667 :33:333
+T:34:666 :33:334
+T:34:666 :33:332 :35:2
+T:34:357 :33:178 :36:4 :37:8 :38:4 :40:258 :39:173 :16:16 :17:2
+T:49:6 :50:2 :51:4 :52:2 :53:1 :54:6 :56:3 :38:4 :40:333 :39:225 :41:39 :42:26 :43:15 :44:46 :45:46 :46:40 :47:60 :48:6 :16:88 :17:4 :28:9 :55:18 :29:17
+T:57:4 :38:4 :40:591 :39:395 :16:4 :17:2
+T:40:600 :39:400
+T:58:2 :59:4 :40:453 :39:303 :41:18 :42:12 :43:6 :44:16 :45:16 :46:14 :47:21 :48:2 :16:68 :17:2 :24:10 :25:53
+
+
+# Thread 1
+# Total intervals: 45 (Interval Size 1000)
+# Total instructions: 45639
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 45 (Interval Size 1000)
+# Total instructions: 45639
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+\e[0;1;37;47m#################################################################\e[0;30;47m#####\e[1;37m#########\e[1;37;40m
+\e[0;1;37;47m################################################################\e[0;30;47m#######\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m###################\e[31m#\e[37m############################################\e[0;30;47m##\e[1;37mO\e[0;30;47m#\e[1;37mO\e[0;30;47m##\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m######\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###########################################\e[0;30;47m#\e[1;33m#####\e[0;30;47m#\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#############\e[0;30;47m#\e[1;37m##########################################\e[0;30;47m##\e[1;37m##\e[33m###\e[37m##\e[0;30;47m##\e[1;37m######\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#########\e[31m###\e[37m###\e[0;30;47m###\e[1;37m#\e[0;30;47m####\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m###\e[1;37m##\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m#####\e[0;30;47m#\e[1;37m##########\e[0;30;47m##\e[1;37m#####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m########\e[31m#\e[37m##\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m############\e[0;30;47m##\e[1;37m####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[31m#\e[37m###\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m#########\e[0;30;47m#\e[1;37m############\e[0;30;47m###\e[1;37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m########\e[33m##\e[0;30;47m#\e[1;37m###########\e[0;30;47m##\e[1;33m#\e[37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#\e[0;30;47m##\e[1;37m#####\e[33m######\e[0;30;47m#\e[1;37m#######\e[30m#\e[33m######\e[37m#\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m######\e[0;30;47m##\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m#\e[0;30;47m#\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m###\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m####\e[33m#######\e[0;30;47m#\e[1;37m#####\e[0;30;47m#\e[1;33m#######\e[37m#\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m############\e[1;37m##\e[0;30;47m###\e[1;37m##\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m#\e[0;30;47m###\e[1;37m#\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m###\e[33m#####\e[30m#\e[0;30;47m#####\e[1m#\e[33m#####\e[37m###\e[1;37;40m
+
+\e[7CLinux Version 2.6.29, Compiled #1 SMP Mon May 4 09:51:54 EDT 2009
+\e[2CTwo 3200MHz Intel(R) Xeon(TM) Processors, 2048M RAM, 6934.38 Bogomips Total
+\e[37Cdomori\e[0m
+
--- /dev/null
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post: cat ll.out.bb
+cleanup: rm ll.out.bb
+
--- /dev/null
+
+ # count for 1 million instructions
+ # total is 2 + 1 + 499997*2 + 3
+
+ .globl _start
+_start:
+ xor %rcx,%rcx # not needed, pads total to 1M
+ xor %rax,%rax # not needed, pads total to 1M
+
+ mov $499997,%rcx # load counter
+test_loop:
+ dec %rcx # repeat count times
+ jnz test_loop
+
+ #================================
+ # Exit
+ #================================
+exit:
+ xor %rdi,%rdi # we return 0
+ mov $60,%rax # put exit syscall number (60) in rax
+ syscall
+
--- /dev/null
+T:1:5 :2:99996
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+
+
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
+F:1:400078:
+F:2:400085:
+F:3:40008a:
--- /dev/null
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+prog: million
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb --pc-out-file=million.out.pc
+post: cat million.out.bb million.out.pc
+cleanup: rm million.out.bb million.out.pc
+
--- /dev/null
+#
+# rep, repe (repz) and repne (repnz) prefixed string instructions
+# only count as one instruction, even though they repeat many times
+# This test makes sure the bbv plugin counts these instructions properly
+# The answer is validated to hw perf counters.
+#
+
+ .globl _start
+_start:
+ cld # we want these to happen forward
+
+
+ #===============================================
+ # Some SSE2 instructions start with 0xf2 or 0xf3
+ # Check for them, to make sure our rep detection
+ # handles things properly.
+ # We should check this on x86 too, but then we'd
+ # have to check for SSE2 capability somehow?
+ #===================================
+false_positives:
+
+ movdqu %xmm1,%xmm2
+ movdqu %xmm2,%xmm1
+ addsd %xmm1,%xmm2
+ pause
+
+ #===================================
+ # Check varied order of the size prefix
+ # with the rep prefix. Older binutils
+ # did this one way, newer binutils the other
+ #===================================
+
+size_prefix:
+ # test 16-bit load
+
+ mov $8192, %rcx
+ mov $buffer1, %rsi # set source
+ .byte 0x66, 0xf3, 0xad # lodsw
+
+ mov $8192, %rcx
+ mov $buffer1, %rsi # set source
+ .byte 0xf3, 0x66, 0xad # lodsw
+
+
+
+
+ #===================================
+ # Load and Store Instructions
+ #===================================
+loadstore:
+ xor %rax, %rax
+ mov $0xd, %al # set eax to d
+
+ # test 8-bit store
+
+ mov $16384, %rcx
+ mov $buffer1, %rdi # set destination
+ rep stosb # store d 16384 times, auto-increment
+
+ # test 8-bit load
+
+ mov $16384, %rcx
+ mov $buffer1, %rsi # set source
+ rep lodsb # load byte 16384 times, auto-increment
+
+ cmp $0xd,%al # if we loaded wrong value
+ jne print_error # print an error
+
+ # test 16-bit store
+
+ mov $0x020d,%ax # store 0x020d
+
+ mov $8192, %rcx
+ mov $buffer1, %rdi # set destination
+ rep stosw # store 8192 times, auto-increment
+
+ # test 16-bit load
+
+ mov $8192, %rcx
+ mov $buffer1, %rsi # set source
+ rep lodsw # load 8192 times, auto-increment
+
+ cmp $0x020d,%ax # if we loaded wrong value
+ jne print_error # print an error
+
+ # test 32-bit store
+
+ mov $0x0feb1378,%eax # store 0x0feb1378
+
+ mov $4096, %rcx
+ mov $buffer1, %rdi # set destination
+ rep stosl # store 4096 times, auto-increment
+
+ # test 32-bit load
+
+ mov $4096, %rcx
+ mov $buffer1, %rsi # set source
+ rep lodsl # load 4096 times, auto-increment
+
+ cmp $0x0feb1378,%eax # if we loaded wrong value
+ jne print_error # print an error
+
+ # test 64-bit store
+
+ mov $0xfeb131978a5a5a5a,%rax
+
+ mov $2048, %rcx
+ mov $buffer1, %rdi # set destination
+ rep stosq # store 2048 times, auto-increment
+
+ # test 64-bit load
+
+ mov $2048, %rcx
+ mov $buffer1, %rsi # set source
+ rep lodsq # load 2048 times, auto-increment
+
+ cmp $0x8a5a5a5a,%eax
+ # !if we loaded wrong value
+ jne print_error # print an error
+
+
+ #=============================
+ # Move instructions
+ #=============================
+moves:
+ # test 8-bit move
+
+ mov $16384, %rcx
+ mov $buffer1, %rsi
+ mov $buffer2, %rdi
+ rep movsb
+
+ # test 16-bit move
+
+ mov $8192, %rcx
+ mov $buffer2, %rsi
+ mov $buffer1, %rdi
+ rep movsw
+
+ # test 32-bit move
+
+ mov $4096, %rcx
+ mov $buffer1, %rsi
+ mov $buffer2, %rdi
+ rep movsl
+
+ # test 64-bit move
+
+ mov $2048, %rcx
+ mov $buffer1, %rsi
+ mov $buffer2, %rdi
+ rep movsq
+
+
+ #==================================
+ # Compare equal instructions
+ #==================================
+compare_equal:
+ # first set up the areas to compare
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer1, %rdi
+ mov $4096, %rcx
+ rep stosl
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer2, %rdi
+ mov $4096, %rcx
+ rep stosl
+
+
+ # test 8-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $16384, %rcx
+ repe cmpsb
+ jnz print_error
+
+ # test 16-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $8192, %rcx
+ repe cmpsw
+ jnz print_error
+
+ # test 32-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $4096, %rcx
+ repe cmpsl
+ jnz print_error
+
+ # test 64-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $2048, %rcx
+ repe cmpsq
+ jnz print_error
+
+
+
+ #==================================
+ # Compare not equal instructions
+ #==================================
+compare_noteq:
+ # change second buffer
+
+ mov $0x5a5a5a5a,%eax
+ mov $buffer2, %rdi
+ mov $4096, %rcx
+ rep stosl
+
+ # test 8-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $16384, %rcx
+# repne cmpsb FIXME! Not implemented valgrind
+# je print_error
+
+ # test 16-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $8192, %rcx
+# repne cmpsw FIXME! Not implemented valgrind
+# je print_error
+
+ # test 32-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $4096, %rcx
+# repne cmpsl FIXME! Not implemented valgrind
+# je print_error
+
+ # test 64-bit
+
+ mov $buffer1,%rsi
+ mov $buffer2,%rdi
+ mov $2048, %rcx
+# repne cmpsq FIXME! Not implemented valgrind
+# je print_error
+
+ #====================================
+ # Check scan equal instruction
+ #====================================
+scan_eq:
+ # test 8-bit
+
+ mov $0xa5,%al
+ mov $buffer1,%rdi
+ mov $16384, %rcx
+ repe scasb
+ jnz print_error
+
+ # test 16-bit
+
+ mov $0xa5a5,%ax
+ mov $buffer1,%rdi
+ mov $8192, %rcx
+ repe scasw
+ jnz print_error
+
+ # test 32-bit
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer1,%rdi
+ mov $4096, %rcx
+ repe scasl
+ jnz print_error
+
+ # test 64-bit
+
+ mov $0xa5a5a5a5a5a5a5a5,%rax
+ mov $buffer1,%rdi
+ mov $2048, %rcx
+ repe scasq
+ jnz print_error
+
+
+ #====================================
+ # Check scan not-equal instruction
+ #====================================
+
+ # test 8-bit
+scan_ne:
+ mov $0xa5,%al
+ mov $buffer2,%rdi
+ mov $16384, %rcx
+ repne scasb
+ jz print_error
+
+ # test 16-bit
+
+ mov $0xa5a5,%ax
+ mov $buffer2,%rdi
+ mov $8192, %rcx
+ repne scasw
+ jz print_error
+
+ # test 32-bit
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer2,%rdi
+ mov $4096, %rcx
+ repne scasl
+ jz print_error
+
+ # test 64-bit
+
+ mov $0xa5a5a5a5a5a5a5a5,%rax
+ mov $buffer2,%rdi
+ mov $2048, %rcx
+ repne scasq
+ jz print_error
+
+ jmp exit # no error, skip to exit
+
+print_error:
+
+ mov $1, %rax # Write syscall
+ mov $1, %rdi # print to stdout
+ mov $error_string, %rsi # string to print
+ mov $16, %edx # strlen
+ syscall # call syscall
+
+ #================================
+ # Exit
+ #================================
+exit:
+ mov $60,%rax
+ xor %rdi,%rdi # we return 0
+ syscall # and exit
+
+
+.data
+error_string: .asciz "Error detected!\n"
+
+.bss
+
+.lcomm buffer1, 16384
+.lcomm buffer2, 16384
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 100000)
+# Total instructions: 152
+# Total reps: 165917
+# Unique reps: 29
+# Total fldcw instructions: 0
--- /dev/null
+prog: rep_prefix
+vgopts: --interval-size=100000 --bb-out-file=rep_prefix.out.bb
+cleanup: rm rep_prefix.out.bb
+
--- /dev/null
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic |
+
+# This attempts to filter out the basic block numbers
+# While keeping total count. This is because the
+# basic block number is non-deterministic on a
+# multi-threaded benchmark
+
+sed s/:\[0-9\]\*:/' '/g
--- /dev/null
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic |
+
+# Remove lines that don't start with #
+sed '/^[^#]/d' |
+
+# Remove all blank lines
+sed '/^$/d'
+
+
--- /dev/null
+.equ FREQUENT_CHAR,0
+.equ N,1024
+.equ F,64
+.equ THRESHOLD,2
+.equ P_BITS,10
+.equ POSITION_MASK,3
--- /dev/null
+logo:
+ .byte 255,27,91,48,59,49,59,51,55
+ .byte 159,59,52,55,109,35,204,247,192,7,51
+ .byte 141,48,200,27,27,91,196,7,203,31,28,12,59
+ .byte 15,52,48,109,10,192,247,1,96,26,56,44,156
+ .byte 31,27,91,51,49,109,204,4,65,172,13,36
+ .byte 2,28,16,79,13,32,16,65,147,152,131,52,28,52,204,16
+ .byte 16,12,36,111,57,236,167,28,8,51,22,20,137,85,44,96
+ .byte 0,43,97,214,113,226,200,203,8,212,9,211,16,43,89,245,209
+ .byte 0,128,17,210,24,13,40,28,20,13,44,28,28,240,74,26,91
+ .byte 0,13,80,95,101,135,101,43,85,245,205,205,40,205,20,137,65
+ .byte 0,29,135,66,75,114,83,28,120,15,98,135,109,85,88,247,193
+ .byte 0,232,43,244,151,73,120,61,176,27,95,151,176,18,43,171,202
+ .byte 16,223,22,26,245,90,245,217,63,51,27,86,146,91,176,2
+ .byte 0,12,29,211,200,172,57,23,102,50,246,110,109,236,68,96,94
+ .byte 8,175,10,166,105,20,1,48,51,11,222,31,49,15,211,188
+ .byte 0,175,79,25,86,170,69,82,219,40,82,70,127,8,83,219,35
+ .byte 0,169,85,170,53,24,33,18,104,145,42,200,34,178,104,112,45
+ .byte 0,198,80,178,121,145,74,112,49,248,81,243,40,221,23,255,23
+ .byte 8,2,54,3,36,229,66,10
+logo_end:
--- /dev/null
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+ million ll
+
+EXTRA_DIST = \
+ ll.stderr.exp \
+ ll.stdout.exp \
+ ll.post.exp \
+ ll.vgtest \
+ million.stderr.exp \
+ million.post.exp \
+ million.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += -nostartfiles -nodefaultlibs
+
+ll_SOURCES = ll.S
+million_SOURCES = million.S
--- /dev/null
+#! /bin/sh
+
+../filter_stderr
+
+
--- /dev/null
+#
+# linux_logo in ppc assembly language
+# based on the code from ll_asm-0.36
+#
+# By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the SYSCALL_SYSINFO buffer
+.equ S_TOTALRAM,16
+
+# Sycscalls
+.equ SYSCALL_EXIT, 1
+#.equ SYSCALL_READ, 3
+.equ SYSCALL_WRITE, 4
+#.equ SYSCALL_OPEN, 5
+#.equ SYSCALL_CLOSE, 6
+#.equ SYSCALL_SYSINFO,116
+#.equ SYSCALL_UNAME, 122
+
+#
+.equ STDIN, 0
+.equ STDOUT,1
+.equ STDERR,2
+
+.equ BSS_BEGIN,25
+.equ DATA_BEGIN,26
+
+.include "../logo.include"
+
+ .globl _start
+_start:
+
+ #========================
+ # Initialization
+ #========================
+
+
+# eieio # coolest opcode of all time ;)
+ # not needed, but I had to put it here
+ # the hack loading BSS_BEGIN and DATA_BEGIN
+ # saves one instruction on any future load from memory
+ # as we can just do an addi rather than an lis;addi
+
+ lis 25,bss_begin@ha
+ addi 25,25,bss_begin@l
+
+ lis 26,data_begin@ha
+ addi 26,26,data_begin@l
+
+ addi 14,BSS_BEGIN,(out_buffer-bss_begin)
+ # the output buffer
+
+ addi 21,BSS_BEGIN,(text_buf-bss_begin)
+
+
+ mr 17,14 # store out-buffer for later
+
+ #=========================
+ # PRINT LOGO
+ #=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+
+ li 8,(N-F) # grab "R"
+
+ addi 9,DATA_BEGIN,(logo-data_begin)-1
+ # logo_pointer
+
+ addi 12,DATA_BEGIN,(logo_end-data_begin)-1
+ # end of the logo
+
+
+ mr 16,17
+
+decompression_loop:
+ lbzu 10,1(9) # load in a byte
+ # auto-update
+ mr 11,10 # copy to 11
+ ori 11,11,0xff00 # re-load top as a hackish
+ # 8-bit counter
+
+test_flags:
+ cmpw 0,12,9 # have we reached the end?
+ ble done_logo # ! if so exit
+
+ andi. 13,11,0x1
+ srawi 11,11,1
+
+ bne 0,discrete_char
+
+offset_length:
+ lbzu 10,1(9)
+ lbzu 24,1(9)
+ slwi 24,24,8
+ or 24,24,10
+
+ mr 10,24
+
+ srawi 15,10,P_BITS
+ addi 15,15,THRESHOLD+1 # cl = ax >> (P_BITS)+THRESH+1
+ # = match length
+
+output_loop:
+ andi. 24,24,(POSITION_MASK<<8+0xff) # mask it
+ lbzx 10,21,24
+ addi 24,24,1
+
+store_byte:
+ stbu 10,1(16)
+
+ stbx 10,21,8
+ addi 8,8,1
+ andi. 8,8,(N-1)
+
+ addic. 15,15,-1
+ bne 0,output_loop
+
+ andi. 13,11,0xff00
+ bne test_flags
+
+ b decompression_loop
+
+discrete_char:
+
+ lbzu 10,1(9)
+ li 15,1
+
+ b store_byte
+
+done_logo:
+
+ addi 4,17,1 # restore (plus one because r17 is decremented)
+ bl write_stdout # and print the logo
+
+
+ #==========================
+ # First Line
+ #==========================
+
+
+ #==========================
+ # PRINT VERSION
+ #==========================
+
+# li 0,SYSCALL_UNAME # uname syscall
+# addi 3,BSS_BEGIN,(uname_info-bss_begin)
+ # uname struct
+# sc # do syscall
+
+
+ addi 16,DATA_BEGIN,(uname_info-data_begin)+U_SYSNAME@l-1
+ # os-name from uname "Linux"
+ bl strcat
+
+ addi 16,DATA_BEGIN,(ver_string-data_begin)-1
+ # source is " Version "
+ bl strcat
+
+ addi 16,DATA_BEGIN,(uname_info-data_begin)+U_RELEASE@l-1
+ # version from uname "2.4.1"
+ bl strcat
+
+ addi 16,DATA_BEGIN,(compiled_string-data_begin)-1
+ # source is ", Compiled "
+ bl strcat
+
+ addi 16,DATA_BEGIN,(uname_info-data_begin)+U_VERSION-1
+ # compiled date
+ bl strcat
+
+ bl center_and_print # write it to screen
+
+
+ #===============================
+ # Middle-Line
+ #===============================
+
+ #=========
+ # Load /proc/cpuinfo into buffer
+ #=========
+
+# li 0,SYSCALL_OPEN # open()
+# addi 3,DATA_BEGIN,(cpuinfo-data_begin)
+ # '/proc/cpuinfo'
+# li 4,0 # O_RDONLY <bits/fcntl.h>
+# sc # syscall. fd in r0.
+ # we should check that r0>=0
+
+# mr 13,3 # save fd in r13
+
+# li 0,SYSCALL_READ # read
+# addi 4,BSS_BEGIN,(disk_buffer-bss_begin)
+# li 5,4096 # 4096 is maximum size of proc file ;)
+# sc
+
+# mr 3,13 # restore fd
+# li 0,6 # close
+# sc
+
+ #=============
+ # Number of CPUs
+ #=============
+
+ mr 14,17 # point output to out_buf
+
+ # Assume 1 CPU for now
+ # my iBook's /proc/cpuinfo does not have a "processor" line ???
+
+ addi 16,DATA_BEGIN,(one-data_begin)-1
+ bl strcat
+
+ #=========
+ # MHz
+ #=========
+
+ lis 20,('l'<<8)+'o' # find 'lock ' and grab up to M
+ addi 20,20,('c'<<8)+'k'
+ li 23,'M'
+ bl find_string
+
+ addi 16,DATA_BEGIN,(megahertz-data_begin)-1
+ # print 'MHz '
+ bl strcat
+
+
+ #=========
+ # Chip Name
+ #=========
+
+ lis 20,('c'<<8)+'p' # find 'cpu\t: ' and grab up to \n
+ addi 20,20,('u'<<8)+'\t'
+ li 23,'\n'
+ bl find_string
+
+ addi 16,DATA_BEGIN,(comma-data_begin)-1
+ # print ', '
+ bl strcat
+
+ #========
+ # RAM
+ #========
+
+# li 0,SYSCALL_SYSINFO # sysinfo() syscall
+# addi 3,BSS_BEGIN,(sysinfo_buff-bss_begin)
+ # sysinfo_buffer
+
+# sc
+
+ lwz 4,(sysinfo_buff+S_TOTALRAM-data_begin)(DATA_BEGIN)
+ # load bytes of RAM into r4
+
+ srawi 4,4,20 # divide by 2^20 to get MB
+ li 5,0
+
+ bl num_to_ascii
+
+ addi 16,DATA_BEGIN,(ram_comma-data_begin)-1
+ # print 'M RAM, '
+
+ bl strcat
+
+ #========
+ # Bogomips
+ #========
+
+ lis 20,('m'<<8)+'i' # find 'mips' and grab up to \n
+ addi 20,20,('p'<<8)+'s'
+ li 23,'\n'
+ bl find_string
+
+ addi 16,DATA_BEGIN,(bogo_total-data_begin)-1
+ # print "Bogomips Total"
+ bl strcat
+
+ bl center_and_print # center it
+
+
+ #=================================
+ # Print Host Name
+ #=================================
+
+ mr 14,17 # restore out buffer
+
+ addi 16,DATA_BEGIN,((uname_info-data_begin)+U_NODENAME)-1
+ # hostname
+
+ bl strcat
+
+ bl center_and_print
+
+ #================================
+ # Exit
+ #================================
+exit:
+ li 3,0 # 0 exit value
+ li 0,SYSCALL_EXIT # put the exit syscall number in eax
+ sc # and exit
+
+
+
+
+ #=================================
+ # FIND_STRING
+ #=================================
+ # r23 is char to end at
+ # r20 is the 4-char ascii string to look for
+ # r14 points at output buffer
+ # r16,r21
+
+find_string:
+
+ addi 16,DATA_BEGIN,(disk_buffer-data_begin)-1
+ # look in cpuinfo buffer
+ # -1 so we can use lbzu
+
+find_loop:
+ lwzu 13,1(16) # load in 32 bits, incrementing 8bits
+ cmpwi 13,0 # ! if null, we are done
+ beq done
+ cmpw 13,20 # compare with out 4 char string
+ bne find_loop # ! if no match, keep looping
+
+
+ # ! if we get this far, we matched
+
+ li 21,':'
+find_colon:
+ lbzu 13,1(16) # repeat till we find colon
+ cmpwi 13,0
+ beq done
+ cmpw 13,21
+ bne find_colon
+
+ addi 16,16,1 # skip a char [should be space]
+
+store_loop:
+ lbzu 13,1(16)
+ cmpwi 13,0
+ beq done
+ cmpw 13,23 # is it end string?
+ beq almost_done # ! if so, finish
+ stbu 13,1(14) # ! if not store and continue
+ b store_loop
+
+almost_done:
+ li 13,0 # replace last value with null
+ stb 13,1(14)
+
+done:
+ blr
+
+ #================================
+ # strcat
+ #================================
+ # r13 = "temp"
+ # r16 = "source"
+ # r14 = "destination"
+strcat:
+ lbzu 13,1(16) # load a byte from [r16]
+ stbu 13,1(14) # store a byte to [r14]
+ cmpwi 13,0 # is it zero?
+ bne strcat # ! if not loop
+ subi 14,14,1 # point to one less than null
+ blr # return
+
+ #==============================
+ # center_and_print
+ #==============================
+ # r14 is end of buffer
+ # r17 is start of buffer
+ # r29 = saved link register
+ # r4-r10, r19-r22, r30 trashed
+
+center_and_print:
+
+ mflr 29 # back up return address
+
+ subf 5,17,14 # see how long the output
+ # buffer is
+
+ cmpwi 5,80 # see if we are >80
+ bgt done_center # ! if so, bail
+
+ li 4,80 # 80 column screen
+ subf 4,5,4 # subtract strlen
+ srawi 23,4,1 # divide by two
+
+ lis 4,escape@ha
+ addi 4,4,escape@l
+ bl write_stdout
+
+ mr 4,23
+ li 5,1 # print to stdout
+ bl num_to_ascii # print number
+
+ lis 4,c@ha
+ addi 4,4,c@l
+ bl write_stdout
+
+
+done_center:
+
+ addi 4,17,1 # move string to output+1
+ bl write_stdout # call write stdout
+
+ lis 4,linefeed@ha
+ addi 4,4,linefeed@l
+
+ mtlr 29 # restore link register
+ # and let write_stdout
+ # return for us
+
+
+
+ #================================
+ # WRITE_STDOUT
+ #================================
+ # r4 has string
+ # r0,r3,r4,r5,r6 trashed
+
+write_stdout:
+ li 0,SYSCALL_WRITE # write syscall
+ li 3,STDOUT # stdout
+
+ li 5,0 # string length counter
+strlen_loop:
+ lbzx 6,4,5 # get byte from (r4+r5)
+ addi 5,5,1 # increment counter
+ cmpi 0,6,0 # is it zero?
+ bne strlen_loop # ! if not keep counting
+ addi 5,5,-1
+ sc # syscall
+
+ blr # return
+
+
+ ##############################
+ # Num to Ascii
+ ##############################
+ # num is in r4
+ # r5 =0 then strcat, otherwise stdout
+ # r5-r10,r19,r20,r21,r22,r30 trashed
+
+num_to_ascii:
+
+ mflr 30 # save the link register
+
+ addi 16,BSS_BEGIN,(num_to_ascii_end-bss_begin)
+ # the end of a backwards growing
+ # 10 byte long buffer.
+
+ li 20,10 # we will divide by 10
+ mr 19,4 # load in the value passed
+
+div_by_10:
+ divw 21,19,20 # divide r19 by r20 put into r21
+
+ mullw 22,21,20 # find remainder. 1st q*dividend
+ subf 22,22,19 # then subtract from original = R
+ addi 22,22,0x30 # convert remainder to ascii
+
+ stbu 22,-1(16) # Store to backwards buffer
+
+ mr 19,21 # move Quotient as new dividend
+ cmpwi 19,0 # was quotient zero?
+ bne div_by_10 # ! if not keep dividing
+
+write_out:
+ cmpwi 5,0 # ! if r5 is 0 then skip ahead
+ bne stdout_num
+
+ addi 16,16,-1 # point to the beginning
+ bl strcat # and strcat it
+
+ mtlr 30 # restore link register
+
+ blr # return
+
+stdout_num:
+ mr 4,16 # point to our buffer
+ mtlr 30 # restore link register
+ b write_stdout # stdout will return for us
+
+
+#===========================================================================
+.data
+#===========================================================================
+
+
+data_begin:
+
+.include "../logo.lzss_new"
+
+ver_string: .ascii " Version \0"
+compiled_string: .ascii ", Compiled \0"
+megahertz: .ascii "MHz PPC \0"
+.equ space, ram_comma+6
+.equ comma, ram_comma+5
+linefeed: .ascii "\n\0"
+escape: .ascii "\033[\0"
+c: .ascii "C\0"
+ram_comma: .ascii "M RAM, \0"
+
+bogo_total: .ascii " Bogomips Total\0"
+
+default_colors: .ascii "\033[0m\n\n\0"
+
+cpuinfo: .ascii "/proc/cpuinfo\0"
+
+one: .ascii "One \0"
+
+disk_buffer:
+.ascii "processor : 0\n"
+.ascii "cpu : 745/755\n"
+.ascii "temperature : 22-24 C (uncalibrated)\n"
+.ascii "clock : 600.000000MHz\n"
+.ascii "revision : 51.17 (pvr 0008 3311)\n"
+.ascii "bogomips : 49.79\n"
+.ascii "timebase : 24960000\n"
+.ascii "platform : PowerMac\n"
+.ascii "model : PowerBook4,1\n"
+.ascii "machine : PowerBook4,1\n"
+.ascii "motherboard : PowerBook4,1 MacRISC2 MacRISC Power Macintosh\n"
+.ascii "detected as : 257 (iBook 2)\n"
+.ascii "pmac flags : 0000001b\n"
+.ascii "L2 cache : 256K unified\n"
+.ascii "pmac-generation : NewWorld\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "henparma\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "#1 Wed May 13 15:51:54 UTC 2009\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+
+sysinfo_buff:
+.long 0,0,0,0,512*1024*1024,0,0,0
+
+#============================================================================
+#.bss
+#============================================================================
+
+.lcomm bss_begin,0
+.lcomm num_to_ascii_buff,10
+.lcomm num_to_ascii_end,1
+.lcomm text_buf, (N+F-1) # These buffers must follow each other
+.lcomm out_buffer,16384
+
+
+
+
+
+
+
+
+
+
--- /dev/null
+T:1:16 :8:10 :6:32 :2:56 :9:48 :10:666 :4:90 :5:36 :7:2 :3:45
+T:8:5 :6:20 :2:34 :9:80 :10:775 :4:42 :5:22 :7:1 :3:21
+T:8:5 :6:16 :2:27 :9:64 :10:824 :4:30 :5:18 :7:1 :3:15
+T:8:5 :6:10 :2:18 :9:80 :10:865 :4:6 :5:12 :7:1 :3:3
+T:8:5 :6:10 :2:18 :9:96 :10:858 :5:12 :7:1
+T:8:5 :6:10 :2:18 :9:80 :10:865 :4:6 :5:12 :7:1 :3:3
+T:6:6 :2:9 :9:36 :10:943 :5:6
+T:8:5 :6:8 :2:15 :9:92 :10:869 :5:10 :7:1
+T:6:14 :2:21 :9:112 :10:839 :5:14
+T:8:5 :6:6 :2:12 :9:64 :10:902 :5:10 :7:1
+T:8:5 :6:8 :2:15 :9:80 :10:883 :5:8 :7:1
+T:6:8 :2:12 :9:64 :10:908 :5:8
+T:6:6 :2:9 :9:48 :10:931 :5:6
+T:8:5 :6:4 :2:9 :9:48 :10:927 :5:6 :7:1
+T:6:6 :2:9 :9:48 :10:931 :5:6
+T:8:5 :6:6 :2:12 :9:64 :10:904 :5:8 :7:1
+T:6:2 :2:3 :9:16 :10:977 :5:2
+T:8:5 :6:12 :2:21 :9:96 :10:842 :4:6 :5:14 :7:1 :3:3
+T:6:6 :2:9 :9:48 :10:931 :5:6
+T:6:6 :2:9 :9:48 :10:931 :5:6
+T:8:5 :6:14 :2:24 :9:112 :10:819 :4:6 :5:16 :7:1 :3:3
+T:8:5 :6:6 :2:12 :9:64 :10:904 :5:8 :7:1
+T:6:6 :2:9 :9:48 :10:931 :5:6
+T:8:5 :6:8 :2:15 :9:80 :10:881 :5:10 :7:1
+T:8:5 :6:10 :2:18 :9:96 :10:858 :5:12 :7:1
+T:6:10 :2:15 :9:80 :10:885 :5:10
+T:8:5 :6:10 :2:15 :9:64 :10:470 :4:6 :5:12 :7:1 :3:3 :11:2 :12:7 :13:405
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:13:1000
+T:16:2 :19:2 :20:2 :21:2 :22:2 :23:1 :17:268 :18:10 :24:4 :25:6 :26:3 :30:3 :31:2 :12:28 :13:636 :14:8 :15:4 :27:12 :28:2 :29:3
+T:33:3 :34:4 :46:2 :47:4 :48:2 :49:4 :53:2 :54:4 :35:8 :37:246 :36:168 :38:8 :40:27 :39:22 :41:8 :44:51 :42:38 :43:34 :45:6 :17:116 :18:10 :32:10 :13:183 :14:4 :15:2 :27:12 :50:16 :28:2 :51:2 :52:2
+T:55:2 :56:1 :35:4 :37:381 :36:256 :38:4 :40:12 :39:10 :41:4 :44:15 :42:12 :43:10 :45:3 :17:64 :18:2 :24:4 :25:6 :26:3 :30:3 :31:2 :12:28 :13:148 :14:6 :15:3 :27:12 :28:2 :29:3
+
+
+# Thread 1
+# Total intervals: 40 (Interval Size 1000)
+# Total instructions: 40330
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 40 (Interval Size 1000)
+# Total instructions: 40330
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+\e[0;1;37;47m#################################################################\e[0;30;47m#####\e[1;37m#########\e[1;37;40m
+\e[0;1;37;47m################################################################\e[0;30;47m#######\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m###################\e[31m#\e[37m############################################\e[0;30;47m##\e[1;37mO\e[0;30;47m#\e[1;37mO\e[0;30;47m##\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m######\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###########################################\e[0;30;47m#\e[1;33m#####\e[0;30;47m#\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#############\e[0;30;47m#\e[1;37m##########################################\e[0;30;47m##\e[1;37m##\e[33m###\e[37m##\e[0;30;47m##\e[1;37m######\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#########\e[31m###\e[37m###\e[0;30;47m###\e[1;37m#\e[0;30;47m####\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m###\e[1;37m##\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m#####\e[0;30;47m#\e[1;37m##########\e[0;30;47m##\e[1;37m#####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m########\e[31m#\e[37m##\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m############\e[0;30;47m##\e[1;37m####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[31m#\e[37m###\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m#########\e[0;30;47m#\e[1;37m############\e[0;30;47m###\e[1;37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m########\e[33m##\e[0;30;47m#\e[1;37m###########\e[0;30;47m##\e[1;33m#\e[37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#\e[0;30;47m##\e[1;37m#####\e[33m######\e[0;30;47m#\e[1;37m#######\e[30m#\e[33m######\e[37m#\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m######\e[0;30;47m##\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m#\e[0;30;47m#\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m###\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m####\e[33m#######\e[0;30;47m#\e[1;37m#####\e[0;30;47m#\e[1;33m#######\e[37m#\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m############\e[1;37m##\e[0;30;47m###\e[1;37m##\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m#\e[0;30;47m###\e[1;37m#\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m###\e[33m#####\e[30m#\e[0;30;47m#####\e[1m#\e[33m#####\e[37m###\e[1;37;40m
+
+\e[9CLinux Version 2.6.29, Compiled #1 Wed May 13 15:51:54 UTC 2009
+\e[9COne 600.000000MHz PPC 745/755, 512M RAM, 49.79 Bogomips Total
+\e[36Chenparma
--- /dev/null
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post: cat ll.out.bb
+cleanup: rm ll.out.bb
+
--- /dev/null
+
+ # count for 1 million instructions
+ # total is 3 + 499997*2 + 3
+
+ .globl _start
+_start:
+ nop # to give us an even million
+ lis 15,499997@ha # load high 16-bits of counter
+ addi 15,15,499997@l # load low 16-bits of counter
+test_loop:
+ addic. 15,15,-1 # decrement counter
+ bne 0,test_loop # loop until zero
+
+ #================================
+ # Exit
+ #================================
+
+exit:
+ li 3,0 # 0 exit value
+ li 0,1 # put the exit syscall number (1) in r0
+ sc # and exit
+
+
--- /dev/null
+T:1:5 :2:99996
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+
+
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+prog: million
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb
+post: cat million.out.bb
+cleanup: rm million.out.bb
+
--- /dev/null
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+ ll clone_test
+
+EXTRA_DIST = \
+ clone_test.stderr.exp \
+ clone_test.post.exp \
+ clone_test.vgtest \
+ ll.stderr.exp \
+ ll.stdout.exp \
+ ll.post.exp \
+ ll.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += @FLAG_M32@ -static -nostartfiles -nodefaultlibs
+
+clone_test_SOURCES = clone_test.S
+ll_SOURCES = ll.S
+
+AM_CCASFLAGS += @FLAG_M32@
--- /dev/null
+ # count for ~1 million instructions thread 1
+ # count for ~2 million instructions thread 2
+ # count for additional 500 million each before exit
+
+ .globl _start
+_start:
+
+ #################################################
+ # 1000 cycles in initial thread #
+ #################################################
+
+ xor %eax,%eax
+ mov $499,%ecx # load counter
+initial_loop:
+ dec %ecx # repeat count times
+ jnz initial_loop
+
+
+ #####################################################
+ # Spawn a thread! #
+ #####################################################
+clone:
+ mov $120,%eax # clone syscall
+
+ # Note, clone syscall is different than the glibc implementation
+
+# int clone (flags, stack_pointer,parent_tidptr,child_tidptr,tls)
+
+
+ # Flags in
+ #/usr/include/bits/sched.h
+ # CLONE_THREAD 0x10000
+ # CLONE_SIGHAND 0x800
+ # CLONE_VM 0x100
+ # above must be called together
+ # Below required for Valgrind
+ # CLONE_FS 0x200
+ # CLONE_FILES 0x400
+
+ mov $0x10f00,%ebx
+
+
+ mov $(new_stack+4096),%ecx # new stack
+
+
+
+ mov $0,%edx # args (none)
+
+ int $0x80
+
+ cmp $0,%eax # are we in new thread?
+ jz thread2 # if so, jump to thrad2
+
+
+ ###############################################
+ # thread1 #
+ ###############################################
+
+thread1:
+
+ mov $499997,%ecx # load counter
+thread1_loop:
+ dec %ecx # repeat count times
+ jnz thread1_loop
+
+ xor %ebx,%ebx # we return 0
+ jmp exit
+
+thread2:
+ mov $999997,%ecx # load counter
+thread2_loop:
+ dec %ecx # repeat count times
+ jnz thread2_loop
+
+ mov $5,%ebx # we return 5
+
+
+ #================================
+ # Exit
+ #================================
+exit:
+
+ # count an additional 500 million
+
+ mov $250000,%ecx # load counter
+exit_loop:
+ dec %ecx # repeat count times
+ jnz exit_loop
+
+actual_exit:
+ mov $1,%eax # put exit syscall number (60) in rax
+ int $0x80
+
+.bss
+.lcomm new_stack,4096
--- /dev/null
+T 4 996 5 2 3 98991
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 1001 2 3 98994
+T 100000
+T 100000
+T 100000
+T 100000
+
+
+# Thread 1
+# Total intervals: 15 (Interval Size 100000)
+# Total instructions: 1501007
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
+T 2 3 99996
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 100000
+T 99996 4
+T 100000
+T 100000
+T 100000
+T 100000
+T 99998 2
+
+
+# Thread 2
+# Total intervals: 25 (Interval Size 100000)
+# Total instructions: 2500001
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 15 (Interval Size 100000)
+# Total instructions: 1501007
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+# Thread 2
+# Total intervals: 25 (Interval Size 100000)
+# Total instructions: 2500001
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+prog: clone_test
+vgopts: --interval-size=100000 --bb-out-file=clone_test.out.bb --pc-out-file=clone_test.out.pc
+post: cat clone_test.out.bb clone_test.out.bb.2 | ../filter_bb
+cleanup: rm clone_test.out.bb
+
--- /dev/null
+#! /bin/sh
+
+../filter_stderr
+
+
--- /dev/null
+#
+# linux_logo in i386 assembly language
+# based on the code from ll_asm-0.36
+#
+# By Vince Weaver <vince _at_ deater.net>
+#
+# Modified to remove non-deterministic system calls
+# And to avoid reading from /proc
+#
+
+.include "../logo.include"
+
+# offsets into the results returned by the uname syscall
+.equ U_SYSNAME,0
+.equ U_NODENAME,65
+.equ U_RELEASE,65*2
+.equ U_VERSION,(65*3)
+.equ U_MACHINE,(65*4)
+.equ U_DOMAINNAME,65*5
+
+# offset into the results returned by the sysinfo syscall
+.equ S_TOTALRAM,16
+
+# Sycscalls
+.equ SYSCALL_EXIT, 1
+.equ SYSCALL_WRITE, 4
+
+#
+.equ STDIN,0
+.equ STDOUT,1
+.equ STDERR,2
+
+ .globl _start
+_start:
+ #=========================
+ # PRINT LOGO
+ #=========================
+
+# LZSS decompression algorithm implementation
+# by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989
+# optimized some more by Vince Weaver
+
+ # we used to fill the buffer with FREQUENT_CHAR
+ # but, that only gains us one byte of space in the lzss image.
+ # the lzss algorithm does automatic RLE... pretty clever
+ # so we compress with NUL as FREQUENT_CHAR and it is pre-done for us
+
+ mov $(N-F), %bp # R
+
+ mov $logo, %esi # %esi points to logo (for lodsb)
+
+ mov $out_buffer, %edi # point to out_buffer
+ push %edi # save this value for later
+
+decompression_loop:
+ lodsb # load in a byte
+
+ mov $0xff, %bh # re-load top as a hackish 8-bit counter
+ mov %al, %bl # move in the flags
+
+test_flags:
+ cmp $logo_end, %esi # have we reached the end?
+ je done_logo # if so, exit
+
+ shr $1, %ebx # shift bottom bit into carry flag
+ jc discrete_char # if set, we jump to discrete char
+
+offset_length:
+ lodsw # get match_length and match_position
+ mov %eax,%edx # copy to edx
+ # no need to mask dx, as we do it
+ # by default in output_loop
+
+ shr $(P_BITS),%eax
+ add $(THRESHOLD+1),%al
+ mov %al,%cl # cl = (ax >> P_BITS) + THRESHOLD + 1
+ # (=match_length)
+
+output_loop:
+ and $POSITION_MASK,%dh # mask it
+ mov text_buf(%edx), %al # load byte from text_buf[]
+ inc %edx # advance pointer in text_buf
+store_byte:
+ stosb # store it
+
+ mov %al, text_buf(%ebp) # store also to text_buf[r]
+ inc %ebp # r++
+ and $(N-1), %bp # mask r
+
+ loop output_loop # repeat until k>j
+
+ or %bh,%bh # if 0 we shifted through 8 and must
+ jnz test_flags # re-load flags
+
+ jmp decompression_loop
+
+discrete_char:
+ lodsb # load a byte
+ inc %ecx # we set ecx to one so byte
+ # will be output once
+ # (how do we know ecx is zero?)
+
+ jmp store_byte # and cleverly store it
+
+
+# end of LZSS code
+
+done_logo:
+
+ pop %ebp # get out_buffer and keep in bp
+ mov %ebp,%ecx # move out_buffer to ecx
+
+ call write_stdout # print the logo
+
+ #
+ # Setup
+ #
+setup:
+ mov $strcat,%edx # use edx as call pointer
+
+
+ #==========================
+ # PRINT VERSION
+ #==========================
+
+# push $SYSCALL_UNAME # uname syscall
+# pop %eax # in 3 bytes
+# mov $uname_info,%ebx # uname struct
+# int $0x80 # do syscall
+
+ mov %ebp,%edi # point %edi to out_buffer
+
+ mov $(uname_info+U_SYSNAME),%esi # os-name from uname "Linux"
+ call *%edx # call strcat
+
+ mov $ver_string,%esi # source is " Version "
+ call *%edx # call strcat
+ push %esi # save our .txt pointer
+
+ mov $(uname_info+U_RELEASE),%esi # version from uname "2.4.1"
+ call *%edx # call strcat
+
+ pop %esi # restore .txt pointer
+ # source is ", Compiled "
+ call *%edx # call strcat
+ push %esi # store for later
+
+ mov $(uname_info+U_VERSION),%esi # compiled date
+ call *%edx # call strcat
+
+ mov %ebp,%ecx # move out_buffer to ecx
+
+ mov $0xa,%ax # store linefeed on end
+ stosw # and zero
+
+ call *%edx # call strcat
+
+ call center_and_print # center and print
+
+ #===============================
+ # Middle-Line
+ #===============================
+
+ #=========
+ # Load /proc/cpuinfo into buffer
+ #=========
+
+ push %edx # save call pointer
+
+# push $SYSCALL_OPEN # load 5 [ open() ]
+# pop %eax # in 3 bytes
+
+# mov $cpuinfo,%ebx # '/proc/cpuinfo'
+# xor %ecx,%ecx # 0 = O_RDONLY <bits/fcntl.h>
+# cdq # clear edx in clever way
+# int $0x80 # syscall. fd in eax.
+ # we should check that eax>=0
+
+# mov %eax,%ebx # save our fd
+
+# push $SYSCALL_READ # load 3 = read()
+# pop %eax # in 3 bytes
+
+ mov $disk_buffer,%ecx
+
+# mov $16,%dh # 4096 is maximum size of proc file #)
+ # we load sneakily by knowing
+ # 16<<8 = 4096. be sure edx clear
+
+
+# int $0x80
+
+# push $SYSCALL_CLOSE # close (to be correct)
+# pop %eax
+# int $0x80
+
+ #=============
+ # Number of CPUs
+ #=============
+number_of_cpus:
+
+ xor %ebx,%ebx # chip count
+
+ # $disk_buffer still in ecx
+bogo_loop:
+ mov (%ecx), %eax # load 4 bytes into eax
+ inc %ecx # increment pointer
+
+ cmp $0,%al # check for end of file
+ je done_bogo
+
+ cmp $('o'<<24+'g'<<16+'o'<<8+'b'),%eax
+ # "bogo" in little-endian
+
+ jne bogo_loop # if not equal, keep going
+
+ inc %ebx # otherwise, we have a bogo
+ inc %ebx # times two for future magic
+ jmp bogo_loop
+
+done_bogo:
+ lea one-6(%ebx,%ebx,2), %esi
+ # Load into esi
+ # [one]+(num_cpus*6)
+ #
+ # the above multiplies by three
+ # esi = (ebx+(ebx*2))
+ # and we double-incremented ebx
+ # earlier
+
+ mov %ebp,%edi # move output buffer to edi
+
+ pop %edx # restore call pointer
+ call *%edx # copy it (call strcat)
+
+ mov $' ',%al # print a space
+ stosb
+
+ push %ebx # store cpu count
+ push %edx # store strcat pointer
+
+ #=========
+ # MHz
+ #=========
+print_mhz:
+ mov $('z'<<24+'H'<<16+'M'<<8+' '),%ebx
+ # find ' MHz' and grab up to .
+ # we are little endian
+ mov $'.',%ah
+
+ # below is same as "sub $(strcat-find_string),%edx
+ # gas won't let us force the one-byte constant
+ .byte 0x83,0xEA,strcat-find_string
+
+ call *%edx # call find string
+
+ mov %ebx,%eax # clever way to get MHz in, sadly
+ ror $8,%eax # not any smaller than a mov
+ stosl
+
+ #=========
+ # Chip Name
+ #=========
+chip_name:
+
+ # because of ugly newer cpuinfos from intel I had to hack this
+ # now we grab the first two words in the name field and use that
+ # it works on all recent Intel and AMD chips. Older things
+ # might choke
+
+ mov $('e'<<24+'m'<<16+'a'<<8+'n'),%ebx
+ # find 'name\t: ' and grab up to \n
+ # we are little endian
+ mov $' ',%ah
+ call *%edx # print first word
+ stosb # store a space
+ call skip_spaces # print next word
+
+ pop %edx
+ pop %ebx # restore chip count
+ pop %esi
+
+ call *%edx # ' Processor'
+ cmpb $2,%bl
+ jne print_s
+ inc %esi # if singular, skip the s
+print_s:
+ call *%edx # 's, '
+
+ push %esi # restore the values
+ push %edx
+
+ #========
+ # RAM
+ #========
+
+# push $SYSCALL_SYSINFO # sysinfo() syscall
+# pop %eax
+# mov $sysinfo_buff,%ebx
+# int $0x80
+
+ mov (sysinfo_buff+S_TOTALRAM),%eax # size in bytes of RAM
+ shr $20,%eax # divide by 1024*1024 to get M
+ adc $0, %eax # round
+
+
+ call num_to_ascii
+
+ pop %edx # restore strcat pointer
+
+ pop %esi # print 'M RAM, '
+ call *%edx # call strcat
+
+ push %esi
+
+
+ #========
+ # Bogomips
+ #========
+
+ mov $('s'<<24+'p'<<16+'i'<<8+'m'),%ebx
+ # find 'mips\t: ' and grab up to \n
+ mov $0xa,%ah
+ call find_string
+
+ pop %esi # bogo total follows RAM
+
+ call *%edx # call strcat
+
+ push %esi
+
+ mov %ebp,%ecx # point ecx to out_buffer
+
+
+ call center_and_print # center and print
+
+ #=================================
+ # Print Host Name
+ #=================================
+
+ mov %ebp,%edi # point to output_buffer
+
+ mov $(uname_info+U_NODENAME),%esi # host name from uname()
+ call *%edx # call strcat
+
+ # ecx is unchanged
+ call center_and_print # center and print
+
+ pop %ecx # (.txt) pointer to default_colors
+
+ call write_stdout
+
+
+ #================================
+ # Exit
+ #================================
+exit:
+ xor %ebx,%ebx
+ xor %eax,%eax
+ inc %eax # put exit syscall number (1) in eax
+ int $0x80 # and exit
+
+
+ #=================================
+ # FIND_STRING
+ #=================================
+ # ah is char to end at
+ # ebx is 4-char ascii string to look for
+ # edi points at output buffer
+
+find_string:
+
+ mov $disk_buffer-1,%esi # look in cpuinfo buffer
+find_loop:
+ inc %esi
+ cmpb $0, (%esi) # are we at EOF?
+ je done # if so, done
+
+ cmp (%esi), %ebx # do the strings match?
+ jne find_loop # if not, loop
+
+ # ! if we get this far, we matched
+
+find_colon:
+ lodsb # repeat till we find colon
+ cmp $0,%al # this is actually smaller code
+ je done # than an or ecx/repnz scasb
+ cmp $':',%al
+ jne find_colon
+
+
+skip_spaces:
+ lodsb # skip spaces
+ cmp $0x20,%al # Loser new intel chips have lots??
+ je skip_spaces
+
+store_loop:
+ cmp $0,%al
+ je done
+ cmp %ah,%al # is it end string?
+ je almost_done # if so, finish
+ cmp $'\n',%al # also end if linefeed
+ je almost_done
+ stosb # if not store and continue
+ lodsb # load value
+ jmp store_loop
+
+almost_done:
+
+ movb $0, (%edi) # replace last value with NUL
+done:
+ ret
+
+
+ #================================
+ # strcat
+ #================================
+
+strcat:
+ lodsb # load a byte from [ds:esi]
+ stosb # store a byte to [es:edi]
+ cmp $0,%al # is it zero?
+ jne strcat # if not loop
+ dec %edi # point to one less than null
+ ret # return
+
+ #==============================
+ # center_and_print
+ #==============================
+ # string to center in ecx
+
+center_and_print:
+ push %edx
+ push %ecx # save the string pointer
+ inc %edi # move to a clear buffer
+ push %edi # save for later
+
+ mov $('['<<8+27),%ax # we want to output ^[[
+ stosw
+
+ cdq # clear dx
+
+str_loop2: # find end of string
+ inc %edx
+ cmpb $0,(%ecx,%edx) # repeat till we find zero
+ jne str_loop2
+
+ push $81 # one added to cheat, we don't
+ # count the trailing '\n'
+ pop %eax
+
+ cmp %eax,%edx # see if we are >=80
+ jl not_too_big # if so, don't center
+ push $80
+ pop %edx
+
+not_too_big:
+ sub %edx,%eax # subtract size from 80
+
+ shr %eax # then divide by 2
+
+ call num_to_ascii # print number of spaces
+ mov $'C',%al # tack a 'C' on the end
+ # ah is zero from num_to_ascii
+ stosw # store C and a NULL
+ pop %ecx # pop the pointer to ^[[xC
+
+ call write_stdout # write to the screen
+
+done_center:
+ pop %ecx # restore string pointer
+ # and trickily print the real string
+
+ pop %edx
+
+ #================================
+ # WRITE_STDOUT
+ #================================
+ # ecx has string
+ # eax,ebx,ecx,edx trashed
+write_stdout:
+ push %edx
+ push $SYSCALL_WRITE # put 4 in eax (write syscall)
+ pop %eax # in 3 bytes of code
+
+ cdq # clear edx
+
+ xor %ebx,%ebx # put 1 in ebx (stdout)
+ inc %ebx # in 3 bytes of code
+
+ # another way of doing this: lea 1(%edx), %ebx
+
+str_loop1:
+ inc %edx
+ cmpb $0,(%ecx,%edx) # repeat till zero
+ jne str_loop1
+
+ int $0x80 # run the syscall
+ pop %edx
+ ret
+
+ ##############################
+ # num_to_ascii
+ ##############################
+ # ax = value to print
+ # edi points to where we want it
+
+num_to_ascii:
+ push $10
+ pop %ebx
+ xor %ecx,%ecx # clear ecx
+div_by_10:
+ cdq # clear edx
+ div %ebx # divide
+ push %edx # save for later
+ inc %ecx # add to length counter
+ or %eax,%eax # was Q zero?
+ jnz div_by_10 # if not divide again
+
+write_out:
+ pop %eax # restore in reverse order
+ add $0x30, %al # convert to ASCII
+ stosb # save digit
+ loop write_out # loop till done
+ ret
+
+#===========================================================================
+# section .data
+#===========================================================================
+.data
+
+ver_string: .ascii " Version \0"
+compiled_string: .ascii ", Compiled \0"
+processor: .ascii " Processor\0"
+s_comma: .ascii "s, \0"
+ram_comma: .ascii "M RAM, \0"
+bogo_total: .ascii " Bogomips Total\n\0"
+
+default_colors: .ascii "\033[0m\n\n\0"
+
+cpuinfo: .ascii "/proc/cpuinfo\0"
+
+
+one: .ascii "One\0\0\0"
+two: .ascii "Two\0\0\0"
+three: .ascii "Three\0"
+four: .ascii "Four\0"
+
+.include "../logo.lzss_new"
+
+disk_buffer:
+.ascii "processor : 0\n"
+.ascii "vendor_id : AuthenticAMD\n"
+.ascii "cpu family : 6\n"
+.ascii "model : 6\n"
+.ascii "model name : AMD Athlon(tm) XP 2000+\n"
+.ascii "stepping : 2\n"
+.ascii "cpu MHz : 1665.267\n"
+.ascii "cache size : 256 KB\n"
+.ascii "fdiv_bug : no\n"
+.ascii "hlt_bug : no\n"
+.ascii "f00f_bug : no\n"
+.ascii "coma_bug : no\n"
+.ascii "fpu : yes\n"
+.ascii "fpu_exception : yes\n"
+.ascii "cpuid level : 1\n"
+.ascii "wp : yes\n"
+.ascii "flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow up\n"
+.ascii "bogomips : 3330.53\n"
+.ascii "clflush size : 32\n"
+.ascii "power management: ts\n\0"
+
+uname_info:
+.ascii "Linux\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "tobler\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "2.6.29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "#1 SMP Mon May 4 09:51:54 EDT 2009\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+.ascii "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+
+
+sysinfo_buff:
+.long 0,0,0,0,512*1024*1024,0,0,0,0
+.long 0,0,0,0,0,0,0,0,0
+
+#============================================================================
+# section .bss
+#============================================================================
+.bss
+
+.lcomm text_buf, (N+F-1)
+.lcomm out_buffer,16384
+
+
+
+
+
--- /dev/null
+T:1:9 :7:10 :5:38 :2:44 :8:65 :9:663 :4:119 :6:2 :3:51
+T:7:5 :5:16 :2:18 :8:52 :9:858 :4:35 :6:1 :3:15
+T:7:5 :5:16 :2:18 :8:52 :9:858 :4:35 :6:1 :3:15
+T:7:5 :5:14 :2:16 :8:91 :9:863 :4:7 :6:1 :3:3
+T:7:5 :5:12 :2:14 :8:78 :9:880 :4:7 :6:1 :3:3
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:7:5 :5:10 :2:11 :8:65 :9:908 :6:1
+T:7:5 :5:14 :2:17 :8:117 :9:846 :6:1
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:8 :2:10 :8:65 :9:911 :6:1
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:5:6 :2:6 :8:39 :9:949
+T:7:5 :5:6 :2:8 :8:52 :9:928 :6:1
+T:5:4 :2:4 :8:26 :9:966
+T:7:5 :5:12 :2:14 :8:78 :9:880 :4:7 :6:1 :3:3
+T:5:6 :2:6 :8:39 :9:949
+T:7:5 :5:8 :2:10 :8:65 :9:911 :6:1
+T:7:5 :5:14 :2:16 :8:91 :9:863 :4:7 :6:1 :3:3
+T:5:8 :2:8 :8:52 :9:932
+T:7:5 :5:10 :2:12 :8:78 :9:894 :6:1
+T:7:5 :5:10 :2:12 :8:75 :9:897 :6:1
+T:5:12 :2:12 :8:81 :9:895
+T:7:5 :5:8 :2:8 :8:39 :9:389 :4:7 :6:1 :3:3 :10:3 :11:9 :12:528
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:12:1000
+T:15:4 :18:2 :19:3 :20:2 :21:3 :22:4 :16:283 :17:10 :12:686 :13:1 :14:2
+T:23:1 :32:7 :34:352 :33:177 :16:1 :17:2 :24:10 :25:195 :26:4 :27:3 :30:4 :31:11 :11:9 :12:204 :13:2 :14:4 :28:9 :29:5
+T:34:667 :33:333
+T:34:665 :33:332 :35:3
+T:34:128 :33:64 :36:4 :37:8 :49:6 :38:8 :40:407 :39:274 :41:21 :42:14 :43:6 :44:10 :45:10 :46:8 :47:12 :48:2 :16:16 :17:2
+T:50:2 :51:4 :52:2 :53:2 :54:6 :56:3 :57:4 :38:4 :40:405 :39:272 :41:18 :42:12 :43:9 :44:30 :45:30 :46:26 :47:39 :48:4 :16:88 :17:6 :28:9 :55:12 :29:13
+T:40:600 :39:400
+T:58:2 :59:3 :40:352 :39:236 :41:18 :42:12 :43:6 :44:16 :45:16 :46:14 :47:21 :48:2 :16:68 :17:2 :24:10 :25:210 :26:4 :27:3 :28:5
+
+
+# Thread 1
+# Total intervals: 39 (Interval Size 1000)
+# Total instructions: 39439
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 39 (Interval Size 1000)
+# Total instructions: 39439
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+\e[0;1;37;47m#################################################################\e[0;30;47m#####\e[1;37m#########\e[1;37;40m
+\e[0;1;37;47m################################################################\e[0;30;47m#######\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m###################\e[31m#\e[37m############################################\e[0;30;47m##\e[1;37mO\e[0;30;47m#\e[1;37mO\e[0;30;47m##\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m######\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###########################################\e[0;30;47m#\e[1;33m#####\e[0;30;47m#\e[1;37m########\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#############\e[0;30;47m#\e[1;37m##########################################\e[0;30;47m##\e[1;37m##\e[33m###\e[37m##\e[0;30;47m##\e[1;37m######\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#########\e[31m###\e[37m###\e[0;30;47m###\e[1;37m#\e[0;30;47m####\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m###\e[1;37m##\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m#####\e[0;30;47m#\e[1;37m##########\e[0;30;47m##\e[1;37m#####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m########\e[31m#\e[37m##\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m############\e[0;30;47m##\e[1;37m####\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[31m#\e[37m###\e[31m#\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m#########\e[0;30;47m#\e[1;37m############\e[0;30;47m###\e[1;37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m##########\e[31m##\e[0;30;47m#\e[1;37m###\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m######\e[0;30;47m###\e[1;37m########\e[33m##\e[0;30;47m#\e[1;37m###########\e[0;30;47m##\e[1;33m#\e[37m###\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m#######\e[0;30;47m#\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m#\e[0;30;47m##\e[1;37m#####\e[33m######\e[0;30;47m#\e[1;37m#######\e[30m#\e[33m######\e[37m#\e[1;37;40m
+\e[0;1;37;47m####\e[0;30;47m##\e[1;37m######\e[0;30;47m##\e[1;37m#\e[31m##\e[0;30;47m#\e[1;37m#\e[0;30;47m#\e[1;37m##\e[0;30;47m##\e[1;37m#####\e[0;30;47m##\e[1;37m##\e[0;30;47m###\e[1;37m###\e[0;30;47m###\e[1;37m####\e[0;30;47m##\e[1;37m###\e[0;30;47m##\e[1;37m####\e[33m#######\e[0;30;47m#\e[1;37m#####\e[0;30;47m#\e[1;33m#######\e[37m#\e[1;37;40m
+\e[0;1;37;47m##\e[0;30;47m############\e[1;37m##\e[0;30;47m###\e[1;37m##\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m###\e[0;30;47m####\e[1;37m#\e[0;30;47m###\e[1;37m#\e[0;30;47m#####\e[1;37m#\e[0;30;47m######\e[1;37m###\e[33m#####\e[30m#\e[0;30;47m#####\e[1m#\e[33m#####\e[37m###\e[1;37;40m
+
+\e[7CLinux Version 2.6.29, Compiled #1 SMP Mon May 4 09:51:54 EDT 2009
+\e[5COne 1665MHz AMD Athlon(tm) Processor, 512M RAM, 3330.53 Bogomips Total
+\e[37Ctobler\e[0m
+
--- /dev/null
+prog: ll
+vgopts: --interval-size=1000 --bb-out-file=ll.out.bb
+post: cat ll.out.bb
+cleanup: rm ll.out.bb
+
--- /dev/null
+include $(top_srcdir)/Makefile.tool-tests.am
+
+dist_noinst_SCRIPTS = filter_stderr
+
+check_PROGRAMS = \
+ million rep_prefix fldcw_check complex_rep
+
+EXTRA_DIST = \
+ complex_rep.stderr.exp \
+ complex_rep.vgtest \
+ fldcw_check.stderr.exp \
+ fldcw_check.vgtest \
+ million.stderr.exp \
+ million.post.exp \
+ million.vgtest \
+ rep_prefix.stderr.exp \
+ rep_prefix.vgtest
+
+AM_CCASFLAGS += -ffreestanding
+
+LDFLAGS += @FLAG_M32@ -static -nostartfiles -nodefaultlibs
+
+complex_rep_SOURCES = complex_rep.S
+fldcw_check_SOURCES = fldcw_check.S
+million_SOURCES = million.S
+rep_prefix_SOURCES = rep_prefix.S
+
+AM_CCASFLAGS += @FLAG_M32@
--- /dev/null
+# When trying (and failing) to instrument at the basic block level
+# I thought up a lot of corner-cases in the rep code. This tries
+# to catch some of them
+
+# Performance counters give us 8207 insns
+# 11 + 8*1024 + 3 = 8206
+
+ .globl _start
+_start:
+ cld # we want these to happen forward
+
+ mov $0xfeb1378,%eax # value to store
+
+ # test back-to-back rep/stosb's
+
+ mov $1024,%ecx
+ mov $buffer1, %edi # set destination
+ rep stosb # store 1024 times
+ rep stosb # should store 0 times
+ rep stosb # should store 0 times
+
+
+ # test stosb where cx is 0
+
+ xor %ecx,%ecx
+ mov $buffer1, %edi # set destination
+ rep stosb # should not load at all
+
+ # test rep inside of a loop
+
+ mov $1024, %ebx
+rep_loop:
+
+ mov $1024,%ecx
+ mov $buffer1, %edi # set destination
+ rep stosb
+
+ mov $1024,%ecx
+ mov $buffer1, %edi # set destination
+ rep stosb
+
+ dec %ebx
+ jnz rep_loop
+
+
+ #================================
+ # Exit
+ #================================
+exit:
+ mov $1,%eax
+#ifdef VGO_darwin
+ pushl $0
+#else
+ xor %ebx,%ebx # we return 0
+#endif
+ int $0x80 # and exit
+
+
+#.bss
+
+.lcomm buffer1, 16384
+
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 100000)
+# Total instructions: 8206
+# Total reps: 2100228
+# Unique reps: 2052
+# Total fldcw instructions: 0
--- /dev/null
+prog: complex_rep
+vgopts: --interval-size=100000 --bb-out-file=complex_rep.out.bb
+cleanup: rm complex_rep.out.bb
+
--- /dev/null
+#! /bin/sh
+
+../filter_stderr
+
+
--- /dev/null
+
+.globl _start
+
+_start:
+ # This code tests for the fldcw "load floating point command word"
+ # instruction. On most x86 processors the retired_instruction
+ # performance counter counts this as one instruction. However,
+ # on Pentium 4 systems it counts as two. Therefore this can
+ # affect BBV results on such a system.
+ # fldcw is most often used to set the rouding mode when doing
+ # floating point to integer conversions
+
+ # It is encoded as "d9 /5" which means
+ # 1101 1001 xx10 1yyy
+ # Where xx is the "mod" which will be 00, 01, or 10 indicating offset
+ # and yyy is the register field
+
+
+
+ # these are instructions with similar encodings to fldcw
+ # that can cause false positives if the test isn't explicit enough
+similar:
+ fld1 # d9 e8
+ fldl2t # d9 e9
+ fldl2e # d9 ea
+ fldpi # d9 eb
+ fldlg2 # d9 ec
+ fldln2 # d9 ed
+ fldz # d9 ee
+
+ # check some varied ways of calling fldcw
+
+
+ # offset on stack
+stack:
+ sub $4,%esp # allocate space on stack
+ fnstcw 2(%esp)
+ fldcw 2(%esp)
+ add $4,%esp # restore stack
+
+ # 32-bit register
+
+ fnstcw cw
+ mov $cw,%eax
+ fldcw 0(%eax) # eax
+ mov $cw,%ebx
+ fldcw 0(%ebx) # ebx
+ mov $cw,%ecx
+ fldcw 0(%ecx) # ecx
+ mov $cw,%edx
+ fldcw 0(%edx) # edx
+
+ # register + 8-bit offset
+eight_bit:
+ mov $cw,%eax
+ sub $32,%eax
+
+ fldcw 32(%eax) # eax + 8 bit offset
+ mov %eax,%ebx
+ fldcw 32(%ebx) # ebx + 8 bit offset
+ mov %eax,%ecx
+ fldcw 32(%ecx) # ecx + 8 bit offset
+ mov %eax,%edx
+ fldcw 32(%edx) # edx + 8 bit offset
+
+ # register + 32-bit offset
+thirtytwo_bit:
+ mov $cw,%eax
+ sub $30000,%eax
+
+ fldcw 30000(%eax) # eax + 16 bit offset
+ mov %eax,%ebx
+ fldcw 30000(%ebx) # ebx + 16 bit offset
+ mov %eax,%ecx
+ fldcw 30000(%ecx) # ecx + 16 bit offset
+ mov %eax,%edx
+ fldcw 30000(%edx) # edx + 16 bit offset
+
+ # check an fp/integer conversion
+ # in a loop to give a bigger count
+
+ mov $1024,%ecx
+big_loop:
+
+ fldl three # load value onto fp stack
+ fnstcw saved_cw # store control word to mem
+ movzwl saved_cw, %eax # load cw from mem, zero extending
+ movb $12, %ah # set cw for "round to zero"
+ movw %ax, cw # store back to memory
+ fldcw cw # save new rounding mode
+ fistpl result # save stack value as integer to mem
+ fldcw saved_cw # restore old cw
+
+ loop big_loop # loop to make the count more obvious
+
+ movl result, %ebx # sanity check to see if the
+ cmp $3,%ebx # result is the expected one
+ je exit
+
+print_error:
+ mov $4,%eax # write syscall
+#ifdef VGO_darwin
+ pushl $1
+ pushl $error
+ pushl $22
+#else
+ mov $1,%ebx # stdout
+ mov $error,%ecx # string
+ mov $22,%edx # length of string
+#endif
+ int $0x80
+
+exit:
+#ifdef VGO_darwin
+ pushl result
+#else
+ movl result, %ebx # load converted value
+#endif
+ movl $1, %eax # SYSCALL_EXIT
+ int $0x80
+
+
+
+.data
+saved_cw: .long 0
+cw: .long 0
+result: .long 0
+three: .long 0 # a floating point 3.0
+ .long 1074266112
+error: .asciz "Error! Wrong result!\n"
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 10000)
+# Total instructions: 9261
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 2061
--- /dev/null
+prog: fldcw_check
+vgopts: --interval-size=10000 --bb-out-file=fldcw_check.out.bb
+cleanup: rm fldcw_check.out.bb
+
--- /dev/null
+ # many thanks to David Fang
+ # for providing an OSX 10.5 machine to test on
+
+ # count for 1 million instructions
+ # total is 1 + 1 + 499997*2 + 4
+
+ .globl _start
+_start:
+ xor %ecx,%ecx # not needed, pads total to 1M
+ mov $499997,%ecx # load counter
+test_loop:
+ dec %ecx # repeat count times
+ jnz test_loop
+
+ #================================
+ # Exit
+ #================================
+
+ # syscall numbers in /usr/include/sys/syscall.h on OSX
+ # in arc/x86/include/asm/unistd_32.h on Linux
+ # disassemble on OSX otool -tV
+exit:
+#ifdef VGO_darwin
+ pushl $0 # we return 0
+ xor %eax,%eax
+ inc %eax # put exit syscall number (1) in eax
+ int $0x80 # and exit
+#else
+ xor %ebx,%ebx # we return 0
+ xor %eax,%eax
+ inc %eax # put exit syscall number (1) in eax
+ int $0x80 # and exit
+#endif
--- /dev/null
+T:1:4 :2:99997
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+T:2:100000
+
+
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
+
--- /dev/null
+# Thread 1
+# Total intervals: 10 (Interval Size 100000)
+# Total instructions: 1000000
+# Total reps: 0
+# Unique reps: 0
+# Total fldcw instructions: 0
--- /dev/null
+prog: million
+vgopts: --interval-size=100000 --bb-out-file=million.out.bb --pc-out-file=million.out.pc
+post: cat million.out.bb
+cleanup: rm million.out.bb million.out.pc
+
--- /dev/null
+#
+# rep, repe (repz) and repne (repnz) prefixed string instructions
+# only count as one instruction, even though they repeat many times
+# This test makes sure the bbv plugin counts these instructions properly
+# The answer is validated to hw perf counters.
+#
+
+ .globl _start
+_start:
+ cld # we want these to happen forward
+
+ #===================================
+ # Check varied order of the size prefix
+ # with the rep prefix. Older binutils
+ # did this one way, newer binutils the other
+ #===================================
+
+size_prefix:
+ # test 16-bit load
+
+ mov $8192, %ecx
+ mov $buffer1, %esi # set source
+ .byte 0x66, 0xf3, 0xad # lodsw
+
+ mov $8192, %ecx
+ mov $buffer1, %esi # set source
+ .byte 0xf3, 0x66, 0xad # lodsw
+
+
+
+
+ #===================================
+ # Load and Store Instructions
+ #===================================
+loadstore:
+ xor %eax, %eax
+ mov $0xd, %al # set eax to d
+
+ # test 8-bit store
+
+ mov $16384, %ecx
+ mov $buffer1, %edi # set destination
+ rep stosb # store d 16384 times, auto-increment
+
+ # test 8-bit load
+
+ mov $16384, %ecx
+ mov $buffer1, %esi # set source
+ rep lodsb # load byte 16384 times, auto-increment
+
+ cmp $0xd,%al # if we loaded wrong value
+ jne print_error # print an error
+
+ # test 16-bit store
+
+ mov $0x020d,%ax # store 0x020d
+
+ mov $8192, %ecx
+ mov $buffer1, %edi # set destination
+ rep stosw # store 8192 times, auto-increment
+
+ # test 16-bit load
+
+ mov $8192, %ecx
+ mov $buffer1, %esi # set source
+ rep lodsw # load 8192 times, auto-increment
+
+ cmp $0x020d,%ax # if we loaded wrong value
+ jne print_error # print an error
+
+ # test 32-bit store
+
+ mov $0x0feb1378,%eax # store 0x0feb1378
+
+ mov $4096, %ecx
+ mov $buffer1, %edi # set destination
+ rep stosl # store 4096 times, auto-increment
+
+ # test 32-bit load
+
+ mov $4096, %ecx
+ mov $buffer1, %esi # set source
+ rep lodsl # load 4096 times, auto-increment
+
+ cmp $0x0feb1378,%eax # if we loaded wrong value
+ jne print_error # print an error
+
+ #=============================
+ # Move instructions
+ #=============================
+moves:
+ # test 8-bit move
+
+ mov $16384, %ecx
+ mov $buffer1, %esi
+ mov $buffer2, %edi
+ rep movsb
+
+ # test 16-bit move
+
+ mov $8192, %ecx
+ mov $buffer2, %esi
+ mov $buffer1, %edi
+ rep movsw
+
+ # test 32-bit move
+
+ mov $4096, %ecx
+ mov $buffer1, %esi
+ mov $buffer2, %edi
+ rep movsl
+
+ #==================================
+ # Compare equal instructions
+ #==================================
+compare_equal:
+ # first set up the areas to compare
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer1, %edi
+ mov $4096, %ecx
+ rep stosl
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer2, %edi
+ mov $4096, %ecx
+ rep stosl
+
+ # test 8-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $16384, %ecx
+ repe cmpsb
+ jnz print_error
+
+ # test 16-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $8192, %ecx
+ repe cmpsw
+ jnz print_error
+
+ # test 32-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $4096, %ecx
+ repe cmpsl
+ jnz print_error
+
+ #==================================
+ # Compare not equal instructions
+ #==================================
+compare_noteq:
+ # change second buffer
+
+ mov $0x5a5a5a5a,%eax
+ mov $buffer2, %edi
+ mov $4096, %ecx
+ rep stosl
+
+ # test 8-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $16384, %ecx
+ repne cmpsb
+ je print_error
+
+ # test 16-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $8192, %ecx
+ repne cmpsw
+ je print_error
+
+ # test 32-bit
+
+ mov $buffer1,%esi
+ mov $buffer2,%edi
+ mov $4096, %ecx
+ repne cmpsl
+ je print_error
+
+ #====================================
+ # Check scan equal instruction
+ #====================================
+
+ # test 8-bit
+
+ mov $0xa5,%al
+ mov $buffer1,%edi
+ mov $16384, %ecx
+ repe scasb
+ jnz print_error
+
+ # test 16-bit
+
+ mov $0xa5a5,%ax
+ mov $buffer1,%edi
+ mov $8192, %ecx
+ repe scasw
+ jnz print_error
+
+ # test 32-bit
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer1,%edi
+ mov $4096, %ecx
+ repe scasl
+ jnz print_error
+
+ #====================================
+ # Check scan not-equal instruction
+ #====================================
+
+ # test 8-bit
+
+ mov $0xa5,%al
+ mov $buffer2,%edi
+ mov $16384, %ecx
+ repne scasb
+ jz print_error
+
+ # test 16-bit
+
+ mov $0xa5a5,%ax
+ mov $buffer2,%edi
+ mov $8192, %ecx
+ repne scasw
+ jz print_error
+
+ # test 32-bit
+
+ mov $0xa5a5a5a5,%eax
+ mov $buffer2,%edi
+ mov $4096, %ecx
+ repne scasl
+ jz print_error
+
+ jmp exit # no error, skip to exit
+
+print_error:
+
+ mov $4, %eax # Write syscall
+#ifdef VGO_darwin
+ pushl $1
+ pushl $error_string
+ pushl $16
+#else
+ mov $1, %ebx # print to stdout
+ mov $error_string, %ecx # string to print
+ mov $16, %edx # strlen
+#endif
+ int $0x80 # call syscall
+
+ #================================
+ # Exit
+ #================================
+exit:
+#ifdef VGO_darwin
+ xor %ebx,%ebx # we return 0
+#else
+ pushl $0 # we return 0
+#endif
+ xor %eax,%eax
+ inc %eax # put exit syscall number (1) in eax
+ int $0x80 # and exit
+
+
+.data
+error_string: .asciz "Error detected!\n"
+
+#.bss
+
+.lcomm buffer1, 16384
+.lcomm buffer2, 16384
--- /dev/null
+# Thread 1
+# Total intervals: 0 (Interval Size 100000)
+# Total instructions: 124
+# Total reps: 229402
+# Unique reps: 26
+# Total fldcw instructions: 0
--- /dev/null
+prog: rep_prefix
+vgopts: --interval-size=100000 --bb-out-file=rep_prefix.out.bb
+cleanup: rm rep_prefix.out.bb
+