diff --git a/.gitignore b/.gitignore index 50d2256..99c9b7e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ zsim-ev.h5 zsim-cmp.h5 out.cfg heartbeat -myPatchRoot/ \ No newline at end of file +myPatchRoot/ +.cache/ +pin.log +compile_commands.json diff --git a/SConstruct b/SConstruct index 8c17800..67c27fb 100644 --- a/SConstruct +++ b/SConstruct @@ -17,7 +17,7 @@ def buildSim(cppFlags, dir, type, pgo=None): versionFile = joinpath(buildDir, "version.h") if os.path.exists(".git"): env.Command(versionFile, allSrcs + [".git/index", "SConstruct"], - 'printf "#define ZSIM_BUILDDATE \\"`date`\\"\\n#define ZSIM_BUILDVERSION \\"`python misc/gitver.py`\\"" >>' + versionFile) + 'printf "#define ZSIM_BUILDDATE \\"`date`\\"\\n#define ZSIM_BUILDVERSION \\"`python3 misc/gitver.py`\\"" >>' + versionFile) else: env.Command(versionFile, allSrcs + ["SConstruct"], 'printf "#define ZSIM_BUILDDATE \\"`date`\\"\\n#define ZSIM_BUILDVERSION \\"no git repo\\"" >>' + versionFile) @@ -33,23 +33,6 @@ def buildSim(cppFlags, dir, type, pgo=None): env['CC'] = 'icc' env['CXX'] = 'icpc -ipo' - # Required paths - if "PINPATH" in os.environ: - PINPATH = os.environ["PINPATH"] - else: - print("ERROR: You need to define the $PINPATH environment variable with Pin's path") - sys.exit(1) - - # Pin 3 introduces PinCRT. - withPinCrt = os.path.exists(joinpath(PINPATH, "extras/crt")) - if withPinCrt: - pinCrtDir = joinpath(PINPATH, "extras/crt") - pinCrtLibDir = joinpath(PINPATH, "intel64/runtime/pincrt") - assert os.path.exists(pinCrtDir) - assert os.path.exists(pinCrtLibDir) - # Pin 3.24 starts to support most C++11 - withPinCrtCXX11 = os.path.exists(joinpath(PINPATH, "extras/cxx")) - ROOT = Dir('.').abspath # NOTE: These flags are for the 28/02/2011 2.9 PIN kit (rev39599). Older versions will not build. @@ -58,49 +41,12 @@ def buildSim(cppFlags, dir, type, pgo=None): # NOTE (dsm 16 Apr 2015): Update flags code to support Pin 2.14 while retaining backwards compatibility # NOTE (gaomy May 2019): Set ABI version # NOTE (gaomy Sept 2020): Add -Wno-unused-function for the template ilog2 - env["CPPFLAGS"] += " -g -std=c++0x -Wall -Wno-unknown-pragmas -fomit-frame-pointer -fno-stack-protector" + env["CPPFLAGS"] += " -g -std=c++20 -Wall -Wno-unknown-pragmas -fomit-frame-pointer -fno-stack-protector" env["CPPFLAGS"] += " -MMD -DBIGARRAY_MULTIPLIER=1 -DUSING_XED -DTARGET_IA32E -DHOST_IA32E -fPIC -DTARGET_LINUX" env["CPPFLAGS"] += " -fabi-version=2" env["CPPFLAGS"] += " -Wno-unused-function" - # Add more flags and system paths for pintool if with PinCRT. - if withPinCrt: - env["CPPFLAGS"] += " -D__PIN__=1 -DPIN_CRT=1" - env["CPPFLAGS"] += " -fno-exceptions -fno-rtti -funwind-tables -fasynchronous-unwind-tables" - env["CPPFLAGS"] += " -Ddynamic_cast=static_cast" - if withPinCrtCXX11: - env["CPPFLAGS"] += " -isystem " + joinpath(PINPATH, "extras/cxx/include") - else: - env["CPPFLAGS"] += " -isystem " + joinpath(PINPATH, "extras/stlport/include") - env["CPPFLAGS"] += " -isystem " + joinpath(PINPATH, "extras/libstdc++/include") - env["CPPFLAGS"] += " -isystem " + joinpath(PINPATH, "extras/libunwind/include") - env["CPPFLAGS"] += " -isystem " + joinpath(pinCrtDir, "include") - env["CPPFLAGS"] += " -isystem " + joinpath(pinCrtDir, "include/arch-x86_64") - env["CPPFLAGS"] += " -isystem " + joinpath(pinCrtDir, "include/kernel/uapi") - env["CPPFLAGS"] += " -isystem " + joinpath(pinCrtDir, "include/kernel/uapi/asm-x86") - - # Pin 2.12+ kits have changed the layout of includes, detect whether we need - # source/include/ or source/include/pin/ - pinInclDir = joinpath(PINPATH, "source/include/") - if not os.path.exists(joinpath(pinInclDir, "pin.H")): - pinInclDir = joinpath(pinInclDir, "pin") - assert os.path.exists(joinpath(pinInclDir, "pin.H")) - - # Pin 2.14 changes location of XED - # Pin 3 changes location of XED again - xedName = "xed2" # used below - xedPath = joinpath(PINPATH, "extras/" + xedName + "-intel64/include") - if not os.path.exists(xedPath): - xedName = "xed" - xedPath = joinpath(PINPATH, "extras/" + xedName + "-intel64/include") - if os.path.exists(joinpath(xedPath, "xed")): - xedPath = joinpath(xedPath, "xed") - assert os.path.exists(xedPath) - - env["CPPPATH"] = [xedPath, - pinInclDir, joinpath(pinInclDir, "gen"), - joinpath(PINPATH, "extras/components/include")] - + env["CPPPATH"] = [] # Perform trace logging? ##env["CPPFLAGS"] += " -D_LOG_TRACE_=1" @@ -110,47 +56,6 @@ def buildSim(cppFlags, dir, type, pgo=None): # Be a Warning Nazi? (recommended) # env["CPPFLAGS"] += " -Werror " - # Enables lib and harness to use the same info/log code, - # but only lib uses pin locks for thread safety - env["PINCPPFLAGS"] = " -DMT_SAFE_LOG " - - # PIN-specific libraries - env["PINLINKFLAGS"] = " -Wl,--hash-style=sysv -Wl,-Bsymbolic -Wl,--version-script=" + joinpath(pinInclDir, "pintool.ver") - - # To prime system libs, we include /usr/lib and /usr/lib/x86_64-linux-gnu - # first in lib path. In particular, this solves the issue that, in some - # systems, Pin's libelf takes precedence over the system's, but it does not - # include symbols that we need or it's a different variant (we need - # libelfg0-dev in Ubuntu systems) - # NOTE(gaomy May 2019): PinCRT use its own libs and the system libs are - # disallowed, so libelf dependency is removed. - env["PINLIBPATH"] = [joinpath(PINPATH, "extras/" + xedName + "-intel64/lib"), - joinpath(PINPATH, "intel64/lib"), joinpath(PINPATH, "intel64/lib-ext")] - - # Libdwarf is provided in static and shared variants, Ubuntu only provides - # static, and I don't want to add -R because - # there are some other old libraries provided there (e.g., libelf) and I - # want to use the system libs as much as possible. So link directly to the - # static version of libdwarf. - - # Pin 2.14 uses unambiguous libpindwarf - # Pin 3 uses libpin3dwarf - # Pin 3.25 changes back to libpindwarf and changes its path - pindwarfPath = joinpath(PINPATH, "intel64/lib-ext/libdwarf.a") - pindwarfLib = File(pindwarfPath) - if not os.path.exists(pindwarfPath): - pindwarfPath = joinpath(PINPATH, "intel64/lib-ext/libpindwarf.a") - pindwarfLib = "pindwarf" - if not os.path.exists(pindwarfPath): - pindwarfPath = joinpath(PINPATH, "intel64/lib-ext/libpin3dwarf.so") - pindwarfLib = "pin3dwarf" - if not os.path.exists(pindwarfPath): - pindwarfPath = joinpath(PINPATH, "intel64/lib/libpindwarf.so") - pindwarfLib = "pindwarf" - assert os.path.exists(pindwarfPath) - - env["PINLIBS"] = ["pin", "xed", pindwarfLib] - # Non-pintool libraries env["LIBPATH"] = [] env["LIBS"] = ["config", "hdf5", "hdf5_hl"] @@ -177,26 +82,20 @@ def buildSim(cppFlags, dir, type, pgo=None): if "MBEDTLSPATH" in os.environ: MBEDTLSPATH = os.environ["MBEDTLSPATH"] env["LINKFLAGS"] += " -Wl,-R" + joinpath(MBEDTLSPATH, "lib") - env["PINLIBPATH"] += [joinpath(MBEDTLSPATH, "lib")] env["CPPPATH"] += [joinpath(MBEDTLSPATH, "include")] - env["PINLIBS"] += ["mbedcrypto"] env["CPPFLAGS"] += " -D_WITH_MBEDTLS_=1 " if "POLARSSLPATH" in os.environ: POLARSSLPATH = os.environ["POLARSSLPATH"] env["LINKFLAGS"] += " -Wl,-R" + joinpath(POLARSSLPATH, "lib") - env["PINLIBPATH"] += [joinpath(POLARSSLPATH, "library")] env["CPPPATH"] += [joinpath(POLARSSLPATH, "include")] - env["PINLIBS"] += ["polarssl"] env["CPPFLAGS"] += " -D_WITH_POLARSSL_=1 " # Only include DRAMSim if available if "DRAMSIMPATH" in os.environ: DRAMSIMPATH = os.environ["DRAMSIMPATH"] env["LINKFLAGS"] += " -Wl,-R" + DRAMSIMPATH - env["PINLIBPATH"] += [DRAMSIMPATH] env["CPPPATH"] += [DRAMSIMPATH] - env["PINLIBS"] += ["dramsim"] env["CPPFLAGS"] += " -D_WITH_DRAMSIM_=1 " # addr2line from GNU binutils is used as an independent third-party executable called by zsim when backtracing bugs. @@ -208,47 +107,8 @@ def buildSim(cppFlags, dir, type, pgo=None): env["CPPPATH"] += ["."] - # PinCRT libs. These libs are needed by both the shared lib pintool and the utilities. - if withPinCrt: - # PinCRT has issues, and we allow to apply our patches to it. - if "PINCRTPATCHPATH" in os.environ: - PINCRTPATCHPATH = os.environ["PINCRTPATCHPATH"] - env["LINKFLAGS"] += " -Wl,-R" + PINCRTPATCHPATH + " -Wl,--no-as-needed" - env["LIBPATH"] += [PINCRTPATCHPATH] - env["LIBS"] += ["pincrtpatch"] - env["LINKFLAGS"] += " -nostdlib -Wl,-R" + pinCrtLibDir - env["LIBPATH"] += [pinCrtLibDir] - if withPinCrtCXX11: - env["LIBS"] += ["dl-dynamic", "m-dynamic", "c-dynamic", "c++", "c++abi", "unwind-dynamic"] - else: - env["LIBS"] += ["dl-dynamic", "stlport-dynamic", "m-dynamic", "c-dynamic", "unwind-dynamic"] - env["CRTBEGIN"] = [joinpath(pinCrtLibDir, "crtbegin.o")] - env["CRTEND"] = [joinpath(pinCrtLibDir, "crtend.o")] - env["CRTBEGINS"] = [joinpath(pinCrtLibDir, "crtbeginS.o")] - env["CRTENDS"] = [joinpath(pinCrtLibDir, "crtendS.o")] - env["LINKCOM"] = "$LINK -o $TARGET $LINKFLAGS $CRTBEGIN $SOURCES $_LIBDIRFLAGS $_LIBFLAGS $CRTEND" - env["SHLINKCOM"] = "$SHLINK -o $TARGET $SHLINKFLAGS $CRTBEGINS $SOURCES $_LIBDIRFLAGS $_LIBFLAGS $CRTENDS" - # Harness needs these defined - env["CPPFLAGS"] += ' -DPIN_PATH="' + joinpath(PINPATH, "intel64/bin/pinbin") + '" ' env["CPPFLAGS"] += ' -DZSIM_PATH="' + joinpath(ROOT, joinpath(buildDir, "libzsim.so")) + '" ' - env["CPPFLAGS"] += ' -DLDLIB_PATH="' + ":".join(env["LIBPATH"] + env["PINLIBPATH"]) + '" ' - if withPinCrt: - env["CPPFLAGS"] += ' -DPIN_CRT_TZDATA="' + joinpath(PINPATH, "extras/crt/tzdata") + '" ' - # PinCRT header misses NULL declaration - env["CPPFLAGS"] += ' -DNULL=0 ' - - # Do PGO? - if pgo == "generate": - genFlags = " -prof-gen " if useIcc else " -fprofile-generate " - env["PINCPPFLAGS"] += genFlags - env["PINLINKFLAGS"] += genFlags - elif pgo == "use": - if useIcc: useFlags = " -prof-use " - else: useFlags = " -fprofile-use -fprofile-correction " - # even single-threaded sims use internal threads, so we need correction - env["PINCPPFLAGS"] += useFlags - env["PINLINKFLAGS"] += useFlags env.SConscript("src/SConscript", variant_dir=buildDir, exports= {'env' : env.Clone()}) diff --git a/make.bash b/make.bash new file mode 100755 index 0000000..9209429 --- /dev/null +++ b/make.bash @@ -0,0 +1,5 @@ +#!/bin/bash + +export LIBCONFIGPATH=$HOME/.local +export HDF5PATH=$HOME/.local +bear -- scons --d -j64 diff --git a/src/SConscript b/src/SConscript index daf8c1f..424f194 100644 --- a/src/SConscript +++ b/src/SConscript @@ -3,8 +3,8 @@ import os Import("env") -commonSrcs = ["config.cpp", "galloc.cpp", "log.cpp", "pin_cmd.cpp"] -harnessSrcs = ["zsim_harness.cpp", "debug_harness.cpp"] +commonSrcs = ["config.cpp", "galloc.cpp", "log.cpp"] +harnessSrcs = ["debug_harness.cpp"] # By default, we compile all cpp files in libzsim.so. List the cpp files that # should be excluded below (one per line and in order, to ease merges) @@ -16,22 +16,12 @@ excludeSrcs = [ excludeSrcs += harnessSrcs libEnv = env.Clone() -libEnv["CPPFLAGS"] += libEnv["PINCPPFLAGS"] -libEnv["LINKFLAGS"] += libEnv["PINLINKFLAGS"] -libEnv["LIBPATH"] += libEnv["PINLIBPATH"] -libEnv["LIBS"] += libEnv["PINLIBS"] - -# Build syscall name file -def getSyscalls(): return os.popen("python ../../misc/list_syscalls.py").read().strip() -syscallSrc = libEnv.Substfile("virt/syscall_name.cpp", "virt/syscall_name.cpp.in", - SUBST_DICT = {"SYSCALL_NAME_LIST" : getSyscalls()}) # Build libzsim.so -globSrcNodes = Glob("*.cpp") + Glob("virt/*.cpp") +globSrcNodes = Glob("*.cpp") libSrcs = [str(x) for x in globSrcNodes if str(x) not in excludeSrcs] -libSrcs += [str(x) for x in syscallSrc] -libSrcs = list(set(libSrcs)) # ensure syscallSrc is not duplicated -libEnv.SharedLibrary("zsim.so", libSrcs) +libSrcs = list(set(libSrcs)) +libEnv.Program("zsim", libSrcs) # Build tracing utilities traceEnv = env.Clone() @@ -42,8 +32,8 @@ traceEnv.Program("sorttrace", ["sorttrace.cpp", "access_tracing.cpp"] + commonSr # Build harness (static to make it easier to run across environments) # NOTE(gaomy): with PinCRT we cannot build static as CRT only provides shared libs. # env["LINKFLAGS"] += " --static " -env["LIBS"] += ["c"] # required by wordexp in zsim harness -env.Program("zsim", harnessSrcs + commonSrcs) +# env["LIBS"] += ["c"] # required by wordexp in zsim harness +# env.Program("zsim", commonSrcs) -# Build additional utilities below -env.Program("fftoggle", ["fftoggle.cpp"] + commonSrcs) +# # Build additional utilities below +# env.Program("fftoggle", ["fftoggle.cpp"] + commonSrcs) diff --git a/src/contention_sim.cpp b/src/contention_sim.cpp index 50ca88b..77d4bd5 100644 --- a/src/contention_sim.cpp +++ b/src/contention_sim.cpp @@ -51,8 +51,8 @@ bool ContentionSim::CompareDomains::operator()(DomainData* d1, DomainData* d2) c } -void ContentionSim::SimThreadTrampoline(void* arg) { - ContentionSim* csim = static_cast(arg); +void ContentionSim::SimThreadTrampoline(ContentionSim* arg) { + ContentionSim* csim = arg; uint32_t thid = __sync_fetch_and_add(&csim->threadTicket, 1); csim->simThreadLoop(thid); } @@ -93,7 +93,7 @@ ContentionSim::ContentionSim(uint32_t _numDomains, uint32_t _numSimThreads) { threadTicket = 0; __sync_synchronize(); for (uint32_t i = 0; i < numSimThreads; i++) { - PIN_SpawnInternalThread(SimThreadTrampoline, this, 1024*1024, nullptr); + simThreadVec.emplace_back(SimThreadTrampoline, this); } lastCrossing = gm_calloc(numDomains*numDomains*MAX_THREADS); //TODO: refine... this allocs too much diff --git a/src/contention_sim.h b/src/contention_sim.h index c84b320..3c1e9ad 100644 --- a/src/contention_sim.h +++ b/src/contention_sim.h @@ -29,6 +29,7 @@ #include #include #include +#include #include "bithacks.h" #include "event_recorder.h" #include "g_std/g_vector.h" @@ -160,10 +161,12 @@ class ContentionSim : public GlobAlloc { #endif private: + std::vector simThreadVec; + void simThreadLoop(uint32_t thid); void simulatePhaseThread(uint32_t thid); - static void SimThreadTrampoline(void* arg); + static void SimThreadTrampoline(ContentionSim* arg); }; #endif // CONTENTION_SIM_H_ diff --git a/src/core.h b/src/core.h index a0a662a..cb3046d 100644 --- a/src/core.h +++ b/src/core.h @@ -27,16 +27,112 @@ #define CORE_H_ #include -#include "decoder.h" #include "g_std/g_string.h" #include "stats.h" +#include -struct BblInfo { - uint32_t instrs; - uint32_t bytes; - DynBbl oooBbl[0]; //0 bytes, but will be 1-sized when we have an element (and that element has variable size as well) +typedef uint32_t INS; +typedef uint64_t THREADID; +typedef uint64_t ADDRINT; +typedef bool BOOL; + +struct BranchInformation { + uint8_t branchTaken; + ADDRINT branchTakenNpc; +}; + +struct BasicBlockLoadStore { + ADDRINT addr1; + ADDRINT addr2; + ADDRINT addr3; + uint8_t entryValid; + struct BasicBlockLoadStore *next; +}; + +struct BasicBlock { + size_t codeBytes; + uint8_t *code; + size_t loadStores; + struct BasicBlockLoadStore *loadStore; + struct BranchInformation branchInfo; + uint64_t virtualPc; + uint64_t midgardPc; + uint64_t physicalPc; + size_t programIndex; + + void resetProgramIndex() { + programIndex = 0; + } + + INS getHeadInstruction(size_t *index = nullptr, uint8_t *instLength = nullptr, bool lookahead = false) { + if (programIndex >= codeBytes) { + programIndex += 4; + return 0xffffffff; + } + if (index) { + *index = programIndex; + } + INS tryFetch = *(INS *)(code + programIndex); + uint8_t firstTwoBits = tryFetch & 0x03; + switch (firstTwoBits) { + case 0x0: + case 0x1: + case 0x2: + if (instLength) { + *instLength = 2; + } + programIndex += lookahead ? 0 : 2; + return tryFetch & 0xffff; + default: + if (instLength) { + *instLength = 4; + } + programIndex += lookahead ? 0 : 4; + return tryFetch; + } + } + + size_t getInstructionCount() { + size_t ret = 0; + resetProgramIndex(); + for (getHeadInstruction(); !endOfBlock(); getHeadInstruction()) { + ret++; + } + resetProgramIndex(); + return ret; + } + + bool endOfBlock() { + return programIndex > codeBytes; + } + + ~BasicBlock() { + delete[] code; + for (size_t i = 0; i < loadStores; i++) { + auto next = loadStore[i].next; + while (next) { + auto cur = next; + next = next->next; + delete[] cur; + } + } + delete[] loadStore; + } }; +struct FrontendTrace { + struct BasicBlock *blocks; + size_t count; + + FrontendTrace() = default; + + ~FrontendTrace() { + delete[] blocks; + } +}; + +struct BblInfo; + /* Analysis function pointer struct * As an artifact of having a shared code cache, we need these to be the same for different core types. */ diff --git a/src/decoder.cpp b/src/decoder.cpp index d2aaf70..f6a8048 100644 --- a/src/decoder.cpp +++ b/src/decoder.cpp @@ -35,14 +35,6 @@ #include "locks.h" #include "log.h" -extern "C" { -#include "xed-interface.h" -} - -//XED expansion macros (enable us to type opcodes at a reasonable speed) -#define XC(cat) (XED_CATEGORY_##cat) -#define XO(opcode) (XED_ICLASS_##opcode) - //PORT defines. You might want to change these to affect scheduling #define PORT_0 (0x1) #define PORT_1 (0x2) @@ -58,104 +50,63 @@ void DynUop::clear() { } Decoder::Instr::Instr(INS _ins) : ins(_ins), numLoads(0), numInRegs(0), numOutRegs(0), numStores(0) { - uint32_t numOperands = INS_OperandCount(ins); - for (uint32_t op = 0; op < numOperands; op++) { - bool read = INS_OperandRead(ins, op); - bool write = INS_OperandWritten(ins, op); - assert(read || write); - if (INS_OperandIsMemory(ins, op)) { - if (read) loadOps[numLoads++] = op; - if (write) storeOps[numStores++] = op; - } else if (INS_OperandIsReg(ins, op) && INS_OperandReg(ins, op)) { //it's apparently possible to get INS_OperandIsReg to be true and an invalid reg ... WTF Pin? - REG reg = INS_OperandReg(ins, op); - assert(reg); // can't be invalid - reg = REG_FullRegName(reg); // eax -> rax, etc; o/w we'd miss a bunch of deps! - if (read) inRegs[numInRegs++] = reg; - if (write) outRegs[numOutRegs++] = reg; - } - } - - //By convention, we move flags regs to the end - reorderRegs(inRegs, numInRegs); - reorderRegs(outRegs, numOutRegs); -} - -static inline bool isFlagsReg(uint32_t reg) { - return (reg == REG_EFLAGS || reg == REG_FLAGS || reg == REG_MXCSR); -} - -void Decoder::Instr::reorderRegs(uint32_t* array, uint32_t regs) { - if (regs == 0) return; - //Unoptimized bubblesort -- when arrays are this short, regularity wins over O(n^2). - uint32_t swaps; - do { - swaps = 0; - for (uint32_t i = 0; i < regs-1; i++) { - if (isFlagsReg(array[i]) && !isFlagsReg(array[i+1])) { - std::swap(array[i], array[i+1]); - swaps++; - } - } - } while (swaps > 0); + // auto op = riscvOpCode(_ins); + // bool read = INS_OperandRead(ins, op); + // bool write = INS_OperandWritten(ins, op); + // assert(read || write); + // if (INS_OperandIsMemory(ins, op)) { + // if (read) loadOps[numLoads++] = op; + // if (write) storeOps[numStores++] = op; + // } else if (INS_OperandIsReg(ins, op) && INS_OperandReg(ins, op)) { //it's apparently possible to get INS_OperandIsReg to be true and an invalid reg ... WTF Pin? + // REG reg = INS_OperandReg(ins, op); + // assert(reg); // can't be invalid + // reg = REG_FullRegName(reg); // eax -> rax, etc; o/w we'd miss a bunch of deps! + // if (read) inRegs[numInRegs++] = reg; + // if (write) outRegs[numOutRegs++] = reg; + // } + + // //By convention, we move flags regs to the end + // reorderRegs(inRegs, numInRegs); + // reorderRegs(outRegs, numOutRegs); } -//Helper function -static std::string regsToString(uint32_t* regs, uint32_t numRegs) { - std::string str = ""; //if efficiency was a concern, we'd use a stringstream - if (numRegs) { - str += "("; - for (uint32_t i = 0; i < numRegs - 1; i++) { - str += REG_StringShort((REG)regs[i]) + ", "; - } - str += REG_StringShort((REG)regs[numRegs - 1]) + ")"; - } - return str; -} +// static inline bool isFlagsReg(uint32_t reg) { +// return (reg == REG_EFLAGS || reg == REG_FLAGS || reg == REG_MXCSR); +// } + +// void Decoder::Instr::reorderRegs(uint32_t* array, uint32_t regs) { +// if (regs == 0) return; +// //Unoptimized bubblesort -- when arrays are this short, regularity wins over O(n^2). +// uint32_t swaps; +// do { +// swaps = 0; +// for (uint32_t i = 0; i < regs-1; i++) { +// if (isFlagsReg(array[i]) && !isFlagsReg(array[i+1])) { +// std::swap(array[i], array[i+1]); +// swaps++; +// } +// } +// } while (swaps > 0); +// } void Decoder::reportUnhandledCase(Instr& instr, const char* desc) { - warn("Unhandled case: %s | %s | loads=%d stores=%d inRegs=%d %s outRegs=%d %s", desc, INS_Disassemble(instr.ins).c_str(), - instr.numLoads, instr.numStores, instr.numInRegs, regsToString(instr.inRegs, instr.numInRegs).c_str(), - instr.numOutRegs, regsToString(instr.outRegs, instr.numOutRegs).c_str()); + warn("Unhandled case: %s | %08x", desc, instr.ins); } -void Decoder::emitLoad(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t destReg) { - assert(idx < instr.numLoads); - uint32_t op = instr.loadOps[idx]; - uint32_t baseReg = INS_OperandMemoryBaseReg(instr.ins, op); - uint32_t indexReg = INS_OperandMemoryIndexReg(instr.ins, op); - - if (destReg == 0) destReg = REG_LOAD_TEMP + idx; - +void Decoder::emitLoad(DynUopVec& uops, uint16_t destReg, uint16_t baseReg) { DynUop uop; uop.clear(); uop.rs[0] = baseReg; - uop.rs[1] = indexReg; uop.rd[0] = destReg; uop.type = UOP_LOAD; uop.portMask = PORT_2; uops.push_back(uop); //FIXME: The interface should support in-place grow... } -void Decoder::emitStore(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t srcReg) { - assert(idx < instr.numStores); - uint32_t op = instr.storeOps[idx]; - uint32_t baseReg = INS_OperandMemoryBaseReg(instr.ins, op); - uint32_t indexReg = INS_OperandMemoryIndexReg(instr.ins, op); - - if (srcReg == 0) srcReg = REG_STORE_TEMP + idx; - - uint32_t addrReg; - - //Emit store address uop - //NOTE: Although technically one uop would suffice with <=1 address register, - //stores always generate 2 uops. The store address uop is especially important, - //as in Nehalem loads don't issue after all prior store addresses have been resolved. - addrReg = REG_STORE_ADDR_TEMP + idx; - +void Decoder::emitStore(DynUopVec& uops, uint16_t dataReg, uint16_t addrReg) { DynUop addrUop; addrUop.clear(); - addrUop.rs[0] = baseReg; - addrUop.rs[1] = indexReg; + addrUop.rs[0] = dataReg; addrUop.rd[0] = addrReg; addrUop.lat = 1; addrUop.portMask = PORT_3; @@ -166,25 +117,12 @@ void Decoder::emitStore(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t sr DynUop uop; uop.clear(); uop.rs[0] = addrReg; - uop.rs[1] = srcReg; + uop.rs[1] = dataReg; uop.portMask = PORT_4; uop.type = UOP_STORE; uops.push_back(uop); } - -void Decoder::emitLoads(Instr& instr, DynUopVec& uops) { - for (uint32_t i = 0; i < instr.numLoads; i++) { - emitLoad(instr, i, uops); - } -} - -void Decoder::emitStores(Instr& instr, DynUopVec& uops) { - for (uint32_t i = 0; i < instr.numStores; i++) { - emitStore(instr, i, uops); - } -} - void Decoder::emitFence(DynUopVec& uops, uint32_t lat) { DynUop uop; uop.clear(); @@ -208,965 +146,804 @@ void Decoder::emitExecUop(uint32_t rs0, uint32_t rs1, uint32_t rd0, uint32_t rd1 uops.push_back(uop); } -void Decoder::emitBasicMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports) { - if (instr.numLoads + instr.numInRegs > 1 || instr.numStores + instr.numOutRegs != 1) { - reportUnhandledCase(instr, "emitBasicMove"); - } +void Decoder::emitBasicMove(DynUopVec& uops, uint16_t rd, uint32_t lat, uint8_t ports) { //Note that we can have 0 loads and 0 input registers. In this case, we are loading from an immediate, and we set the input register to 0 so there is no dependence - uint32_t inReg = (instr.numInRegs == 1)? instr.inRegs[0] : 0; - if (!instr.numLoads && !instr.numStores) { //reg->reg - emitExecUop(inReg, 0, instr.outRegs[0], 0, uops, lat, ports); - } else if (instr.numLoads && !instr.numStores) { //mem->reg - emitLoad(instr, 0, uops, instr.outRegs[0]); - } else if (!instr.numLoads && instr.numStores) { //reg->mem - emitStore(instr, 0, uops, inReg); - } else { //mem->mem - emitLoad(instr, 0, uops); - emitStore(instr, 0, uops, REG_LOAD_TEMP /*chain with load*/); - } + emitExecUop(0, 0, rd, 0, uops, lat, ports); } -void Decoder::emitXchg(Instr& instr, DynUopVec& uops) { - if (instr.numLoads) { // mem <-> reg - assert(instr.numLoads == 1 && instr.numStores == 1); - assert(instr.numInRegs == 1 && instr.numOutRegs == 1); - assert(instr.inRegs[0] == instr.outRegs[0]); - - emitLoad(instr, 0, uops); - emitExecUop(instr.inRegs[0], 0, REG_EXEC_TEMP, 0, uops, 1, PORTS_015); //r -> temp - emitExecUop(REG_LOAD_TEMP, 0, instr.outRegs[0], 0, uops, 1, PORTS_015); // load -> r - emitStore(instr, 0, uops, REG_EXEC_TEMP); //temp -> out - if (!INS_LockPrefix(instr.ins)) emitFence(uops, 14); //xchg has an implicit lock prefix (TODO: Check we don't introduce two fences...) - } else { // reg <-> reg - assert(instr.numInRegs == 2 && instr.numOutRegs == 2); - assert(instr.inRegs[0] == instr.outRegs[0]); - assert(instr.inRegs[1] == instr.outRegs[1]); - - emitExecUop(instr.inRegs[0], 0, REG_EXEC_TEMP, 0, uops, 1, PORTS_015); - emitExecUop(instr.inRegs[1], 0, instr.outRegs[0], 0, uops, 1, PORTS_015); - emitExecUop(REG_EXEC_TEMP, 0, instr.outRegs[1], 0, uops, 1, PORTS_015); - } +void Decoder::emitXchg(DynUopVec& uops, uint16_t rd, uint16_t rs1, uint16_t rs2) { + emitLoad(uops, rd, rs1); + emitExecUop(rd, 0, 0, 0, uops, 1, PORTS_015); //r -> temp + emitExecUop(0, 0, rs2, 0, uops, 1, PORTS_015); // load -> r + emitStore(uops, 0, rs1); //temp -> out } - -void Decoder::emitConditionalMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports) { - uint32_t initialUops = uops.size(); - assert(instr.numOutRegs == 1); //always move to reg - assert(instr.numStores == 0); - - if (instr.numLoads) { - assert(instr.numLoads == 1); - assert(instr.numInRegs == 1); - uint32_t flagsReg = instr.inRegs[0]; - emitExecUop(flagsReg, 0, REG_EXEC_TEMP, 0, uops, lat, ports); - emitLoad(instr, 0, uops); - uint32_t numUops = uops.size(); - assert(numUops - initialUops == 2); - //We need to make the load depend on the result. This is quite crude, but works: - uops[numUops - 2].rs[1] = uops[numUops - 1].rs[1]; //comparison uop gets source of load (possibly 0) - uops[numUops - 1].rs[1] = REG_EXEC_TEMP; //load uop is made to depend on comparison uop - //TODO: Make this follow codepath below + load - } else { - assert(instr.numInRegs == 2); - assert(instr.numOutRegs == 1); - uint32_t flagsReg = instr.inRegs[1]; - //Since this happens in 2 instructions, we'll assume we need to read the output register - emitExecUop(flagsReg, instr.inRegs[0], REG_EXEC_TEMP, 0, uops, 1, ports); - emitExecUop(instr.outRegs[0], REG_EXEC_TEMP, instr.outRegs[0], 0, uops, lat, ports); - } +void Decoder::emitMul(DynUopVec& uops, uint16_t rd, uint16_t rs1, uint16_t rs2) { + emitExecUop(rs1, rs2, rd, 0, uops, 3, PORT_1); } -void Decoder::emitCompareAndExchange(Instr& instr, DynUopVec& uops) { - emitLoads(instr, uops); - - uint32_t srcs = instr.numLoads + instr.numInRegs; - uint32_t dsts = instr.numStores + instr.numOutRegs; - - uint32_t srcRegs[srcs + 2]; - uint32_t dstRegs[dsts + 2]; - populateRegArrays(instr, srcRegs, dstRegs); - - assert(srcs == 3); - assert(dsts == 3); - - //reportUnhandledCase(instr, "XXXX"); - //info("%d %d %d | %d %d %d", srcRegs[0], srcRegs[1], srcRegs[2], dstRegs[0], dstRegs[1], dstRegs[2]); - - uint32_t rflags = dstRegs[2]; - uint32_t rax = dstRegs[1]; //note: can be EAX, etc - assert(srcRegs[2] == rax); //if this fails, pin has changed the register orderings... - - //Compare destination (first operand) w/ RAX. If equal, copy source (second operand) into destination and set the zero flag; o/w copy destination into RAX - if (!instr.numLoads) { - //2 swaps, implemented in 2 stages: first, and all sources with rflags.zf; then or results pairwise. This is pure speculation, but matches uops required. - emitExecUop(srcRegs[0], rax, REG_EXEC_TEMP, rflags, uops, 1, PORTS_015); //includes compare - emitExecUop(srcRegs[1], rflags, REG_EXEC_TEMP+1, 0, uops, 2, PORTS_015); - emitExecUop(srcRegs[2], rflags, REG_EXEC_TEMP+2, 0, uops, 2, PORTS_015); - - emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[0], 0, uops, 2, PORTS_015); - emitExecUop(REG_EXEC_TEMP+1, REG_EXEC_TEMP+2, dstRegs[1] /*rax*/, 0, uops, 2, PORTS_015); - } else { - //6 uops (so 3 exec), and critical path is 4 (for rax), GO FIGURE - emitExecUop(srcRegs[0], rax, REG_EXEC_TEMP, rflags, uops, 2, PORTS_015); - emitExecUop(srcRegs[1], rflags, dstRegs[0], 0, uops, 2, PORTS_015); //let's assume we can do a fancy conditional store - emitExecUop(srcRegs[2], REG_EXEC_TEMP, dstRegs[1] /*rax*/, 0, uops, 2, PORTS_015); //likewise +void Decoder::emitDiv(DynUopVec& uops, uint8_t width, uint16_t rd, uint16_t rs1, uint16_t rs2) { + uint32_t lat = 0; + switch (width) { + case 8: + lat = 15; + break; + case 16: + lat = 19; + break; + case 32: + lat = 23; + break; + case 64: + lat = 63; + break; + default: + panic("emitDiv: Invalid reg size"); } - - //NOTE: While conceptually srcRegs[0] == dstRegs[0], when it's a memory location they map to different temporary regs - - emitStores(instr, uops); + uint8_t extraSlots = lat-1; + emitExecUop(rs1, rs2, rd, 0, uops, lat, PORTS_015, extraSlots); } - - -void Decoder::populateRegArrays(Instr& instr, uint32_t* srcRegs, uint32_t* dstRegs) { - uint32_t curSource = 0; - for (uint32_t i = 0; i < instr.numLoads; i++) { - srcRegs[curSource++] = REG_LOAD_TEMP + i; - } - for (uint32_t i = 0; i < instr.numInRegs; i++) { - srcRegs[curSource++] = instr.inRegs[i]; - } - srcRegs[curSource++] = 0; - srcRegs[curSource++] = 0; - - uint32_t curDest = 0; - for (uint32_t i = 0; i < instr.numStores; i++) { - dstRegs[curDest++] = REG_STORE_TEMP + i; - } - for (uint32_t i = 0; i < instr.numOutRegs; i++) { - dstRegs[curDest++] = instr.outRegs[i]; - } - dstRegs[curDest++] = 0; - dstRegs[curDest++] = 0; +uint8_t Decoder::riscvInsOpCode(INS ins) { + return ins & 0x7f; } -void Decoder::emitBasicOp(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports, uint8_t extraSlots, bool reportUnhandled) { - emitLoads(instr, uops); - - uint32_t srcs = instr.numLoads + instr.numInRegs; - uint32_t dsts = instr.numStores + instr.numOutRegs; - - uint32_t srcRegs[srcs + 2]; - uint32_t dstRegs[dsts + 2]; - populateRegArrays(instr, srcRegs, dstRegs); - - if (reportUnhandled && (srcs > 2 || dsts > 2)) reportUnhandledCase(instr, "emitBasicOp"); //We're going to be ignoring some dependencies - - emitExecUop(srcRegs[0], srcRegs[1], dstRegs[0], dstRegs[1], uops, lat, ports, extraSlots); - - emitStores(instr, uops); +uint8_t Decoder::riscvInsFunct3(INS ins) { + return (ins >> 12) & 0x07; } -void Decoder::emitChainedOp(Instr& instr, DynUopVec& uops, uint32_t numUops, uint32_t* latArray, uint8_t* portsArray) { - emitLoads(instr, uops); - - uint32_t srcs = instr.numLoads + instr.numInRegs; - uint32_t dsts = instr.numStores + instr.numOutRegs; - - uint32_t srcRegs[srcs + 2]; - uint32_t dstRegs[dsts + 2]; - populateRegArrays(instr, srcRegs, dstRegs); - - assert(numUops > 1); - //if (srcs != numUops + 1) reportUnhandledCase(instr, "emitChainedOps"); - assert(srcs + 2 >= numUops + 1); // note equality is not necessary in case one or more operands are immediates - - emitExecUop(srcRegs[0], srcRegs[1], REG_EXEC_TEMP, 0, uops, latArray[0], portsArray[0]); - for (uint32_t i = 1; i < numUops-1; i++) { - emitExecUop(REG_EXEC_TEMP, srcRegs[i+1], REG_EXEC_TEMP, 0, uops, latArray[i], portsArray[i]); - } - emitExecUop(REG_EXEC_TEMP, srcRegs[numUops-1], dstRegs[0], dstRegs[1], uops, latArray[numUops-1], portsArray[numUops-1]); - - emitStores(instr, uops); +uint8_t Decoder::riscvInsFunct7(INS ins) { + return (ins >> 25) & 0x7f; } -//Some convert ops are implemented in 2 uops, even though they could just use one given src/dst reg constraints -void Decoder::emitConvert2Op(Instr& instr, DynUopVec& uops, uint32_t lat1, uint32_t lat2, uint8_t ports1, uint8_t ports2) { - if (instr.numStores > 0 || instr.numLoads > 1 || instr.numOutRegs != 1 || instr.numLoads + instr.numInRegs != 1) { - reportUnhandledCase(instr, "convert"); - } else { - //May have single load, has single output - uint32_t src; - if (instr.numLoads) { - emitLoads(instr, uops); - src = REG_LOAD_TEMP; - } else { - src = instr.inRegs[0]; - } - uint32_t dst = instr.outRegs[0]; - emitExecUop(src, 0, REG_EXEC_TEMP, 0, uops, lat1, ports1); - emitExecUop(REG_EXEC_TEMP, 0, dst, 0, uops, lat2, ports2); - } +uint8_t Decoder::riscvInsIsAtomic(INS ins) { + return riscvInsOpCode(ins) == RISCV_OPCODE_ATOMIC; } +uint8_t Decoder::riscvInsArithRd(INS ins) { + return (ins >> 7) & 0x1f; +} -void Decoder::emitMul(Instr& instr, DynUopVec& uops) { - uint32_t dsts = instr.numStores + instr.numOutRegs; - if (dsts == 3) { - emitLoads(instr, uops); - - uint32_t srcs = instr.numLoads + instr.numInRegs; - - uint32_t srcRegs[srcs + 2]; - uint32_t dstRegs[dsts + 2]; - populateRegArrays(instr, srcRegs, dstRegs); - - assert(srcs <= 2); - - emitExecUop(srcRegs[0], srcRegs[1], dstRegs[0], REG_EXEC_TEMP, uops, 3, PORT_1); - emitExecUop(srcRegs[0], srcRegs[1], dstRegs[1], REG_EXEC_TEMP+1, uops, 3, PORT_1); - emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[2], 0, uops, 1, PORTS_015); +uint8_t Decoder::riscvInsArithRs1(INS ins) { + return (ins >> 15) & 0x1f; +} - emitStores(instr, uops); - } else { - emitBasicOp(instr, uops, 3, PORT_1); - } +uint8_t Decoder::riscvInsArithRs2(INS ins) { + return (ins >> 20) & 0x1f; } -void Decoder::emitDiv(Instr& instr, DynUopVec& uops) { - uint32_t srcs = instr.numLoads + instr.numInRegs; - uint32_t dsts = instr.numStores + instr.numOutRegs; +bool Decoder::riscvInsIsLoad(INS ins) { + auto opcode = riscvInsOpCode(ins); + return opcode == RISCV_OPCODE_LOAD || opcode == RISCV_OPCODE_VECTOR_LOAD; +} - /* div and idiv are microsequenced, with a variable number of uops on all ports, and have fixed - * input and output regs (rdx:rax is the input, rax is the quotient and rdx is the remainder). - * Also, the number of uops and latency depends on the data. We approximate this with a 4-uop - * sequence that sorta kinda emulates the typical latency. - */ +bool Decoder::riscvInsIsStore(INS ins) { + auto opcode = riscvInsOpCode(ins); + return opcode == RISCV_OPCODE_STORE || opcode == RISCV_OPCODE_VECTOR_STORE; +} - uint32_t srcRegs[srcs + 2]; - uint32_t dstRegs[dsts + 2]; - populateRegArrays(instr, srcRegs, dstRegs); +bool Decoder::riscvInsIsBranch(INS ins) { + auto opcode = riscvInsOpCode(ins); + return opcode == RISCV_OPCODE_BRANCH; +} - //assert(srcs == 3); //there is a variant of div that uses only 2 regs --> see below - //assert(dsts == 3); - assert(instr.numInRegs > 1); +uint8_t Decoder::riscvCompressedRegDecode(uint8_t reg) { + assert(reg <= 7); + return reg + 8; +} - uint32_t width = INS_OperandWidth(instr.ins, 1); - uint32_t lat = 0; - switch (width) { - case 8: - lat = 15; +bool Decoder::riscvInsIsMemAccess(INS ins) { + uint8_t opcode = riscvInsOpCode(ins); + switch (opcode) { + case RISCV_OPCODE_INTEGER: break; - case 16: - lat = 19; + case RISCV_OPCODE_INTEGER_32: break; - case 32: - lat = 23; + case RISCV_OPCODE_INTEGER_IMM: break; - case 64: - lat = 63; + // RV64I I-type word instructions (opcode = 0x1B) + case RISCV_OPCODE_INTEGER_IMM_32: + break; + case RISCV_OPCODE_LOAD: + return true; + case RISCV_OPCODE_STORE: + return true; + case RISCV_OPCODE_BRANCH: + case RISCV_OPCODE_JAL: + break; + case RISCV_OPCODE_JALR: + break; + case RISCV_OPCODE_LUI: + break; + case RISCV_OPCODE_AUIPC: + break; + case RISCV_OPCODE_ATOMIC: + return true; + case RISCV_OPCODE_SYSTEM: + break; + case RISCV_OPCODE_FENCE: + break; + case RISCV_OPCODE_MADD_FP: // FMADD.S, FMADD.D + case RISCV_OPCODE_MSUB_FP: // FMSUB.S, FMSUB.D + case RISCV_OPCODE_NMSUB_FP: // FNMSUB.S, FNMSUB.D + case RISCV_OPCODE_NMADD_FP: // FNMADD.S, FNMADD.D + break; + case RISCV_OPCODE_FP: // Floating point operations + break; + /* To some floating compress instructions: + * We will simply simulated it as a general purposed register + * since the value does not matter + */ + case RISCV_OPCODE_C0: + { + uint8_t func = (ins >> 13) & 0x07; + switch(func) { + case 0: + break; + case 1: + case 2: + case 3: + return true; + case 5: + case 6: + case 7: + return false; + default: + break; + } + } + break; + case RISCV_OPCODE_C1: + break; + case RISCV_OPCODE_C2: + { + uint8_t func = (ins >> 13) & 0x07; + switch (func) { + case 0: // C.SLLI64 + break; + case 1: // C.FLDSP + case 2: // C.LWSP + case 3: // C.LDSP + return true; + case 4: // C.JR C.MV C.EBREAK C.JALR C.ADD + break; + case 5: // C.FSDSP + case 6: // C.SWSP + case 7: // C.SDSP + return true; + } + } + break; + /* Use OOO's intrinsic register renaming to simulate vector instructions */ + case RISCV_OPCODE_VECTOR_LOAD: // Vector loads + case RISCV_OPCODE_VECTOR_STORE: + return true; + case RISCV_OPCODE_VECTOR_ARITH: // Vector arithmetic operations break; default: - panic("emitDiv: Invalid reg size"); + break; } - uint8_t extraSlots = lat-1; - if (srcs == 3 && dsts == 3) { - emitLoads(instr, uops); - emitExecUop(srcRegs[0], srcRegs[1], REG_EXEC_TEMP, 0, uops, lat, PORTS_015, extraSlots); - emitExecUop(srcRegs[0], srcRegs[2], REG_EXEC_TEMP+1, 0, uops, lat, PORTS_015, extraSlots); - emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[0], dstRegs[1], uops, 1, PORTS_015); //quotient and remainder - emitExecUop(REG_EXEC_TEMP, REG_EXEC_TEMP+1, dstRegs[2], 0, uops, 1, PORTS_015); //flags - - emitStores(instr, uops); - } else if (srcs <= 2 && dsts <= 2) { - emitBasicOp(instr, uops, lat, PORTS_015, extraSlots); - } else { - reportUnhandledCase(instr, "emitDiv"); - } + return false; } -//Helper function -static bool dropRegister(uint32_t targetReg, uint32_t* regs, uint32_t& numRegs) { - for (uint32_t i = 0; i < numRegs; i++) { - uint32_t reg = regs[i]; - if (reg == targetReg) { - //Shift rest of regs - for (uint32_t j = i; j < numRegs - 1; j++) regs[j] = regs[j+1]; - numRegs--; - return true; - } +bool Decoder::riscvInsIsStoreCond(INS ins) { + uint8_t opcode = riscvInsOpCode(ins); + uint8_t funct3 = riscvInsFunct3(ins); + uint8_t funct7 = riscvInsFunct7(ins); + switch (opcode) { + case RISCV_OPCODE_ATOMIC: + switch (funct3) { + case 0x2: // Word operations + case 0x3: // Double word operations + { + uint8_t amo_type = funct7 >> 2; + switch (amo_type) { + case 0x0: // AMOADD + case 0x4: // AMOXOR + case 0x8: // AMOOR + case 0xC: // AMOAND + case 0x10: // AMOMIN + case 0x14: // AMOMAX + case 0x18: // AMOMINU + case 0x1C: // AMOMAXU + case 0x1: // AMOSWAP + case 0x2: // LR (Load Reserved) + break; + case 0x3: // SC (Store Conditional) + return true; + default: + break; + } + break; + } + default: + break; + } + break; + default: + break; } - return false; -} -void Decoder::dropStackRegister(Instr& instr) { - bool dropIn = dropRegister(REG_RSP, instr.inRegs, instr.numInRegs); - bool dropOut = dropRegister(REG_RSP, instr.outRegs, instr.numOutRegs); - if (!dropIn && !dropOut) /*reportUnhandledCase(instr, "dropStackRegister (no RSP found)")*/; - else reportUnhandledCase(instr, "dropStackRegister (RSP found)"); + return false; } - bool Decoder::decodeInstr(INS ins, DynUopVec& uops) { uint32_t initialUops = uops.size(); bool inaccurate = false; - xed_category_enum_t category = (xed_category_enum_t) INS_Category(ins); - xed_iclass_enum_t opcode = (xed_iclass_enum_t) INS_Opcode(ins); + uint8_t opcode = riscvInsOpCode(ins); + uint8_t funct3 = riscvInsFunct3(ins); + uint8_t funct7 = riscvInsFunct7(ins); Instr instr(ins); bool isLocked = false; // NOTE(dsm): IsAtomicUpdate == xchg or LockPrefix (xchg has in implicit lock prefix) - if (INS_IsAtomicUpdate(instr.ins)) { + if (riscvInsIsAtomic(ins)) { isLocked = true; emitFence(uops, 0); //serialize the initial load w.r.t. all prior stores } - - switch (category) { - //NOPs are optimized out in the execution pipe, but they still grab a ROB entry - case XC(NOP): - case XC(WIDENOP): - emitExecUop(0, 0, 0, 0, uops, 1, PORTS_015); - break; - - /* Moves */ - case XC(DATAXFER): - switch (opcode) { - case XO(BSWAP): - emitBasicMove(instr, uops, 1, PORT_1); + switch (opcode) { + case RISCV_OPCODE_INTEGER: + switch (funct3) { + case 0x0: // ADD/SUB + if (funct7 == 0x00) { + // ADD + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x20) { + // SUB + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // MUL (M extension) + emitMul(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOV): - emitBasicMove(instr, uops, 1, PORTS_015); + case 0x1: // SLL + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + } else if (funct7 == 0x01) { + // MULH (M extension) + emitMul(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOVAPS): - case XO(MOVAPD): - case XO(MOVUPS): - case XO(MOVUPD): - case XO(MOVSS): - case XO(MOVSD): - case XO(MOVSD_XMM): - case XO(MOVHLPS): - case XO(MOVLHPS): - case XO(MOVDDUP): - case XO(MOVSHDUP): - case XO(MOVSLDUP): - emitBasicMove(instr, uops, 1, PORT_5); + case 0x2: // SLT + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // MULHSU (M extension) + emitMul(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOVHPS): - case XO(MOVHPD): - case XO(MOVLPS): - case XO(MOVLPD): - //A bit unclear... could be 2 or 3 cycles, and current microbenchmarks are not enough to tell - emitBasicOp(instr, uops, /*2*/ 1, PORT_5); + case 0x3: // SLTU + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // MULHU (M extension) + emitMul(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOVMSKPS): - case XO(MOVMSKPD): - emitBasicMove(instr, uops, 1, PORT_0); + case 0x4: // XOR + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // DIV (M extension) + emitDiv(uops, 64, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOVD): - case XO(MOVQ): - case XO(MOVDQA): - case XO(MOVDQU): - case XO(MOVDQ2Q): - case XO(MOVQ2DQ): - emitBasicMove(instr, uops, 1, PORTS_015); //like mov + case 0x5: // SRL/SRA + if (funct7 == 0x00) { + // SRL + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + } else if (funct7 == 0x20) { + // SRA + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + } else if (funct7 == 0x01) { + // DIVU (M extension) + emitDiv(uops, 64, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(MOVSX): - case XO(MOVSXD): - case XO(MOVZX): - emitBasicMove(instr, uops, 1, PORTS_015); //like mov + case 0x6: // OR + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // REM (M extension) + emitDiv(uops, 64, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; - case XO(XCHG): - emitXchg(instr, uops); + case 0x7: // AND + if (funct7 == 0x00) { + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // REMU (M extension) + emitDiv(uops, 64, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } break; default: - //TODO: MASKMOVQ, MASKMOVDQ, MOVBE (Atom only), MOVNTxx variants (nontemporal), MOV_CR and MOV_DR (privileged?), VMOVxxxx variants (AVX) inaccurate = true; - emitBasicMove(instr, uops, 1, PORTS_015); } break; - - case XC(CMOV): - emitConditionalMove(instr, uops, 1, PORTS_015); - break; - case XC(FCMOV): - emitConditionalMove(instr, uops, 1, PORT_0); + case RISCV_OPCODE_INTEGER_32: + switch (funct3) { + case 0x0: // ADDW/SUBW + if (funct7 == 0x00) { + // ADDW + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x20) { + // SUBW + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + } else if (funct7 == 0x01) { + // MULW (M extension) + emitMul(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } + break; + case 0x1: // SLLW + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + break; + case 0x4: // DIVW (M extension) + emitDiv(uops, 32, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + break; + case 0x5: // SRLW/SRAW + if (funct7 == 0x00) { + // SRLW + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + } else if (funct7 == 0x20) { + // SRAW + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + } else if (funct7 == 0x01) { + // DIVUW (M extension) + emitDiv(uops, 32, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + } + break; + case 0x6: // REMW (M extension) + case 0x7: // REMUW (M extension) + emitDiv(uops, 32, riscvInsArithRd(ins), riscvInsArithRs1(ins), + riscvInsArithRs2(ins)); + break; + default: + inaccurate = true; + } break; - - /* Barebones arithmetic instructions */ - case XC(BINARY): - { - if (opcode == XO(ADC) || opcode == XO(SBB)) { - uint32_t lats[] = {1, 1}; - uint8_t ports[] = {PORTS_015, PORTS_015}; - emitChainedOp(instr, uops, 2, lats, ports); - } else if (opcode == XO(MUL) || opcode == XO(IMUL)) { - emitMul(instr, uops); - } else if (opcode == XO(DIV) || opcode == XO(IDIV)) { - emitDiv(instr, uops); - } else { - //ADD, SUB, CMP, DEC, INC, NEG are 1 cycle - emitBasicOp(instr, uops, 1, PORTS_015); - } + case RISCV_OPCODE_INTEGER_IMM: + switch (funct3) { + case 0x0: // ADDI + case 0x2: // SLTI + case 0x3: // SLTIU + case 0x4: // XORI + case 0x6: // ORI + case 0x7: // ANDI + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + break; + case 0x1: // SLLI + case 0x5: // SRLI/SRAI + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + break; + default: + inaccurate = true; } break; - case XC(BITBYTE): - { - uint32_t opLat = 1; - switch (opcode) { - case XO(BSF): - case XO(BSR): - opLat = 3; - break; - //TODO: EXTRQ, INSERTQ, LZCNT - default: {} //BT, BTx, SETcc ops are 1 cycle - } - emitBasicOp(instr, uops, opLat, PORTS_015); + // RV64I I-type word instructions (opcode = 0x1B) + case RISCV_OPCODE_INTEGER_IMM_32: + switch (funct3) { + case 0x0: // ADDIW + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + break; + case 0x1: // SLLIW + case 0x5: // SRLIW/SRAIW + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); + break; + default: + inaccurate = true; } break; - case XC(LOGICAL): - //AND, OR, XOR, TEST are 1 cycle - emitBasicOp(instr, uops, 1, PORTS_015); + case RISCV_OPCODE_LOAD: + emitLoad(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); break; - case XC(ROTATE): - { - uint32_t opLat = 1; //ROR, ROL 1 cycle - if (opcode == XO(RCR) || opcode == XO(RCL)) opLat = 2; - emitBasicOp(instr, uops, opLat, PORT_0 | PORT_5); - } + case RISCV_OPCODE_STORE: + emitStore(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); break; - case XC(SHIFT): - { - if (opcode == XO(SHLD)|| opcode == XO(SHRD)) { - uint32_t lats[] = {2, opcode == XO(SHLD)? 1u : 2u}; //SHRD takes 4 cycles total, SHLD takes 3 - uint8_t ports[] = {PORTS_015, PORTS_015}; - emitChainedOp(instr, uops, 2, lats, ports); - } else { - uint32_t opLat = 1; //SHR SHL SAR are 1 cycle - emitBasicOp(instr, uops, opLat, PORT_0 | PORT_5); - } - } + case RISCV_OPCODE_BRANCH: + case RISCV_OPCODE_JAL: + emitExecUop(0, 0, + riscvInsArithRd(ins), 0, uops, 1, PORT_5); break; - case XC(DECIMAL): //pack/unpack BCD, these seem super-deprecated - { - uint32_t opLat = 1; - switch (opcode) { - case XO(AAA): - case XO(AAS): - case XO(DAA): - case XO(DAS): - opLat = 3; - break; - case XO(AAD): - opLat = 15; - break; - case XO(AAM): - opLat = 20; + case RISCV_OPCODE_JALR: + // All branches have the same uop profile in this model + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1, PORT_5); + break; + case RISCV_OPCODE_LUI: + emitExecUop(0, 0, + riscvInsArithRd(ins), 0, uops, 1, PORTS_015); + break; + case RISCV_OPCODE_AUIPC: + emitExecUop(0, 0, + riscvInsArithRd(ins), 0, uops, 1, PORT_1); + break; + case RISCV_OPCODE_ATOMIC: + switch (funct3) { + case 0x2: // Word operations + case 0x3: // Double word operations + { + uint8_t amo_type = funct7 >> 2; + switch (amo_type) { + case 0x0: // AMOADD + case 0x4: // AMOXOR + case 0x8: // AMOOR + case 0xC: // AMOAND + case 0x10: // AMOMIN + case 0x14: // AMOMAX + case 0x18: // AMOMINU + case 0x1C: // AMOMAXU + { + emitLoad(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); + emitExecUop(riscvInsArithRd(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 3, PORTS_015); + emitStore(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); + } + break; + case 0x1: // AMOSWAP + /* TODO: check with uops */ + emitXchg(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins), riscvInsArithRs2(ins)); + break; + case 0x2: // LR (Load Reserved) + /* TODO: check with the register */ + emitLoad(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); + break; + case 0x3: // SC (Store Conditional) + emitStore(uops, riscvInsArithRs2(ins), riscvInsArithRs1(ins)); + emitExecUop(0, 0, + riscvInsArithRd(ins), 0, uops, 3, PORTS_015); + break; + default: + inaccurate = true; + } break; - default: - panic("Invalid opcode for this class"); - } - emitBasicOp(instr, uops, opLat, PORTS_015); + } + default: + inaccurate = true; } break; - case XC(FLAGOP): - switch (opcode) { - case XO(LAHF): - case XO(SAHF): - emitBasicOp(instr, uops, 1, PORTS_015); - break; - case XO(CLC): - case XO(STC): - case XO(CMC): - emitBasicOp(instr, uops, 1, PORTS_015); + case RISCV_OPCODE_SYSTEM: + switch (funct3) { + case 0x0: // ECALL, EBREAK, MRET, SRET, URET, WFI + emitExecUop(0, 0, 0, 0, uops, 1, PORTS_015); break; - case XO(CLD): - emitExecUop(0, 0, REG_EXEC_TEMP, 0, uops, 2, PORTS_015); - emitExecUop(REG_EXEC_TEMP, 0, REG_RFLAGS, 0, uops, 2, PORTS_015); + case 0x1: // CSRRW + case 0x2: // CSRRS + case 0x3: // CSRRC + case 0x5: // CSRRWI + case 0x6: // CSRRSI + case 0x7: // CSRRCI + // CSR operations are more complex + emitExecUop(0, 0, 0, 0, uops, 5, PORTS_015); break; - case XO(STD): - emitExecUop(0, 0, REG_EXEC_TEMP, 0, uops, 3, PORTS_015); - emitExecUop(REG_EXEC_TEMP, 0, REG_RFLAGS, 0, uops, 2, PORTS_015); + case 9: // SFENCE.VMA + /* TODO: check the latency value */ + emitFence(uops, 25); break; default: inaccurate = true; } break; - - case XC(SEMAPHORE): //atomic ops, these must involve memory - //reportUnhandledCase(instr, "SEM"); - //emitBasicOp(instr, uops, 1, PORTS_015); - - switch (opcode) { - case XO(CMPXCHG): - case XO(CMPXCHG8B): - //case XO(CMPXCHG16B): //not tested... - emitCompareAndExchange(instr, uops); + case RISCV_OPCODE_FENCE: + switch (funct3) { + case 0x0: // FENCE + emitFence(uops, 9); break; - case XO(XADD): - { - uint32_t lats[] = {2, 2}; - uint8_t ports[] = {PORTS_015, PORTS_015}; - emitChainedOp(instr, uops, 2, lats, ports); - } + case 0x1: // FENCE.I + emitFence(uops, 12); break; default: inaccurate = true; } break; - - /* FP, SSE and other extensions */ - case /*XC(X)87_ALU*/ XC(X87_ALU): - //emitBasicOp(instr, uops, 1, PORTS_015); - break; - - case XED_CATEGORY_3DNOW: - //emitBasicOp(instr, uops, 1, PORTS_015); - break; - - case XC(MMX): - //emitBasicOp(instr, uops, 1, PORTS_015); + case RISCV_OPCODE_MADD_FP: // FMADD.S, FMADD.D + case RISCV_OPCODE_MSUB_FP: // FMSUB.S, FMSUB.D + case RISCV_OPCODE_NMSUB_FP: // FNMSUB.S, FNMSUB.D + case RISCV_OPCODE_NMADD_FP: // FNMADD.S, FNMADD.D + { + /* TODO: check fp register */ + uint8_t fmt = (ins >> 25) & 0x3; + if (fmt == 0x0) { // Single precision + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 4, PORT_0); + } else if (fmt == 0x1) { // Double precision + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 5, PORT_0); + } else { + inaccurate = true; + } + } break; - - case XC(SSE): + case RISCV_OPCODE_FP: // Floating point operations { - //TODO: Multi-uop BLENDVXX, DPXX - - uint32_t lat = 1; - uint8_t ports = PORTS_015; - uint8_t extraSlots = 0; - switch (opcode) { - case XO(ADDPD): - case XO(ADDPS): - case XO(ADDSD): - case XO(ADDSS): - case XO(SUBPD): - case XO(SUBPS): - case XO(SUBSD): - case XO(SUBSS): - case XO(ADDSUBPD): - case XO(ADDSUBPS): - lat = 3; - ports = PORT_1; + uint32_t fmt = (ins >> 25) & 0x3; + uint32_t fp_opcode = ins >> 27; + + switch (fp_opcode) { + case 0x0: // FADD.S/FADD.D + case 0x1: // FSUB.S/FSUB.D + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 3, PORT_1); break; - - case XO(BLENDPS): - case XO(BLENDPD): - case XO(SHUFPS): - case XO(SHUFPD): - case XO(UNPCKHPD): - case XO(UNPCKHPS): - case XO(UNPCKLPD): - case XO(UNPCKLPS): - lat = 1; - ports = PORT_5; + case 0x2: // FMUL.S/FMUL.D + if (fmt == 0x0) { // FMUL.S + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 4, PORT_1); + } else if (fmt == 0x1) { // FMUL.D + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 5, PORT_1); + } break; - - case XO(CMPPD): - case XO(CMPPS): - case XO(CMPSD): - case XO(CMPSS): - lat = 3; - ports = PORT_1; + case 0x3: // FDIV.S/FDIV.D + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 7, PORT_0, 6); break; - - case XO(COMISD): - case XO(COMISS): - case XO(UCOMISD): - case XO(UCOMISS): - lat = 1+2; //writes rflags, always crossing xmm -> int domains - ports = PORT_1; + case 0x4: // FSGNJ.S/FSGNJ.D, FSGNJN.S/FSGNJN.D, FSGNJX.S/FSGNJX.D + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 1, PORT_0 | PORT_5); break; - - case XO(DIVPS): - case XO(DIVSS): - lat = 7; //from mubench - ports = PORT_0; - extraSlots = lat - 1; //non-pipelined + case 0x5: // FMIN.S/FMIN.D, FMAX.S/FMAX.D + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 3, PORT_1); break; - case XO(DIVPD): - case XO(DIVSD): - lat = 7; //from mubench - ports = PORT_0; //non-pipelined - extraSlots = lat - 1; + case 0x6: // FCVT.W.S/FCVT.W.D, FCVT.WU.S/FCVT.WU.D + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 3 + 2, PORT_1); break; - - case XO(MAXPD): - case XO(MAXPS): - case XO(MAXSD): - case XO(MAXSS): - case XO(MINPD): - case XO(MINPS): - case XO(MINSD): - case XO(MINSS): - lat = 3; - ports = PORT_1; + case 0x7: // FMV.X.W/FMV.X.D, FCLASS.S/FCLASS.D + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1 + 2, PORT_1); break; - - case XO(MULSS): - case XO(MULPS): - lat = 4; - ports = PORT_0; + case 0x8: // FCMP.S/FCMP.D (FEQ, FLT, FLE) + emitExecUop(riscvInsArithRs1(ins), riscvInsArithRs2(ins), + riscvInsArithRd(ins), 0, uops, 3, PORT_1); break; - case XO(MULSD): - case XO(MULPD): - lat = 5; - ports = PORT_0; + case 0x9: // FCVT.S.W/FCVT.S.D, FCVT.S.WU/FCVT.D.WU + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 3 + 2, PORT_1); break; - - case XO(RCPPS): - case XO(RCPSS): - lat = 3; - ports = PORT_1; + case 0xA: // FMV.W.X/FMV.D.X + case 0xB: // FCVT.D.S/FCVT.S.D + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 1 + 2, PORT_0); break; - - case XO(ROUNDPD): - case XO(ROUNDPS): - case XO(ROUNDSD): - case XO(ROUNDSS): - lat = 3; - ports = PORT_1; + case 0xC: // FSQRT.S/FSQRT.D + emitExecUop(riscvInsArithRs1(ins), 0, + riscvInsArithRd(ins), 0, uops, 7, PORT_0); break; - - case XO(RSQRTPS): - case XO(RSQRTSS): - lat = 3; - ports = PORT_1; - extraSlots = 1; //from mubench, has reciprocal thput of 2 + default: + inaccurate = true; + } + } + break; + /* To some floating compress instructions: + * We will simply simulated it as a general purposed register + * since the value does not matter + */ + case RISCV_OPCODE_C0: + { + uint8_t func = (ins >> 13) & 0x07; + uint8_t rd = riscvCompressedRegDecode((ins >> 2) & 0x7); + uint8_t rs = riscvCompressedRegDecode((ins >> 7) & 0x7); + switch(func) { + case 0: + emitExecUop(0, 0, rd, 0, uops, 1, PORTS_015); break; - - case XO(SQRTSS): - case XO(SQRTPS): - lat = 7; //from mubench - ports = PORT_0; - extraSlots = lat-1; //unpiped + case 1: + case 2: + case 3: + emitLoad(uops, rd, rs); break; - - case XO(SQRTSD): - case XO(SQRTPD): - lat = 7; //from mubench - ports = PORT_0; - extraSlots = lat-1; //unpiped + case 5: + case 6: + case 7: + emitStore(uops, rs, rd); break; - - case XO(POPCNT): - case XO(CRC32): - lat = 3; - ports = PORT_1; + default: + inaccurate = true; + } + } + break; + case RISCV_OPCODE_C1: + { + uint8_t func = (ins >> 13) & 0x07; + switch (func) { + case 0: // C.ADDI + case 1: // C.ADDIW + { + uint8_t rsrd = (ins >> 7) & 0x1f; + emitExecUop(rsrd, 0, rsrd, 0, uops, 1, PORTS_015); + } break; - - //Packed arith; these are rare, so I'm implementing only what I've seen used (and simple variants) - case XO(PADDB): - case XO(PADDD): - case XO(PADDQ): - case XO(PADDSB): - case XO(PADDSW): - case XO(PADDUSB): - case XO(PADDUSW): - case XO(PADDW): - case XO(PSUBB): - case XO(PSUBD): - case XO(PSUBQ): - case XO(PSUBSB): - case XO(PSUBSW): - case XO(PSUBUSB): - case XO(PSUBUSW): - case XO(PSUBW): - - case XO(PALIGNR): - - case XO(PCMPEQB): - case XO(PCMPEQD): - case XO(PCMPEQQ): - case XO(PCMPEQW): - case XO(PCMPGTB): - case XO(PCMPGTD): - case XO(PCMPGTW): - - case XO(PUNPCKHBW): - case XO(PUNPCKHDQ): - case XO(PUNPCKHQDQ): - case XO(PUNPCKHWD): - case XO(PUNPCKLBW): - case XO(PUNPCKLDQ): - case XO(PUNPCKLQDQ): - case XO(PUNPCKLWD): - - case XO(PSHUFB): - case XO(PSHUFD): - case XO(PSHUFHW): - case XO(PSHUFLW): - lat = 1; - ports = PORT_0 | PORT_5; + case 2: // C.Li + case 3: // C.ADDI16SP C.LUI + { + uint8_t rsrd = (ins >> 7) & 0x1f; + emitExecUop(0, 0, rsrd, 0, uops, 1, PORTS_015); + } break; - - case XO(PCMPGTQ): //weeeird, only packed comparison that's done differently - lat = 3; - ports = PORT_1; + case 4: // Shift and Arith + { + uint8_t rsrd = riscvCompressedRegDecode((ins >> 7) & 0x07); + uint8_t subFunc = (ins >> 10) & 0x03; + if (subFunc != 3) { + emitExecUop(rsrd, 0, rsrd, 0, uops, 1, PORT_0 | PORT_5); + } else { + uint8_t rs2 = riscvCompressedRegDecode((ins >> 2) & 0x07); + emitExecUop(rsrd, rs2, rsrd, 0, uops, 1, PORTS_015); + } + } break; - - case XO(PMOVMSKB): - lat = 2+2; - ports = PORT_0; + case 5: // C.J + emitExecUop(0, 0, 1, 0, uops, 1, PORT_5); + break; + case 6: // C.BEQZ + case 7: // C.BNEZ + { + uint8_t rs = riscvCompressedRegDecode((ins >> 7) & 0x07); + emitExecUop(rs, 0, 0, 0, uops, 1, PORT_5); + } break; - - default: - inaccurate = true; } - emitBasicOp(instr, uops, lat, ports, extraSlots); } break; - - case XC(STTNI): //SSE 4.2 - break; - - case XC(CONVERT): //part of SSE - switch (opcode) { - case XO(CVTPD2PS): - case XO(CVTSD2SS): - emitConvert2Op(instr, uops, 2, 2, PORT_1, PORT_5); - break; - case XO(CVTPS2PD): - emitConvert2Op(instr, uops, 1, 1, PORT_0, PORT_5); - break; - case XO(CVTSS2SD): - emitBasicOp(instr, uops, 1, PORT_0); - break; - case XO(CVTDQ2PS): - case XO(CVTPS2DQ): - case XO(CVTTPS2DQ): - emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1); - break; - case XO(CVTDQ2PD): - case XO(CVTPD2DQ): - case XO(CVTTPD2DQ): - emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_5); - break; - case XO(CVTPI2PS): - case XO(CVTPS2PI): - case XO(CVTTPS2PI): - emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1); - break; - case XO(CVTPI2PD): - case XO(CVTPD2PI): - case XO(CVTTPD2PI): - emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_0 | PORT_5); - break; - case XO(CVTSI2SS): - case XO(CVTSS2SI): - case XO(CVTTSS2SI): - emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1); - break; - case XO(CVTSI2SD): - emitConvert2Op(instr, uops, 2, 2+2 /*domain change*/, PORT_1, PORT_0); - break; - case XO(CVTSD2SI): - case XO(CVTTSD2SI): - emitBasicOp(instr, uops, 3+2 /*domain change*/, PORT_1); - break; - case XO(CBW): - case XO(CWDE): - case XO(CDQE): - emitBasicOp(instr, uops, 1, PORTS_015); - break; - case XO(CWD): - case XO(CDQ): - case XO(CQO): - emitBasicOp(instr, uops, 1, PORT_0 | PORT_5); - break; - - default: // AVX converts - inaccurate = true; + case RISCV_OPCODE_C2: + { + uint8_t func = (ins >> 13) & 0x07; + uint8_t rsrd = (ins >> 7) & 0x1f; + uint8_t rs2 = (ins >> 2) & 0x1f; + switch (func) { + case 0: // C.SLLI64 + emitExecUop(rsrd, 0, rsrd, 0, uops, 1, PORT_0 | PORT_5); + break; + case 1: // C.FLDSP + case 2: // C.LWSP + case 3: // C.LDSP + emitLoad(uops, rsrd, 2); + break; + case 4: // C.JR C.MV C.EBREAK C.JALR C.ADD + { + uint8_t funct1 = (ins >> 12) & 0x01; + if (funct1 == 0) { + if (rs2 == 0) { + emitExecUop(0, 0, rsrd, 0, uops, 1, PORT_5); + } else { + emitExecUop(rs2, 0, rsrd, 0, uops, 1, PORT_5); + } + } else { + if (rsrd == 0 && rs2 == 0) { + /* EBREAK */ + emitExecUop(0, 0, 0, 0, uops, 1, PORTS_015); + } else if (rsrd != 0 && rs2 == 0) { + /* C.JALR */ + emitExecUop(rsrd, 0, 1, 0, uops, 1, PORT_5); + } else { + /* C.ADD */ + emitExecUop(rsrd, rs2, rsrd, 0, uops, 1, PORTS_015); + } + } + } + break; + case 5: // C.FSDSP + case 6: // C.SWSP + case 7: // C.SDSP + emitStore(uops, 2, rs2); + break; + } } break; - - case XC(AVX): - //TODO: Whatever, Nehalem has no AVX - break; - - case XC(BROADCAST): //part of AVX - //TODO: Same as AVX - break; - - case XC(AES): - break; - - case XC(PCLMULQDQ): //CLMUL extension (carryless multiply, generally related to AES-NI) - break; - - case XC(XSAVE): - case XC(XSAVEOPT): //hold your horses, it's optimized!! (AVX) - break; - - /* Control flow ops (branches, jumps) */ - case XC(COND_BR): - case XC(UNCOND_BR): - // We model all branches and jumps with a latency of 1. Far jumps are really expensive, but they should be exceedingly rare (from Intel's manual, they are used for call gates, task switches, etc.) - emitBasicOp(instr, uops, 1, PORT_5); - if (opcode == XO(JMP_FAR)) inaccurate = true; - break; - - /* Stack operations */ - case XC(CALL): - case XC(RET): - /* Call and ret are both unconditional branches and stack operations; however, Pin does not list RSP as source or destination for them */ - //dropStackRegister(instr); //stack engine kills accesses to RSP - emitBasicOp(instr, uops, 1, PORT_5); - if (opcode != XO(CALL_NEAR) && opcode != XO(RET_NEAR)) inaccurate = true; //far call/ret or irets are far more complex - break; - - case XC(POP): - case XC(PUSH): - //Again, RSP is not included here, so no need to remove it. - switch (opcode) { - case XO(POP): - case XO(PUSH): - //Basic PUSH/POP are just moves. They are always to/from memory, so PORTS is irrelevant - emitBasicMove(instr, uops, 1, PORTS_015); - break; - case XO(POPF): - case XO(POPFD): - case XO(POPFQ): - //Java uses POPFx/PUSHFx variants. POPF is complicated, 8 uops... microsequenced - inaccurate = true; - emitBasicOp(instr, uops, 14, PORTS_015); - break; - case XO(PUSHF): - case XO(PUSHFD): - case XO(PUSHFQ): - //This one we can handle... 2 exec uops + store and reciprocal thput of 1 - { - uint32_t lats[] = {1, 1}; - uint8_t ports[] = {PORTS_015, PORTS_015}; - emitChainedOp(instr, uops, 2, lats, ports); + /* Use OOO's intrinsic register renaming to simulate vector instructions */ + case RISCV_OPCODE_VECTOR_LOAD: // Vector loads + case RISCV_OPCODE_VECTOR_STORE: + { + if (funct3 != 0) { + if (opcode == RISCV_OPCODE_VECTOR_LOAD) { + emitLoad(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); + } else { + emitStore(uops, riscvInsArithRd(ins), riscvInsArithRs1(ins)); } - break; - - default: - inaccurate = true; - } - break; - - /* Prefetches */ - case XC(PREFETCH): - //A prefetch is just a load that doesn't feed into any register (or REG_TEMP in this case) - //NOTE: Not exactly, because this will serialize future loads under TSO - emitLoads(instr, uops); - break; - - /* Stuff on the system side (some of these are privileged) */ - case XC(INTERRUPT): - case XC(SYSCALL): - case XC(SYSRET): - case XC(IO): - break; - - case XC(SYSTEM): - //TODO: Privileged ops are not included - /*switch(opcode) { - case XO(RDTSC): - case XO(RDTSCP): - opLat = 24; - break; - case XO(RDPMC): - opLat = 40; - break; - default: ; - }*/ - break; - - case XC(SEGOP): - //TODO: These are privileged, right? They are expensive but rare anyhow - break; - - case XC(VTX): //virtualization, hmmm - //TODO - break; - - - /* String ops (I'm reading the manual and they seem just like others... wtf?) */ - case XC(STRINGOP): - switch (opcode) { - case XO(STOSB): - case XO(STOSW): - case XO(STOSD): - case XO(STOSQ): - //mov [rdi] <- rax - //add rdi, 8 - //emitBasicOp(instr, uops, 1, PORTS_015); //not really, this emits the store later and there's no dep (the load is direct to reg) - emitStore(instr, 0, uops, REG_RAX); - emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015); - break; - case XO(LODSB): - case XO(LODSW): - case XO(LODSD): - case XO(LODSQ): - //mov rax <- [rsi] - //add rsi, 8 - emitLoad(instr, 0, uops, REG_RAX); - emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015); - break; - case XO(MOVSB): - case XO(MOVSW): - case XO(MOVSD): - case XO(MOVSQ): - //lodsX + stosX - emitLoad(instr, 0, uops, REG_RAX); - emitStore(instr, 0, uops, REG_RAX); - emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015); - emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015); - break; - case XO(CMPSB): - case XO(CMPSW): - case XO(CMPSD): - case XO(CMPSQ): - //load [rsi], [rdi], compare them, and add the other 2 - //Agner's tables say all exec uops can go anywhere, but I'm betting the comp op only goes in port5 - emitLoad(instr, 0, uops, REG_LOAD_TEMP); - emitLoad(instr, 0, uops, REG_LOAD_TEMP+1); - emitExecUop(REG_LOAD_TEMP, REG_LOAD_TEMP+1, REG_RFLAGS, 0, uops, 1, PORT_5); - emitExecUop(REG_RSI, 0, REG_RSI, 0, uops, 1, PORTS_015); - emitExecUop(REG_RDI, 0, REG_RDI, 0, uops, 1, PORTS_015); - break; - default: //SCAS and other dragons I have not seen yet - inaccurate = true; + } else { + /* Loads and stores of these instructions will be provided + * by the frontend + */ + } } break; - case XC(IOSTRINGOP): - //TODO: These seem to make sense with REP, which Pin unfolds anyway. Are they used al all? - break; - - /* Stuff not even the Intel guys know how to classify :P */ - case XC(MISC): - if (opcode == XO(LEA)) { - emitBasicOp(instr, uops, 1, PORT_1); - } else if (opcode == XO(PAUSE)) { - //Pause is weird. It takes 9 cycles, issues 5 uops (to be treated like a complex instruction and put a wrench on the decoder?), - //and those uops are issued to PORT_015. No idea about how individual uops are sized, but in ubenchs I cannot put even an ADD - //between pauses for free, so I'm assuming it's 9 solid cycles total. - emitExecUop(0, 0, 0, 0, uops, 9, PORTS_015, 8); //9, longest first - emitExecUop(0, 0, 0, 0, uops, 5, PORTS_015, 4); //NOTE: latency does not matter - emitExecUop(0, 0, 0, 0, uops, 5, PORTS_015, 4); - emitExecUop(0, 0, 0, 0, uops, 4, PORTS_015, 3); - emitExecUop(0, 0, 0, 0, uops, 4, PORTS_015, 3); + case RISCV_OPCODE_VECTOR_ARITH: // Vector arithmetic operations + { + // uint32_t vectorOp = INS_VectorOp(ins); + // uint32_t width = INS_VectorWidth(ins); + // uint32_t elements = INS_VectorElements(ins); + uint32_t width = 32, elements = 16; + + // Approximate vector operations based on width and element count + uint32_t baseLatency = 1; + uint8_t port = PORTS_015; + + /* TODO: check this */ + // Adjust latency based on operation type + // switch (vectorOp) { + // case RV_VOP_ADD: + // case RV_VOP_SUB: + // case RV_VOP_AND: + // case RV_VOP_OR: + // case RV_VOP_XOR: + // baseLatency = 1; + // break; + // case RV_VOP_MUL: + // baseLatency = 4; + // port = PORT_0; + // break; + // case RV_VOP_DIV: + // baseLatency = 20; + // port = PORT_0; + // break; + // case RV_VOP_FP_ADD: + // case RV_VOP_FP_SUB: + // baseLatency = 3; + // port = PORT_1; + // break; + // case RV_VOP_FP_MUL: + // baseLatency = 4; + // port = PORT_0; + // break; + // case RV_VOP_FP_DIV: + // baseLatency = 10; + // port = PORT_0; + // break; + // default: + // baseLatency = 1; + // } + + baseLatency = 8; + port = PORT_0; + + // Emit vector operations based on vector length + uint32_t numUops = (width * elements + 63) / 64; // Rough approximation + numUops = numUops == 0 ? 1 : numUops; // At least one uop + + for (uint32_t i = 0; i < numUops; i++) { + emitExecUop(0, 0, 0, 0, uops, baseLatency, port); + } } - /*switch (opcode) { - case CPUID: - case ENTER: - case LEAVE: - case LEA: - case LFENCE: - case MFENCE: - case SFENCE: - case MONITOR: - case MWAIT: - case UD2: - case XLAT: - }*/ - //TODO break; - - default: {} - //panic("Invalid instruction category"); - } - - //Try to produce something approximate... - if (uops.size() - initialUops == isLocked? 1 : 0) { //if it's locked, we have the initial fence for an empty instr - emitBasicOp(instr, uops, 1, PORTS_015, 0, false /* don't report unhandled cases */); - inaccurate = true; + + default: + inaccurate = true; + // Try to produce something approximate + emitExecUop(0, 0, 0, 0, uops, 1, PORTS_015); } //NOTE: REP instructions are unrolled by PIN, so they are accurately simulated (they are treated as predicated in Pin) @@ -1185,176 +962,69 @@ bool Decoder::decodeInstr(INS ins, DynUopVec& uops) { // See Agner Fog's uarch doc, macro-op fusion for Core 2 / Nehalem bool Decoder::canFuse(INS ins) { - xed_iclass_enum_t opcode = (xed_iclass_enum_t) INS_Opcode(ins); - if (!(opcode == XO(CMP) || opcode == XO(TEST))) return false; - //Discard if immediate - for (uint32_t op = 0; op < INS_OperandCount(ins); op++) if (INS_OperandIsImmediate(ins, op)) return false; - - //OK so far, let's check the branch - INS nextIns = INS_Next(ins); - if (!INS_Valid(nextIns)) return false; - xed_iclass_enum_t nextOpcode = (xed_iclass_enum_t) INS_Opcode(nextIns); - xed_category_enum_t nextCategory = (xed_category_enum_t) INS_Category(nextIns); - if (nextCategory != XC(COND_BR)) return false; - if (!INS_IsDirectBranch(nextIns)) return false; //according to PIN's API, this s only true for PC-rel near branches - - switch (nextOpcode) { - case XO(JZ): //or JZ - case XO(JNZ): //or JNE - case XO(JB): - case XO(JBE): - case XO(JNBE): //or JA - case XO(JNB): //or JAE - case XO(JL): - case XO(JLE): - case XO(JNLE): //or JG - case XO(JNL): //or JGE + uint8_t opcode = riscvInsOpCode(ins); + switch (opcode) { + case RISCV_OPCODE_BRANCH: + case RISCV_OPCODE_JAL: return true; - case XO(JO): - case XO(JNO): - case XO(JP): - case XO(JNP): - case XO(JS): - case XO(JNS): - return opcode == XO(TEST); //CMP cannot fuse with these + case RISCV_OPCODE_JALR: default: - return false; //other instrs like LOOP don't fuse + return false; } } -bool Decoder::decodeFusedInstrs(INS ins, DynUopVec& uops) { - //assert(canFuse(ins)); //this better be true :) - - Instr instr(ins); - Instr branch(INS_Next(ins)); - - //instr should have 2 inputs (regs/mem), and 1 output (rflags), and branch should have 2 inputs (rip, rflags) and 1 output (rip) - - if (instr.numOutRegs != 1 || instr.outRegs[0] != REG_RFLAGS || - branch.numOutRegs != 1 || branch.outRegs[0] != REG_RIP) - { - reportUnhandledCase(instr, "decodeFusedInstrs"); - reportUnhandledCase(branch, "decodeFusedInstrs"); - } else { - instr.outRegs[1] = REG_RIP; - instr.numOutRegs++; - } - - emitBasicOp(instr, uops, 1, PORT_5); - return false; //accurate -} - - -#ifdef BBL_PROFILING - -//All is static for now... -#define MAX_BBLS (1<<24) //16M - -static lock_t bblIdxLock = 0; -static uint64_t bblIdx = 0; - -static uint64_t bblCount[MAX_BBLS]; -static std::vector* bblApproxOpcodes[MAX_BBLS]; - -#endif - -BblInfo* Decoder::decodeBbl(BBL bbl, bool oooDecoding) { - uint32_t instrs = BBL_NumIns(bbl); - uint32_t bytes = BBL_Size(bbl); +BblInfo* Decoder::decodeBbl(struct BasicBlock &bbl, bool oooDecoding) { + auto bytes = bbl.codeBytes; BblInfo* bblInfo; - if (oooDecoding) { - //Decode BBL - uint32_t approxInstrs = 0; - uint32_t curIns = 0; - DynUopVec uopVec; - -#ifdef BBL_PROFILING - std::vector approxOpcodes; - - //XED decoder init - xed_state_t dstate; - xed_decoded_inst_t xedd; - xed_state_zero(&dstate); - xed_state_init(&dstate, XED_MACHINE_MODE_LONG_64, XED_ADDRESS_WIDTH_64b, XED_ADDRESS_WIDTH_64b); - xed_decoded_inst_zero_set_mode(&xedd, &dstate); -#endif - - //Gather some info about instructions needed to model decode stalls - std::vector instrAddr; - std::vector instrBytes; - std::vector instrUops; - std::vector instrDesc; - - //Decode - for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins)) { - bool inaccurate = false; - uint32_t prevUops = uopVec.size(); - if (Decoder::canFuse(ins)) { - inaccurate = Decoder::decodeFusedInstrs(ins, uopVec); - instrAddr.push_back(INS_Address(ins)); - instrBytes.push_back(INS_Size(ins)); - instrUops.push_back(uopVec.size() - prevUops); - instrDesc.push_back(ins); - - ins = INS_Next(ins); //skip the JMP - - instrAddr.push_back(INS_Address(ins)); - instrBytes.push_back(INS_Size(ins)); - instrUops.push_back(0); - instrDesc.push_back(ins); - - curIns+=2; - } else { - inaccurate = Decoder::decodeInstr(ins, uopVec); - - instrAddr.push_back(INS_Address(ins)); - instrBytes.push_back(INS_Size(ins)); - instrUops.push_back(uopVec.size() - prevUops); - instrDesc.push_back(ins); - - curIns++; - } -#ifdef PROFILE_ALL_INSTRS - inaccurate = true; //uncomment to profile everything -#endif - if (inaccurate) { - approxInstrs++; -#ifdef BBL_PROFILING - xed_decoded_inst_zero_keep_mode(&xedd); //need to do this per instruction - xed_iform_enum_t iform = XED_IFORM_INVALID; - uint8_t buf[16]; - //Using safecopy, we bypass pagefault uglyness due to out-of-bounds accesses - size_t insBytes = PIN_SafeCopy(buf, INS_Address(ins), 15); - xed_error_enum_t err = xed_decode(&xedd, buf, insBytes); - if (err != XED_ERROR_NONE) { - panic("xed_decode failed: %s", xed_error_enum_t2str(err)); - } else { - iform = xed_decoded_inst_get_iform_enum(&xedd); - } - approxOpcodes.push_back((uint32_t)iform); -#endif - //info("Approx decoding: %s", INS_Disassemble(ins).c_str()); - } + bbl.resetProgramIndex(); + //Gather some info about instructions needed to model decode stalls + std::vector instrAddr; + std::vector instrBytes; + std::vector instrUops; + std::vector instrDesc; + + //Decode BBL + uint32_t approxInstrs = 0; + DynUopVec uopVec; + //Decode + size_t instIndex = 0; + uint8_t instLength = 0; + for (INS ins = bbl.getHeadInstruction(&instIndex, &instLength); !bbl.endOfBlock(); + ins = bbl.getHeadInstruction(&instIndex, &instLength)) { + bool inaccurate = false; + uint32_t prevUops = uopVec.size(); + inaccurate = Decoder::decodeInstr(ins, uopVec); + + instrAddr.push_back(bbl.virtualPc + instIndex); + instrBytes.push_back(instLength); + instrUops.push_back(uopVec.size() - prevUops); + instrDesc.push_back(ins); + + if (inaccurate) { + approxInstrs++; + } + if (Decoder::canFuse(ins)) { + break; } - assert(curIns == instrs); + } + if (oooDecoding) { //Instr predecoder and decode stage modeling; we assume clean slate between BBLs, which is typical because //optimizing compilers 16B-align most branch targets (and if it doesn't happen, the error introduced is fairly small) //1. Predecoding - uint32_t predecCycle[instrs]; + std::vector predecCycle; uint32_t pcyc = 0; uint32_t psz = 0; uint32_t pcnt = 0; uint32_t pblk = 0; - ADDRINT startAddr = (INS_Address(instrDesc[0]) >> 4) << 4; + ADDRINT startAddr = bbl.virtualPc & ~0xfUL; - for (uint32_t i = 0; i < instrs; i++) { - INS ins = instrDesc[i]; - ADDRINT addr = INS_Address(ins); - uint32_t bytes = INS_Size(ins); + for (uint32_t i = 0; i < instrDesc.size(); i++) { + ADDRINT addr = instrAddr[i]; + uint32_t bytes = instrBytes[i]; uint32_t block = (addr - startAddr) >> 4; psz += bytes; pcnt++; @@ -1385,7 +1055,7 @@ BblInfo* Decoder::decodeBbl(BBL bbl, bool oooDecoding) { uint32_t dsimple = 0; uint32_t dcomplex = 0; - for (uint32_t i = 0; i < instrs; i++) { + for (uint32_t i = 0; i < instrDesc.size(); i++) { if (instrUops[i] == 0) continue; //fused branch uint32_t pcyc = predecCycle[i]; @@ -1423,7 +1093,7 @@ BblInfo* Decoder::decodeBbl(BBL bbl, bool oooDecoding) { //Initialize ooo part DynBbl& dynBbl = bblInfo->oooBbl[0]; - dynBbl.addr = BBL_Address(bbl); + dynBbl.addr = bbl.virtualPc; dynBbl.uops = uopVec.size(); dynBbl.approxInstrs = approxInstrs; for (uint32_t i = 0; i < dynBbl.uops; i++) dynBbl.uop[i] = uopVec[i]; @@ -1444,8 +1114,9 @@ BblInfo* Decoder::decodeBbl(BBL bbl, bool oooDecoding) { } //Initialize generic part - bblInfo->instrs = instrs; + bblInfo->instrs = instrDesc.size(); bblInfo->bytes = bytes; + // printf("PC: %016lx, inst count: %d\n", bbl.virtualPc, bblInfo->instrs); return bblInfo; } diff --git a/src/decoder.h b/src/decoder.h index c0fe1cb..f74b939 100644 --- a/src/decoder.h +++ b/src/decoder.h @@ -26,9 +26,10 @@ #ifndef DECODER_H_ #define DECODER_H_ -#include +#include "core.h" +#include +#include #include -#include "pin.H" // Uncomment to get a count of BBLs run. This is currently used to get a distribution of inaccurate instructions decoded that are actually run // NOTE: This is not multiprocess-safe @@ -77,7 +78,11 @@ struct DynBbl { } }; -struct BblInfo; // defined in core.h +struct BblInfo { + uint32_t instrs; + uint32_t bytes; + DynBbl oooBbl[0]; //0 bytes, but will be 1-sized when we have an element (and that element has variable size as well) +}; /* These are absolute maximums per instruction. If there is some non-conforming instruction, either increase these limits or * treat it as a special case. @@ -89,16 +94,37 @@ struct BblInfo; // defined in core.h #define MAX_UOPS_PER_INSTR 12 // technically, even full decoders produce 1-4 uops; we increase this for common microsequenced instructions (e.g. xchg). -/* Temporary register offsets */ -#define REG_LOAD_TEMP (REG_LAST + 1) // REG_LAST defined by PIN -#define REG_STORE_TEMP (REG_LOAD_TEMP + MAX_INSTR_LOADS) -#define REG_STORE_ADDR_TEMP (REG_STORE_TEMP + MAX_INSTR_STORES) -#define REG_EXEC_TEMP (REG_STORE_ADDR_TEMP + MAX_INSTR_STORES) - -#define MAX_REGISTERS (REG_EXEC_TEMP + 64) +#define MAX_REGISTERS 32 typedef std::vector DynUopVec; +/* RISC-V definitions */ +#define RISCV_OPCODE_ATOMIC 0x2f +#define RISCV_OPCODE_INTEGER 0x33 +#define RISCV_OPCODE_INTEGER_IMM 0x13 +#define RISCV_OPCODE_INTEGER_32 0x3b +#define RISCV_OPCODE_INTEGER_IMM_32 0x1b +#define RISCV_OPCODE_LOAD 0x03 +#define RISCV_OPCODE_STORE 0x23 +#define RISCV_OPCODE_BRANCH 0x63 +#define RISCV_OPCODE_JAL 0x6f +#define RISCV_OPCODE_JALR 0x67 +#define RISCV_OPCODE_LUI 0x37 +#define RISCV_OPCODE_AUIPC 0x17 +#define RISCV_OPCODE_SYSTEM 0x73 +#define RISCV_OPCODE_FENCE 0x0f +#define RISCV_OPCODE_MADD_FP 0x43 +#define RISCV_OPCODE_MSUB_FP 0x47 +#define RISCV_OPCODE_NMSUB_FP 0x4b +#define RISCV_OPCODE_NMADD_FP 0x4f +#define RISCV_OPCODE_FP 0x53 +#define RISCV_OPCODE_C0 0x0 +#define RISCV_OPCODE_C1 0x1 +#define RISCV_OPCODE_C2 0x2 +#define RISCV_OPCODE_VECTOR_LOAD 0x07 +#define RISCV_OPCODE_VECTOR_STORE 0x27 +#define RISCV_OPCODE_VECTOR_ARITH 0x57 + //Nehalem-style decoder. Fully static for now class Decoder { private: @@ -121,12 +147,26 @@ class Decoder { private: //Put registers in some canonical order -- non-flags first - void reorderRegs(uint32_t* regArray, uint32_t numRegs); + // void reorderRegs(uint32_t* regArray, uint32_t numRegs); }; public: //If oooDecoding is true, produces a DynBbl with DynUops that can be used in OOO cores - static BblInfo* decodeBbl(BBL bbl, bool oooDecoding); + static BblInfo* decodeBbl(struct BasicBlock &bbl, bool oooDecoding); + + static uint8_t riscvInsOpCode(INS ins); + static uint8_t riscvInsFunct3(INS ins); + static uint8_t riscvInsFunct7(INS ins); + static uint8_t riscvInsIsAtomic(INS ins); + static uint8_t riscvInsArithRd(INS ins); + static uint8_t riscvInsArithRs1(INS ins); + static uint8_t riscvInsArithRs2(INS ins); + static uint8_t riscvCompressedRegDecode(uint8_t reg); + static bool riscvInsIsLoad(INS ins); + static bool riscvInsIsStore(INS ins); + static bool riscvInsIsBranch(INS ins); + static bool riscvInsIsMemAccess(INS ins); + static bool riscvInsIsStoreCond(INS ins); #ifdef BBL_PROFILING static void profileBbl(uint64_t bblIdx); @@ -140,12 +180,8 @@ class Decoder { /* Every emit function can produce 0 or more uops; it returns the number of uops. These are basic templates to make our life easier */ //By default, these emit to temporary registers that depend on the index; this can be overriden, e.g. for moves - static void emitLoad(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t destReg = 0); - static void emitStore(Instr& instr, uint32_t idx, DynUopVec& uops, uint32_t srcReg = 0); - - //Emit all loads and stores for this uop - static void emitLoads(Instr& instr, DynUopVec& uops); - static void emitStores(Instr& instr, DynUopVec& uops); + static void emitLoad(DynUopVec& uops, uint16_t destReg, uint16_t baseReg); + static void emitStore(DynUopVec& uops, uint16_t dataReg, uint16_t addrReg); //Emits a load-store fence uop static void emitFence(DynUopVec& uops, uint32_t lat); @@ -155,37 +191,18 @@ class Decoder { /* Instruction emits */ - static void emitBasicMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports); - static void emitConditionalMove(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports); - - // 1 "exec" uop, 0-2 inputs, 0-2 outputs - static void emitBasicOp(Instr& instr, DynUopVec& uops, uint32_t lat, uint8_t ports, - uint8_t extraSlots = 0, bool reportUnhandled = true); - - // >1 exec uops in a chain: each uop takes 2 inputs, produces 1 output to the next op - // in the chain; the final op writes to the 0-2 outputs - static void emitChainedOp(Instr& instr, DynUopVec& uops, uint32_t numUops, - uint32_t* latArray, uint8_t* portsArray); - - // Some convert ops need 2 chained exec uops, though they have a single input and output - static void emitConvert2Op(Instr& instr, DynUopVec& uops, uint32_t lat1, uint32_t lat2, - uint8_t ports1, uint8_t ports2); + static void emitBasicMove(DynUopVec& uops, uint16_t rd, uint32_t lat, uint8_t ports); /* Specific cases */ - static void emitXchg(Instr& instr, DynUopVec& uops); - static void emitMul(Instr& instr, DynUopVec& uops); - static void emitDiv(Instr& instr, DynUopVec& uops); - - static void emitCompareAndExchange(Instr&, DynUopVec&); + static void emitXchg(DynUopVec& uops, uint16_t rd, uint16_t rs1, uint16_t rs2); + static void emitMul(DynUopVec& uops, uint16_t rd, uint16_t rs1, uint16_t rs2); + static void emitDiv(DynUopVec& uops, uint8_t width, uint16_t rd, uint16_t rs1, uint16_t rs2); /* Other helper functions */ static void reportUnhandledCase(Instr& instr, const char* desc); - static void populateRegArrays(Instr& instr, uint32_t* srcRegs, uint32_t* dstRegs); - static void dropStackRegister(Instr& instr); /* Macro-op (ins) fusion */ static bool canFuse(INS ins); - static bool decodeFusedInstrs(INS ins, DynUopVec& uops); }; #endif // DECODER_H_ diff --git a/src/init.cpp b/src/init.cpp index e31f0eb..e8abc75 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -64,7 +64,6 @@ #include "numa_map.h" #include "ooo_core.h" #include "part_repl_policies.h" -#include "pin_cmd.h" #include "prefetcher.h" #include "proc_stats.h" #include "process_stats.h" @@ -82,7 +81,6 @@ #include "timing_event.h" #include "trace_driver.h" #include "tracing_cache.h" -#include "virt/port_virtualizer.h" #include "weave_md1_mem.h" //validation, could be taken out... #include "zsim.h" @@ -1312,10 +1310,8 @@ void SimInit(const char* configFile, const char* outputDir, uint32_t shmid) { zinfo->attachDebugger = config.get("sim.attachDebugger", false); zinfo->harnessPid = getppid(); zinfo->debugPortId = static_cast(config.get("sim.debugPortId", 0)); - getLibzsimAddrs(&zinfo->libzsimAddrs); if (zinfo->attachDebugger) { - gm_set_secondary_ptr(&zinfo->libzsimAddrs); notifyHarnessForDebugger(zinfo->harnessPid, zinfo->debugPortId); } @@ -1414,16 +1410,11 @@ void SimInit(const char* configFile, const char* outputDir, uint32_t shmid) { if (zinfo->pageSize < zinfo->lineSize) panic("Page size must be no smaller than line size."); if (!isPow2(zinfo->pageSize)) panic("Page size must be power-of-two."); - //Port virtualization - for (uint32_t i = 0; i < MAX_PORT_DOMAINS; i++) zinfo->portVirt[i] = new PortVirtualizer(); - //Process hierarchy //NOTE: Due to partitioning, must be done before initializing memory hierarchy CreateProcessTree(config); zinfo->procArray[0]->notifyStart(); //called here so that we can detect end-before-start races - zinfo->pinCmd = new PinCmd(&config, nullptr /*don't pass config file to children --- can go either way, it's optional*/, outputDir, shmid); - //NUMA map InitNUMA(config); @@ -1465,7 +1456,7 @@ void SimInit(const char* configFile, const char* outputDir, uint32_t shmid) { config.get("sim.aslr", false); //Write config out - bool strictConfig = config.get("sim.strictConfig", true); //if true, panic on unused variables + bool strictConfig = config.get("sim.strictConfig", false); //if true, panic on unused variables config.writeAndClose((string(zinfo->outputDir) + "/out.cfg").c_str(), strictConfig); zinfo->contentionSim->postInit(); diff --git a/src/interval_recorder.h b/src/interval_recorder.h index ee60e87..d09e41b 100644 --- a/src/interval_recorder.h +++ b/src/interval_recorder.h @@ -1,6 +1,7 @@ #ifndef INTERVAL_RECORDER_H_ #define INTERVAL_RECORDER_H_ +#include #include "galloc.h" #include "intrusive_list.h" diff --git a/src/ipc_handler.cpp b/src/ipc_handler.cpp new file mode 100644 index 0000000..e811f4d --- /dev/null +++ b/src/ipc_handler.cpp @@ -0,0 +1,151 @@ +#include "ipc_handler.h" +#include "core.h" +#include "log.h" +#include +#include +#include +#include +#include +#include + +#define panicAndEnd(args...) \ +{ \ + fprintf(logFdErr, "%sPanic on %s:%d: ", logHeader, __FILE__, __LINE__); \ + fprintf(logFdErr, args); \ + fprintf(logFdErr, "\n"); \ + fflush(logFdErr); \ + *endOfThread = true; \ + return nullptr; \ +} + +IPCHandler::IPCHandler(THREADID tid): thread_id(tid) { + std::stringstream ss; + ss << SOCKET_PATH << "trace_in_uds_" << tid; + socketPath = ss.str(); + + serverFd = socket(AF_UNIX, SOCK_STREAM, 0); + if (serverFd < 0) { + panic("Failed to create Unix domain socket"); + } + + sockaddr_un serverAddr; + memset(&serverAddr, 0, sizeof(serverAddr)); + serverAddr.sun_family = AF_UNIX; + strncpy(serverAddr.sun_path, socketPath.c_str(), sizeof(serverAddr.sun_path) - 1); + + /* delete existing domain socket file */ + unlink(socketPath.c_str()); + + if (bind(serverFd, (sockaddr*)&serverAddr, sizeof(serverAddr)) < 0) { + panic("Bind failed"); + close(serverFd); + return; + } + + if (listen(serverFd, 1) < 0) { + panic("Listen failed"); + close(serverFd); + } +} + +IPCHandler::~IPCHandler() { + if (clientFd != -1) { + close(clientFd); + } + if (serverFd != -1) { + close(serverFd); + } + unlink(socketPath.c_str()); +} + +void IPCHandler::waitAccept() { + clientFd = accept(serverFd, nullptr, nullptr); + if (clientFd < 0) { + panic("Accept failed"); + } +} + +size_t IPCHandler::readExactBytes(int fd, void *buffer, size_t size) { + size_t recv = 0; + while (recv < size) { + auto result = read(fd, (char*)buffer + recv, size - recv); + if (result <= 0) return result; + recv += result; + } + return recv; +} + +void *IPCHandler::readData(bool *endOfThread, enum TraceDataType dataType, void *buffer) { + uint32_t expectSize; + if (readExactBytes(clientFd, &expectSize, sizeof(expectSize)) <= 0) { + panicAndEnd("Cannot receive data size"); + } + uint32_t actualDataType; + if (readExactBytes(clientFd, &actualDataType, sizeof(actualDataType)) <= 0) { + panicAndEnd("Cannot receive data type"); + } + if ((uint32_t) dataType != actualDataType) { + panicAndEnd("Cannot resolve packet"); + } + char *buf__ = nullptr; + if (!buffer) { + buffer = new char[expectSize]; + buf__ = (char *)buffer; + } + if (readExactBytes(clientFd, buffer, expectSize) <= 0) { + if (buf__) { + delete[] buf__; + } + panicAndEnd("Cannot resolve packet body"); + } + return buffer; +} + +void IPCHandler::acknowledgeTrace() { + int sent = write(clientFd, "ACK", 4); + assert(sent != -1); +} + +/* + * packet format: size(word) type(word) data + */ +struct FrontendTrace *IPCHandler::receiveTrace() { + bool needEnd = false; + auto frontendTrace = (struct FrontendTrace *)readData(&needEnd, TRACE_DATA_START_TRACE); + if (needEnd) { + return nullptr; + } + frontendTrace->blocks = new struct BasicBlock[frontendTrace->count]; + for (size_t i = 0; i < frontendTrace->count; i++) { + readData(&needEnd, TRACE_DATA_BASIC_BLOCK, (void *) &frontendTrace->blocks[i]); + if (needEnd) { + return nullptr; + } + /* receive basic block data */ + /* receive code */ + frontendTrace->blocks[i].code = reinterpret_cast(readData(&needEnd, TRACE_DATA_CODE)); + if (needEnd) { + return nullptr; + } + /* receive load and store */ + if (frontendTrace->blocks[i].loadStores) { + frontendTrace->blocks[i].loadStore = new struct BasicBlockLoadStore[frontendTrace->blocks[i].loadStores]; + for (size_t j = 0; j < frontendTrace->blocks[i].loadStores; j++) { + readData(&needEnd, TRACE_DATA_LOAD_STORE, &frontendTrace->blocks[i].loadStore[j]); + if (needEnd) { + return nullptr; + } + auto next = &frontendTrace->blocks[i].loadStore[j].next; + while (*next) { + *next = reinterpret_cast(readData(&needEnd, TRACE_DATA_LOAD_STORE)); + if (needEnd) { + return nullptr; + } + next = &((*next)->next); + } + } + } + } + acknowledgeTrace(); + return frontendTrace; +} diff --git a/src/ipc_handler.h b/src/ipc_handler.h new file mode 100644 index 0000000..dd406f1 --- /dev/null +++ b/src/ipc_handler.h @@ -0,0 +1,38 @@ +#ifndef IPC_HANDLER_H +#define IPC_HANDLER_H +#include "decoder.h" +#include +#include + +#define SOCKET_PATH "./" + +enum TraceDataType { + TRACE_DATA_START_TRACE = 0, + TRACE_DATA_BASIC_BLOCK = 1, + TRACE_DATA_CODE = 2, + TRACE_DATA_LOAD_STORE = 3 +}; + +class IPCHandler { +private: + int thread_id; + std::string socketPath = ""; + int serverFd = -1; + int clientFd = -1; + + size_t readExactBytes(int fd, void *buffer, size_t size); + + void *readData(bool *endOfThread, enum TraceDataType dataType, void *buffer = nullptr); + + void acknowledgeTrace(); +public: + IPCHandler(THREADID tid); + + ~IPCHandler(); + + void waitAccept(); + + struct FrontendTrace *receiveTrace(); +}; + +#endif diff --git a/src/network.h b/src/network.h index a9956cf..6acb008 100644 --- a/src/network.h +++ b/src/network.h @@ -32,6 +32,7 @@ * This is a basic model that should be extended as appropriate. */ +#include #include #include diff --git a/src/null_core.cpp b/src/null_core.cpp index 06d83c4..dd61d40 100644 --- a/src/null_core.cpp +++ b/src/null_core.cpp @@ -23,6 +23,7 @@ * this program. If not, see . */ +#include #include "null_core.h" #include "zsim.h" @@ -52,7 +53,7 @@ void NullCore::bbl(BblInfo* bblInfo) { void NullCore::contextSwitch(int32_t gid) {} void NullCore::join() { - curCycle = MAX(curCycle, zinfo->globPhaseCycles); + curCycle = std::max(curCycle, zinfo->globPhaseCycles); phaseEndCycle = zinfo->globPhaseCycles + zinfo->phaseLength; } diff --git a/src/null_core.h b/src/null_core.h index 83619e7..6097738 100644 --- a/src/null_core.h +++ b/src/null_core.h @@ -28,7 +28,7 @@ //A core model with IPC=1 and no hooks into the memory hierarchy. Useful to isolate threads that need to be run for simulation purposes. -#include "core.h" +#include "decoder.h" #include "pad.h" class NullCore : public Core { diff --git a/src/ooo_core.cpp b/src/ooo_core.cpp index 4a2305e..b56285d 100644 --- a/src/ooo_core.cpp +++ b/src/ooo_core.cpp @@ -266,6 +266,7 @@ inline void OOOCore::bbl(Address bblAddr, BblInfo* bblInfo) { dispatchCycle = MAX(lastStoreAddrCommitCycle+1, dispatchCycle); Address addr = loadAddrs[loadIdx++]; + /* TODO: check if uops match the load stores */ uint64_t reqSatisfiedCycle = dispatchCycle; if (addr != ((Address)-1L)) { reqSatisfiedCycle = l1d->load(addr, dispatchCycle) + L1D_LAT; diff --git a/src/ooo_core.h b/src/ooo_core.h index 8bada6e..3109a39 100644 --- a/src/ooo_core.h +++ b/src/ooo_core.h @@ -29,7 +29,7 @@ #include #include #include -#include "core.h" +#include "decoder.h" #include "g_std/g_multimap.h" #include "memory_hierarchy.h" #include "ooo_core_recorder.h" diff --git a/src/pin_cmd.cpp b/src/pin_cmd.cpp deleted file mode 100644 index 985da5f..0000000 --- a/src/pin_cmd.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "pin_cmd.h" -#include -#include -#include -#include -#include "config.h" - -//Funky macro expansion stuff -#define QUOTED_(x) #x -#define QUOTED(x) QUOTED_(x) - -PinCmd::PinCmd(Config* conf, const char* configFile, const char* outputDir, uint64_t shmid) { - //Figure the program paths - const char* zsimEnvPath = getenv("ZSIM_PATH"); - g_string pinPath, zsimPath; - if (zsimEnvPath) { - info("Using env path %s", zsimEnvPath); - pinPath = zsimEnvPath; - pinPath += "/pinbin"; - zsimPath = zsimEnvPath; - zsimPath += "/libzsim.so"; - } else { - pinPath = QUOTED(PIN_PATH); - zsimPath = QUOTED(ZSIM_PATH); - } - - args.push_back(pinPath); - - //Global pin options - args.push_back("-follow_execv"); //instrument child processes - // The following option is removed starting from Pin 3.0 - //args.push_back("-tool_exit_timeout"); //don't wait much of internal threads - //args.push_back("1"); - - //Additional options (e.g., -smc_strict for Java), parsed from config - const char* pinOptions = conf->get("sim.pinOptions", ""); - std::vector tokens; - Tokenize(pinOptions, tokens, " "); - for (auto t : tokens) if (t != "") args.push_back(g_string(t.c_str())); - - //Load tool - args.push_back("-t"); - args.push_back(zsimPath); - - //Tool options - if (configFile) { - //Check configFile is an absolute path - //NOTE: We check rather than canonicalizing it ourselves because by the time we're created, we might be in another directory - char* absPath = realpath(configFile, nullptr); - if (std::string(configFile) != std::string(absPath)) { - panic("Internal zsim bug, configFile should be absolute"); - } - free(absPath); - - args.push_back("-config"); - args.push_back(configFile); - } - - args.push_back("-outputDir"); - args.push_back(outputDir); - - std::stringstream shmid_ss; - shmid_ss << shmid; - - args.push_back("-shmid"); - args.push_back(shmid_ss.str().c_str()); - - if (conf->get("sim.logToFile", false)) { - args.push_back("-logToFile"); - } - - //Read the per-process params of the processes run directly by the harness - while (true) { - std::stringstream p_ss; - p_ss << "process" << procInfo.size(); - - if (!conf->exists(p_ss.str().c_str())) break; - - const char* cmd = conf->get(p_ss.str() + ".command"); - const char* input = conf->get(p_ss.str() + ".input", ""); - const char* loader = conf->get(p_ss.str() + ".loader", ""); - const char* env = conf->get(p_ss.str() + ".env", ""); - - ProcCmdInfo pi = {g_string(cmd), g_string(input), g_string(loader), g_string(env)}; - procInfo.push_back(pi); - } - - // Set env vars required before invoking pintool. - // See launcher_u.c and os_specific_l.c - // These env vars are generally required; the others are set per process. -#ifdef PIN_CRT_TZDATA - assert(setenv("PIN_CRT_TZDATA", QUOTED(PIN_CRT_TZDATA), 1) == 0); -#endif - assert(setenv("PIN_VM64_LD_LIBRARY_PATH", QUOTED(LDLIB_PATH), 1) == 0); - assert(setenv("PIN_INJECTOR64_LD_LIBRARY_PATH", QUOTED(LDLIB_PATH), 1) == 0); - assert(setenv("PIN_LD_RESTORE_REQUIRED", "t", 1) == 0); -} - -g_vector PinCmd::getPinCmdArgs(uint32_t procIdx) { - g_vector res = args; - - std::stringstream procIdx_ss; - procIdx_ss << procIdx; - res.push_back("-procIdx"); - res.push_back(procIdx_ss.str().c_str()); - res.push_back("--"); - return res; -} - -g_vector PinCmd::getFullCmdArgs(uint32_t procIdx, const char** inputFile, wordExpFunc f) { - assert(procIdx < procInfo.size()); //must be one of the topmost processes - g_vector res = getPinCmdArgs(procIdx); - - g_string cmd = procInfo[procIdx].cmd; - - /* Loader injection: Turns out that Pin mingles with the simulated binary, which decides the loader used, - * even when PIN_VM_LIBRARY_PATH is used. This kill the invariance on libzsim.so's loaded address, because - * loaders in different children have different sizes. So, if specified, we prefix the program with the - * given loader. This is optional because it won't work with statically linked binaries. - * - * BTW, thinking of running pin under a specific loaderto fix this instead? Nope, it gets into an infinite loop. - */ - if (procInfo[procIdx].loader != "") { - cmd = procInfo[procIdx].loader + " " + cmd; - info("Injected loader on process%d, command line: %s", procIdx, cmd.c_str()); - warn("Loader injection makes Pin unaware of symbol routines, so things like routine patching" - "will not work! You can homogeneize the loaders instead by editing the .interp ELF section"); - } - - //Parse command - for (auto s : f(cmd.c_str())) res.push_back(s); - - //Input redirect - *inputFile = (procInfo[procIdx].input == "")? nullptr : procInfo[procIdx].input.c_str(); - return res; -} - -void PinCmd::setEnvVars(uint32_t procIdx, wordExpFunc f) { - assert(procIdx < procInfo.size()); //must be one of the topmost processes - if (procInfo[procIdx].env != "") { - for (auto s : f(procInfo[procIdx].env.c_str())) { - char* var = strdup(s.c_str()); //putenv() does not make copies, and takes non-const char* in - if (putenv(var) != 0) { - panic("putenv(%s) failed", var); - } - } - } - - // Backup env vars required by the app but not by pintool. - const char* libraryPath = getenv("LD_LIBRARY_PATH"); - if (libraryPath) { - assert(setenv("PIN_APP_LD_LIBRARY_PATH", libraryPath, 1) == 0); - } - assert(setenv("LD_LIBRARY_PATH", QUOTED(LDLIB_PATH), 1) == 0); - const char* assumeKernel = getenv("LD_ASSUME_KERNEL"); - if (assumeKernel) { - assert(setenv("PIN_APP_LD_ASSUME_KERNEL", assumeKernel, 1) == 0); - assert(unsetenv("LD_ASSUME_KERNEL") == 0); - } - const char* bindNow = getenv("LD_BIND_NOW"); - if (bindNow) { - assert(setenv("PIN_APP_LD_BIND_NOW", bindNow, 1) == 0); - assert(unsetenv("LD_BIND_NOW") == 0); - } - const char* preload = getenv("LD_PRELOAD"); - if (preload) { - assert(setenv("PIN_APP_LD_PRELOAD", preload, 1) == 0); - assert(unsetenv("LD_PRELOAD") == 0); - } -} - diff --git a/src/pin_cmd.h b/src/pin_cmd.h deleted file mode 100644 index b213ddc..0000000 --- a/src/pin_cmd.h +++ /dev/null @@ -1,64 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef PIN_CMD_H_ -#define PIN_CMD_H_ - -/* Interface to get pin command line */ - -#include -#include "g_std/g_string.h" -#include "g_std/g_vector.h" -#include "galloc.h" - -class Config; - -class PinCmd : public GlobAlloc { - private: - g_vector args; - - struct ProcCmdInfo { - g_string cmd; - g_string input; - g_string loader; - g_string env; - }; - - g_vector procInfo; //one entry for each process that the harness launches (not for child procs) - - public: - // Callback type for shell expansion. - typedef g_vector (*wordExpFunc)(const char*); - - public: - PinCmd(Config* conf, const char* configFile, const char* outputDir, uint64_t shmid); - g_vector getPinCmdArgs(uint32_t procIdx); - g_vector getFullCmdArgs(uint32_t procIdx, const char** inputFile, wordExpFunc f); - void setEnvVars(uint32_t procIdx, wordExpFunc f); - - uint32_t getNumCmdProcs() {return procInfo.size();} -}; - -#endif // PIN_CMD_H_ diff --git a/src/scheduler.cpp b/src/scheduler.cpp index f2e7d16..47ae840 100644 --- a/src/scheduler.cpp +++ b/src/scheduler.cpp @@ -24,15 +24,14 @@ * this program. If not, see . */ +#include #include "scheduler.h" #include #include // POSIX regex instead of C++11 regex #include "config.h" // for ParseList -#include "pin.H" #include "process_tree.h" #include "profile_stats.h" #include "str.h" -#include "virt/syscall_name.h" //The scheduler class started simple, but at some point having it all in the header is too ridiculous. Migrate non perf-intensive calls here! (all but sync, really) @@ -123,11 +122,12 @@ void Scheduler::watchdogThreadFunc() { if (lastPhase == curPhase && !fakeLeaves.empty() && (fakeLeaves.front()->th->futexJoin.action != FJA_WAKE)) { if (++fakeLeaveStalls >= WATCHDOG_STALL_THRESHOLD) { + assert(false); info("Detected possible stall due to fake leaves (%ld current)", fakeLeaves.size()); // Uncomment to print all leaves FakeLeaveInfo* pfl = fakeLeaves.front(); while (pfl) { - info(" [%d/%d] %s (%d) @ 0x%lx", getPid(pfl->th->gid), getTid(pfl->th->gid), GetSyscallName(pfl->syscallNumber), pfl->syscallNumber, pfl->pc); + info(" [%d/%d] %s (%d) @ 0x%lx", getPid(pfl->th->gid), getTid(pfl->th->gid), "GetSyscallName(pfl->syscallNumber)", pfl->syscallNumber, pfl->pc); pfl = pfl->next; } @@ -142,13 +142,13 @@ void Scheduler::watchdogThreadFunc() { regex_t sbRegex; if (regcomp(&sbRegex, sbRegexStr.c_str(), REG_EXTENDED | REG_NOSUB)) panic("Scheduler fails to compile syscall blacklist regex (%s)", sbRegexStr.c_str()); - if (regexec(&sbRegex, GetSyscallName(fl->syscallNumber), 0, nullptr, 0) == 0) { + if (regexec(&sbRegex, "GetSyscallName(fl->syscallNumber)", 0, nullptr, 0) == 0) { // If this is the last leave we catch, it is the culprit for sure -> blacklist it // Over time, this will blacklist every blocking syscall // The root reason for being conservative though is that we don't have a sure-fire // way to distinguish IO waits from truly blocking syscalls (TODO) if (fakeLeaves.size() == 1) { - info("Blacklisting from future fake leaves: [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx", pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1); + info("Blacklisting from future fake leaves: [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx", pid, "GetSyscallName(fl->syscallNumber)", fl->pc, fl->arg0, fl->arg1); blockingSyscalls[pid].insert(fl->pc); } @@ -178,7 +178,7 @@ void Scheduler::watchdogThreadFunc() { } while (fakeLeaves.size() > 8); } else { info("Skipping, [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx does not match blacklist regex (%s)", - pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1, sbRegexStr.c_str()); + pid, "GetSyscallName(fl->syscallNumber)", fl->pc, fl->arg0, fl->arg1, sbRegexStr.c_str()); } fakeLeaveStalls = 0; } @@ -267,13 +267,9 @@ void Scheduler::watchdogThreadFunc() { info("Finished scheduler watchdog thread"); } -void Scheduler::threadTrampoline(void* arg) { - Scheduler* sched = static_cast(arg); - sched->watchdogThreadFunc(); -} - -void Scheduler::startWatchdogThread() { - PIN_SpawnInternalThread(threadTrampoline, this, 64*1024, nullptr); +void Scheduler::threadTrampoline(Scheduler* arg) { + while (arg->schedInitialized.load()); + arg->watchdogThreadFunc(); } @@ -315,7 +311,7 @@ void Scheduler::notifyFutexWakeStart(uint32_t pid, uint32_t tid, uint32_t maxWak // Programs sometimes call FUTEX_WAIT with maxWakes = UINT_MAX to wake // everyone waiting on it; we cap to a reasonably high number to avoid // overflows on maxAllowedFutexWakeups - maxWakes = MIN(maxWakes, 1<<24 /*16M wakes*/); + maxWakes = std::min((unsigned long)maxWakes, 1UL<<24 /*16M wakes*/); maxAllowedFutexWakeups += maxWakes; th->futexJoin.maxWakes = maxWakes; diff --git a/src/scheduler.h b/src/scheduler.h index 728c018..4610f61 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include "barrier.h" #include "constants.h" #include "core.h" @@ -171,9 +173,13 @@ class Scheduler : public GlobAlloc, public Callee { inline uint32_t getPid(uint32_t gid) const {return gid >> 16;} inline uint32_t getTid(uint32_t gid) const {return gid & 0x0FFFF;} + std::atomic schedInitialized; + std::thread watchDogThread; + public: Scheduler(void (*_atSyncFunc)(void), uint32_t _parallelThreads, uint32_t _numCores, uint32_t _schedQuantum) : - atSyncFunc(_atSyncFunc), bar(_parallelThreads, this), numCores(_numCores), schedQuantum(_schedQuantum), rnd(0x5C73D9134) + atSyncFunc(_atSyncFunc), bar(_parallelThreads, this), numCores(_numCores), schedQuantum(_schedQuantum), rnd(0x5C73D9134), + schedInitialized(true), watchDogThread(threadTrampoline, this) { contexts.resize(numCores); for (uint32_t i = 0; i < numCores; i++) { @@ -194,7 +200,8 @@ class Scheduler : public GlobAlloc, public Callee { info("Started RR scheduler, quantum=%d phases", schedQuantum); terminateWatchdogThread = false; - startWatchdogThread(); + + schedInitialized.store(false); } ~Scheduler() {} @@ -828,10 +835,9 @@ class Scheduler : public GlobAlloc, public Callee { * Instead, we have an auxiliary thread check for this condition periodically, and if all threads are sleeping or blocked, we just drive time * forward. */ - void startWatchdogThread(); void watchdogThreadFunc(); - static void threadTrampoline(void* arg); + static void threadTrampoline(Scheduler* arg); /* Accurate and adaptive join-leave * diff --git a/src/simple_core.h b/src/simple_core.h index 8a3a143..e8b7fe2 100644 --- a/src/simple_core.h +++ b/src/simple_core.h @@ -28,7 +28,7 @@ //A simple core model with IPC=1 except on memory accesses -#include "core.h" +#include "decoder.h" #include "memory_hierarchy.h" #include "pad.h" diff --git a/src/timing_core.h b/src/timing_core.h index 956907c..46c643c 100644 --- a/src/timing_core.h +++ b/src/timing_core.h @@ -26,7 +26,7 @@ #ifndef TIMING_CORE_H_ #define TIMING_CORE_H_ -#include "core.h" +#include "decoder.h" #include "core_recorder.h" #include "event_recorder.h" #include "memory_hierarchy.h" diff --git a/src/virt/common.h b/src/virt/common.h deleted file mode 100644 index 74c1af1..0000000 --- a/src/virt/common.h +++ /dev/null @@ -1,132 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef VIRT_COMMON_H_ -#define VIRT_COMMON_H_ - -// Typedefs and common functions for Virt implementation -// This is internal to virt, and should only be included withing virt/ files - -#include "galloc.h" -#include "log.h" -#include "pin.H" -#include "virt/virt.h" - -struct PrePatchArgs { - uint32_t tid; - CONTEXT* ctxt; - SYSCALL_STANDARD std; - const char* patchRoot; - bool isNopThread; -}; - -struct PostPatchArgs { - uint32_t tid; - CONTEXT* ctxt; - SYSCALL_STANDARD std; -}; - -// Define our own wrapper for lambda closures instead of std::function. -class PostPatchFn { -private: - class LambdaWrapperBase : public GlobAlloc { - public: - virtual ~LambdaWrapperBase() = default; - virtual PostPatchAction invoke(PostPatchArgs&) = 0; - virtual LambdaWrapperBase* clone() const = 0; - }; - - template - class LambdaWrapper : public LambdaWrapperBase { - private: - Lambda l; - public: - LambdaWrapper(const Lambda& _l) : l(_l) {} - ~LambdaWrapper() override = default; - PostPatchAction invoke(PostPatchArgs& args) { return l(args); } - LambdaWrapperBase* clone() const { return new LambdaWrapper(l); } - }; - -private: - LambdaWrapperBase* w; - -public: - template - PostPatchFn(Lambda l) { - w = new LambdaWrapper(l); - } - - PostPatchFn() { - w = nullptr; - } - - PostPatchFn(const PostPatchFn& other) { - w = other.w ? other.w->clone() : nullptr; - } - - PostPatchFn& operator=(const PostPatchFn& other) { - w = other.w ? other.w->clone() : nullptr; - return *this; - } - - PostPatchFn(PostPatchFn&& other) { - delete w; - w = other.w; - other.w = nullptr; - } - - PostPatchFn& operator=(PostPatchFn&& other) { - delete w; - w = other.w; - other.w = nullptr; - return *this; - } - - ~PostPatchFn() { - delete w; - w = nullptr; - } - - PostPatchAction operator()(PostPatchArgs args) { - return w ? w->invoke(args) : PPA_NOTHING; - } -}; - -typedef PostPatchFn (*PrePatchFn)(PrePatchArgs); - -extern const PostPatchFn NullPostPatch; // defined in virt.cpp - -// PIN_SafeCopy wrapper. We expect the default thing to be correct access -template -static inline bool safeCopy(const T* src, T* dst, const char* file = __FILE__, int line = __LINE__) { - size_t copiedBytes = PIN_SafeCopy(dst, src, sizeof(T)); - if (copiedBytes != sizeof(T)) { - warn("[%d] %s:%d Failed app<->tool copy (%ld/%ld bytes copied)", PIN_ThreadId(), file, line, copiedBytes, sizeof(T)); - return false; - } - return true; -} - -#endif // VIRT_COMMON_H_ diff --git a/src/virt/control.cpp b/src/virt/control.cpp deleted file mode 100644 index ef02d70..0000000 --- a/src/virt/control.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include "log.h" -#include "process_tree.h" -#include "scheduler.h" -#include "virt/common.h" -#include "zsim.h" - -static void sleepUntilPhase(uint32_t tid, uint64_t wakeupPhase, CONTEXT* ctxt, SYSCALL_STANDARD std) { - auto futexWord = zinfo->sched->markForSleep(procIdx, tid, wakeupPhase); - // Turn this into a non-timed FUTEX_WAIT syscall - PIN_SetSyscallNumber(ctxt, std, SYS_futex); - PIN_SetSyscallArgument(ctxt, std, 0, (ADDRINT)futexWord); - PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT)FUTEX_WAIT); - PIN_SetSyscallArgument(ctxt, std, 2, (ADDRINT)1 /*by convention, see sched code*/); - PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)nullptr); -} - -PostPatchFn PatchExitGroup(PrePatchArgs args) { - if (args.isNopThread || zinfo->procArray[procIdx]->isInFastForward()) { - // Already in FF, i.e., left the barrier. No need to patch. - return NullPostPatch; - } - /* We need to play a trick here. If we directly do exit_group, other - * threads may be killed without calling leave() first, resulting in - * deadlock at the phase barrier. Our solution is to mark the process as in - * a group-exit state, which every thread will check at the next end of - * phase, and call leave() at the beginning of next phase. The caller waits - * until then and re-executes exit_group, to finish the whole process. - */ - // Mark the process as in group-exit. - info("PatchExitGroup: thread %u in process %u calls exit_group", args.tid, procIdx); - zinfo->procArray[procIdx]->exitGroup(); - - // Save args. - ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - ADDRINT arg1 = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - ADDRINT arg2 = PIN_GetSyscallArgument(args.ctxt, args.std, 2); - ADDRINT arg3 = PIN_GetSyscallArgument(args.ctxt, args.std, 3); - // Save current PC for retry. - ADDRINT prevIp = PIN_GetContextReg(args.ctxt, REG_INST_PTR); - - // Sleep for 2 phases until other threads leave in the next phase. - uint64_t wakeupPhase = zinfo->numPhases + 2; - sleepUntilPhase(args.tid, wakeupPhase, args.ctxt, args.std); - - return [wakeupPhase, prevIp, arg0, arg1, arg2, arg3](PostPatchArgs args) { - if (wakeupPhase > zinfo->numPhases) { - warn("PatchExitGroup: thread was waken up too early (current %lu < expected %lu); retry", zinfo->numPhases, wakeupPhase); - sleepUntilPhase(args.tid, wakeupPhase, args.ctxt, args.std); - } else { - // Re-execute exit_group - PIN_SetSyscallNumber(args.ctxt, args.std, SYS_exit_group); - // Restore pre-call args - PIN_SetSyscallArgument(args.ctxt, args.std, 0, arg0); - PIN_SetSyscallArgument(args.ctxt, args.std, 1, arg1); - PIN_SetSyscallArgument(args.ctxt, args.std, 2, arg2); - PIN_SetSyscallArgument(args.ctxt, args.std, 3, arg3); - } - PIN_SetContextReg(args.ctxt, REG_INST_PTR, prevIp); - return PPA_USE_RETRY_PTRS; - // A successful exit_group will never return, which ends retry. - }; -} - diff --git a/src/virt/cpu.cpp b/src/virt/cpu.cpp deleted file mode 100644 index 45b430b..0000000 --- a/src/virt/cpu.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "bithacks.h" -#include "cpuenum.h" -#include "log.h" -#include "virt/common.h" -#include "scheduler.h" - -// SYS_getcpu - -// Call without CPU from vdso, with CPU from syscall version -void VirtGetcpu(uint32_t tid, uint32_t cpu, ADDRINT arg0, ADDRINT arg1) { - unsigned resCpu; - unsigned resNode = 0; - if (!arg0) { - info("getcpu() called with null cpu arg"); - } - if (!safeCopy((unsigned*)arg0, &resCpu)) { - info("getcpu() called with invalid cpu arg"); - return; - } - if (arg1 && !safeCopy((unsigned*)arg1, &resNode)) { - info("getcpu() called with invalid node arg"); - return; - } - - trace(TimeVirt, "Patching getcpu()"); - trace(TimeVirt, "Orig cpu %d, node %d, patching core %d / node 0", resCpu, resNode, cpu); - resCpu = cpu; - resNode = 0; - - safeCopy(&resCpu, (unsigned*)arg0); - if (arg1) safeCopy(&resNode, (unsigned*)arg1); -} - -PostPatchFn PatchGetcpu(PrePatchArgs args) { - uint32_t cpu = cpuenumCpu(procIdx, getCid(args.tid)); // still valid, may become invalid when we leave() - assert(cpu != (uint32_t)-1); - return [cpu](PostPatchArgs args) { - trace(TimeVirt, "[%d] Post-patching SYS_getcpu", tid); - ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - ADDRINT arg1 = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - VirtGetcpu(args.tid, cpu, arg0, arg1); - return PPA_NOTHING; - }; -} - -// Scheduler affinity - -PostPatchFn PatchSchedGetaffinity(PrePatchArgs args) { - return [](PostPatchArgs args) { - int err = -PIN_GetSyscallNumber(args.ctxt, args.std); - if (err == EINVAL || err == EFAULT) { - // SYS_sched_getaffinity may return EINVAL if the given cpusetsize is too small. - // If error, directly return to the user. - return PPA_NOTHING; - } - // On success, the syscall returns the size of cpumask_t in bytes. - uint32_t minSize = -err; - // Get the required size from the simulated number of cores. - uint32_t reqSize = CPU_ALLOC_SIZE(cpuenumNumCpus(procIdx)); - // Get the allocated size of the argument. - uint32_t size = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - if (reqSize > minSize) { - // Extend cpumask_t size. - minSize = reqSize; - warn("[%u/%u] Increase cpumask_t size to %d to support %u cores. This may break some applications. " - "Try patch root or disable this change.", procIdx, args.tid, minSize, cpuenumNumCpus(procIdx)); - PIN_SetSyscallNumber(args.ctxt, args.std, minSize); - } - if (size < minSize) { - // CPU set size is not large enough. Return EINVAL. - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EINVAL); - return PPA_NOTHING; - } - - uint32_t linuxTid = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - uint32_t tid = (linuxTid == 0 ? args.tid : zinfo->sched->getTidFromLinuxTid(linuxTid)); - std::vector cpumask(cpuenumNumCpus(procIdx), true); // all core eligible - if (tid == (uint32_t)-1) { - warn("SYS_sched_getaffinity cannot find thread with OS id %u (maybe in FF?), default to be all core eligible", linuxTid); - return PPA_NOTHING; - } else { - cpumask = cpuenumMask(procIdx, tid); - } - cpu_set_t* set = (cpu_set_t*)PIN_GetSyscallArgument(args.ctxt, args.std, 2); - if (set) { //TODO: use SafeCopy, this can still segfault - CPU_ZERO_S(size, set); - for (uint32_t i = 0; i < MIN(cpumask.size(), size*8 /*size is in bytes, supports 1 cpu/bit*/); i++) { - if (cpumask[i]) CPU_SET_S(i, (size_t)size, set); - } - } - info("[%d] Post-patching SYS_sched_getaffinity size %d cpuset %p", tid, size, set); - return PPA_NOTHING; - }; -} - -PostPatchFn PatchSchedSetaffinity(PrePatchArgs args) { - uint32_t linuxTid = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - uint32_t tid = (linuxTid == 0 ? args.tid : zinfo->sched->getTidFromLinuxTid(linuxTid)); - if (tid == (uint32_t)-1) { - warn("SYS_sched_setaffinity cannot find thread with OS id %u (maybe in FF?), ignored!", linuxTid); - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT) SYS_getpid); // squash - return [](PostPatchArgs args) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EPERM); - return PPA_NOTHING; - }; - } - uint32_t size = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - if (size*8 < cpuenumNumCpus(procIdx)) { - // CPU set size is not large enough. Return EINVAL. - return [](PostPatchArgs args) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EINVAL); - return PPA_NOTHING; - }; - } - cpu_set_t* set = (cpu_set_t*)PIN_GetSyscallArgument(args.ctxt, args.std, 2); - info("[%d] Pre-patching SYS_sched_setaffinity size %d cpuset %p", tid, size, set); - if (set) { - std::vector cpumask(cpuenumNumCpus(procIdx)); - for (uint32_t i = 0; i < MIN(cpumask.size(), size*8 /*size is in bytes, supports 1 cpu/bit*/); i++) { - cpumask[i] = CPU_ISSET_S(i, (size_t)size, set); - } - cpuenumUpdateMask(procIdx, tid, cpumask); - } - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT) SYS_getpid); // squash - return [](PostPatchArgs args) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)0); // return 0 on success - // SyscallEnter() in zsim.cpp makes sure that (fake) leaving syscalls will use join ptrs. - return PPA_NOTHING; - }; -} - diff --git a/src/virt/fs.cpp b/src/virt/fs.cpp deleted file mode 100644 index b6cf9b1..0000000 --- a/src/virt/fs.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include -#include -#include -#include -#include -#include "config.h" // for Tokenize -#include "process_tree.h" -#include "str.h" -#include "virt/common.h" - - -using std::string; -using std::vector; - -/* Helper functions to perform robust, incremental name resolution - * See http://man7.org/linux/man-pages/man7/path_resolution.7.html - * Tested against several corner cases - */ - -static string getcwd() { - char buf[PATH_MAX+1]; - char* res = getcwd(buf, PATH_MAX); - assert(res); - return string(res); -} - -static string abspath(const string& path, const string& basepath) { - if (path.length() == 0) return path; - if (path[0] == '/') return path; - return basepath + "/" + path; -} - -static string dirnamepath(const string& path) { - char* buf = strdup(path.c_str()); - string res = dirname(buf); - free(buf); - return res; -} - -// Resolves at most one symlink, returns an absolute path -// Works fine if file does not exist --- it will return the same path -string resolvepath(const string& path) { - string ap = abspath(path, getcwd()); - if (ap.length() == 0) return ap; - - vector comps; - Tokenize(ap, comps, "/"); - - // Remove empty comps - for (int32_t i = comps.size() - 1; i >= 0; i--) { - if (comps[i].length() == 0) comps.erase(comps.begin() + i); - } - if (comps.size() == 0) return "/"; - - std::string cur = "/"; - for (uint32_t i = 0; i < comps.size(); i++) { - if (comps[i] == "..") { - cur = dirnamepath(cur); // reaching / is safe, (/.. returns /) - if ((i+1) < comps.size()) cur += "/"; - continue; - } - string p = cur + comps[i]; - - char buf[PATH_MAX+1]; - int res = readlink(p.c_str(), buf, PATH_MAX); - if (res < 0) { - // not a symlink, keep going - cur = p; - if ((i+1) < comps.size()) cur += "/"; - } else { - // NULL-terminate the string (readlink doesn't) - assert(res <= PATH_MAX); - buf[res] = '\0'; - - // Reconstruct rest of the path - string link = buf; - string newpath = abspath(link, cur); - for (uint32_t j = i+1; j < comps.size(); j++) { - newpath += "/" + comps[j]; - } - cur = newpath; - break; - } - } - return cur; -} - -/* Path generation from patchRoot */ - -vector listdir(string dir) { - vector files; - - DIR* d = opendir(dir.c_str()); - if (!d) panic("Invalid dir %s", dir.c_str()); - - struct dirent* de; - while ((de = readdir(d)) != nullptr) { - string s = de->d_name; - if (s == ".") continue; - if (s == "..") continue; - files.push_back(s); - } - - closedir(d); - return files; -} - -vector* getFakedPaths(const char* patchRoot) { - vector rootFiles = listdir(patchRoot); - auto pi = std::find(rootFiles.begin(), rootFiles.end(), "proc"); - - // HACK: We soft-patch on /proc (only patch files that exist) - if (pi != rootFiles.end()) { - rootFiles.erase(pi); - vector procFiles = listdir(patchRoot + string("/proc")); - for (auto pf : procFiles) { - rootFiles.push_back("proc/" + pf); - } - } - - vector* res = new vector(); - for (auto f : rootFiles) { - res->push_back("/" + f); - } - info("PatchRoot %s, faking paths %s", patchRoot, Str(*res).c_str()); - return res; -} - -static const vector* fakedPaths = nullptr; //{"/proc/cputinfo", "/proc/stat", "/sys", "/lib", "/usr"}; -static uint32_t numInfos = 0; -static const uint32_t MAX_INFOS = 100; - -// SYS_open and SYS_openat; these are ALWAYS patched -PostPatchFn PatchOpen(PrePatchArgs args) { - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - const char* patchRoot = args.patchRoot; - - uint32_t syscall = PIN_GetSyscallNumber(ctxt, std); - assert(syscall == SYS_open || syscall == SYS_openat); - - if (!patchRoot) return NullPostPatch; // process does not want patched system... - - string fileName; - int pathReg = (syscall == SYS_open)? 0 : 1; - ADDRINT pathArg = PIN_GetSyscallArgument(ctxt, std, pathReg); - if (pathArg) fileName = (const char*) pathArg; // TODO(dsm): SafeCopy - if (syscall == SYS_openat) { - // Get path relative to dirfd's path; if AT_CWDFD, readlink() should fail - int dirfd = PIN_GetSyscallArgument(ctxt, std, 0); - char buf[PATH_MAX+1]; - string fd = "/proc/self/fd/" + Str(dirfd); - int res = readlink(fd.c_str(), buf, PATH_MAX); - if (res > 0) { - buf[res] = '\0'; // argh... readlink does not null-terminate strings! - // Double-check deref'd symlink is valid - char* rp = realpath(buf, nullptr); - if (rp) { - fileName = string(buf) + "/" + fileName; - free(rp); - } else { - panic("Not a valid path, but readlink() succeeded! %s fd %d res %d", buf, dirfd, res); - } - } - } - - // Try to match the path with out path matches, and resolve symlinks in path one at a time. - // This ensures we always catch any symlink that gets us to one of the paths we intercept. - vector bases; - string curPath = abspath(fileName, getcwd()); - uint32_t numSymlinks = 0; - - while (numSymlinks < 1024 /*avoid symlink loops*/) { - bool match = false; - if (!fakedPaths) fakedPaths = getFakedPaths(patchRoot); - for (uint32_t i = 0; i < fakedPaths->size(); i++) { - uint32_t diff = strncmp(curPath.c_str(), fakedPaths->at(i).c_str(), fakedPaths->at(i).length()); - if (!diff) { - match = true; - break; - } - } - - if (match) { - std::string patchPath = patchRoot; - patchPath += curPath; - - bool patch = true; - //Try to open the patched file to see if it exists - //NOTE: We now rely on always patching; uncomment to do selectively, but this leaks info - //FILE * patchedFd = fopen(patchPath.c_str(), "r"); - //if (patchedFd) fclose(patchedFd); else patch = false; - if (patch) { - char* patchPathMem = strdup(patchPath.c_str()); // in heap - if (numInfos <= MAX_INFOS) { - info("Patched SYS_open, original %s, patched %s", fileName.c_str(), patchPathMem); - if (numInfos == MAX_INFOS) { - info("(Omitting future SYS_open path messages...)"); - } - numInfos++; - } - PIN_SetSyscallArgument(ctxt, std, pathReg, (ADDRINT) patchPathMem); - - // Restore old path on syscall exit - return [pathReg, pathArg, patchPathMem](PostPatchArgs args) { - PIN_SetSyscallArgument(args.ctxt, args.std, pathReg, pathArg); - free(patchPathMem); - return PPA_NOTHING; - }; - } else { - info("Patched SYS_open to match %s, left unpatched (no patch)", fileName.c_str()); - return NullPostPatch; - } - } else { - string newPath = resolvepath(curPath); - if (newPath == curPath) { - break; // we've already resolved all the symlinks - } else { - numSymlinks++; - curPath = newPath; - } - } - } - // info("Leaving SYS_open unpatched, %s", fileName.c_str()); - return NullPostPatch; -} - diff --git a/src/virt/numa.cpp b/src/virt/numa.cpp deleted file mode 100644 index 00e10ea..0000000 --- a/src/virt/numa.cpp +++ /dev/null @@ -1,460 +0,0 @@ -#include -#include "cpuenum.h" -#include "log.h" -#include "memory_hierarchy.h" -#include "numa_map.h" -#include "virt/common.h" -#include "zsim.h" - -#define BITS_PER_ULONG (sizeof(unsigned long) * 8) -#define ULONGS_FOR_BIT(n) (((n)+BITS_PER_ULONG-1)/BITS_PER_ULONG) - -/* Help functions. */ - -// Manipulate nodemask. Return error code. -// nodemask -> vector. -static inline int nodemask2vector(unsigned long* nodemask, unsigned long maxnode, g_vector& vec) { - // Initialize to empty. - uint32_t sysMaxNode = zinfo->numaMap->getMaxNode(); - vec.assign(sysMaxNode + 1, false); - - // The number of nodes in nodemask. - // It is a little confusing what maxnode means. If nodemask is not nullptr, maxnode is included; - // but if nodemask is nullptr, 0 maxnode means no nodes are specified at all. - if (nodemask == nullptr) return 0; - // Only look at nodes up to max node. - size_t num = MIN(sysMaxNode, maxnode) + 1; - - for (size_t iw = 0; iw < ULONGS_FOR_BIT(num); iw++) { - unsigned long m; - if (!safeCopy(nodemask + iw, &m)) return EFAULT; - for (size_t ib = 0; ib < BITS_PER_ULONG; ib++) { - size_t idx = ib + iw * BITS_PER_ULONG; - if (idx >= num) break; - vec[idx] = m & (1uL << ib); - } - } - return 0; -} - -// vector -> nodemask. -static inline int vector2nodemask(const g_vector& vec, unsigned long* nodemask, unsigned long maxnode) { - // Check input vector. - if (vec.empty()) return 0; - uint32_t sysMaxNode = zinfo->numaMap->getMaxNode(); - assert(vec.size() == sysMaxNode + 1); - - // The number of nodes in nodemask. - if (maxnode < sysMaxNode) return EINVAL; - size_t num = sysMaxNode + 1; - - for (size_t iw = 0; iw < ULONGS_FOR_BIT(num); iw++) { - unsigned long m = 0; - for (size_t ib = 0; ib < BITS_PER_ULONG; ib++) { - size_t idx = ib + iw * BITS_PER_ULONG; - if (idx >= num) break; - if (vec[idx]) m |= (1uL << ib); - } - if (!safeCopy(&m, nodemask + iw)) return EFAULT; - } - return 0; -} - -// Empty vector (all false). -static inline bool isEmptyVector(const g_vector& vec) { - for (const auto& b : vec) if (b) return false; - return true; -} - - -// Validate policy and nodemask (as vector). -static inline int validate(int mode, const g_vector& vec, const char* name) { -#ifdef MPOL_F_STATIC_NODES -#ifdef MPOL_F_RELATIVE_NODES - if ((mode & MPOL_F_STATIC_NODES) || (mode & MPOL_F_RELATIVE_NODES)) { - warn("%s does not support MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES!", name); - return EINVAL; - } -#endif // MPOL_F_RELATIVE_NODES -#endif // MPOL_F_STATIC_NODES - - switch (mode) { - case MPOL_DEFAULT: -#ifdef MPOL_LOCAL - case MPOL_LOCAL: -#endif // MPOL_LOCAL - // Nodemask must be empty. - if (!isEmptyVector(vec)) return EINVAL; - break; - case MPOL_BIND: - case MPOL_INTERLEAVE: - // Nodemask must be non-empty. - if (isEmptyVector(vec)) return EINVAL; - break; - case MPOL_PREFERRED: - // Nodemask could be empty or non-empty. - break; - default: - // Invalid mode. - return EINVAL; - } - return 0; -} - - -// Core-to-node mapping. -static inline uint32_t getNodeOfCore(uint32_t cid) { - assert(cid < zinfo->numCores); - return zinfo->numaMap->getNodeOfCore(cid); -} - - -// Address-to-node mapping. -static inline Address getPageAddress(void* addr) { - return zinfo->numaMap->getPageAddress((Address)addr); -} - -static inline Address getPageAddressEnd(void* addr, unsigned long len) { - return getPageAddress(reinterpret_cast(reinterpret_cast(addr) + len - 1)) + 1; -} - -static inline uint32_t getNodeOfAddr(void* addr) { - return zinfo->numaMap->getNodeOfPage(getPageAddress(addr)); -} - -static inline size_t addAddrRangeToNode(void* addr, unsigned long len, uint32_t node) { - auto begin = getPageAddress(addr); - auto end = getPageAddressEnd(addr, len); - return zinfo->numaMap->addPagesToNode(begin, end - begin, node); -} - -static inline void removeAddrRange(void* addr, unsigned long len) { - auto begin = getPageAddress(addr); - auto end = getPageAddressEnd(addr, len); - zinfo->numaMap->removePages(begin, end - begin); -} - -static inline size_t addAddrRangeThreadPolicy(void* addr, unsigned long len, uint32_t tid, uint32_t cid, NUMAPolicy* policy = nullptr) { - auto begin = getPageAddress(addr); - auto end = getPageAddressEnd(addr, len); - return zinfo->numaMap->addPagesThreadPolicy(begin, end - begin, procIdx, tid, cid, policy); -} - - -/* Patches. */ - -PostPatchFn getErrorPostPatch(int err) { - return [err](PostPatchArgs args) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-err); - return PPA_NOTHING; - }; -} - -// SYS_get_mempolicy -PostPatchFn PatchGetMempolicy(PrePatchArgs args) { - if (!zinfo->numaMap) { - warn("[%d] NUMA is not modeled in the simulated system configuration, syscall: SYS_get_mempolicy (%d)", args.tid, SYS_get_mempolicy); - return getErrorPostPatch(ENOSYS); - } - - int* mode = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 0)); - unsigned long* nodemask = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 1)); - unsigned long maxnode = PIN_GetSyscallArgument(args.ctxt, args.std, 2); - void* addr = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 3)); - unsigned long flags = PIN_GetSyscallArgument(args.ctxt, args.std, 4); - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)SYS_getpid); // no effect on host - - // Validate. - if (((flags & MPOL_F_ADDR) && addr == nullptr) || (!(flags & MPOL_F_ADDR) && addr != nullptr) - || ((flags & MPOL_F_MEMS_ALLOWED) && ((flags & MPOL_F_ADDR) || (flags & MPOL_F_NODE)))) - return getErrorPostPatch(EINVAL); - - return [=](PostPatchArgs args){ - if (!flags) { - // Return policy through mode and nodemask. - const auto& policy = zinfo->numaMap->getThreadPolicy(procIdx, args.tid); - if (mode != nullptr) { - int resMode = static_cast(policy.getMode()); - if (!safeCopy(&resMode, mode)) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EFAULT); - return PPA_NOTHING; - } - } - if (nodemask != nullptr) { - const auto& resMask = policy.getMask(); - auto err = vector2nodemask(resMask, nodemask, maxnode); - if (err) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-err); - return PPA_NOTHING; - } - } - } else if (flags & MPOL_F_MEMS_ALLOWED) { - // Return allowed nodes through nodemask. Argument mode is ignored. - if (nodemask != nullptr) { - // By default all nodes are allowed for mbind(). - g_vector resMask(zinfo->numaMap->getMaxNode() + 1, true); - auto err = vector2nodemask(resMask, nodemask, maxnode); - if (err) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-err); - return PPA_NOTHING; - } - } - } else if ((flags & MPOL_F_ADDR) && (flags & MPOL_F_NODE)) { - // Return node ID in mode for addr. - const auto node = getNodeOfAddr(addr); - if (mode != nullptr) { - int resMode = static_cast(node); - if (!safeCopy(&resMode, mode)) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EFAULT); - return PPA_NOTHING; - } - } - if (nodemask != nullptr) { - g_vector resMask(zinfo->numaMap->getMaxNode() + 1, false); - resMask[node] = true; - auto err = vector2nodemask(resMask, nodemask, maxnode); - if (err) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-err); - return PPA_NOTHING; - } - } - } else if (flags & MPOL_F_ADDR) { - // Return policy for addr through mode and nodemask, if not null. - // FIXME(mgao12): currently we do not store the allocation policy. - warn("SYS_get_mempolicy does not support MPOL_F_ADDR for allocation policy!"); - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EINVAL); - return PPA_NOTHING; - } else if (flags & MPOL_F_NODE) { - // Return next interleaving node ID. - const auto& policy = zinfo->numaMap->getThreadPolicy(procIdx, args.tid); - if (policy.getMode() != MPOL_INTERLEAVE) { - // The policy must be MPOL_INTERLEAVE. - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EINVAL); - return PPA_NOTHING; - } - const auto nextNode = zinfo->numaMap->getThreadNextAllocNode(procIdx, args.tid); - assert(nextNode != NUMAMap::INVALID_NODE); - if (mode != nullptr) { - int resMode = static_cast(nextNode); - if (!safeCopy(&resMode, mode)) { - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EFAULT); - return PPA_NOTHING; - } - } - } else { - // Invalid flags. - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EINVAL); - return PPA_NOTHING; - } - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)0); // return 0 on success - return PPA_NOTHING; - }; -} - -// SYS_set_mempolicy -PostPatchFn PatchSetMempolicy(PrePatchArgs args) { - if (!zinfo->numaMap) { - warn("[%d] NUMA is not modeled in the simulated system configuration, syscall: SYS_set_mempolicy (%d)", args.tid, SYS_set_mempolicy); - return getErrorPostPatch(ENOSYS); - } - - int mode = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - unsigned long* nodemask = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 1)); - unsigned long maxnode = PIN_GetSyscallArgument(args.ctxt, args.std, 2); - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)SYS_getpid); // no effect on host - - // Translate nodemask. - g_vector vec; - int err = nodemask2vector(nodemask, maxnode, vec); - if (err) return getErrorPostPatch(err); - - // Validate. - err = validate(mode, vec, "SYS_set_mempolicy"); - if (err) return getErrorPostPatch(err); - - // Update policy. - return [mode, vec](PostPatchArgs args) { - zinfo->numaMap->setThreadPolicy(procIdx, args.tid, mode, vec); - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)0); // return 0 on success - return PPA_NOTHING; - }; -} - -// SYS_mbind -PostPatchFn PatchMbind(PrePatchArgs args) { - if (!zinfo->numaMap) { - warn("[%d] NUMA is not modeled in the simulated system configuration, syscall: SYS_mbind (%d)", args.tid, SYS_mbind); - return getErrorPostPatch(ENOSYS); - } - - void* addr = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 0)); - unsigned long len = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - int mode = PIN_GetSyscallArgument(args.ctxt, args.std, 2); - unsigned long* nodemask = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 3)); - unsigned long maxnode = PIN_GetSyscallArgument(args.ctxt, args.std, 4); - unsigned long flags = PIN_GetSyscallArgument(args.ctxt, args.std, 5); - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)SYS_getpid); // no effect on host - - // Translate nodemask. - g_vector vec; - int err = nodemask2vector(nodemask, maxnode, vec); - if (err) return getErrorPostPatch(err); - - // Validate. - err = validate(mode, vec, "SYS_mbind"); - if (err) return getErrorPostPatch(err); - if (flags & MPOL_MF_MOVE_ALL) { - warn("SYS_mbind does not support MPOL_MF_MOVE_ALL!"); - return getErrorPostPatch(EPERM); - } - - // We must get the core info now, since thread will leave after entering syscall. - uint32_t cid = getCid(args.tid); - if (mode == MPOL_DEFAULT && cid >= zinfo->numCores) { - warn("Thread %u uses default mempolicy but runs on core %u (are we in FF?); fall back to default core 0", args.tid, cid); - cid = 0; - } - - return [=](PostPatchArgs args) { - // Construct the policy if not default. - NUMAPolicy* policy = nullptr; - if (mode != MPOL_DEFAULT) { - policy = new NUMAPolicy(mode, vec); - } - - // Add all non-existing pages; either move or ignore existing pages depending on flags. - bool isStrict = (flags & MPOL_MF_STRICT); - bool movePages = (flags & MPOL_MF_MOVE); - if (movePages) { - removeAddrRange(addr, len); - } - auto ignoredCount = addAddrRangeThreadPolicy(addr, len, args.tid, cid, policy); - if (isStrict && ignoredCount != 0) { - // Some pages do not follow the policy or could not be moved. - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EIO); - return PPA_NOTHING; - } - - delete policy; - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)0); // return 0 on success - return PPA_NOTHING; - }; -} - -// SYS_migrate_pages -PostPatchFn PatchMigratePages(PrePatchArgs args) { - if (!zinfo->numaMap) { - warn("[%d] NUMA is not modeled in the simulated system configuration, syscall: SYS_migrate_pages (%d)", args.tid, SYS_migrate_pages); - return getErrorPostPatch(ENOSYS); - } - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)SYS_getpid); // no effect on host - - return [](PostPatchArgs args) { - // FIXME(mgao12): current NUMAMap does not provide interface to migrate all pages - // associated with a node in a process, so we do not patch migrate_page for now. - warn("SYS_migrate_pages is not supported for now!"); - // Make it a failure. - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-EPERM); - return PPA_NOTHING; - }; -} - -// SYS_move_pages -PostPatchFn PatchMovePages(PrePatchArgs args) { - if (!zinfo->numaMap) { - warn("[%d] NUMA is not modeled in the simulated system configuration, syscall: SYS_move_pages (%d)", args.tid, SYS_move_pages); - return getErrorPostPatch(ENOSYS); - } - - uint32_t linuxTid = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - unsigned long count = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - void** pages = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 2)); - const int* nodes = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 3)); - int* status = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 4)); - int flags = PIN_GetSyscallArgument(args.ctxt, args.std, 5); - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)SYS_getpid); // no effect on host - - // Validate. - if (linuxTid != 0 || (flags & MPOL_MF_MOVE_ALL)) { - warn("SYS_move_pages does not support non-zero pid or MPOL_MF_MOVE_ALL!"); - return getErrorPostPatch(EPERM); - } - if (pages == nullptr) return getErrorPostPatch(EINVAL); - - return [=](PostPatchArgs args) { - int err = 0; - for (unsigned long idx = 0; idx < count; idx++) { - // Get page. - void* page = nullptr; - if (!safeCopy(pages + idx, &page)) { - err = EFAULT; - break; - }; - - int stat = 0; - if (nodes != nullptr) { - // Move pages. - int resNode = 0; - if (!safeCopy(nodes + idx, &resNode)) { - err = EFAULT; - break; - } - uint32_t node = static_cast(resNode); - if (node > zinfo->numaMap->getMaxNode()) { - err = ENODEV; - break; - } - removeAddrRange(page, 1); - assert(addAddrRangeToNode(page, 1, node) == 0); - stat = static_cast(node); - } else { - // Get current node. - uint32_t node = getNodeOfAddr(page); - stat = static_cast(node); - } - if (status != nullptr) { - if (!safeCopy(&stat, status + idx)) { - err = EFAULT; - break; - } - } - } - - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT)-err); - return PPA_NOTHING; - }; -} - -// SYS_munmap -PostPatchFn PatchMunmap(PrePatchArgs args) { - if (zinfo->numaMap) { - void* addr = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 0)); - size_t len = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - - removeAddrRange(addr, len); - } - - return NullPostPatch; -} - -// SYS_mremap -PostPatchFn PatchMremap(PrePatchArgs args) { - if (zinfo->numaMap) { - void* old_addr = reinterpret_cast(PIN_GetSyscallArgument(args.ctxt, args.std, 0)); - size_t old_size = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - - warn("mremap will NOT preserve the NUMA memory policy with the original allocation!"); - - removeAddrRange(old_addr, old_size); - } - - return NullPostPatch; -} - diff --git a/src/virt/patchdefs.h b/src/virt/patchdefs.h deleted file mode 100644 index 528b9bd..0000000 --- a/src/virt/patchdefs.h +++ /dev/null @@ -1,71 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -// Definitions of which patch functions handle which syscalls -// Uses macros, assumes you'll include this from somewhere else - -// Unconditional patches - -// File system -- fs.cpp -PF(SYS_open, PatchOpen); -PF(SYS_openat, PatchOpen); - -// Port virtualization -- ports.cpp -PF(SYS_bind, PatchBind); -PF(SYS_getsockname, PatchGetsockname); -PF(SYS_connect, PatchConnect); - -// CPU virtualization -- cpu.cpp -PF(SYS_getcpu, PatchGetcpu); -PF(SYS_sched_getaffinity, PatchSchedGetaffinity); -PF(SYS_sched_setaffinity, PatchSchedSetaffinity); - -// NUMA virtualization -- numa.cpp -PF(SYS_get_mempolicy, PatchGetMempolicy); -PF(SYS_set_mempolicy, PatchSetMempolicy); -PF(SYS_mbind, PatchMbind); -PF(SYS_migrate_pages, PatchMigratePages); -PF(SYS_move_pages, PatchMovePages); -PF(SYS_munmap, PatchMunmap); -PF(SYS_mremap, PatchMremap); - -// Thread control -- control.cpp -PF(SYS_exit_group, PatchExitGroup); - -// Conditional patches, only when not fast-forwarded - -// Time virtualization -- time.cpp -PF(SYS_gettimeofday, PatchGettimeofday); -PF(SYS_time, PatchTime); -PF(SYS_clock_gettime, PatchClockGettime); -PF(SYS_nanosleep, PatchNanosleep); -PF(SYS_clock_nanosleep, PatchNanosleep); - -// Timeout virtualization -- timeout.cpp -PF(SYS_futex, PatchTimeoutSyscall); -PF(SYS_epoll_wait, PatchTimeoutSyscall); -PF(SYS_epoll_pwait, PatchTimeoutSyscall); -PF(SYS_poll, PatchTimeoutSyscall); - diff --git a/src/virt/port_virtualizer.h b/src/virt/port_virtualizer.h deleted file mode 100644 index 1d6e7a1..0000000 --- a/src/virt/port_virtualizer.h +++ /dev/null @@ -1,73 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef VIRT_PORT_VIRTUALIZER_H_ -#define VIRT_PORT_VIRTUALIZER_H_ - -/* Simple class to keep tabs on virtualized ports */ - -#include "g_std/g_unordered_map.h" -#include "galloc.h" -#include "locks.h" - -class PortVirtualizer : public GlobAlloc { - private: - g_unordered_map realToVirt; - g_unordered_map virtToReal; - - lock_t pvLock; - - public: - PortVirtualizer() { - futex_init(&pvLock); - } - - //Must always lock before any operation, and unlock after! - //lock() unlock() are external because bind() spans multiple methods - void lock() { futex_lock(&pvLock); } - void unlock() { futex_unlock(&pvLock); } - - //Note there's no error checking for a bind that binds on a previous one. - //If someone previous bound to that port, the virtualization code should just go ahead with that mapping and - //either let bind() fail (if the previous bind is stil active) or succeed (if the previous bind ended) - void registerBind(int virt, int real) { - realToVirt[real] = virt; - virtToReal[virt] = real; - } - - //Returns -1 if not in map. For connect() and bind() - int lookupReal(int virt) { - g_unordered_map::iterator it = virtToReal.find(virt); - return (it == virtToReal.end())? -1 : it->second; - } - - //Returns -1 if not in map. For getsockname(), where the OS returns real and we need virt - int lookupVirt(int real) { - g_unordered_map::iterator it = realToVirt.find(real); - return (it == realToVirt.end())? -1 : it->second; - } -}; - -#endif // VIRT_PORT_VIRTUALIZER_H_ diff --git a/src/virt/ports.cpp b/src/virt/ports.cpp deleted file mode 100644 index 4238f6d..0000000 --- a/src/virt/ports.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include -#include -#include -#include "process_tree.h" -#include "virt/common.h" -#include "virt/port_virtualizer.h" -#include "zsim.h" - -// Helper function -static struct sockaddr_in* GetSockAddr(ADDRINT guestAddr, size_t guestSize) { - if (guestSize != sizeof(struct sockaddr_in)) return nullptr; - struct sockaddr_in* res = (struct sockaddr_in*) malloc(sizeof(struct sockaddr_in)); - if (!safeCopy((struct sockaddr_in*) guestAddr, res) || res->sin_family != AF_INET) { - free(res); - return nullptr; - } - return res; -} - -// Patch functions - -PostPatchFn PatchBind(PrePatchArgs args) { - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - - ADDRINT sAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1); - ADDRINT sLen = PIN_GetSyscallArgument(ctxt, std, 2); - struct sockaddr_in* servAddr = GetSockAddr(sAddrPtr, sLen); - if (!servAddr) return NullPostPatch; // invalid input or socketaddr family - - int port = ntohs(servAddr->sin_port); - if (port != 0) { // if port is 0, we don't need to virtualize, OS will assign a free one - uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain(); - info("Virtualizing bind() to port %d (domain %d)", port, portDomain); - zinfo->portVirt[portDomain]->lock(); //unlocked either on write failure below, or after the syscall - int prevPort = zinfo->portVirt[portDomain]->lookupReal(port); - if (prevPort == -1) { - // No previous bind(), request whatever - servAddr->sin_port = htons(0); - } else { - // There was a previous bind() on this port, so we reuse the translation - // This should work in MOST cases, but may fail if the port is reused by something else and we conflict. Should be quite rare, since Linux tries to space out anonymous reassigns to the same port - warn("bind() to port %d, this port already has a translation %d, using it --- in rare cases this may fail when the unvirtualized case should succeed", port, prevPort); - servAddr->sin_port = htons(prevPort); - } - PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT) servAddr); - - auto postFn = [sAddrPtr](PostPatchArgs args) { - struct sockaddr_in* servAddr = (struct sockaddr_in*) PIN_GetSyscallArgument(args.ctxt, args.std, 1); - int virtPort = ntohs(((struct sockaddr_in*)sAddrPtr)->sin_port); - - uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain(); - REG out = (REG) PIN_GetSyscallNumber(args.ctxt, args.std); - if (out == 0) { - int sockfd = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - struct sockaddr_in sockName; //NOTE: sockaddr_in to sockaddr casts are fine - socklen_t sockLen = sizeof(sockName); - if (getsockname(sockfd, (struct sockaddr*)&sockName, &sockLen) != 0) { - panic("bind() succeeded, but getsockname() failed..."); - } - int realPort = ntohs(sockName.sin_port); - - info("Virtualized bind(), v: %d r: %d (domain %d)", virtPort, realPort, portDomain); - zinfo->portVirt[portDomain]->registerBind(virtPort, realPort); - } else { - info("bind(): tried to virtualize port, but bind() failed, not registering (domain %d)", portDomain); - } - zinfo->portVirt[portDomain]->unlock(); // note lock was in prepatch - - // Restore original descriptor, free alloc - PIN_SetSyscallArgument(args.ctxt, args.std, 1, sAddrPtr); - free(servAddr); - return PPA_NOTHING; - }; - return postFn; - } else { - free(servAddr); - return NullPostPatch; - } -} - -PostPatchFn PatchGetsockname(PrePatchArgs args) { - return [](PostPatchArgs args) { - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - - REG out = (REG) PIN_GetSyscallNumber(ctxt, std); - if (out == 0) { - ADDRINT sockAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1); - struct sockaddr_in sockAddr; - //safecopy may fail here and that's OK, it's just not a sockaddr_in, so not IPv4 - if (safeCopy((struct sockaddr_in*) sockAddrPtr, &sockAddr) && sockAddr.sin_family == AF_INET) { - int realPort = ntohs(sockAddr.sin_port); - uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain(); - zinfo->portVirt[portDomain]->lock(); - int virtPort = zinfo->portVirt[portDomain]->lookupVirt(realPort); - zinfo->portVirt[portDomain]->unlock(); - if (virtPort != -1) { - info("Virtualizing getsockname() on previously bound port, r: %d, v: %d (domain %d)", realPort, virtPort, portDomain); - sockAddr.sin_port = htons(virtPort); - if (!safeCopy(&sockAddr, (struct sockaddr_in*) sockAddrPtr)) { - panic("getsockname() virt fail"); - } - } - } - } //else this failed, no need to virtualize - return PPA_NOTHING; - }; -} - -PostPatchFn PatchConnect(PrePatchArgs args) { - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - - ADDRINT sAddrPtr = PIN_GetSyscallArgument(ctxt, std, 1); - ADDRINT sLen = PIN_GetSyscallArgument(ctxt, std, 2); - struct sockaddr_in* servAddr = GetSockAddr(sAddrPtr, sLen); - if (!servAddr) return NullPostPatch; // invalid input or socketaddr family - - int virtPort = ntohs(servAddr->sin_port); - uint32_t portDomain = zinfo->procArray[procIdx]->getPortDomain(); - zinfo->portVirt[portDomain]->lock(); - int realPort = zinfo->portVirt[portDomain]->lookupReal(virtPort); - zinfo->portVirt[portDomain]->unlock(); - if (realPort != -1) { - info("Virtualizing connect(), v: %d r: %d (domain %d)", virtPort, realPort, portDomain); - servAddr->sin_port = htons(realPort); - PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT) servAddr); - - auto postFn = [sAddrPtr, servAddr](PostPatchArgs args) { - //Restore original (virt) port (NOTE: regardless of whether connect() succeeded or not) - PIN_SetSyscallArgument(args.ctxt, args.std, 1, sAddrPtr); - free(servAddr); - return PPA_NOTHING; - }; - return postFn; - } else { - free(servAddr); - return NullPostPatch; - } -} - diff --git a/src/virt/syscall_name.cpp.in b/src/virt/syscall_name.cpp.in deleted file mode 100644 index 1f2574e..0000000 --- a/src/virt/syscall_name.cpp.in +++ /dev/null @@ -1,36 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2014 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -static const char* syscallNames[] = { -// Auto-generated at build time -SYSCALL_NAME_LIST -}; - -#include - -const char* GetSyscallName(uint32_t syscall) { - return (syscall >= sizeof(syscallNames)/sizeof(syscallNames[0]))? "INVALID" : syscallNames[syscall]; -} - diff --git a/src/virt/syscall_name.h b/src/virt/syscall_name.h deleted file mode 100644 index 4c29063..0000000 --- a/src/virt/syscall_name.h +++ /dev/null @@ -1,31 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef VIRT_SYSCALL_NAME_H_ -#define VIRT_SYSCALL_NAME_H_ - -const char* GetSyscallName(uint32_t syscall); - -#endif // VIRT_SYSCALL_NAME_H_ diff --git a/src/virt/time.cpp b/src/virt/time.cpp deleted file mode 100644 index c0ba417..0000000 --- a/src/virt/time.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include -#include -#include "log.h" -#include "process_tree.h" -#include "rdtsc.h" -#include "scheduler.h" -#include "virt/common.h" -#include "virt/time_conv.h" -#include "zsim.h" - -static bool SkipTimeVirt(PrePatchArgs args) { - // having both conditions ensures that we don't virtualize in the interim of toggling ff ON - return args.isNopThread || zinfo->procArray[procIdx]->isInFastForward(); -} - -// General virtualization functions, used for both syscall and vsyscall/vdso virtualization - -void VirtGettimeofday(uint32_t tid, ADDRINT arg0) { - trace(TimeVirt, "[%d] Post-patching gettimeofday", tid); - if (arg0) { - struct timeval tv; - if (!safeCopy((struct timeval*) arg0, &tv)) { - info("Failed read of gettimeofday() input"); - return; - } - trace(TimeVirt, "Orig %ld sec, %ld usec", tv.tv_sec, tv.tv_usec); - uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles); - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - tv = nsToTimeval(zinfo->clockDomainInfo[domain].realtimeOffsetNs + simNs); - - trace(TimeVirt, " Patched %ld sec, %ld usec", tv.tv_sec, tv.tv_usec); - if (!safeCopy(&tv, (struct timeval*) arg0)) { - info("Failed write of gettimeofday() output"); - } - } -} - -void VirtTime(uint32_t tid, REG* out, ADDRINT arg0) { - time_t origRes = (time_t)out; - if (origRes == ((time_t)-1) || origRes == ((time_t)-EFAULT)) { //glibc will return -1; raw syscall will return -EFAULT - info("[%d] post-patch time(), returned error or EFAULT (%ld)", tid, origRes); - return; - } - - uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles); - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - time_t tm = (zinfo->clockDomainInfo[domain].realtimeOffsetNs + simNs)/NSPS; - - trace(TimeVirt, "[%d] Post-patching time(), orig %ld, new %ld", tid, (time_t)*out, tm); - *out = (REG)tm; - if (arg0) { - if (!safeCopy(&tm, (time_t*) arg0)) { - info("Failed write of time() output"); - } - } -} - -void VirtClockGettime(uint32_t tid, ADDRINT arg0, ADDRINT arg1) { - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - ClockDomainInfo& dom = zinfo->clockDomainInfo[domain]; - - //arg0 indicates clock type - uint64_t offset = 0; - switch (arg0) { - case CLOCK_MONOTONIC: - offset = dom.monotonicOffsetNs; - break; - case CLOCK_REALTIME: - offset = dom.realtimeOffsetNs; - break; - case CLOCK_PROCESS_CPUTIME_ID: - offset = dom.processOffsetNs; - break; - case CLOCK_THREAD_CPUTIME_ID: - offset = dom.processOffsetNs; - warn("clock_gettime() called with CLOCK_THREAD_CPUTIME_ID, faking with CLOCK_PROCESS_CPUTIME_ID"); - break; - } //with others, the result does not matter --- actual clock_gettime has returned -1 and EINVAL - - if (arg1) { - struct timespec ts; - if (!safeCopy((struct timespec*) arg1, &ts)) { - info("Failed read of clock_gettime() input"); - return; - } - - trace(TimeVirt, "Patching clock_gettime()"); - trace(TimeVirt, "Orig %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - - syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &ts); - trace(TimeVirt, "MONOTONIC %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - syscall(SYS_clock_gettime, CLOCK_REALTIME, &ts); - trace(TimeVirt, "REALTIME %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - syscall(SYS_clock_gettime, CLOCK_PROCESS_CPUTIME_ID, &ts); - trace(TimeVirt, "PROCESS_CPUTIME_ID %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - syscall(SYS_clock_gettime, CLOCK_THREAD_CPUTIME_ID, &ts); - trace(TimeVirt, "THREAD_CPUTIME_ID %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - - uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles); - ts = nsToTimespec(offset + simNs); - trace(TimeVirt, "Patched %ld sec, %ld nsec", ts.tv_sec, ts.tv_nsec); - - if (!safeCopy(&ts, (struct timespec*) arg1)) { - info("Failed write of gettimeofday() output"); - } - } -} - -// Syscall patch wrappers - -PostPatchFn PatchGettimeofday(PrePatchArgs args) { - if (SkipTimeVirt(args)) return NullPostPatch; - return [](PostPatchArgs args) { - trace(TimeVirt, "[%d] Post-patching SYS_gettimeofday", args.tid); - ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - VirtGettimeofday(args.tid, arg0); - return PPA_NOTHING; - }; -} - -PostPatchFn PatchTime(PrePatchArgs args) { - if (SkipTimeVirt(args)) return NullPostPatch; - return [](PostPatchArgs args) { - trace(TimeVirt, "[%d] Post-patching SYS_time", args.tid); - ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - REG out = (REG)PIN_GetSyscallNumber(args.ctxt, args.std); - VirtTime(args.tid, &out, arg0); - PIN_SetSyscallNumber(args.ctxt, args.std, (ADDRINT) out); // hack, we have no way of setting the result, this changes rax just as well - return PPA_NOTHING; - }; -} - -PostPatchFn PatchClockGettime(PrePatchArgs args) { - if (SkipTimeVirt(args)) return NullPostPatch; - return [](PostPatchArgs args) { - trace(TimeVirt, "[%d] Post-patching SYS_clock_gettime", args.tid); - ADDRINT arg0 = PIN_GetSyscallArgument(args.ctxt, args.std, 0); - ADDRINT arg1 = PIN_GetSyscallArgument(args.ctxt, args.std, 1); - VirtClockGettime(args.tid, arg0, arg1); - return PPA_NOTHING; - }; -} - -// SYS_nanosleep & SYS_clock_nanosleep - -PostPatchFn PatchNanosleep(PrePatchArgs args) { - if (SkipTimeVirt(args)) return NullPostPatch; - - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - uint32_t syscall = PIN_GetSyscallNumber(ctxt, std); - bool isClock = (syscall == SYS_clock_nanosleep); - assert(isClock || syscall == SYS_nanosleep); - - struct timespec* ts; - uint64_t offsetNsec = 0; - if (isClock) { - trace(TimeVirt, "[%d] Pre-patching SYS_clock_nanosleep", tid); - int flags = (int) PIN_GetSyscallArgument(ctxt, std, 1); - ts = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, 2); - if (flags == TIMER_ABSTIME) { - trace(TimeVirt, "[%d] SYS_clock_nanosleep requests TIMER_ABSTIME, offsetting", tid); - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles); - offsetNsec = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs; - } - } else { - trace(TimeVirt, "[%d] Pre-patching SYS_nanosleep", tid); - ts = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, 0); - } - - // Check preconditions - // FIXME, shouldn't this use safeCopy?? - if (!ts) return NullPostPatch; // kernel will return EFAULT - if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec > 999999999) return NullPostPatch; // kernel will return EINVAL - - uint64_t waitNsec = timespecToNs(*ts); - if (waitNsec >= offsetNsec) waitNsec -= offsetNsec; - else waitNsec = 0; - - uint64_t waitCycles = nsToCycles(waitNsec); - uint64_t waitPhases = waitCycles/zinfo->phaseLength + 1; //wait at least 1 phase - uint64_t wakeupPhase = zinfo->numPhases + waitPhases; - - volatile uint32_t* futexWord = zinfo->sched->markForSleep(procIdx, args.tid, wakeupPhase); - - // Save args - ADDRINT arg0 = PIN_GetSyscallArgument(ctxt, std, 0); - ADDRINT arg1 = PIN_GetSyscallArgument(ctxt, std, 1); - ADDRINT arg2 = PIN_GetSyscallArgument(ctxt, std, 2); - ADDRINT arg3 = PIN_GetSyscallArgument(ctxt, std, 3); - struct timespec* rem = (struct timespec*) PIN_GetSyscallArgument(ctxt, std, isClock? 3 : 1); - - // Turn this into a non-timed FUTEX_WAIT syscall - PIN_SetSyscallNumber(ctxt, std, SYS_futex); - PIN_SetSyscallArgument(ctxt, std, 0, (ADDRINT)futexWord); - PIN_SetSyscallArgument(ctxt, std, 1, (ADDRINT)FUTEX_WAIT); - PIN_SetSyscallArgument(ctxt, std, 2, (ADDRINT)1 /*by convention, see sched code*/); - PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)nullptr); - - return [isClock, wakeupPhase, arg0, arg1, arg2, arg3, rem](PostPatchArgs args) { - CONTEXT* ctxt = args.ctxt; - SYSCALL_STANDARD std = args.std; - - if (isClock) { - trace(TimeVirt, "[%d] Post-patching SYS_clock_nanosleep", tid); - } else { - trace(TimeVirt, "[%d] Post-patching SYS_nanosleep", tid); - } - - int res = (int)(-PIN_GetSyscallNumber(ctxt, std)); - if (res == EWOULDBLOCK) { - trace(TimeVirt, "Fixing EWOULDBLOCK --> 0"); - PIN_SetSyscallNumber(ctxt, std, 0); // this is fine, you just called a very very short sleep - } else if (res == EINTR) { - PIN_SetSyscallNumber(ctxt, std, -EINTR); // we got an interrupt - } else { - trace(TimeVirt, "%d", res); - assert(res == 0); - } - - // Restore pre-call args - PIN_SetSyscallArgument(ctxt, std, 0, arg0); - PIN_SetSyscallArgument(ctxt, std, 1, arg1); - PIN_SetSyscallArgument(ctxt, std, 2, arg2); - PIN_SetSyscallArgument(ctxt, std, 3, arg3); - - // Handle remaining time stuff - if (rem) { - if (res == EINTR) { - assert(wakeupPhase >= zinfo->numPhases); // o/w why is this EINTR... - uint64_t remainingCycles = wakeupPhase - zinfo->numPhases; - uint64_t remainingNsecs = remainingCycles*1000/zinfo->freqMHz; - rem->tv_sec = remainingNsecs/1000000000; - rem->tv_nsec = remainingNsecs % 1000000000; - } else { - rem->tv_sec = 0; - rem->tv_nsec = 0; - } - } - - return PPA_NOTHING; - }; -} - -// Clock domain query functions - -void VirtCaptureClocks(bool isDeffwd) { - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - ClockDomainInfo& dom = zinfo->clockDomainInfo[domain]; - futex_lock(&dom.lock); - if (isDeffwd || dom.realtimeOffsetNs == 0) { - info("[%d] Adjusting clocks, domain %d, de-ffwd %d", procIdx, domain, isDeffwd); - - struct timespec realtime; - struct timespec monotonic; - struct timespec process; - syscall(SYS_clock_gettime, CLOCK_REALTIME, &realtime); - syscall(SYS_clock_gettime, CLOCK_MONOTONIC, &monotonic); - syscall(SYS_clock_gettime, CLOCK_PROCESS_CPUTIME_ID, &process); - uint64_t realRdtsc = rdtsc(); - - uint64_t curCycles = zinfo->globPhaseCycles; - uint64_t curNs = cyclesToNs(curCycles); - - uint64_t realtimeNs = timespecToNs(realtime); - uint64_t monotonicNs = timespecToNs(monotonic); - uint64_t processNs = timespecToNs(process); - - dom.realtimeOffsetNs = realtimeNs - curNs; - dom.monotonicOffsetNs = monotonicNs - curNs; - dom.processOffsetNs = processNs - curNs; - dom.rdtscOffset = realRdtsc - curCycles; - - //info("Offsets: %ld %ld %ld %ld", dom.realtimeOffsetNs, dom.monotonicOffsetNs, dom.processOffsetNs, dom.rdtscOffset) - } - futex_unlock(&dom.lock); -} - -uint64_t VirtGetPhaseRDTSC() { - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - return zinfo->clockDomainInfo[domain].rdtscOffset + zinfo->globPhaseCycles; -} - diff --git a/src/virt/time_conv.h b/src/virt/time_conv.h deleted file mode 100644 index fd218ba..0000000 --- a/src/virt/time_conv.h +++ /dev/null @@ -1,67 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef VIRT_TIME_CONV_H_ -#define VIRT_TIME_CONV_H_ - -#include - -// Helper functions to translate between ns, timespec/timeval, and cycles - -// ns per s :) -#define NSPS (1000*1000*1000L) - -static inline uint64_t timevalToNs(struct timeval tv) { - return tv.tv_sec*NSPS + tv.tv_usec*1000L; -} - -static inline uint64_t timespecToNs(struct timespec ts) { - return ts.tv_sec*NSPS + ts.tv_nsec; -} - -static inline struct timeval nsToTimeval(uint64_t ns) { - struct timeval res; - res.tv_sec = ns/NSPS; - res.tv_usec = (ns % NSPS)/1000; - return res; -} - -static inline struct timespec nsToTimespec(uint64_t ns) { - struct timespec res; - res.tv_sec = ns/NSPS; - res.tv_nsec = (ns % NSPS); - return res; -} - -static inline uint64_t cyclesToNs(uint64_t cycles) { - return cycles*1000/zinfo->freqMHz; -} - -static inline uint64_t nsToCycles(uint64_t cycles) { - return cycles*zinfo->freqMHz/1000; -} - -#endif // VIRT_TIME_CONV_H_ diff --git a/src/virt/timeout.cpp b/src/virt/timeout.cpp deleted file mode 100644 index dcd2963..0000000 --- a/src/virt/timeout.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "constants.h" -#include "log.h" -#include "scheduler.h" -#include "process_tree.h" -#include "virt/common.h" -#include "virt/syscall_name.h" -#include "virt/time_conv.h" -#include "zsim.h" - -static struct timespec fakeTimeouts[MAX_THREADS]; //for syscalls that use timespec to indicate a timeout -static bool inFakeTimeoutMode[MAX_THREADS]; - -static bool SkipTimeoutVirt(PrePatchArgs args) { - // having both conditions ensures that we don't virtualize in the interim of toggling ff ON - return args.isNopThread || zinfo->procArray[procIdx]->isInFastForward(); -} - -// Helper function, see /usr/include/linux/futex.h -static bool isFutexWaitOp(int op) { - switch (op & FUTEX_CMD_MASK) { //handles PRIVATE / REALTIME as well - case FUTEX_WAIT: - case FUTEX_WAIT_BITSET: - case FUTEX_WAIT_REQUEUE_PI: - return true; - default: - return false; - } -} - -static bool isFutexWakeOp(int op) { - switch (op & FUTEX_CMD_MASK) { - case FUTEX_WAKE: - case FUTEX_REQUEUE: - case FUTEX_CMP_REQUEUE: - case FUTEX_WAKE_OP: - case FUTEX_WAKE_BITSET: - case FUTEX_CMP_REQUEUE_PI: - return true; - default: - return false; - } -} - - -static int getTimeoutArg(int syscall) { - if (syscall == SYS_poll) return 2; - return 3; // futex, epoll_wait, epoll_pwait -} - -static bool PrePatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std, int syscall) { - assert(!inFakeTimeoutMode[tid]); // canary: this will probably fail... - int64_t waitNsec = 0; - - // Per-syscall manipulation. This code either succeeds, fakes timeout value and sets waitNsec, or returns false - int timeoutArg = getTimeoutArg(syscall); - if (syscall == SYS_futex) { - // Check preconditions - assert(timeoutArg == 3); - int* uaddr = (int*) PIN_GetSyscallArgument(ctxt, std, 0); - int op = (int) PIN_GetSyscallArgument(ctxt, std, 1); - const struct timespec* timeout = (const struct timespec*) PIN_GetSyscallArgument(ctxt, std, 3); - - //info("FUTEX op %d waitOp %d uaddr %p ts %p", op, isFutexWaitOp(op), uaddr, timeout); - if (!(uaddr && isFutexWaitOp(op) && timeout)) return false; // not a timeout FUTEX_WAIT - - waitNsec = timeout->tv_sec*1000000000L + timeout->tv_nsec; - - if (op & FUTEX_CLOCK_REALTIME) { - // NOTE: FUTEX_CLOCK_REALTIME is not a documented interface AFAIK, but looking at the Linux source code + with some verification, this is the xlat - uint32_t domain = zinfo->procArray[procIdx]->getClockDomain(); - uint64_t simNs = cyclesToNs(zinfo->globPhaseCycles); - uint64_t offsetNs = simNs + zinfo->clockDomainInfo[domain].realtimeOffsetNs; - //info(" REALTIME FUTEX: %ld %ld %ld %ld", waitNsec, simNs, offsetNs, waitNsec-offsetNs); - waitNsec = (waitNsec > (int64_t)offsetNs)? (waitNsec - offsetNs) : 0; - } - - if (waitNsec <= 0) return false; // while technically waiting, this does not block. I'm guessing this is done for trylocks? It's weird. - - fakeTimeouts[tid].tv_sec = 0; - fakeTimeouts[tid].tv_nsec = 20*1000*1000; // timeout every 20ms of actual host time - PIN_SetSyscallArgument(ctxt, std, 3, (ADDRINT)&fakeTimeouts[tid]); - } else { - assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll); - int timeout = (int) PIN_GetSyscallArgument(ctxt, std, timeoutArg); - if (timeout <= 0) return false; - //info("[%d] pre-patch epoll_wait/pwait", tid); - - PIN_SetSyscallArgument(ctxt, std, timeoutArg, 20); // 20ms timeout - waitNsec = ((uint64_t)timeout)*1000*1000; // timeout is in ms - } - - //info("[%d] pre-patch %s (%d) waitNsec = %ld", tid, GetSyscallName(syscall), syscall, waitNsec); - - uint64_t waitCycles = waitNsec*zinfo->freqMHz/1000; - uint64_t waitPhases = waitCycles/zinfo->phaseLength; - if (waitPhases < 2) waitPhases = 2; // at least wait 2 phases; this should basically eliminate the chance that we get a SIGSYS before we start executing the syscal instruction - uint64_t wakeupPhase = zinfo->numPhases + waitPhases; - - /*volatile uint32_t* futexWord =*/ zinfo->sched->markForSleep(procIdx, tid, wakeupPhase); // we still want to mark for sleep, bear with me... - inFakeTimeoutMode[tid] = true; - return true; -} - -static bool PostPatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std, int syscall, ADDRINT prevIp, ADDRINT timeoutArgVal) { - assert(inFakeTimeoutMode[tid]); - int res = (int)PIN_GetSyscallNumber(ctxt, std); - - // Decide if it timed out - bool timedOut; - if (syscall == SYS_futex) { - timedOut = (res == -ETIMEDOUT); - } else { - timedOut = (res == 0); - } - - bool isSleeping = zinfo->sched->isSleeping(procIdx, tid); - - // Decide whether to retry - bool retrySyscall; - if (!timedOut) { - if (isSleeping) zinfo->sched->notifySleepEnd(procIdx, tid); - retrySyscall = false; - } else { - retrySyscall = isSleeping; - } - - if (retrySyscall && zinfo->procArray[procIdx]->isInFastForward()) { - warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall)); - retrySyscall = false; - assert(isSleeping); - zinfo->sched->notifySleepEnd(procIdx, tid); - } - - if (retrySyscall) { - // ADDRINT curIp = PIN_GetContextReg(ctxt, REG_INST_PTR); - //info("[%d] post-patch, retrying, IP: 0x%lx -> 0x%lx", tid, curIp, prevIp); - PIN_SetContextReg(ctxt, REG_INST_PTR, prevIp); - PIN_SetSyscallNumber(ctxt, std, syscall); - } else { - // Restore timeout arg - PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutArgVal); - inFakeTimeoutMode[tid] = false; - - // Restore arg? I don't think we need this! - /*if (syscall == SYS_futex) { - PIN_SetSyscallNumber(ctxt, std, -ETIMEDOUT); - } else { - assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll); - PIN_SetSyscallNumber(ctxt, std, 0); //no events returned - }*/ - } - - //info("[%d] post-patch %s (%d), timedOut %d, sleeping (orig) %d, retrying %d, orig res %d, patched res %d", tid, GetSyscallName(syscall), syscall, timedOut, isSleeping, retrySyscall, res, (int)PIN_GetSyscallNumber(ctxt, std)); - return retrySyscall; -} - -/* Notify scheduler about FUTEX_WAITs woken up by FUTEX_WAKEs, FUTEX_WAKE entries, and FUTEX_WAKE exits */ - -struct FutexInfo { - int op; - int val; -}; - -FutexInfo PrePatchFutex(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std) { - FutexInfo fi; - fi.op = (int) PIN_GetSyscallArgument(ctxt, std, 1); - fi.val = (int) PIN_GetSyscallArgument(ctxt, std, 2); - if (isFutexWakeOp(fi.op)) { - zinfo->sched->notifyFutexWakeStart(procIdx, tid, fi.val); - } - return fi; -} - -void PostPatchFutex(uint32_t tid, FutexInfo fi, CONTEXT* ctxt, SYSCALL_STANDARD std) { - int res = (int) PIN_GetSyscallNumber(ctxt, std); - if (isFutexWaitOp(fi.op) && res == 0) { - zinfo->sched->notifyFutexWaitWoken(procIdx, tid); - } else if (isFutexWakeOp(fi.op) && res >= 0) { - /* In contrast to the futex manpage, from the kernel's futex.c - * (do_futex), WAKE and WAKE_OP return the number of threads woken up, - * but the REQUEUE and CMP_REQUEUE and REQUEUE_PI ops return the number - * of threads woken up + requeued. However, these variants - * (futex_requeue) first try to wake the specified threads, then - * requeue as many other threads as they can. - * - * Therefore, this wokenUp expression should be correct for all variants - * of SYS_futex that wake up threads (WAKE, REQUEUE, CMP_REQUEUE, ...) - */ - uint32_t wokenUp = std::min(res, fi.val); - zinfo->sched->notifyFutexWakeEnd(procIdx, tid, wokenUp); - } -} - -PostPatchFn PatchTimeoutSyscall(PrePatchArgs args) { - if (SkipTimeoutVirt(args)) return NullPostPatch; - - int syscall = PIN_GetSyscallNumber(args.ctxt, args.std); - assert_msg(syscall == SYS_futex || syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll, - "Invalid timeout syscall %d", syscall); - - FutexInfo fi = {0, 0}; - if (syscall == SYS_futex) fi = PrePatchFutex(args.tid, args.ctxt, args.std); - - if (PrePatchTimeoutSyscall(args.tid, args.ctxt, args.std, syscall)) { - ADDRINT prevIp = PIN_GetContextReg(args.ctxt, REG_INST_PTR); - ADDRINT timeoutArgVal = PIN_GetSyscallArgument(args.ctxt, args.std, getTimeoutArg(syscall)); - return [syscall, prevIp, timeoutArgVal, fi](PostPatchArgs args) { - if (PostPatchTimeoutSyscall(args.tid, args.ctxt, args.std, syscall, prevIp, timeoutArgVal)) { - return PPA_USE_RETRY_PTRS; // retry - } else { - if (syscall == SYS_futex) PostPatchFutex(args.tid, fi, args.ctxt, args.std); - return PPA_USE_JOIN_PTRS; // finish - } - }; - } else { - if (syscall == SYS_futex) { - return [fi](PostPatchArgs args) { - PostPatchFutex(args.tid, fi, args.ctxt, args.std); - return PPA_NOTHING; - }; - } else { - return NullPostPatch; - } - } -} - diff --git a/src/virt/virt.cpp b/src/virt/virt.cpp deleted file mode 100644 index a97f3de..0000000 --- a/src/virt/virt.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include -#include "constants.h" -#include "log.h" -#include "virt/common.h" -#include "virt/syscall_name.h" -#include "virt/virt.h" - -#define MAX_SYSCALLS 350 // doesn't need to be accurate - -PrePatchFn prePatchFunctions[MAX_SYSCALLS]; -PostPatchFn postPatchFunctions[MAX_THREADS]; - -const PostPatchFn NullPostPatch; - -// Common prepatch functions -PostPatchFn NullPatch(PrePatchArgs) { - return NullPostPatch; -} - -PostPatchFn WarnTimingRelated(PrePatchArgs args) { - uint32_t syscall = PIN_GetSyscallNumber(args.ctxt, args.std); - warn("[%d] Executing unvirtualized potentially timing-sensitive syscall: %s (%d)", args.tid, GetSyscallName(syscall), syscall); - return NullPostPatch; -} - -// Define all patch functions -#define PF(syscall, pfn) PostPatchFn pfn(PrePatchArgs args); -#include "virt/patchdefs.h" -#undef PF - -void VirtInit() { - for (uint32_t i = 0; i < MAX_SYSCALLS; i++) prePatchFunctions[i] = NullPatch; - - // Issue warnings on timing-sensitive syscalls - uint32_t timingSyscalls[] = {SYS_select, SYS_getitimer, SYS_alarm, SYS_setitimer, SYS_semop, - SYS_gettimeofday, SYS_times, SYS_rt_sigtimedwait, SYS_time, SYS_futex, SYS_mq_timedsend, - SYS_mq_timedreceive, SYS_pselect6, SYS_ppoll}; - for (uint32_t syscall : timingSyscalls) { - prePatchFunctions[syscall] = WarnTimingRelated; - } - - // Bind all patch functions - #define PF(syscall, pfn) prePatchFunctions[syscall] = pfn; - #include "virt/patchdefs.h" - #undef PF -} - - -// Dispatch methods -void VirtSyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, const char* patchRoot, bool isNopThread) { - uint32_t syscall = PIN_GetSyscallNumber(ctxt, std); - // glibc version 2.28+, if built with GCC's -fcf-protection, will have - // init_cpu_features() (which runs early on during the execution of any - // process) attempt to call the nonexisting ARCH_CET_STATUS (0x3001) - // subfunction of arch_prctl. See: - // https://sourceware.org/git/?p=glibc.git;a=commit;h=394df3815e8ceec750fd06583eee4896174ce808 - // This became the default in Ubuntu 19.10+. See: - // https://wiki.ubuntu.com/ToolChain/CompilerFlags#A-fcf-protection - // Pin v2.14 crashes when it sees this unexpected arch_prctl subfunction. - // Avoid the crash by just pretending to execute the syscall instruction - // while skipping over it. - if (syscall == SYS_arch_prctl && PIN_GetContextReg(ctxt, REG_RDI) == 0x3001) { - PIN_SetContextReg(ctxt, REG_INST_PTR, PIN_GetContextReg(ctxt, REG_INST_PTR) + 2); - PIN_SetContextReg(ctxt, REG_RAX, -1UL); - return; - } - // glibc version 2.34+ uses the clone3 syscall, but will fallback to clone - // if errno is ENOSYS. So pretend to fail with this errno. To produce - // portable binaries, do this even if compiling on a machine where - // SYS_clone3 is undefined. - if (syscall == 435/*SYS_clone3*/) { - PIN_SetContextReg(ctxt, REG_RAX, -ENOSYS); - PIN_SetContextReg(ctxt, REG_INST_PTR, PIN_GetContextReg(ctxt, REG_INST_PTR) + 2); - return; - } - if (syscall >= MAX_SYSCALLS) { - warn("syscall %d out of range", syscall); - postPatchFunctions[tid] = NullPostPatch; - } else { - postPatchFunctions[tid] = prePatchFunctions[syscall]({tid, ctxt, std, patchRoot, isNopThread}); - } -} - -PostPatchAction VirtSyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std) { - return postPatchFunctions[tid]({tid, ctxt, std}); -} - diff --git a/src/virt/virt.h b/src/virt/virt.h deleted file mode 100644 index 83e05be..0000000 --- a/src/virt/virt.h +++ /dev/null @@ -1,54 +0,0 @@ -/** $glic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * Copyright (C) 2011 Google Inc. - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef VIRT_VIRT_H_ -#define VIRT_VIRT_H_ - -// External virt interface - -#include "pin.H" - -enum PostPatchAction { - PPA_NOTHING, - PPA_USE_RETRY_PTRS, - PPA_USE_JOIN_PTRS, -}; - -void VirtInit(); // per-process, not global -void VirtSyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, const char* patchRoot, bool isNopThread); -PostPatchAction VirtSyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std); - -// VDSO / external virt functions -void VirtGettimeofday(uint32_t tid, ADDRINT arg0); -void VirtTime(uint32_t tid, REG* retVal, ADDRINT arg0); -void VirtClockGettime(uint32_t tid, ADDRINT arg0, ADDRINT arg1); -void VirtGetcpu(uint32_t tid, uint32_t cpu, ADDRINT arg0, ADDRINT arg1); - -// Time virtualization direct functions -void VirtCaptureClocks(bool isDeffwd); // called on start and ffwd to get all clocks together -uint64_t VirtGetPhaseRDTSC(); - -#endif // VIRT_VIRT_H_ diff --git a/src/zsim.cpp b/src/zsim.cpp index d2c2101..386c310 100644 --- a/src/zsim.cpp +++ b/src/zsim.cpp @@ -28,6 +28,9 @@ #include "zsim.h" #include +#include +#include +#include #include #include #include @@ -40,26 +43,30 @@ #include #include #include +#include +#include +#include +#include +#include #include #include "access_tracing.h" +#include "config.h" #include "constants.h" #include "contention_sim.h" #include "core.h" +#include "decoder.h" #include "cpuenum.h" #include "cpuid.h" #include "debug_zsim.h" #include "event_queue.h" #include "galloc.h" #include "init.h" +#include "ipc_handler.h" #include "log.h" -#include "pin.H" -#include "pin_cmd.h" #include "process_tree.h" #include "profile_stats.h" #include "scheduler.h" #include "stats.h" -#include "trace_driver.h" -#include "virt/virt.h" using namespace std; @@ -67,32 +74,6 @@ using namespace std; /* Command-line switches (used to pass info from harness that cannot be passed through the config file, most config is file-based) */ -KNOB KnobProcIdx(KNOB_MODE_WRITEONCE, "pintool", - "procIdx", "0", "zsim process idx (internal)"); - -KNOB KnobShmid(KNOB_MODE_WRITEONCE, "pintool", - "shmid", "0", "SysV IPC shared memory id used when running in multi-process mode"); - -KNOB KnobConfigFile(KNOB_MODE_WRITEONCE, "pintool", - "config", "zsim.cfg", "config file name (only needed for the first simulated process)"); - -//We need to know these as soon as we start, otherwise we could not log anything until we attach and read the config -KNOB KnobLogToFile(KNOB_MODE_WRITEONCE, "pintool", - "logToFile", "false", "true if all messages should be logged to a logfile instead of stdout/err"); - -KNOB KnobOutputDir(KNOB_MODE_WRITEONCE, "pintool", - "outputDir", "./", "absolute path to write output files into"); - - - -/* ===================================================================== */ - -INT32 Usage() { - cerr << "zsim simulator pintool" << endl; - cerr << KNOB_BASE::StringKnobSummary(); - cerr << endl; - return -1; -} /* Global Variables */ @@ -142,19 +123,14 @@ uint32_t getCid(uint32_t tid) { void EnterFastForward(); void ExitFastForward(); -VOID SimThreadStart(THREADID tid); -VOID SimThreadFini(THREADID tid); -VOID SimEnd(); - -VOID HandleMagicOp(THREADID tid, ADDRINT op); - -VOID FakeCPUIDPre(THREADID tid, REG eax, REG ecx); -VOID FakeCPUIDPost(THREADID tid, ADDRINT* eax, ADDRINT* ebx, ADDRINT* ecx, ADDRINT* edx); //REG* eax, REG* ebx, REG* ecx, REG* edx); +void SimThreadStart(THREADID tid); +void SimThreadFini(THREADID tid); +void SimEnd(); -VOID FakeRDTSCPost(THREADID tid, REG* eax, REG* edx); +void HandleMagicOp(THREADID tid, ADDRINT op); -VOID VdsoInstrument(INS ins); -VOID FFThread(VOID* arg); +void VdsoInstrument(INS ins); +void FFThread(void* arg); /* Indirect analysis calls to work around PIN's synchronization * @@ -169,27 +145,27 @@ VOID FFThread(VOID* arg); InstrFuncPtrs fPtrs[MAX_THREADS] ATTR_LINE_ALIGNED; //minimize false sharing -VOID PIN_FAST_ANALYSIS_CALL IndirectLoadSingle(THREADID tid, ADDRINT addr) { +void IndirectLoadSingle(THREADID tid, ADDRINT addr) { fPtrs[tid].loadPtr(tid, addr); } -VOID PIN_FAST_ANALYSIS_CALL IndirectStoreSingle(THREADID tid, ADDRINT addr) { +void IndirectStoreSingle(THREADID tid, ADDRINT addr) { fPtrs[tid].storePtr(tid, addr); } -VOID PIN_FAST_ANALYSIS_CALL IndirectBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { +void IndirectBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { fPtrs[tid].bblPtr(tid, bblAddr, bblInfo); } -VOID PIN_FAST_ANALYSIS_CALL IndirectRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) { +void IndirectRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) { fPtrs[tid].branchPtr(tid, branchPc, taken, takenNpc, notTakenNpc); } -VOID PIN_FAST_ANALYSIS_CALL IndirectPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) { +void IndirectPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) { fPtrs[tid].predLoadPtr(tid, addr, pred); } -VOID PIN_FAST_ANALYSIS_CALL IndirectPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) { +void IndirectPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) { fPtrs[tid].predStorePtr(tid, addr, pred); } @@ -211,44 +187,44 @@ void Join(uint32_t tid) { fPtrs[tid] = cores[tid]->GetFuncPtrs(); //back to normal pointers } -VOID JoinAndLoadSingle(THREADID tid, ADDRINT addr) { +void JoinAndLoadSingle(THREADID tid, ADDRINT addr) { Join(tid); fPtrs[tid].loadPtr(tid, addr); } -VOID JoinAndStoreSingle(THREADID tid, ADDRINT addr) { +void JoinAndStoreSingle(THREADID tid, ADDRINT addr) { Join(tid); fPtrs[tid].storePtr(tid, addr); } -VOID JoinAndBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { +void JoinAndBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { Join(tid); fPtrs[tid].bblPtr(tid, bblAddr, bblInfo); } -VOID JoinAndRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) { +void JoinAndRecordBranch(THREADID tid, ADDRINT branchPc, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) { Join(tid); fPtrs[tid].branchPtr(tid, branchPc, taken, takenNpc, notTakenNpc); } -VOID JoinAndPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) { +void JoinAndPredLoadSingle(THREADID tid, ADDRINT addr, BOOL pred) { Join(tid); fPtrs[tid].predLoadPtr(tid, addr, pred); } -VOID JoinAndPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) { +void JoinAndPredStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) { Join(tid); fPtrs[tid].predStorePtr(tid, addr, pred); } // NOP variants: Do nothing -VOID NOPLoadStoreSingle(THREADID tid, ADDRINT addr) {} -VOID NOPBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {} -VOID NOPRecordBranch(THREADID tid, ADDRINT addr, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {} -VOID NOPPredLoadStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) {} +void NOPLoadStoreSingle(THREADID tid, ADDRINT addr) {} +void NOPBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) {} +void NOPRecordBranch(THREADID tid, ADDRINT addr, BOOL taken, ADDRINT takenNpc, ADDRINT notTakenNpc) {} +void NOPPredLoadStoreSingle(THREADID tid, ADDRINT addr, BOOL pred) {} // FF is basically NOP except for basic blocks -VOID FFBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { +void FFBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { if (unlikely(!procTreeNode->isInFastForward())) { SimThreadStart(tid); } @@ -282,7 +258,7 @@ static uint64_t* ffiPrevFFStartInstrs; static const InstrFuncPtrs& GetFFPtrs(); -VOID FFITrackNFFInterval() { +void FFITrackNFFInterval() { assert(!procTreeNode->isInFastForward()); assert(ffiInstrsDone < ffiInstrsLimit); //unless you have ~10-instr FFWds, this does not happen @@ -309,7 +285,7 @@ VOID FFITrackNFFInterval() { } // Called on process start -VOID FFIInit() { +void FFIInit() { const g_vector& ffiPoints = procTreeNode->getFFIPoints(); if (!ffiPoints.empty()) { if (zinfo->ffReinstrument) panic("FFI and reinstrumenting on FF switches are incompatible"); @@ -329,7 +305,7 @@ VOID FFIInit() { } //Set the next ffiPoint, or finish -VOID FFIAdvance() { +void FFIAdvance() { const g_vector& ffiPoints = procTreeNode->getFFIPoints(); ffiPoint++; if (ffiPoint >= ffiPoints.size()) { @@ -341,7 +317,7 @@ VOID FFIAdvance() { } } -VOID FFIBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { +void FFIBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { ffiInstrsDone += bblInfo->instrs; if (unlikely(ffiInstrsDone >= ffiInstrsLimit)) { FFIAdvance(); @@ -357,7 +333,7 @@ VOID FFIBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { } // One-off, called after we go from NFF to FF -VOID FFIEntryBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { +void FFIEntryBasicBlock(THREADID tid, ADDRINT bblAddr, BblInfo* bblInfo) { ffiInstrsDone += *ffiFFStartInstrs - *ffiPrevFFStartInstrs; //add all instructions executed in the NFF phase FFIAdvance(); assert(ffiNFF); @@ -385,10 +361,6 @@ void EnterFastForward() { procTreeNode->enterFastForward(); __sync_synchronize(); //Make change globally visible - //Re-instrument; VM/client lock are not needed - if (zinfo->ffReinstrument) { - PIN_RemoveInstrumentation(); - } //Transition to FF; we have the ff lock, this should be safe with end of phase code. This avoids profiling the end of a simulation as bound time //NOTE: Does not work well with multiprocess runs zinfo->profSimTime->transition(PROF_FF); @@ -398,15 +370,8 @@ void EnterFastForward() { void ExitFastForward() { assert(procTreeNode->isInFastForward()); - VirtCaptureClocks(true /*exiting ffwd*/); - procTreeNode->exitFastForward(); __sync_synchronize(); //make change globally visible - - //Re-instrument; VM/client lock are not needed - if (zinfo->ffReinstrument) { - PIN_RemoveInstrumentation(); - } } @@ -414,9 +379,9 @@ void ExitFastForward() { //Termination volatile uint32_t perProcessEndFlag; -VOID SimEnd(); +void SimEnd(); -VOID CheckForTermination() { +void CheckForTermination() { assert(zinfo->terminationConditionMet == false); if (zinfo->maxPhases && zinfo->numPhases >= zinfo->maxPhases) { zinfo->terminationConditionMet = true; @@ -472,7 +437,7 @@ VOID CheckForTermination() { /* This is called by the scheduler at the end of a phase. At that point, zinfo->numPhases * has not incremented, so it denotes the END of the current phase */ -VOID EndOfPhaseActions() { +void EndOfPhaseActions() { zinfo->profSimTime->transition(PROF_WEAVE); if (zinfo->globalPauseFlag) { info("Simulation entering global pause"); @@ -538,47 +503,40 @@ static void PrintIp(THREADID tid, ADDRINT ip) { } #endif -VOID Instruction(INS ins) { +void PrepareNextInstruction(THREADID tid, INS ins, ADDRINT instAddr, struct BasicBlockLoadStore *loadStore, + struct BranchInformation *branchInfo) { //Uncomment to print an instruction trace //INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)PrintIp, IARG_THREAD_ID, IARG_REG_VALUE, REG_INST_PTR, IARG_END); if (!procTreeNode->isInFastForward() || !zinfo->ffReinstrument) { - AFUNPTR LoadFuncPtr = (AFUNPTR) IndirectLoadSingle; - AFUNPTR StoreFuncPtr = (AFUNPTR) IndirectStoreSingle; + void (*LoadStoreFuncPtr)(THREADID, ADDRINT) = nullptr; - AFUNPTR PredLoadFuncPtr = (AFUNPTR) IndirectPredLoadSingle; - AFUNPTR PredStoreFuncPtr = (AFUNPTR) IndirectPredStoreSingle; - - if (INS_IsMemoryRead(ins)) { - if (!INS_IsPredicated(ins)) { - INS_InsertCall(ins, IPOINT_BEFORE, LoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD_EA, IARG_END); - } else { - INS_InsertCall(ins, IPOINT_BEFORE, PredLoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD_EA, IARG_EXECUTING, IARG_END); - } + /* TODO: atomic instructions */ + bool isLoad = Decoder::riscvInsIsLoad(ins); + bool isStore = Decoder::riscvInsIsStore(ins); + if (isLoad) { + LoadStoreFuncPtr = IndirectLoadSingle; } - - if (INS_HasMemoryRead2(ins)) { - if (!INS_IsPredicated(ins)) { - INS_InsertCall(ins, IPOINT_BEFORE, LoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD2_EA, IARG_END); - } else { - INS_InsertCall(ins, IPOINT_BEFORE, PredLoadFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYREAD2_EA, IARG_EXECUTING, IARG_END); - } + if (isStore) { + LoadStoreFuncPtr = IndirectStoreSingle; } - - if (INS_IsMemoryWrite(ins)) { - if (!INS_IsPredicated(ins)) { - INS_InsertCall(ins, IPOINT_BEFORE, StoreFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYWRITE_EA, IARG_END); - } else { - INS_InsertCall(ins, IPOINT_BEFORE, PredStoreFuncPtr, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, IARG_MEMORYWRITE_EA, IARG_EXECUTING, IARG_END); + if (isLoad || isStore) { + struct BasicBlockLoadStore *loadStoreList = loadStore; + while (loadStoreList != nullptr) { + assert(loadStoreList->entryValid); + LoadStoreFuncPtr(tid, loadStoreList->addr1); + loadStoreList = loadStoreList->next; } } - // Instrument only conditional branches - // IARG_BRANCH_TARGET_ADDR is invalid in some cases, such as far-call and XEND. - if (INS_Category(ins) == XED_CATEGORY_COND_BR - && !(INS_IsFarCall(ins) || INS_IsXbegin(ins) || INS_IsXend(ins))) { - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) IndirectRecordBranch, IARG_FAST_ANALYSIS_CALL, IARG_THREAD_ID, - IARG_INST_PTR, IARG_BRANCH_TAKEN, IARG_BRANCH_TARGET_ADDR, IARG_FALLTHROUGH_ADDR, IARG_END); + if (Decoder::riscvInsIsBranch(ins)) { + uint8_t firstTwoBits = ins & 0x03; + uint8_t nextPcAdd = 2; + if (firstTwoBits == 0x03) { + nextPcAdd = 4; + } + IndirectRecordBranch(tid, instAddr, branchInfo->branchTaken, + branchInfo->branchTakenNpc, instAddr + nextPcAdd); } } @@ -587,250 +545,63 @@ VOID Instruction(INS ins) { * is never emitted by any x86 compiler, as they use other (recommended) nop * instructions or sequences. */ - if (INS_IsXchg(ins) && INS_OperandReg(ins, 0) == REG_RCX && INS_OperandReg(ins, 1) == REG_RCX) { - //info("Instrumenting magic op"); - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) HandleMagicOp, IARG_THREAD_ID, IARG_REG_VALUE, REG_ECX, IARG_END); - } - - if (INS_Opcode(ins) == XED_ICLASS_CPUID) { - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) FakeCPUIDPre, IARG_THREAD_ID, IARG_REG_VALUE, REG_EAX, IARG_REG_VALUE, REG_ECX, IARG_END); - INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeCPUIDPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX, - IARG_REG_REFERENCE, REG_EBX, IARG_REG_REFERENCE, REG_ECX, IARG_REG_REFERENCE, REG_EDX, IARG_END); - } - - if (INS_IsRDTSC(ins)) { - //No pre; note that this also instruments RDTSCP - INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeRDTSCPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX, IARG_REG_REFERENCE, REG_EDX, IARG_END); - } - - //Must run for every instruction - VdsoInstrument(ins); -} - - -VOID Trace(TRACE trace, VOID *v) { - if (!procTreeNode->isInFastForward() || !zinfo->ffReinstrument) { - // Visit every basic block in the trace - for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) { - BblInfo* bblInfo = Decoder::decodeBbl(bbl, zinfo->oooDecode); - BBL_InsertCall(bbl, IPOINT_BEFORE /*could do IPOINT_ANYWHERE if we redid load and store simulation in OOO*/, (AFUNPTR)IndirectBasicBlock, IARG_FAST_ANALYSIS_CALL, - IARG_THREAD_ID, IARG_ADDRINT, BBL_Address(bbl), IARG_PTR, bblInfo, IARG_END); - } - } - - //Instruction instrumentation now here to ensure proper ordering - for (BBL bbl = TRACE_BblHead(trace); BBL_Valid(bbl); bbl = BBL_Next(bbl)) { - for (INS ins = BBL_InsHead(bbl); INS_Valid(ins); ins = INS_Next(ins)) { - Instruction(ins); - } - } -} - -/***** vDSO instrumentation and patching code *****/ - -// Helper function to find section address -// adapted from http://outflux.net/aslr/aslr.c -struct Section { - uintptr_t start; - uintptr_t end; -}; - -static Section FindSection(const char* sec) { - /* locate the vdso from the maps file */ - char buf[129]; - buf[128] = '\0'; - FILE * fp = fopen("/proc/self/maps", "r"); - Section res = {0x0, 0x0}; - if (fp) { - while (fgets(buf, 128, fp)) { - if (strstr(buf, sec)) { - char * dash = strchr(buf, '-'); - if (dash) { - *dash='\0'; - res.start = strtoul(buf, nullptr, 16); - res.end = strtoul(dash+1, nullptr, 16); - } - } - } - } - - //Uncomment to print maps - //fseek(fp, 0, SEEK_SET); - //while (fgets(buf, 128, fp)) info("%s", buf); - return res; -} - -// Initialization code and global per-process data - -enum VdsoFunc {VF_CLOCK_GETTIME, VF_GETTIMEOFDAY, VF_TIME, VF_GETCPU}; - -static std::unordered_map vdsoEntryMap; -static uintptr_t vdsoStart; -static uintptr_t vdsoEnd; - -//Used to warn -static uintptr_t vsyscallStart; -static uintptr_t vsyscallEnd; -static bool vsyscallWarned = false; - -// Helper function from parse_vsdo.cpp -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void *vdso_sym(const char *version, const char *name); - -void VdsoInsertFunc(const char* fName, VdsoFunc func) { - ADDRINT vdsoFuncAddr = (ADDRINT) vdso_sym("LINUX_2.6", fName); - if (vdsoFuncAddr == 0) { - warn("Did not find %s in vDSO", fName); - } else { - vdsoEntryMap[vdsoFuncAddr] = func; - } -} - -void VdsoInit() { - Section vdso = FindSection("vdso"); - vdsoStart = vdso.start; - vdsoEnd = vdso.end; - - if (!vdsoEnd) { - // Non-fatal, but should not happen --- even static binaries get vDSO AFAIK - warn("vDSO not found"); - return; - } - - vdso_init_from_sysinfo_ehdr(vdsoStart); - - VdsoInsertFunc("clock_gettime", VF_CLOCK_GETTIME); - VdsoInsertFunc("__vdso_clock_gettime", VF_CLOCK_GETTIME); - - VdsoInsertFunc("gettimeofday", VF_GETTIMEOFDAY); - VdsoInsertFunc("__vdso_gettimeofday", VF_GETTIMEOFDAY); - - VdsoInsertFunc("time", VF_TIME); - VdsoInsertFunc("__vdso_time", VF_TIME); - - VdsoInsertFunc("getcpu", VF_GETCPU); - VdsoInsertFunc("__vdso_getcpu", VF_GETCPU); - - info("vDSO info initialized"); - - Section vsyscall = FindSection("vsyscall"); - vsyscallStart = vsyscall.start; - vsyscallEnd = vsyscall.end; - // Could happen in the future when vsyscall is phased out, kill the warn then - if (!vsyscallEnd) warn("vsyscall page not found"); + // if (INS_IsXchg(ins) && INS_OperandReg(ins, 0) == REG_RCX && INS_OperandReg(ins, 1) == REG_RCX) { + // //info("Instrumenting magic op"); + // INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) HandleMagicOp, IARG_THREAD_ID, IARG_REG_VALUE, REG_ECX, IARG_END); + // } + + // if (INS_Opcode(ins) == XED_ICLASS_CPUID) { + // INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) FakeCPUIDPre, IARG_THREAD_ID, IARG_REG_VALUE, REG_EAX, IARG_REG_VALUE, REG_ECX, IARG_END); + // INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeCPUIDPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX, + // IARG_REG_REFERENCE, REG_EBX, IARG_REG_REFERENCE, REG_ECX, IARG_REG_REFERENCE, REG_EDX, IARG_END); + // } + + // if (INS_IsRDTSC(ins)) { + // //No pre; note that this also instruments RDTSCP + // INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR) FakeRDTSCPost, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_EAX, IARG_REG_REFERENCE, REG_EDX, IARG_END); + // } } -// Register hooks to intercept and virtualize time-related vsyscalls and vdso syscalls, as they do not show up as syscalls! -// NOTE: getcpu is also a VDSO syscall, but is not patched for now - -// Per-thread VDSO data -struct VdsoPatchData { - // Input arguments --- must save them because they are not caller-saved - // Careful: REG is 32 bits; PIN_REGISTER, which is the actual type of the - // pointer, is 64 bits but opaque. We just use ADDRINT, it works - ADDRINT arg0, arg1; - VdsoFunc func; - uint32_t level; // if 0, invalid. Used for VDSO-internal calls -}; -VdsoPatchData vdsoPatchData[MAX_THREADS]; +static std::atomic activeThreads[MAX_THREADS]; // set in ThreadStart, reset in ThreadFini, we need this for exec() (see FollowChild) +#ifdef HARD_CODED_TRACE_TEST +static std::vector> queuePerThread; +static std::vector> queueMutexPerThread; +static std::vector> queueHasDataPerThread; +#endif -// Analysis functions +void ThreadStart(THREADID tid); -VOID VdsoEntryPoint(THREADID tid, uint32_t func, ADDRINT arg0, ADDRINT arg1) { - if (vdsoPatchData[tid].level) { - // common, in Ubuntu 11.10 several vdso functions jump back to the callpoint - // info("vDSO function (%d) called from vdso (%d), level %d, skipping", func, vdsoPatchData[tid].func, vdsoPatchData[tid].level); - } else { - vdsoPatchData[tid].arg0 = arg0; - vdsoPatchData[tid].arg1 = arg1; - vdsoPatchData[tid].func = (VdsoFunc)func; - vdsoPatchData[tid].level++; - } -} +void TraceThreadInit(std::vector &threads, int which) { +#ifdef HARD_CODED_TRACE_TEST + queuePerThread.emplace_back(); -VOID VdsoCallPoint(THREADID tid) { - assert(vdsoPatchData[tid].level); - vdsoPatchData[tid].level++; - // info("vDSO internal callpoint, now level %d", vdsoPatchData[tid].level); //common -} + auto mutexPtr = std::make_unique(); + queueMutexPerThread.push_back(std::move(mutexPtr)); -VOID VdsoRetPoint(THREADID tid, REG* raxPtr) { - if (vdsoPatchData[tid].level == 0) { - warn("vDSO return without matching call --- did we instrument all the functions?"); - return; - } - vdsoPatchData[tid].level--; - if (vdsoPatchData[tid].level) { - // info("vDSO return post level %d, skipping ret handling", vdsoPatchData[tid].level); //common - return; - } - if (fPtrs[tid].type != FPTR_NOP || vdsoPatchData[tid].func == VF_GETCPU) { - // info("vDSO patching for func %d", vdsoPatchData[tid].func); // common - ADDRINT arg0 = vdsoPatchData[tid].arg0; - ADDRINT arg1 = vdsoPatchData[tid].arg1; - switch (vdsoPatchData[tid].func) { - case VF_CLOCK_GETTIME: - VirtClockGettime(tid, arg0, arg1); - break; - case VF_GETTIMEOFDAY: - VirtGettimeofday(tid, arg0); - break; - case VF_TIME: - VirtTime(tid, raxPtr, arg0); - break; - case VF_GETCPU: - { - uint32_t cpu = cpuenumCpu(procIdx, getCid(tid)); - VirtGetcpu(tid, cpu, arg0, arg1); - } - break; - default: - panic("vDSO garbled func %d", vdsoPatchData[tid].func); - } - } + auto cvPtr = std::make_unique(); + queueHasDataPerThread.push_back(std::move(cvPtr)); +#endif + threads.emplace_back(ThreadStart, which); + while (!activeThreads[0].load()); } -// Instrumentation function, called for EVERY instruction -VOID VdsoInstrument(INS ins) { - ADDRINT insAddr = INS_Address(ins); - if (unlikely(insAddr >= vdsoStart && insAddr < vdsoEnd)) { - if (vdsoEntryMap.find(insAddr) != vdsoEntryMap.end()) { - VdsoFunc func = vdsoEntryMap[insAddr]; - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoEntryPoint, IARG_THREAD_ID, IARG_UINT32, (uint32_t)func, IARG_REG_VALUE, REG_RDI, IARG_REG_VALUE, REG_RSI, IARG_END); - } else if (INS_IsCall(ins)) { - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoCallPoint, IARG_THREAD_ID, IARG_END); - } else if (INS_IsRet(ins)) { - INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR) VdsoRetPoint, IARG_THREAD_ID, IARG_REG_REFERENCE, REG_RAX /* return val */, IARG_END); - } - } - - //Warn on the first vsyscall code translation - if (unlikely(insAddr >= vsyscallStart && insAddr < vsyscallEnd && !vsyscallWarned)) { - warn("Instrumenting vsyscall page code --- this process executes vsyscalls, which zsim does not virtualize!"); - vsyscallWarned = true; +#ifdef HARD_CODED_TRACE_TEST +void Trace(THREADID tid, struct FrontendTrace trace) { + { + std::unique_lock lock(*queueMutexPerThread[tid]); + queuePerThread[tid].push(trace); } + queueHasDataPerThread[tid]->notify_one(); } +#endif /* ===================================================================== */ - -bool activeThreads[MAX_THREADS]; // set in ThreadStart, reset in ThreadFini, we need this for exec() (see FollowChild) -bool inSyscall[MAX_THREADS]; // set in SyscallEnter, reset in SyscallExit, regardless of state. We MAY need this for ContextChange - -uint32_t CountActiveThreads() { - // Finish all threads in this process w.r.t. the global scheduler - uint32_t activeCount = 0; - for (uint32_t i = 0; i < MAX_THREADS; i++) { - if (activeThreads[i]) activeCount++; - } - return activeCount; -} - void SimThreadStart(THREADID tid) { info("Thread %d starting", tid); if (tid > MAX_THREADS) panic("tid > MAX_THREADS"); zinfo->sched->start(procIdx, tid, procTreeNode->getMask()); - activeThreads[tid] = true; + activeThreads[tid].store(true); //Pinning #if 0 @@ -852,7 +623,20 @@ void SimThreadStart(THREADID tid) { clearCid(tid); //just in case, set an invalid cid } -VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v) { +void ThreadFini(THREADID tid) { + //NOTE: Thread has no valid cid here! + if (fPtrs[tid].type == FPTR_NOP) { + info("Shadow/NOP thread %d finished", tid); + return; + } else { + SimThreadFini(tid); + info("Thread %d finished", tid); + } +} + +void ThreadStart(THREADID tid) { + IPCHandler ipcHandler(tid); + /* This should only fire for the first thread; I know this is a callback, * everything is serialized etc; that's the point, we block everything. * It's here and not in main() because that way the auxiliary threads can @@ -876,130 +660,74 @@ VOID ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, VOID *v) { //Start normal thread SimThreadStart(tid); } -} - -VOID SimThreadFini(THREADID tid) { - // zinfo->sched->leave(); //exit syscall (SyscallEnter) already leaves - zinfo->sched->finish(procIdx, tid); - activeThreads[tid] = false; - cids[tid] = UNINITIALIZED_CID; //clear this cid, it might get reused -} - -VOID ThreadFini(THREADID tid, const CONTEXT *ctxt, INT32 flags, VOID *v) { - //NOTE: Thread has no valid cid here! - if (fPtrs[tid].type == FPTR_NOP) { - info("Shadow/NOP thread %d finished", tid); - return; - } else { - SimThreadFini(tid); - info("Thread %d finished", tid); - } -} -//Need to remove ourselves from running threads in case the syscall is blocking -VOID SyscallEnter(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, VOID *v) { - bool isNopThread = fPtrs[tid].type == FPTR_NOP; - bool isRetryThread = fPtrs[tid].type == FPTR_RETRY; - - if (!isRetryThread) { - VirtSyscallEnter(tid, ctxt, std, procTreeNode->getPatchRoot(), isNopThread); - } + ipcHandler.waitAccept(); - assert(!inSyscall[tid]); inSyscall[tid] = true; - - if (isNopThread || isRetryThread) return; - - /* NOTE: It is possible that we take 2 syscalls back to back with any - * intervening instrumentation, so we need to check. In that case, this is - * treated as a single syscall scheduling-wise (no second leave without - * join). - */ - if (fPtrs[tid].type != FPTR_JOIN && !zinfo->blockingSyscalls) { - uint32_t cid = getCid(tid); - // set an invalid cid, ours is property of the scheduler now! - clearCid(tid); - - zinfo->sched->syscallLeave(procIdx, tid, cid, PIN_GetContextReg(ctxt, REG_INST_PTR), - PIN_GetSyscallNumber(ctxt, std), PIN_GetSyscallArgument(ctxt, std, 0), - PIN_GetSyscallArgument(ctxt, std, 1)); - //zinfo->sched->leave(procIdx, tid, cid); - fPtrs[tid] = joinPtrs; // will join at the next instr point - //info("SyscallEnter %d", tid); - } -} - -VOID SyscallExit(THREADID tid, CONTEXT *ctxt, SYSCALL_STANDARD std, VOID *v) { - assert(inSyscall[tid]); inSyscall[tid] = false; - - PostPatchAction ppa = VirtSyscallExit(tid, ctxt, std); - if (ppa == PPA_USE_JOIN_PTRS) { - if (!zinfo->blockingSyscalls) { - fPtrs[tid] = joinPtrs; - } else { - fPtrs[tid] = cores[tid]->GetFuncPtrs(); //go back to normal pointers, directly + while (true) { +#ifdef HARD_CODED_TRACE_TEST + std::unique_lock lock(*queueMutexPerThread[tid]); + queueHasDataPerThread[tid]->wait(lock, [tid] { + return !queuePerThread[tid].empty() || !activeThreads[tid].load(); + }); +#endif + if (!activeThreads[tid].load()) { + break; } - } else if (ppa == PPA_USE_RETRY_PTRS) { - fPtrs[tid] = retryPtrs; - } else { - assert(ppa == PPA_NOTHING); - } - - //Avoid joining at all if we are in FF! - if (fPtrs[tid].type == FPTR_JOIN && procTreeNode->isInFastForward()) { - assert(activeThreads[tid]); - info("Thread %d entering fast-forward (from syscall exit)", tid); - //We are not in the scheduler, and have no cid assigned. So, no need to leave() - SimThreadFini(tid); - fPtrs[tid] = GetFFPtrs(); - } - - - if (zinfo->terminationConditionMet) { - info("Caught termination condition on syscall exit, exiting"); - SimEnd(); +#ifdef HARD_CODED_TRACE_TEST + while (!queuePerThread[tid].empty()) { + auto trace = queuePerThread[tid].front(); + queuePerThread[tid].pop(); +#else + auto traceBarePtr = ipcHandler.receiveTrace(); + if (traceBarePtr == nullptr) { + ThreadFini(tid); + SimEnd(); + } + auto tracePtr = std::unique_ptr(traceBarePtr); + auto &trace = *tracePtr; +#endif + if (!procTreeNode->isInFastForward() || !zinfo->ffReinstrument) { + // Visit every basic block in the trace + for (size_t i = 0; i < trace.count; i++) { + struct BasicBlock &bbl = trace.blocks[i]; + BblInfo* bblInfo = Decoder::decodeBbl(bbl, zinfo->oooDecode); + IndirectBasicBlock(tid, bbl.virtualPc, bblInfo); + } + } + + for (size_t i = 0; i < trace.count; i++) { + struct BasicBlock &bbl = trace.blocks[i]; + bbl.resetProgramIndex(); + size_t instIndex = 0; + size_t ldstCount = 0; + for (INS ins = bbl.getHeadInstruction(&instIndex); !bbl.endOfBlock(); + ins = bbl.getHeadInstruction(&instIndex)) { + struct BasicBlockLoadStore *ldstList = nullptr; + if (Decoder::riscvInsIsMemAccess(ins) && !Decoder::riscvInsIsStoreCond(ins)) { + assert(bbl.loadStore); + ldstList = &(bbl.loadStore[ldstCount++]); + } + PrepareNextInstruction(tid, ins, bbl.virtualPc + instIndex, ldstList, &bbl.branchInfo); + } + } +#ifdef HARD_CODED_TRACE_TEST + } +#endif } } -/* NOTE: We may screw up programs with frequent signals / SIG on syscall. If - * you see this warning and simulations misbehave, it's time to do some testing - * to figure out how to make syscall post-patching work in this case. - */ -VOID ContextChange(THREADID tid, CONTEXT_CHANGE_REASON reason, const CONTEXT* from, CONTEXT* to, INT32 info, VOID* v) { - const char* reasonStr = "?"; - switch (reason) { - case CONTEXT_CHANGE_REASON_FATALSIGNAL: - reasonStr = "FATAL_SIGNAL"; - break; - case CONTEXT_CHANGE_REASON_SIGNAL: - reasonStr = "SIGNAL"; - break; - case CONTEXT_CHANGE_REASON_SIGRETURN: - reasonStr = "SIGRETURN"; - break; - case CONTEXT_CHANGE_REASON_APC: - reasonStr = "APC"; - break; - case CONTEXT_CHANGE_REASON_EXCEPTION: - reasonStr = "EXCEPTION"; - break; - case CONTEXT_CHANGE_REASON_CALLBACK: - reasonStr = "CALLBACK"; - break; - } - - warn("[%d] ContextChange, reason %s, inSyscall %d", tid, reasonStr, inSyscall[tid]); - if (inSyscall[tid]) { - SyscallExit(tid, to, SYSCALL_STANDARD_IA32E_LINUX, nullptr); - } - - if (reason == CONTEXT_CHANGE_REASON_FATALSIGNAL) { - info("[%d] Fatal signal caught, finishing", tid); - zinfo->sched->queueProcessCleanup(procIdx, getpid()); //the scheduler watchdog will remove all our state when we are really dead - SimEnd(); +void SimThreadFini(THREADID tid) { +#ifdef HARD_CODED_TRACE_TEST + while (!queuePerThread[tid].empty()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - - //If this is an issue, we might need to call syscallexit on occasion. I very much doubt it - //SyscallExit(tid, to, SYSCALL_STANDARD_IA32E_LINUX, nullptr); //NOTE: For now it is safe to do spurious syscall exits, but careful... +#endif + activeThreads[tid].store(false); +#ifdef HARD_CODED_TRACE_TEST + queueHasDataPerThread[tid]->notify_one(); +#endif + zinfo->sched->finish(procIdx, tid); + cids[tid] = UNINITIALIZED_CID; //clear this cid, it might get reused } /* Fork and exec instrumentation */ @@ -1008,92 +736,7 @@ VOID ContextChange(THREADID tid, CONTEXT_CHANGE_REASON reason, const CONTEXT* fr #define QUOTED_(x) #x #define QUOTED(x) QUOTED_(x) -// Pre-exec -BOOL FollowChild(CHILD_PROCESS childProcess, VOID * userData) { - //Finish all threads in this process w.r.t. the global scheduler - - uint32_t activeCount = CountActiveThreads(); - if (activeCount > 1) warn("exec() of a multithreaded process! (%d live threads)", activeCount); - - // You can always run process0 = { command = "ls"; startPaused = True; startFastForwarded = True; }; to avoid this - if (procIdx == 0) panic("process0 cannot exec(), it spawns globally needed internal threads (scheduler and contention); run a dummy process0 instead!"); - - //Set up Pin command - //NOTE: perProcessDir may be active, we don't care much... run in the same dir as parent process - //NOTE: we recycle our own procIdx on an exec, but fork() changed it so we need to update Pin's command line - g_vector args = zinfo->pinCmd->getPinCmdArgs(procIdx); - uint32_t numArgs = args.size(); - const char* pinArgs[numArgs]; - for (uint32_t i = 0; i < numArgs; i++) pinArgs[i] = args[i].c_str(); - CHILD_PROCESS_SetPinCommandLine(childProcess, numArgs, pinArgs); - - //As a convenience, print the command we are going to execute - const char* const* cArgv; - int cArgc; - CHILD_PROCESS_GetCommandLine(childProcess, &cArgc, &cArgv); - - std::string childCmd = cArgv[0]; - for (int i = 1; i < cArgc; i++) { - childCmd += " "; - childCmd += cArgv[i]; - } - - info("Following exec(): %s", childCmd.c_str()); - - return true; //always follow -} - -static ProcessTreeNode* forkedChildNode = nullptr; - -VOID BeforeFork(THREADID tid, const CONTEXT* ctxt, VOID * arg) { - forkedChildNode = procTreeNode->getNextChild(); - info("Thread %d forking, child procIdx=%d", tid, forkedChildNode->getProcIdx()); -} - -VOID AfterForkInParent(THREADID tid, const CONTEXT* ctxt, VOID * arg) { - forkedChildNode = nullptr; -} - -VOID AfterForkInChild(THREADID tid, const CONTEXT* ctxt, VOID * arg) { - assert(forkedChildNode); - procTreeNode = forkedChildNode; - procIdx = procTreeNode->getProcIdx(); - bool wasNotStarted = procTreeNode->notifyStart(); - assert(wasNotStarted); //it's a fork, should be new - procMask = ((uint64_t)procIdx) << (64-lineBits); - - char header[64]; - snprintf(header, sizeof(header), "[S %dF] ", procIdx); //append an F to distinguish forked from fork/exec'd - std::stringstream logfile_ss; - logfile_ss << zinfo->outputDir << "/zsim.log." << procIdx; - InitLog(header, KnobLogToFile.Value()? logfile_ss.str().c_str() : nullptr); - - info("Forked child (tid %d/%d), PID %d, parent PID %d", tid, PIN_ThreadId(), PIN_GetPid(), getppid()); - - //Initialize process-local per-thread state, even if ThreadStart does so later - for (uint32_t i = 0; i < MAX_THREADS; i++) { - fPtrs[i] = joinPtrs; - cids[i] = UNINITIALIZED_CID; - activeThreads[i] = false; - inSyscall[i] = false; - cores[i] = nullptr; - } - - //We need to launch another copy of the FF control thread - PIN_SpawnInternalThread(FFThread, nullptr, 64*1024, nullptr); - - ThreadStart(tid, nullptr, 0, nullptr); -} - -/** Finalization **/ - -VOID Fini(int code, VOID * v) { - info("Finished, code %d", code); - //NOTE: In fini, it appears that info() and writes to stdout in general won't work; warn() and stderr still work fine. - SimEnd(); -} - -VOID SimEnd() { +void SimEnd() { if (__sync_bool_compare_and_swap(&perProcessEndFlag, 0, 1) == false) { //failed, note DEPENDS ON STRONG CAS while (true) { //sleep until thread that won exits for us struct timespec tm; @@ -1146,154 +789,154 @@ VOID SimEnd() { #define ZSIM_MAGIC_OP_REGISTER_THREAD (1027) #define ZSIM_MAGIC_OP_HEARTBEAT (1028) -VOID HandleMagicOp(THREADID tid, ADDRINT op) { - switch (op) { - case ZSIM_MAGIC_OP_ROI_BEGIN: - if (!zinfo->ignoreHooks) { - //TODO: Test whether this is thread-safe - futex_lock(&zinfo->ffLock); - if (procTreeNode->isInFastForward()) { - info("ROI_BEGIN, exiting fast-forward"); - ExitFastForward(); - } else { - warn("Ignoring ROI_BEGIN magic op, not in fast-forward"); - } - futex_unlock(&zinfo->ffLock); - } - return; - case ZSIM_MAGIC_OP_ROI_END: - if (!zinfo->ignoreHooks) { - //TODO: Test whether this is thread-safe - futex_lock(&zinfo->ffLock); - if (procTreeNode->getSyncedFastForward()) { - warn("Ignoring ROI_END magic op on synced FF to avoid deadlock"); - } else if (!procTreeNode->isInFastForward()) { - info("ROI_END, entering fast-forward"); - EnterFastForward(); - //If we don't do this, we'll enter FF on the next phase. Which would be OK, except with synced FF - //we stay in the barrier forever. And deadlock. And the deadlock code does nothing, since we're in FF - //So, force immediate entry if we're sync-ffwding - if (procTreeNode->getSyncedFastForward()) { - info("Thread %d entering fast-forward (immediate)", tid); - uint32_t cid = getCid(tid); - assert(cid != INVALID_CID); - clearCid(tid); - zinfo->sched->leave(procIdx, tid, cid); - SimThreadFini(tid); - fPtrs[tid] = GetFFPtrs(); - } - } else { - warn("Ignoring ROI_END magic op, already in fast-forward"); - } - futex_unlock(&zinfo->ffLock); - } - return; - case ZSIM_MAGIC_OP_REGISTER_THREAD: - if (!zinfo->registerThreads) { - info("Thread %d: Treating REGISTER_THREAD magic op as NOP", tid); - } else { - if (fPtrs[tid].type == FPTR_NOP) { - SimThreadStart(tid); - } else { - warn("Thread %d: Treating REGISTER_THREAD magic op as NOP, thread already registered", tid); - } - } - return; - case ZSIM_MAGIC_OP_HEARTBEAT: - procTreeNode->heartbeat(); //heartbeats are per process for now - return; - - // HACK: Ubik magic ops - case 1029: - case 1030: - case 1031: - case 1032: - case 1033: - return; - default: - panic("Thread %d issued unknown magic op %ld!", tid, op); - } -} - -//CPUIID faking -static uint32_t cpuidEax[MAX_THREADS]; -static uint32_t cpuidEcx[MAX_THREADS]; - -VOID FakeCPUIDPre(THREADID tid, REG eax, REG ecx) { - //info("%d precpuid", tid); - cpuidEax[tid] = eax; - cpuidEcx[tid] = ecx; -} - -VOID FakeCPUIDPost(THREADID tid, ADDRINT* eax, ADDRINT* ebx, ADDRINT* ecx, ADDRINT* edx) { - uint32_t eaxIn = cpuidEax[tid]; - uint32_t ecxIn = cpuidEcx[tid]; - - // Point to record at same (eax,ecx) or immediately before - CpuIdRecord val = {eaxIn, ecxIn, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1}; - CpuIdRecord* pos = std::lower_bound(cpuid_core2, cpuid_core2+(sizeof(cpuid_core2)/sizeof(CpuIdRecord)), val); - if (pos->eaxIn > eaxIn) { - assert(pos > cpuid_core2); - pos--; - } - assert(pos->eaxIn <= eaxIn); - assert(pos->ecxIn <= ecxIn); - - //info("%x %x : %x %x / %x %x %x %x", eaxIn, ecxIn, pos->eaxIn, pos->ecxIn, pos->eax, pos->ebx, pos->ecx, pos->edx); - - uint32_t eaxOut = pos->eax; - uint32_t ebxOut = pos->ebx; - - // patch eax to give the number of cores - if (eaxIn == 4) { - uint32_t ncpus = cpuenumNumCpus(procIdx); - uint32_t eax3126 = ncpus - 1; - // Overflowing 6 bits? - if (zinfo->numCores > 64) eax3126 = 63; //looked into swarm2.csail (4P Westmere-EX, 80 HTs), it sets this to 63 - eaxOut = (eaxOut & ((1<<26)-1)) | (eax3126<<26); - } - - // HT siblings and APIC (core) ID (apparently used; seems Intel-specific) - if (eaxIn == 0x1) { - uint32_t cid = getCid(tid); - uint32_t cpu = cpuenumCpu(procIdx, cid); - uint32_t ncpus = cpuenumNumCpus(procIdx); - uint32_t siblings = MIN(ncpus, (uint32_t)255); - uint32_t apicId = (cpu < ncpus)? MIN(cpu, (uint32_t)255) : 0 /*not scheduled, ffwd?*/; - ebxOut = (ebxOut & 0xffff) | (siblings << 16) | (apicId << 24); - } - - //info("[%d] postcpuid, inEax 0x%x, pre 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx); - //Preserve high bits - *reinterpret_cast(eax) = eaxOut; - *reinterpret_cast(ebx) = ebxOut; - *reinterpret_cast(ecx) = pos->ecx; - *reinterpret_cast(edx) = pos->edx; - //info("[%d] postcpuid, inEax 0x%x, post 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx); -} - - -//RDTSC faking -VOID FakeRDTSCPost(THREADID tid, REG* eax, REG* edx) { - if (fPtrs[tid].type == FPTR_NOP) return; //avoid virtualizing NOP threads. - - uint32_t cid = getCid(tid); - uint64_t curCycle = VirtGetPhaseRDTSC(); - if (cid < zinfo->numCores) { - curCycle += zinfo->cores[cid]->getPhaseCycles(); - } - - uint32_t lo = (uint32_t)curCycle; - uint32_t hi = (uint32_t)(curCycle >> 32); - - assert((((uint64_t)hi) << 32) + lo == curCycle); - - //uint64_t origTSC = (((uint64_t)*edx) << 32) + (uint32_t)*eax; - //info("[t%d/c%d] Virtualizing RDTSC, pre = %x %x (%ld), post = %x %x (%ld)", tid, cid, *edx, *eax, origTSC, hi, lo, curCycle); - - *eax = (REG)lo; - *edx = (REG)hi; -} +// VOID HandleMagicOp(THREADID tid, ADDRINT op) { +// switch (op) { +// case ZSIM_MAGIC_OP_ROI_BEGIN: +// if (!zinfo->ignoreHooks) { +// //TODO: Test whether this is thread-safe +// futex_lock(&zinfo->ffLock); +// if (procTreeNode->isInFastForward()) { +// info("ROI_BEGIN, exiting fast-forward"); +// ExitFastForward(); +// } else { +// warn("Ignoring ROI_BEGIN magic op, not in fast-forward"); +// } +// futex_unlock(&zinfo->ffLock); +// } +// return; +// case ZSIM_MAGIC_OP_ROI_END: +// if (!zinfo->ignoreHooks) { +// //TODO: Test whether this is thread-safe +// futex_lock(&zinfo->ffLock); +// if (procTreeNode->getSyncedFastForward()) { +// warn("Ignoring ROI_END magic op on synced FF to avoid deadlock"); +// } else if (!procTreeNode->isInFastForward()) { +// info("ROI_END, entering fast-forward"); +// EnterFastForward(); +// //If we don't do this, we'll enter FF on the next phase. Which would be OK, except with synced FF +// //we stay in the barrier forever. And deadlock. And the deadlock code does nothing, since we're in FF +// //So, force immediate entry if we're sync-ffwding +// if (procTreeNode->getSyncedFastForward()) { +// info("Thread %d entering fast-forward (immediate)", tid); +// uint32_t cid = getCid(tid); +// assert(cid != INVALID_CID); +// clearCid(tid); +// zinfo->sched->leave(procIdx, tid, cid); +// SimThreadFini(tid); +// fPtrs[tid] = GetFFPtrs(); +// } +// } else { +// warn("Ignoring ROI_END magic op, already in fast-forward"); +// } +// futex_unlock(&zinfo->ffLock); +// } +// return; +// case ZSIM_MAGIC_OP_REGISTER_THREAD: +// if (!zinfo->registerThreads) { +// info("Thread %d: Treating REGISTER_THREAD magic op as NOP", tid); +// } else { +// if (fPtrs[tid].type == FPTR_NOP) { +// SimThreadStart(tid); +// } else { +// warn("Thread %d: Treating REGISTER_THREAD magic op as NOP, thread already registered", tid); +// } +// } +// return; +// case ZSIM_MAGIC_OP_HEARTBEAT: +// procTreeNode->heartbeat(); //heartbeats are per process for now +// return; + +// // HACK: Ubik magic ops +// case 1029: +// case 1030: +// case 1031: +// case 1032: +// case 1033: +// return; +// default: +// panic("Thread %d issued unknown magic op %ld!", tid, op); +// } +// } + +// //CPUIID faking +// static uint32_t cpuidEax[MAX_THREADS]; +// static uint32_t cpuidEcx[MAX_THREADS]; + +// VOID FakeCPUIDPre(THREADID tid, REG eax, REG ecx) { +// //info("%d precpuid", tid); +// cpuidEax[tid] = eax; +// cpuidEcx[tid] = ecx; +// } + +// VOID FakeCPUIDPost(THREADID tid, ADDRINT* eax, ADDRINT* ebx, ADDRINT* ecx, ADDRINT* edx) { +// uint32_t eaxIn = cpuidEax[tid]; +// uint32_t ecxIn = cpuidEcx[tid]; + +// // Point to record at same (eax,ecx) or immediately before +// CpuIdRecord val = {eaxIn, ecxIn, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1}; +// CpuIdRecord* pos = std::lower_bound(cpuid_core2, cpuid_core2+(sizeof(cpuid_core2)/sizeof(CpuIdRecord)), val); +// if (pos->eaxIn > eaxIn) { +// assert(pos > cpuid_core2); +// pos--; +// } +// assert(pos->eaxIn <= eaxIn); +// assert(pos->ecxIn <= ecxIn); + +// //info("%x %x : %x %x / %x %x %x %x", eaxIn, ecxIn, pos->eaxIn, pos->ecxIn, pos->eax, pos->ebx, pos->ecx, pos->edx); + +// uint32_t eaxOut = pos->eax; +// uint32_t ebxOut = pos->ebx; + +// // patch eax to give the number of cores +// if (eaxIn == 4) { +// uint32_t ncpus = cpuenumNumCpus(procIdx); +// uint32_t eax3126 = ncpus - 1; +// // Overflowing 6 bits? +// if (zinfo->numCores > 64) eax3126 = 63; //looked into swarm2.csail (4P Westmere-EX, 80 HTs), it sets this to 63 +// eaxOut = (eaxOut & ((1<<26)-1)) | (eax3126<<26); +// } + +// // HT siblings and APIC (core) ID (apparently used; seems Intel-specific) +// if (eaxIn == 0x1) { +// uint32_t cid = getCid(tid); +// uint32_t cpu = cpuenumCpu(procIdx, cid); +// uint32_t ncpus = cpuenumNumCpus(procIdx); +// uint32_t siblings = MIN(ncpus, (uint32_t)255); +// uint32_t apicId = (cpu < ncpus)? MIN(cpu, (uint32_t)255) : 0 /*not scheduled, ffwd?*/; +// ebxOut = (ebxOut & 0xffff) | (siblings << 16) | (apicId << 24); +// } + +// //info("[%d] postcpuid, inEax 0x%x, pre 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx); +// //Preserve high bits +// *reinterpret_cast(eax) = eaxOut; +// *reinterpret_cast(ebx) = ebxOut; +// *reinterpret_cast(ecx) = pos->ecx; +// *reinterpret_cast(edx) = pos->edx; +// //info("[%d] postcpuid, inEax 0x%x, post 0x%lx 0x%lx 0x%lx 0x%lx", tid, eaxIn, *eax, *ebx, *ecx, *edx); +// } + + +// //RDTSC faking +// VOID FakeRDTSCPost(THREADID tid, REG* eax, REG* edx) { +// if (fPtrs[tid].type == FPTR_NOP) return; //avoid virtualizing NOP threads. + +// uint32_t cid = getCid(tid); +// uint64_t curCycle = VirtGetPhaseRDTSC(); +// if (cid < zinfo->numCores) { +// curCycle += zinfo->cores[cid]->getPhaseCycles(); +// } + +// uint32_t lo = (uint32_t)curCycle; +// uint32_t hi = (uint32_t)(curCycle >> 32); + +// assert((((uint64_t)hi) << 32) + lo == curCycle); + +// //uint64_t origTSC = (((uint64_t)*edx) << 32) + (uint32_t)*eax; +// //info("[t%d/c%d] Virtualizing RDTSC, pre = %x %x (%ld), post = %x %x (%ld)", tid, cid, *edx, *eax, origTSC, hi, lo, curCycle); + +// *eax = (REG)lo; +// *edx = (REG)hi; +// } /* Fast-forward control */ @@ -1331,7 +974,7 @@ class SyncEvent: public Event { } }; -VOID FFThread(VOID* arg) { +void FFThread(void* arg) { futex_lock(&zinfo->ffToggleLocks[procIdx]); //initialize info("FF control Thread TID %ld", syscall(SYS_gettid)); @@ -1375,145 +1018,65 @@ VOID FFThread(VOID* arg) { panic("Should not be reached!"); } +/* linux kernel memset */ +static uint32_t bbl0_code[] = { + 0x00c286b3, 0x00b28023, 0xede30285, 0x0000fed2 +}; -/* Internal Exception Handler */ -//When firing a debugger was an easy affair, this was not an issue. Now it's not so easy, so let's try to at least capture the backtrace and print it out - -//Use unlocked output, who knows where this happens. -static EXCEPT_HANDLING_RESULT InternalExceptionHandler(THREADID tid, EXCEPTION_INFO *pExceptInfo, PHYSICAL_CONTEXT *pPhysCtxt, VOID *) { - fprintf(stderr, "%s[%d] Internal exception detected:\n", logHeader, tid); - fprintf(stderr, "%s[%d] Code: %d\n", logHeader, tid, PIN_GetExceptionCode(pExceptInfo)); - fprintf(stderr, "%s[%d] Address: 0x%lx\n", logHeader, tid, PIN_GetExceptionAddress(pExceptInfo)); - fprintf(stderr, "%s[%d] Description: %s\n", logHeader, tid, PIN_ExceptionToString(pExceptInfo).c_str()); - - ADDRINT faultyAccessAddr; - if (PIN_GetFaultyAccessAddress(pExceptInfo, &faultyAccessAddr)) { - const char* faultyAccessStr = ""; - FAULTY_ACCESS_TYPE fat = PIN_GetFaultyAccessType(pExceptInfo); - if (fat == FAULTY_ACCESS_READ) faultyAccessStr = "READ "; - else if (fat == FAULTY_ACCESS_WRITE) faultyAccessStr = "WRITE "; - else if (fat == FAULTY_ACCESS_EXECUTE) faultyAccessStr = "EXECUTE "; - - fprintf(stderr, "%s[%d] Caused by invalid %saccess to address 0x%lx\n", logHeader, tid, faultyAccessStr, faultyAccessAddr); - } - -#ifdef PIN_CRT - // With PinCRT, it seems Pin cannot backtrace to the original pintool stack from the exception handler. - // So we extract the instruction and stack pointers and manually unwind with libunwind. - - PIN_LockClient(); - fprintf(stderr, "%s[%d] Backtrace\n", logHeader, tid); - - unw_context_t ctxt; - unw_cursor_t cur; - unw_word_t ip; - unw_getcontext(&ctxt); - unw_init_local(&cur, &ctxt); - - // Restore to the original pintool stack. - unw_set_reg(&cur, UNW_REG_IP, PIN_GetPhysicalContextReg(pPhysCtxt, REG_INST_PTR)); - unw_set_reg(&cur, UNW_REG_SP, PIN_GetPhysicalContextReg(pPhysCtxt, REG_STACK_PTR)); - - // Get libzsim info. - struct LibInfo libzsimAddrs; - getLibzsimAddrs(&libzsimAddrs); - std::string libzsimFile = QUOTED(ZSIM_PATH); - - do { - // Get instruction pointer. - unw_get_reg(&cur, UNW_REG_IP, &ip); - - // We need to use relative instruction address as Pin seems to load libzsim.so in an unusual way. - ADDRINT relInstrAddr = ip - reinterpret_cast(libzsimAddrs.textAddr); - std::stringstream ss; - ss << "0x" << std::hex << relInstrAddr; - std::string relInstrAddrStr(ss.str()); - - std::string s = "Unknown instruction addr: " + libzsimFile + "+" + relInstrAddrStr; - // Get symbol. - std::string exe = -#ifdef ADDR2LINEBIN - QUOTED(ADDR2LINEBIN); -#else - "addr2line"; -#endif - std::string cmd = exe + " -f -C -e " + libzsimFile + " -j .text " + relInstrAddrStr + " 2>/dev/null"; - FILE* f = popen(cmd.c_str(), "r"); - if (f) { - char buf[1024]; - std::string func, loc; - func = fgets(buf, 1024, f); //first line is function name - loc = fgets(buf, 1024, f); //second is location - //Remove line breaks - func = func.substr(0, func.size()-1); - loc = loc.substr(0, loc.size()-1); - - int status = pclose(f); - if (status == 0) { - s = loc + " / " + func; - } - } - - fprintf(stderr, "%s[%d] %s\n", logHeader, tid, s.c_str()); - } while (unw_step(&cur) > 0); - fflush(stderr); - - PIN_UnlockClient(); -#else // PIN_CRT - void* array[40]; - size_t size = backtrace(array, 40); - char** strings = backtrace_symbols(array, size); - fprintf(stderr, "%s[%d] Backtrace (%ld/%d max frames)\n", logHeader, tid, size, 40); - for (uint32_t i = 0; i < size; i++) { - //For libzsim.so addresses, call addr2line to get symbol info (can't use -rdynamic on libzsim.so because of Pin's linker script) - //NOTE: May be system-dependent, may not handle malformed strings well. We're going to die anyway, so in for a penny, in for a pound... - std::string s = strings[i]; - uint32_t lp = s.find_first_of("("); - uint32_t cp = s.find_first_of(")"); - std::string fname = s.substr(0, lp); - std::string faddr = s.substr(lp+1, cp-(lp+1)); - if (fname.find("libzsim.so") != std::string::npos) { - std::string cmd = "addr2line -f -C -e " + fname + " " + faddr; - FILE* f = popen(cmd.c_str(), "r"); - if (f) { - char buf[1024]; - std::string func, loc; - func = fgets(buf, 1024, f); //first line is function name - loc = fgets(buf, 1024, f); //second is location - //Remove line breaks - func = func.substr(0, func.size()-1); - loc = loc.substr(0, loc.size()-1); - - int status = pclose(f); - if (status == 0) { - s = loc + " / " + func; - } - } - } - - fprintf(stderr, "%s[%d] %s\n", logHeader, tid, s.c_str()); - } - fflush(stderr); -#endif // PIN_CRT - - return EHR_CONTINUE_SEARCH; //we never solve anything at all :P +static struct BasicBlock bbl0, bbl1; +static struct FrontendTrace testTrace0, testTrace1; + +void buildTestTrace() { + bbl0.codeBytes = 14; + bbl0.code = (uint8_t *) bbl0_code; + bbl0.loadStore = (struct BasicBlockLoadStore *)malloc(4 * sizeof(struct BasicBlockLoadStore)); + bbl0.branchInfo.branchTaken = true; + bbl0.branchInfo.branchTakenNpc = 0xffffffff8090f174; + bbl0.virtualPc = 0xffffffff8090f170; + bbl0.resetProgramIndex(); + bbl0.loadStore[0].entryValid = false; + bbl0.loadStore[1].entryValid = true; + bbl0.loadStore[1].addr1 = 0xffffffff8190f170; + bbl0.loadStore[1].next = nullptr; + bbl0.loadStore[2].entryValid = false; + bbl0.loadStore[3].entryValid = false; + testTrace0.blocks = &bbl0; + testTrace0.count = 1; + + bbl1.codeBytes = 10; + bbl1.code = (uint8_t *) (bbl0_code) + 4; + bbl1.loadStore = (struct BasicBlockLoadStore *)malloc(3 * sizeof(struct BasicBlockLoadStore)); + bbl1.branchInfo.branchTaken = true; + bbl1.branchInfo.branchTakenNpc = 0xffffffff8090f174; + bbl1.virtualPc = 0xffffffff8090f174; + bbl1.resetProgramIndex(); + bbl1.loadStore[0].entryValid = true; + bbl1.loadStore[0].addr1 = 0x8190fa70; /* force a cache miss */ + bbl1.loadStore[0].next = nullptr; + bbl1.loadStore[1].entryValid = false; + bbl1.loadStore[2].entryValid = false; + testTrace1.blocks = &bbl1; + testTrace1.count = 1; } /* ===================================================================== */ int main(int argc, char *argv[]) { - PIN_InitSymbols(); - if (PIN_Init(argc, argv)) return Usage(); - - //Register an internal exception handler (ASAP, to catch segfaults in init) - PIN_AddInternalExceptionHandler(InternalExceptionHandler, nullptr); - - procIdx = KnobProcIdx.Value(); + procIdx = 0; char header[64]; snprintf(header, sizeof(header), "[S %d] ", procIdx); + + /* argv 1 output directory argv 2 log_to_file argv 3 config file */ + if (argc != 4) { + std::cout << "You must specify output directory, log to file and config file option" << std::endl; + std::cout << "Default value: ./ false zsim.cfg" << std::endl; + return 1; + } + char *outputDir = argv[1], *logToFile = argv[2], *configFile = argv[3]; + std::stringstream logfile_ss; - logfile_ss << KnobOutputDir.Value() << "/zsim.log." << procIdx; - InitLog(header, KnobLogToFile.Value()? logfile_ss.str().c_str() : nullptr); + logfile_ss << outputDir << "/zsim.log." << procIdx; + InitLog(header, !strcmp(logToFile, "log_to_file") ? logfile_ss.str().c_str() : nullptr); //If parent dies, kill us //This avoids leaving strays running in any circumstances, but may be too heavy-handed with arbitrary process hierarchies. @@ -1524,37 +1087,21 @@ int main(int argc, char *argv[]) { info("Started instance"); - //Decrease priority to avoid starving system processes (e.g. gluster) - //setpriority(PRIO_PROCESS, getpid(), 10); - //info("setpriority, new prio %d", getpriority(PRIO_PROCESS, getpid())); - - gm_attach(KnobShmid.Value()); + Config conf(configFile); + uint32_t gmSize = conf.get("sim.gmMBytes", (1<<10) /*default 1024MB*/); + info("Creating global segment, %d MBs", gmSize); + int shmid = gm_init(((size_t)gmSize) << 20 /*MB to Bytes*/); + info("Global segment shmid = %d", shmid); bool masterProcess = false; if (procIdx == 0 && !gm_isready()) { // process 0 can exec() without fork()ing first, so we must check gm_isready() to ensure we don't initialize twice masterProcess = true; - SimInit(KnobConfigFile.Value().c_str(), KnobOutputDir.Value().c_str(), KnobShmid.Value()); + SimInit(configFile, outputDir, 0); } else { while (!gm_isready()) usleep(1000); // wait till proc idx 0 initializes everything zinfo = static_cast(gm_get_glob_ptr()); } - //If assertion below fails, use this to print maps -#if 0 - futex_lock(&zinfo->ffLock); //whatever lock, just don't interleave - std::ifstream infile("/proc/self/maps"); - std::string line; - while (std::getline(infile, line)) info(" %s", line.c_str()); - futex_unlock(&zinfo->ffLock); - usleep(100000); -#endif - //LibzsimAddrs sanity check: Ensure that they match across processes - struct LibInfo libzsimAddrs; - getLibzsimAddrs(&libzsimAddrs); - if (memcmp(&libzsimAddrs, &zinfo->libzsimAddrs, sizeof(libzsimAddrs)) != 0) { - panic("libzsim.so address mismatch! text: %p != %p. Perform loader injection to homogenize offsets!", libzsimAddrs.textAddr, zinfo->libzsimAddrs.textAddr); - } - //Attach to debugger if needed (master process does so in SimInit, to be able to debug initialization) //NOTE: Pin fails to follow exec()'s when gdb is attached. The simplest way to avoid it is to kill the debugger manually before an exec(). If this is common, we could automate it if (!masterProcess && zinfo->attachDebugger) { @@ -1586,61 +1133,42 @@ int main(int argc, char *argv[]) { info("Started process, PID %d", getpid()); //NOTE: external scripts expect this line, please do not change without checking first - //Unless things change substantially, keep this disabled; it causes higher imbalance and doesn't solve large system time with lots of processes. - //Affinity testing code - /*cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(procIdx % 8, &cpuset); - int result = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset); - info("Affinity result %d", result);*/ - info("procMask: 0x%lx", procMask); if (zinfo->sched) zinfo->sched->processCleanup(procIdx); - VirtCaptureClocks(false); FFIInit(); - VirtInit(); - - //Register instrumentation - TRACE_AddInstrumentFunction(Trace, 0); - VdsoInit(); //initialized vDSO patching information (e.g., where all the possible vDSO entry points are) - - PIN_AddThreadStartFunction(ThreadStart, 0); - PIN_AddThreadFiniFunction(ThreadFini, 0); - - PIN_AddSyscallEntryFunction(SyscallEnter, 0); - PIN_AddSyscallExitFunction(SyscallExit, 0); - PIN_AddContextChangeFunction(ContextChange, 0); - - PIN_AddFiniFunction(Fini, 0); - - //Follow exec and fork - PIN_AddFollowChildProcessFunction(FollowChild, 0); - PIN_AddForkFunction(FPOINT_BEFORE, BeforeFork, 0); - PIN_AddForkFunction(FPOINT_AFTER_IN_PARENT, AfterForkInParent, 0); - PIN_AddForkFunction(FPOINT_AFTER_IN_CHILD, AfterForkInChild, 0); - - //FFwd control - //OK, screw it. Launch this on a separate thread, and forget about signals... the caller will set a shared memory var. PIN is hopeless with signal instrumentation on multithreaded processes! - PIN_SpawnInternalThread(FFThread, nullptr, 64*1024, nullptr); - - // Start trace-driven or exec-driven sim - if (zinfo->traceDriven) { - info("Running trace-driven simulation"); - while (!zinfo->terminationConditionMet && zinfo->traceDriver->executePhase()) { - // info("Phase done"); - EndOfPhaseActions(); - zinfo->numPhases++; - zinfo->globPhaseCycles += zinfo->phaseLength; - } - info("Finished trace-driven simulation"); - SimEnd(); - } else { - // Never returns - PIN_StartProgram(); + std::vector threads; + threads.emplace_back(FFThread, nullptr); + + TraceThreadInit(threads, 0); + +#ifdef HARD_CODED_TRACE_TEST + buildTestTrace(); + Trace(0, testTrace0); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + Trace(0, testTrace1); + EndOfPhaseActions(); +#else + for (auto &thd: threads) { + thd.join(); } +#endif return 0; } diff --git a/src/zsim.h b/src/zsim.h index f100203..f0d7419 100644 --- a/src/zsim.h +++ b/src/zsim.h @@ -45,7 +45,6 @@ class EventQueue; class ContentionSim; class EventRecorder; class MemInterconnectEventRecorder; -class PinCmd; class PortVirtualizer; class VectorCounter; class AccessTraceWriter; @@ -126,7 +125,6 @@ struct GlobSimInfo { PAD(); ClockDomainInfo clockDomainInfo[MAX_CLOCK_DOMAINS]; - PortVirtualizer* portVirt[MAX_PORT_DOMAINS]; lock_t ffLock; //global, grabbed in all ff entry/exit ops. @@ -157,8 +155,6 @@ struct GlobSimInfo { uint32_t numProcs; uint32_t numProcGroups; - PinCmd* pinCmd; //enables calls to exec() to modify Pin's calling arguments, see zsim.cpp - // If true, threads start as shadow and have no effect on simulation until they call the register magic op bool registerThreads; @@ -172,8 +168,6 @@ struct GlobSimInfo { int harnessPid; //used for debugging purposes int debugPortId; - struct LibInfo libzsimAddrs; - bool ffReinstrument; //true if we should reinstrument on ffwd, works fine with ST apps and it's faster since we run with basically no instrumentation, but it's not precise with MT apps //fftoggle stuff diff --git a/src/zsim_harness.cpp b/src/zsim_harness.cpp deleted file mode 100644 index 91d355b..0000000 --- a/src/zsim_harness.cpp +++ /dev/null @@ -1,507 +0,0 @@ -/** $lic$ - * Copyright (C) 2012-2015 by Massachusetts Institute of Technology - * Copyright (C) 2010-2013 by The Board of Trustees of Stanford University - * - * This file is part of zsim. - * - * zsim is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License as published by the Free Software - * Foundation, version 2. - * - * If you use this software in your research, we request that you reference - * the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of - * Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the - * source of the simulator in any publications that use this software, and that - * you send us a citation of your work. - * - * zsim is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -/* ZSim master process. Handles global heap creation, configuration, launching - * slave pin processes, coordinating and terminating runs, and stats printing. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "config.h" -#include "constants.h" -#include "debug_harness.h" -#include "galloc.h" -#include "log.h" -#include "pin_cmd.h" -#include "version.h" //autogenerated, in build dir, see SConstruct -#include "zsim.h" - -/* Globals */ - -typedef enum { - OK, - GRACEFUL_TERMINATION, - KILL_EM_ALL, -} TerminationStatus; - -TerminationStatus termStatus = OK; - -typedef enum { - PS_INVALID, - PS_RUNNING, - PS_DONE, -} ProcStatus; - -struct ProcInfo { - int pid; - volatile ProcStatus status; -}; - -//At most as many processes as threads, plus one extra process per child if we launch a debugger -#define MAX_CHILDREN (2*MAX_THREADS) -ProcInfo childInfo[MAX_CHILDREN]; - -volatile uint32_t debuggerChildIdx = MAX_THREADS; - -GlobSimInfo* globzinfo = nullptr; //used very sparingly, only in sig handlers. Should probably promote to a global like in zsim processes. - -bool perProcessDir, aslr; - -PinCmd* pinCmd; - -/* Defs & helper functions */ - -// Posix shell command expansion. #include -extern "C" { -typedef struct { - size_t we_wordc; - char **we_wordv; - size_t we_offs; -} wordexp_t; -extern int wordexp(const char *s, wordexp_t *p, int flags); -extern void wordfree(wordexp_t *p); -} - -g_vector wordexpfunc(const char *s) { - g_vector words; - wordexp_t p; - wordexp(s, &p, 0); - for (uint32_t i = 0; i < p.we_wordc; i++) { - words.push_back(g_string(p.we_wordv[i])); - } - wordfree(&p); - return words; -} - -void LaunchProcess(uint32_t procIdx); - -int getNumChildren() { - int num = 0; - for (int i = 0; i < MAX_CHILDREN; i++) { - if (childInfo[i].status == PS_RUNNING) num++; - } - return num; -} - -int eraseChild(int pid) { - for (int i = 0; i < MAX_CHILDREN; i++) { - if (childInfo[i].pid == pid) { - assert_msg(childInfo[i].status == PS_RUNNING, "i=%d pid=%d status=%d", i, pid, childInfo[i].status); - childInfo[i].status = PS_DONE; - return i; - } - } - panic("Could not erase child!!"); -} - -/* Signal handlers */ - -void chldSigHandler(int sig) { - assert(sig == SIGCHLD); - int status; - int cpid; - while ((cpid = waitpid(-1, &status, WNOHANG)) > 0) { - int idx = eraseChild(cpid); - if (idx < MAX_THREADS) { - info("Child %d done", cpid); - int exitCode = WIFEXITED(status)? WEXITSTATUS(status) : 0; - if (exitCode == PANIC_EXIT_CODE) { - panic("Child issued a panic, killing simulation"); - } - //Stricter check: See if notifyEnd was called (i.e. zsim caught this termination) - //Only works for direct children though - if (globzinfo && !globzinfo->procExited[idx]) { - panic("Child %d (idx %d) exit was anomalous, killing simulation", cpid, idx); - } - - if (globzinfo && globzinfo->procExited[idx] == PROC_RESTARTME) { - info("Restarting procIdx %d", idx); - globzinfo->procExited[idx] = PROC_RUNNING; - LaunchProcess(idx); - } - } else { - info("Child %d done (debugger)", cpid); - } - } -} - -void sigHandler(int sig) { - if (termStatus == KILL_EM_ALL) return; //a kill was already issued, avoid infinite recursion - - switch (sig) { - case SIGSEGV: - warn("Segmentation fault"); - termStatus = KILL_EM_ALL; - break; - case SIGINT: - info("Received interrupt"); - termStatus = (termStatus == OK)? GRACEFUL_TERMINATION : KILL_EM_ALL; - break; - case SIGTERM: - info("Received SIGTERM"); - termStatus = KILL_EM_ALL; - break; - default: - warn("Received signal %d", sig); - termStatus = KILL_EM_ALL; - } - - if (termStatus == KILL_EM_ALL) { - warn("Hard death, killing the whole process tree"); - kill(-getpid(), SIGKILL); - //Exit, we have already killed everything, there should be no strays - panic("SIGKILLs sent -- exiting"); - } else { - info("Attempting graceful termination"); - for (int i = 0; i < MAX_CHILDREN; i++) { - int cpid = childInfo[i].pid; - if (childInfo[i].status == PS_RUNNING) { - info("Killing process %d", cpid); - kill(-cpid, SIGKILL); - usleep(100000); - kill(cpid, SIGKILL); - } - } - - info("Done sending kill signals"); - } -} - -void exitHandler() { - // If for some reason we still have children, kill everything - uint32_t children = getNumChildren(); - if (children) { - warn("Hard death at exit (%d children running), killing the whole process tree", children); - kill(-getpid(), SIGKILL); - } -} - -void debugSigHandler(int signum, siginfo_t* siginfo, void* dummy) { - assert(signum == SIGUSR1); - uint32_t callerPid = siginfo->si_pid; - // Child better have this initialized... - struct LibInfo* zsimAddrs = (struct LibInfo*) gm_get_secondary_ptr(); - int debugPortId = siginfo->si_int; - assert(debugPortId >= 0); - if (debugPortId > 0) { - // use separate debugger client - launchGDBDebugger(callerPid, zsimAddrs, debugPortId); - } else { - // use legacy xterm debugger - uint32_t debuggerPid = launchXtermDebugger(callerPid, zsimAddrs); - childInfo[debuggerChildIdx].pid = debuggerPid; - childInfo[debuggerChildIdx++].status = PS_RUNNING; - } -} - -/* Heartbeats */ - -static time_t startTime; -static time_t lastHeartbeatTime; -static uint64_t lastCycles = 0; - -static void printHeartbeat(GlobSimInfo* zinfo) { - uint64_t cycles = zinfo->numPhases*zinfo->phaseLength; - time_t curTime = time(nullptr); - time_t elapsedSecs = curTime - startTime; - time_t heartbeatSecs = curTime - lastHeartbeatTime; - - if (elapsedSecs == 0) return; - if (heartbeatSecs == 0) return; - - char time[128]; - char hostname[256]; - gethostname(hostname, 256); - - std::ofstream hb("heartbeat"); - hb << "Running on: " << hostname << std::endl; - hb << "Start time: " << ctime_r(&startTime, time); - hb << "Heartbeat time: " << ctime_r(&curTime, time); - hb << "Stats since start:" << std:: endl; - hb << " " << zinfo->numPhases << " phases" << std::endl; - hb << " " << cycles << " cycles" << std::endl; - hb << " " << (cycles)/elapsedSecs << " cycles/s" << std::endl; - hb << "Stats since last heartbeat (" << heartbeatSecs << "s):" << std:: endl; - hb << " " << (cycles-lastCycles)/heartbeatSecs << " cycles/s" << std::endl; - - lastHeartbeatTime = curTime; - lastCycles = cycles; -} - - -void LaunchProcess(uint32_t procIdx) { - int cpid = fork(); - if (cpid) { //parent - assert(cpid > 0); - childInfo[procIdx].pid = cpid; - childInfo[procIdx].status = PS_RUNNING; - } else { //child - // Set the child's vars and get the command - // NOTE: We set the vars first so that, when parsing the command, wordexp takes those vars into account - pinCmd->setEnvVars(procIdx, wordexpfunc); - const char* inputFile; - g_vector args = pinCmd->getFullCmdArgs(procIdx, &inputFile, wordexpfunc); - - //Copy args to a const char* [] for exec - int nargs = args.size()+1; - const char* aptrs[nargs]; - - trace(Harness, "Calling arguments:"); - for (unsigned int i = 0; i < args.size(); i++) { - trace(Harness, " arg%d = %s", i, args[i].c_str()); - aptrs[i] = args[i].c_str(); - } - aptrs[nargs-1] = nullptr; - - //Chdir to process dir if needed - if (perProcessDir) { - std::stringstream dir_ss; - dir_ss << "p" << procIdx << "/"; - int res = chdir(dir_ss.str().c_str()); - if (res == -1) { - perror("Coud not chdir"); - panic("chdir to %s failed", dir_ss.str().c_str()); - } - } - - //Input redirection if needed - if (inputFile) { - int fd = open(inputFile, O_RDONLY); - if (fd == -1) { - perror("open() failed"); - panic("Could not open input redirection file %s", inputFile); - } - dup2(fd, 0); - } - - /* In a modern kernel, we must disable address space randomization. Otherwise, - * different zsim processes will load zsim.so on different addresses, - * which would be fine except that the vtable pointers will be different - * per process, and virtual functions will not work. - * - * WARNING: The harness itself is run with randomization on, which should - * be fine because it doesn't load zsim.so anyway. If this changes at some - * point, we'll need to have the harness be executed via a wrapper that just - * changes the personalily and forks, or run the harness with setarch -R - */ - if (!aslr) { - //Get old personality flags & update - int pers = personality(((unsigned int)-1) /*returns current pers flags; arg is a long, hence the cast, see man*/); - if (pers == -1 || personality(pers | ADDR_NO_RANDOMIZE) == -1) { - perror("personality() call failed"); - panic("Could not change personality to disable address space randomization!"); - } - int newPers = personality(((unsigned int)-1)); - if ((newPers & ADDR_NO_RANDOMIZE) == 0) panic("personality() call was not honored! old 0x%x new 0x%x", pers, newPers); - } - - if (execvp(aptrs[0], (char* const*)aptrs) == -1) { - perror("Could not exec, killing child"); - panic("Could not exec %s", aptrs[0]); - } else { - panic("Something is SERIOUSLY wrong. This should never execute!"); - } - } -} - - -int main(int argc, char *argv[]) { - if (argc == 2 && std::string(argv[1]) == "-v") { - printf("%s\n", ZSIM_BUILDVERSION); - exit(0); - } - - InitLog("[H] ", nullptr /*log to stdout/err*/); - info("Starting zsim, built %s (rev %s)", ZSIM_BUILDDATE, ZSIM_BUILDVERSION); - startTime = time(nullptr); - - if (argc != 2) { - info("Usage: %s config_file", argv[0]); - exit(1); - } - - //Canonicalize paths --- because we change dirs, we deal in absolute paths - const char* configFile = realpath(argv[1], nullptr); - const char* outputDir = getcwd(nullptr, 0); //already absolute - - Config conf(configFile); - - if (atexit(exitHandler)) panic("Could not register exit handler"); - - signal(SIGSEGV, sigHandler); - signal(SIGINT, sigHandler); - signal(SIGABRT, sigHandler); - signal(SIGTERM, sigHandler); - - signal(SIGCHLD, chldSigHandler); - - //SIGUSR1 is used by children processes when they want to get a debugger session started; - struct sigaction debugSa; - debugSa.sa_flags = SA_SIGINFO; - sigemptyset(&debugSa.sa_mask); //NOTE: We might want to start using sigfullsets in other signal handlers to avoid races... - debugSa.sa_sigaction = debugSigHandler; - if (sigaction(SIGUSR1, &debugSa, nullptr) != 0) - panic("sigaction() failed"); - - syscall(SYS_waitid, P_ALL, 0, nullptr, WEXITED); - - //Remove all zsim.log.* files (we append to them, and want to avoid outputs from multiple simulations) - uint32_t removedLogfiles = 0; - while (true) { - std::stringstream ss; - ss << "zsim.log." << removedLogfiles; - if (remove(ss.str().c_str()) != 0) break; - removedLogfiles++; - } - if (removedLogfiles) info("Removed %d old logfiles", removedLogfiles); - - uint32_t gmSize = conf.get("sim.gmMBytes", (1<<10) /*default 1024MB*/); - info("Creating global segment, %d MBs", gmSize); - int shmid = gm_init(((size_t)gmSize) << 20 /*MB to Bytes*/); - info("Global segment shmid = %d", shmid); - //fprintf(stderr, "%sGlobal segment shmid = %d\n", logHeader, shmid); //hack to print shmid on both streams - //fflush(stderr); - - trace(Harness, "Created global segment, starting pin processes, shmid = %d", shmid); - - //Do we need per-process direcories? - perProcessDir = conf.get("sim.perProcessDir", false); - - if (perProcessDir) { - info("Running each process in a different subdirectory"); //p0, p1, ... - } - - bool deadlockDetection; - bool attachDebugger = conf.get("sim.attachDebugger", false); - - if (attachDebugger) { - info("Pausing PIN to attach debugger, and not running deadlock detection"); - deadlockDetection = false; - } else { - deadlockDetection = conf.get("sim.deadlockDetection", true); - } - - info("Deadlock detection %s", deadlockDetection? "ON" : "OFF"); - - aslr = conf.get("sim.aslr", false); - if (aslr) info("Not disabling ASLR, multiprocess runs will fail"); - - //Create children processes - pinCmd = new PinCmd(&conf, configFile, outputDir, shmid); - uint32_t numProcs = pinCmd->getNumCmdProcs(); - - for (uint32_t procIdx = 0; procIdx < numProcs; procIdx++) { - LaunchProcess(procIdx); - } - - if (numProcs == 0) panic("No process config found. Config file needs at least a process0 entry"); - - //Wait for all processes to finish - int sleepLength = 10; - GlobSimInfo* zinfo = nullptr; - int32_t secsStalled = 0; - - int64_t lastNumPhases = 0; - - while (getNumChildren() > 0) { - if (!gm_isready()) { - usleep(1000); // wait till proc idx 0 initializes everyhting - continue; - } - - if (zinfo == nullptr) { - zinfo = static_cast(gm_get_glob_ptr()); - globzinfo = zinfo; - info("Attached to global heap"); - } - - printHeartbeat(zinfo); // ensure we dump hostname etc on early crashes - - int left = sleep(sleepLength); - int secsSlept = sleepLength - left; - //info("Waking up, secs elapsed %d", secsSlept); - - __sync_synchronize(); - - uint32_t activeProcs = zinfo->globalActiveProcs; - uint32_t ffProcs = zinfo->globalFFProcs; - uint32_t sffProcs = zinfo->globalSyncedFFProcs; - bool simShouldAdvance = (ffProcs < activeProcs) && (sffProcs == 0); - - int64_t numPhases = zinfo->numPhases; - - if (deadlockDetection) { - if (simShouldAdvance) { - //info("In deadlock check zone"); - if (numPhases <= lastNumPhases) { - secsStalled += secsSlept; - if (secsStalled > 10) warn("Stalled for %d secs so far", secsStalled); - } else { - //info("Not stalled, did %ld phases since last check", numPhases-lastNumPhases); - lastNumPhases = numPhases; - secsStalled = 0; - } - } else if (activeProcs) { - if (numPhases == lastNumPhases) info("Some fast-forwarding is going on, not doing deadlock detection (a: %d, ff: %d, sff: %d)", activeProcs, ffProcs, sffProcs); - lastNumPhases = numPhases; - } //otherwise, activeProcs == 0; we're done - } - - printHeartbeat(zinfo); - - //This solves a weird race in multiprocess where SIGCHLD does not always fire... - int cpid = -1; - while ((cpid = waitpid(-1, nullptr, WNOHANG)) > 0) { - eraseChild(cpid); - info("Child %d done (in-loop catch)", cpid); - } - - if (secsStalled > 120) { - warn("Deadlock detected, killing children"); - sigHandler(SIGINT); - exit(42); - } - } - - uint32_t exitCode = 0; - if (termStatus == OK) { - info("All children done, exiting"); - } else { - info("Graceful termination finished, exiting"); - exitCode = 1; - } - if (zinfo && zinfo->globalActiveProcs) warn("Unclean exit of %d children, termination stats were most likely not dumped", zinfo->globalActiveProcs); - exit(exitCode); -} - diff --git a/zsim-setup.md b/zsim-setup.md index 21bdf1a..246ded8 100644 --- a/zsim-setup.md +++ b/zsim-setup.md @@ -25,7 +25,7 @@ From now on, we assume we are at the `NDP_PROJECT/` folder, which is referred to - `mkdir zsim-env && cd zsim-env` - Install pin-3 (We use pin-3.28) - `wget https://software.intel.com/sites/landingpage/pintool/downloads/pin-3.28-98749-g6643ecee5-gcc-linux.tar.gz` - - `tar -xvf pin-3.28-98749-g6643ecee5-gcc-linux.tar.gz && mv pin-3.28-98749-g6643ecee5-gcc-linux pin-3.28` + - `tar -xf pin-3.28-98749-g6643ecee5-gcc-linux.tar.gz && mv pin-3.28-98749-g6643ecee5-gcc-linux pin-3.28` - Updata $PINPATH in .bashrc (remember to update it if you change the path to pin) - ` echo "export PINPATH=$(pwd)/pin-3.28" >> ~/.bashrc` - `source ~/.bashrc` @@ -49,7 +49,7 @@ From now on, we assume we are at the `NDP_PROJECT/` folder, which is referred to - get the original hdf5 code - `wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.12/hdf5-1.12.0/src/hdf5-1.12.0.tar.gz` - - `tar -xvf hdf5-1.12.0.tar.gz && cd hdf5-1.12.0` + - `tar -xf hdf5-1.12.0.tar.gz && cd hdf5-1.12.0` - `../run_configure.sh` - `make -j16 && make install` - `cd ../../` @@ -57,7 +57,7 @@ From now on, we assume we are at the `NDP_PROJECT/` folder, which is referred to - `cd libconfig` - get the original libconfig code - `wget https://hyperrealm.github.io/libconfig/dist/libconfig-1.7.3.tar.gz` - - `tar -xvf libconfig-1.7.3.tar.gz && cd libconfig-1.7.3` + - `tar -xf libconfig-1.7.3.tar.gz && cd libconfig-1.7.3` - `../run_configure.sh` - `make -j8 && make install` - `cd ../../`