Squashed 'externals/xbyak/' changes from 671fc805..4a6fac8a

4a6fac8a update version to 5.77
801cf3fd cosmetic change of getNumCores
d397e824 fix number of cores that share LLC cache
a669e092 support non-intel-cpu visual studio
af5f422e Merge branch 'fenghaitao-guard_x86' into develop
9b98dc17 Guard x86 specific codes with "#if defined(__i386__) || defined(__x86_64__)"
dd4173e1 move some member variables input private
f72646a7 update version
4612528f format change
4b95e862 Merge branch 'shelleygoel-master'
4c262fa6 add functionality to get num of cores using x2APIC ID
bc70e7e1 recover Xbyak::CastTo
d09a230f unlink Label when LabelManager is destroyed
973e8597 update version
afdb9fe9 Xbyak::CastTo is removed
b011aca4 add RegRip +/- int
acae93cd increase max temp regs for StackFrame
ea4e3562 util::StackFrame uses push/pop instead of mov
42462ef9 use evex encoding for vpslld/vpslldq/vpsraw/...(reg, mem, imm);
da9117a9 update version of readme.md
d35f4fb7 fix the encoding of vinsertps for disp8N
1de435ed bf uses Label class
613922bd add Label L() for convenience
43e15583 fix typo
93579ee6 add protect-re.cpp
60004b5c fix url of protect-re.cpp
348b2709 fix typo of doc
f34f6ed5 update manual
232110be update test
82b78bf0 add setProtectMode
dd8b290f put warning message if pageSize != 4096
64775ca2 a little refactoring
7c3e7b85 fix wrong VSIB encoding with idx >= 16

git-subtree-dir: externals/xbyak
git-subtree-split: 4a6fac8ade404f667b94170f713367fe7da2a852
This commit is contained in:
MerryMage 2020-04-22 20:59:14 +01:00
parent dbb1f8cf37
commit 080b4b3aff
17 changed files with 994 additions and 489 deletions

View file

@ -40,6 +40,8 @@
// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\
((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::unordered_set
#include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
@ -49,16 +51,22 @@
libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
*/
#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__)
#include <tr1/unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <tr1/unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
#include <unordered_set>
#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
#include <unordered_map>
#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
#else
#include <set>
#define XBYAK_STD_UNORDERED_SET std::set
#include <map>
#define XBYAK_STD_UNORDERED_MAP std::map
#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
@ -105,7 +113,7 @@ namespace Xbyak {
enum {
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x5670 /* 0xABCD = A.BC(D) */
VERSION = 0x5770 /* 0xABCD = A.BC(D) */
};
#ifndef MIE_INTEGER_TYPE_DEFINED
@ -178,7 +186,8 @@ enum {
ERR_INVALID_ZERO,
ERR_INVALID_RIP_IN_AUTO_GROW,
ERR_INVALID_MIB_ADDRESS,
ERR_INTERNAL
ERR_INTERNAL,
ERR_X2APIC_IS_NOT_SUPPORTED
};
class Error : public std::exception {
@ -240,6 +249,7 @@ public:
"invalid rip in AutoGrow",
"invalid mib address",
"internal error",
"x2APIC is not supported"
};
assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
return errTbl[err_];
@ -617,6 +627,12 @@ struct RegRip {
const Label* label_;
bool isAddr_;
explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {}
friend const RegRip operator+(const RegRip& r, int disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
}
friend const RegRip operator-(const RegRip& r, int disp) {
return RegRip(r.disp_ - disp, r.label_, r.isAddr_);
}
friend const RegRip operator+(const RegRip& r, sint64 disp) {
return RegRip(r.disp_ + disp, r.label_, r.isAddr_);
}
@ -786,6 +802,7 @@ inline RegExp operator-(const RegExp& e, size_t disp)
// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
void *const AutoGrow = (void*)1; //-V566
void *const DontSetProtectRWE = (void*)2; //-V566
class CodeArray {
enum Type {
@ -825,6 +842,7 @@ protected:
size_t size_;
bool isCalledCalcJmpAddress_;
bool useProtect() const { return alloc_->useProtect(); }
/*
allocate new memory and copy old data to the new area
*/
@ -848,7 +866,6 @@ protected:
uint64 disp = i->getVal(top_);
rewrite(i->codeOffset, disp, i->jmpSize);
}
if (alloc_->useProtect() && !protect(top_, size_, PROTECT_RWE)) throw Error(ERR_CANT_PROTECT);
isCalledCalcJmpAddress_ = true;
}
public:
@ -858,7 +875,7 @@ public:
PROTECT_RE = 2 // read/exec
};
explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
: type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF)
: type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
, maxSize_(maxSize)
, top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
@ -866,7 +883,7 @@ public:
, isCalledCalcJmpAddress_(false)
{
if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC);
if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, PROTECT_RWE)) {
if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
alloc_->free(top_);
throw Error(ERR_CANT_PROTECT);
}
@ -874,10 +891,19 @@ public:
virtual ~CodeArray()
{
if (isAllocType()) {
if (alloc_->useProtect()) protect(top_, maxSize_, PROTECT_RW);
if (useProtect()) setProtectModeRW(false);
alloc_->free(top_);
}
}
bool setProtectMode(ProtectMode mode, bool throwException = true)
{
bool isOK = protect(top_, maxSize_, mode);
if (isOK) return true;
if (throwException) throw Error(ERR_CANT_PROTECT);
return false;
}
bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
void resetSize()
{
size_ = 0;
@ -909,10 +935,10 @@ public:
void dq(uint64 code) { db(code, 8); }
const uint8 *getCode() const { return top_; }
template<class F>
const F getCode() const { return CastTo<F>(top_); }
const F getCode() const { return reinterpret_cast<F>(top_); }
const uint8 *getCurr() const { return &top_[size_]; }
template<class F>
const F getCurr() const { return CastTo<F>(&top_[size_]); }
const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
size_t getSize() const { return size_; }
void setSize(size_t size)
{
@ -995,6 +1021,9 @@ public:
size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
#ifndef NDEBUG
if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize);
#endif
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#else
return true;
@ -1115,6 +1144,7 @@ public:
Label(const Label& rhs);
Label& operator=(const Label& rhs);
~Label();
void clear() { mgr = 0; id = 0; }
int getId() const { return id; }
const uint8 *getAddress() const;
@ -1153,6 +1183,7 @@ class LabelManager {
};
typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
CodeArray *base_;
// global : stateList_.front(), local : stateList_.back()
@ -1160,6 +1191,7 @@ class LabelManager {
mutable int labelId_;
ClabelDefList clabelDefList_;
ClabelUndefList clabelUndefList_;
LabelPtrList labelPtrList_;
int getId(const Label& label) const
{
@ -1208,9 +1240,14 @@ class LabelManager {
return true;
}
friend class Label;
void incRefCount(int id) { clabelDefList_[id].refCount++; }
void decRefCount(int id)
void incRefCount(int id, Label *label)
{
clabelDefList_[id].refCount++;
labelPtrList_.insert(label);
}
void decRefCount(int id, Label *label)
{
labelPtrList_.erase(label);
ClabelDefList::iterator i = clabelDefList_.find(id);
if (i == clabelDefList_.end()) return;
if (i->second.refCount == 1) {
@ -1229,11 +1266,23 @@ class LabelManager {
#endif
return !list.empty();
}
// detach all labels linked to LabelManager
void resetLabelPtrList()
{
for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
(*i)->clear();
}
labelPtrList_.clear();
}
public:
LabelManager()
{
reset();
}
~LabelManager()
{
resetLabelPtrList();
}
void reset()
{
base_ = 0;
@ -1243,6 +1292,7 @@ public:
stateList_.push_back(SlabelState());
clabelDefList_.clear();
clabelUndefList_.clear();
resetLabelPtrList();
}
void enterLocal()
{
@ -1275,10 +1325,11 @@ public:
SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
define_inner(st.defList, st.undefList, label, base_->getSize());
}
void defineClabel(const Label& label)
void defineClabel(Label& label)
{
define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
label.mgr = this;
labelPtrList_.insert(&label);
}
void assign(Label& dst, const Label& src)
{
@ -1286,6 +1337,7 @@ public:
if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L);
define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
dst.mgr = this;
labelPtrList_.insert(&dst);
}
bool getOffset(size_t *offset, std::string& label) const
{
@ -1333,19 +1385,19 @@ inline Label::Label(const Label& rhs)
{
id = rhs.id;
mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id);
if (mgr) mgr->incRefCount(id, this);
}
inline Label& Label::operator=(const Label& rhs)
{
if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L);
id = rhs.id;
mgr = rhs.mgr;
if (mgr) mgr->incRefCount(id);
if (mgr) mgr->incRefCount(id, this);
return *this;
}
inline Label::~Label()
{
if (id && mgr) mgr->decRefCount(id);
if (id && mgr) mgr->decRefCount(id, this);
}
inline const uint8* Label::getAddress() const
{
@ -1463,6 +1515,7 @@ private:
T_B64 = 1 << 27, // m64bcst
T_M_K = 1 << 28, // mem{k}
T_VSIB = 1 << 29,
T_MEM_EVEX = 1 << 30, // use evex if mem
T_XXX
};
void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false)
@ -1500,7 +1553,7 @@ private:
if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err);
return v;
}
int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0)
int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0, bool Hi16Vidx = false)
{
if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID);
int w = (type & T_EW1) ? 1 : 0;
@ -1543,7 +1596,7 @@ private:
}
}
}
bool Vp = !(v ? v->isExtIdx2() : 0);
bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
db(0x62);
@ -1935,10 +1988,11 @@ private:
const Address& addr = op2.getAddress();
const RegExp& regExp = addr.getRegExp();
const Reg& base = regExp.getBase();
const Reg& index = regExp.getIndex();
if (BIT == 64 && addr.is32bit()) db(0x67);
int disp8N = 0;
bool x = regExp.getIndex().isExtIdx();
if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
bool x = index.isExtIdx();
if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) {
int aaa = addr.getOpmaskIdx();
if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY);
bool b = false;
@ -1946,8 +2000,8 @@ private:
if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST);
b = true;
}
int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0;
disp8N = evex(r, base, p1, type, code, x, b, aaa, VL);
int VL = regExp.isVsib() ? index.getBit() : 0;
disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
} else {
vex(r, base, p1, type, code, x);
}
@ -2147,7 +2201,8 @@ public:
const Segment es, cs, ss, ds, fs, gs;
#endif
void L(const std::string& label) { labelMgr_.defineSlabel(label); }
void L(const Label& label) { labelMgr_.defineClabel(label); }
void L(Label& label) { labelMgr_.defineClabel(label); }
Label L() { Label label; L(label); return label; }
void inLocalLabel() { labelMgr_.enterLocal(); }
void outLocalLabel() { labelMgr_.leaveLocal(); }
/*
@ -2178,7 +2233,7 @@ public:
// call(function pointer)
#ifdef XBYAK_VARIADIC_TEMPLATE
template<class Ret, class... Params>
void call(Ret(*func)(Params...)) { call(CastTo<const void*>(func)); }
void call(Ret(*func)(Params...)) { call(reinterpret_cast<const void*>(func)); }
#endif
void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
@ -2436,11 +2491,16 @@ public:
MUST call ready() to complete generating code if you use AutoGrow mode.
It is not necessary for the other mode if hasUndefinedLabel() is true.
*/
void ready()
void ready(ProtectMode mode = PROTECT_RWE)
{
if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND);
if (isAutoGrow()) calcJmpAddress();
if (isAutoGrow()) {
calcJmpAddress();
if (useProtect()) setProtectMode(mode);
}
}
// set read/exec
void readyRE() { return ready(PROTECT_RE); }
#ifdef XBYAK_TEST
void dump(bool doClear = true)
{

View file

@ -1,4 +1,4 @@
const char *getVersionString() const { return "5.67"; }
const char *getVersionString() const { return "5.77"; }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
@ -1023,7 +1023,7 @@ void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); }
void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); }
void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); }
void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); }
void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
@ -1206,28 +1206,28 @@ void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm,
void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); }
void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); }
void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); }
void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); }
void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); }
void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); }
void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); }
void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); }
void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); }
void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); }
void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); }
void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); }
void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); }
void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); }
void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); }
void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); }
void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); }
void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); }
void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); }
void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); }
void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); }
void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); }

View file

@ -9,6 +9,11 @@
*/
#include "xbyak.h"
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define XBYAK_INTEL_CPU_SPECIFIC
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
#if (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int)
@ -47,14 +52,30 @@
#endif
#endif
#endif
#endif
namespace Xbyak { namespace util {
typedef enum {
SmtLevel = 1,
CoreLevel = 2
} IntelCpuTopologyLevel;
/**
CPU detection class
*/
class Cpu {
uint64 type_;
//system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2;
unsigned int numCores_[maxTopologyLevels];
static const unsigned int maxNumberCacheLevels = 10;
unsigned int dataCacheSize_[maxNumberCacheLevels];
unsigned int coresSharignDataCache_[maxNumberCacheLevels];
unsigned int dataCacheLevels_;
unsigned int get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@ -65,7 +86,7 @@ class Cpu {
}
void setFamily()
{
unsigned int data[4];
unsigned int data[4] = {};
getCpuid(1, data);
stepping = data[0] & mask(4);
model = (data[0] >> 4) & mask(4);
@ -88,6 +109,39 @@ class Cpu {
{
return (val >> base) & ((1u << (end - base)) - 1);
}
void setNumCores()
{
if ((type_ & tINTEL) == 0) return;
unsigned int data[4] = {};
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
/*
if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor
*/
x2APIC_supported_ = true;
for (unsigned int i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15);
}
}
} else {
/*
Failed to deremine num of cores without x2APIC support.
TODO: USE initial APIC ID to determine ncores.
*/
numCores_[SmtLevel - 1] = 0;
numCores_[CoreLevel - 1] = 0;
}
}
void setCacheHierarchy()
{
if ((type_ & tINTEL) == 0) return;
@ -96,21 +150,12 @@ class Cpu {
// const unsigned int INSTRUCTION_CACHE = 2;
const unsigned int UNIFIED_CACHE = 3;
unsigned int smt_width = 0;
unsigned int n_cores = 0;
unsigned int data[4];
unsigned int logical_cores = 0;
unsigned int data[4] = {};
/*
if leaf 11 exists, we use it to get the number of smt cores and cores on socket
If x2APIC is supported, these are the only correct numbers.
leaf 0xB can be zeroed-out by a hypervisor
*/
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) {
getCpuidEx(0xB, 0, data); // CPUID for SMT Level
smt_width = data[1] & 0x7FFF;
getCpuidEx(0xB, 1, data); // CPUID for CORE Level
n_cores = data[1] & 0x7FFF;
if (x2APIC_supported_) {
smt_width = numCores_[0];
logical_cores = numCores_[1];
}
/*
@ -118,29 +163,29 @@ class Cpu {
the first level of data cache is not shared (which is the
case for every existing architecture) and use this to
determine the SMT width for arch not supporting leaf 11.
when leaf 4 reports a number of core less than n_cores
when leaf 4 reports a number of core less than numCores_
on socket reported by leaf 11, then it is a correct number
of cores not an upperbound.
*/
for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) {
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data);
unsigned int cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
if (n_cores != 0) { // true only if leaf 0xB is supported and valid
nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
}
assert(nb_logical_cores != 0);
data_cache_size[data_cache_levels] =
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
(extractBit(data[1], 22, 31) + 1)
* (extractBit(data[1], 12, 21) + 1)
* (extractBit(data[1], 0, 11) + 1)
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u);
data_cache_levels++;
coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
}
@ -154,22 +199,25 @@ public:
int displayFamily; // family + extFamily
int displayModel; // model + extModel
// may I move these members into private?
static const unsigned int maxNumberCacheLevels = 10;
unsigned int data_cache_size[maxNumberCacheLevels];
unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
unsigned int data_cache_levels;
unsigned int getNumCores(IntelCpuTopologyLevel level) {
if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
switch (level) {
case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
default: throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
}
}
unsigned int getDataCacheLevels() const { return data_cache_levels; }
unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
unsigned int getCoresSharingDataCache(unsigned int i) const
{
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
return cores_sharing_data_cache[i];
if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return coresSharignDataCache_[i];
}
unsigned int getDataCacheSize(unsigned int i) const
{
if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER);
return data_cache_size[i];
if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER);
return dataCacheSize_[i];
}
/*
@ -177,30 +225,45 @@ public:
*/
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif
}
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)ecxIn;
(void)data;
#endif
}
static inline uint64 getXfeature()
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return _xgetbv(0);
#else
#else
unsigned int eax, edx;
// xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64)edx << 32) | eax;
#endif
#else
return 0;
#endif
}
typedef uint64 Type;
@ -271,9 +334,13 @@ public:
Cpu()
: type_(NONE)
, data_cache_levels(0)
, x2APIC_supported_(false)
, numCores_()
, dataCacheSize_()
, coresSharignDataCache_()
, dataCacheLevels_(0)
{
unsigned int data[4];
unsigned int data[4] = {};
const unsigned int& EAX = data[0];
const unsigned int& EBX = data[1];
const unsigned int& ECX = data[2];
@ -363,6 +430,7 @@ public:
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
}
setFamily();
setNumCores();
setCacheHierarchy();
}
void putFamily() const
@ -381,12 +449,17 @@ class Clock {
public:
static inline uint64 getRdtsc()
{
#ifdef _MSC_VER
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return __rdtsc();
#else
#else
unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax;
#endif
#else
// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
return 0;
#endif
}
Clock()
@ -416,7 +489,7 @@ const int UseRCX = 1 << 6;
const int UseRDX = 1 << 7;
class Pack {
static const size_t maxTblNum = 10;
static const size_t maxTblNum = 15;
const Xbyak::Reg64 *tbl_[maxTblNum];
size_t n_;
public:
@ -476,7 +549,7 @@ public:
const Xbyak::Reg64& operator[](size_t n) const
{
if (n >= n_) {
fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
throw Error(ERR_BAD_PARAMETER);
}
return *tbl_[n];
@ -518,6 +591,7 @@ class StackFrame {
static const int rcxPos = 3;
static const int rdxPos = 2;
#endif
static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
Xbyak::CodeGenerator *code_;
int pNum_;
int tNum_;
@ -527,7 +601,7 @@ class StackFrame {
int P_;
bool makeEpilog_;
Xbyak::Reg64 pTbl_[4];
Xbyak::Reg64 tTbl_[10];
Xbyak::Reg64 tTbl_[maxRegNum];
Pack p_;
Pack t_;
StackFrame(const StackFrame&);
@ -539,7 +613,7 @@ public:
make stack frame
@param sf [in] this
@param pNum [in] num of function parameter(0 <= pNum <= 4)
@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
@param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
@param stackSizeByte [in] local stack size
@param makeEpilog [in] automatically call close() if true
@ -566,27 +640,17 @@ public:
using namespace Xbyak;
if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
const Reg64& _rsp = code->rsp;
const AddressFrame& _ptr = code->ptr;
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
const int *tbl = getOrderTbl() + noSaveNum;
P_ = saveNum_ + (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
for (int i = 0; i < saveNum_; i++) {
code->push(Reg64(tbl[i]));
}
P_ = (stackSizeByte + 7) / 8;
if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
P_ *= 8;
if (P_ > 0) code->sub(_rsp, P_);
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
}
for (int i = 4; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#else
for (int i = 0; i < saveNum_; i++) {
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
}
#endif
int pos = 0;
for (int i = 0; i < pNum; i++) {
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
@ -607,21 +671,11 @@ public:
{
using namespace Xbyak;
const Reg64& _rsp = code_->rsp;
const AddressFrame& _ptr = code_->ptr;
const int *tbl = getOrderTbl() + noSaveNum;
#ifdef XBYAK64_WIN
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
}
for (int i = 4; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#else
for (int i = 0; i < saveNum_; i++) {
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
}
#endif
if (P_ > 0) code_->add(_rsp, P_);
for (int i = 0; i < saveNum_; i++) {
code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
}
if (callRet) code_->ret();
}
@ -633,9 +687,6 @@ public:
} catch (std::exception& e) {
printf("ERR:StackFrame %s\n", e.what());
exit(1);
} catch (...) {
printf("ERR:StackFrame otherwise\n");
exit(1);
}
}
private:
@ -654,7 +705,7 @@ private:
}
int getRegIdx(int& pos) const
{
assert(pos < 14);
assert(pos < maxRegNum);
using namespace Xbyak;
const int *tbl = getOrderTbl();
int r = tbl[pos++];