Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b6fbd02
WIP: Emit mulx for GT_MULHI
Daniel-Svensson May 30, 2025
b8d054f
* Handle containment for GT_MUL_LONG on x86
Daniel-Svensson May 31, 2025
4ae76cb
update comments
Daniel-Svensson May 31, 2025
9f84b3f
merge upstream/main
Daniel-Svensson May 31, 2025
1266195
update after merge
Daniel-Svensson May 31, 2025
8929847
* remove move instruction since it is handled by lsra
Daniel-Svensson May 31, 2025
f5d77fc
minor formatting fixes
Daniel-Svensson May 31, 2025
feeb24b
clenaup
Daniel-Svensson Jun 1, 2025
59ecc67
Ensure magic number for GT_MULHI for division with constant, is put i…
Daniel-Svensson Jun 2, 2025
0ff6c20
only swap operands for GT_MULHI and GT_MUL_LONG
Daniel-Svensson Jun 2, 2025
e56ee0a
fix formatting
Daniel-Svensson Jun 2, 2025
ce9e87b
Merge remote-tracking branch 'upstream/main' into x86_bmi_mulhi
Daniel-Svensson Jun 2, 2025
691e442
Fix operand order
Daniel-Svensson Jun 2, 2025
280022a
merge upstream/main
Daniel-Svensson Jun 12, 2025
46ee7af
Fixes after merge:
Daniel-Svensson Jun 12, 2025
7854142
fix review comment
Daniel-Svensson Jun 13, 2025
5f848c6
kill rdx register for mulx instead of specifying as fixed register fo…
Daniel-Svensson Jun 13, 2025
82ece23
fix format
Daniel-Svensson Jun 13, 2025
04450b6
remove register preference for mul, it does only make sense for exten…
Daniel-Svensson Jun 13, 2025
45625b7
fix formatting
Daniel-Svensson Jun 13, 2025
7c8dcfb
remove swap in lowering
Daniel-Svensson Jun 14, 2025
8ab4c9d
update fixed reg in lowering for division by constant
Daniel-Svensson Jun 14, 2025
cbd5824
change from isUsedFromMemory to isContained()
Daniel-Svensson Jun 16, 2025
ab4ec11
Fix review comments
Daniel-Svensson Jun 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 59 additions & 21 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -822,38 +822,76 @@ void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
// to get the high bits of the multiply, we are constrained to using the
// 1-op form: RDX:RAX = RAX * rm
// The 3-op form (Rx=Ry*Rz) does not support it.

// When BMI2 is available, we can use the MULX instruction to get the high bits
genConsumeOperands(treeNode->AsOp());

GenTree* regOp = op1;
GenTree* rmOp = op2;

// Set rmOp to the memory operand (if any)
if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RAX)))
// Lowering has ensured that op1 is never the memory operand to simplify checks here
assert(!op1->isUsedFromMemory());

if (treeNode->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2))
{
regOp = op2;
rmOp = op1;
}
assert(regOp->isUsedFromReg());
if (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RDX))
Comment thread
Daniel-Svensson marked this conversation as resolved.
Outdated
{
regOp = op2;
rmOp = op1;
}
assert(regOp->isUsedFromReg());

// Setup targetReg when neither of the source operands was a matching register
inst_Mov(targetType, REG_RAX, regOp->GetRegNum(), /* canSkip */ true);
// Setup targetReg when neither of the source operands was a matching register
inst_Mov(targetType, REG_RDX, regOp->GetRegNum(), /* canSkip */ true);

instruction ins;
if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
{
ins = INS_imulEAX;
if (treeNode->OperGet() == GT_MULHI)
{
// emit MULX instruction, use targetReg twice to only store high result
inst_RV_RV_TT(INS_mulx, size, targetReg, targetReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
}
else
{
#if TARGET_64BIT
assert(false);
#else
assert(treeNode->OperGet() == GT_MUL_LONG);

// emit MULX instruction
regNumber hiReg = targetReg;
regNumber lowReg = treeNode->AsMultiRegOp()->GetRegByIndex(1);
inst_RV_RV_TT(INS_mulx, size, hiReg, lowReg, rmOp, /* isRMW */ false, INS_OPTS_NONE);
#endif
}
}
else
else // Generate MUL or IMUL instruction
{
ins = INS_mulEAX;
}
emit->emitInsBinary(ins, size, treeNode, rmOp);
// If op2 is already present in RAX use that as implicit operand
if (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RAX))
{
regOp = op2;
rmOp = op1;
}
assert(regOp->isUsedFromReg());

// Move the result to the desired register, if necessary
if (treeNode->OperGet() == GT_MULHI)
{
inst_Mov(targetType, targetReg, REG_RDX, /* canSkip */ true);
// Setup targetReg when neither of the source operands was a matching register
inst_Mov(targetType, REG_RAX, regOp->GetRegNum(), /* canSkip */ true);

instruction ins;
if (!treeNode->IsUnsigned())
{
ins = INS_imulEAX;
}
else
{
ins = INS_mulEAX;
}
emit->emitInsBinary(ins, size, treeNode, rmOp);

// Move the result to the desired register, if necessary
if (treeNode->OperGet() == GT_MULHI)
{
assert(targetReg == REG_RDX);
inst_Mov(targetType, targetReg, REG_RDX, /* canSkip */ true);
}
}

genProduceReg(treeNode);
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,13 @@ class Lowering final : public Phase

#endif // TARGET_XARCH

#if TARGET_X86
if (parentNode->OperIs(GT_MUL_LONG))
{
return genTypeSize(childNode->TypeGet()) == operatorSize / 2;
}
#endif // TARGET_X86

return genTypeSize(childNode->TypeGet()) == operatorSize;
}

Expand Down
24 changes: 20 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7950,14 +7950,15 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
bool isSafeToContainOp1 = true;
bool isSafeToContainOp2 = true;

bool isUnsignedMultiply = ((node->gtFlags & GTF_UNSIGNED) != 0);
bool isUnsignedMultiply = node->IsUnsigned();
bool requiresOverflowCheck = node->gtOverflowEx();
bool useLeaEncoding = false;
GenTree* memOp = nullptr;

bool hasImpliedFirstOperand = false;
GenTreeIntConCommon* imm = nullptr;
GenTree* other = nullptr;
var_types nodeType = node->TypeGet();

// Multiply should never be using small types
assert(!varTypeIsSmall(node->TypeGet()));
Expand All @@ -7977,6 +7978,16 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
else if (node->OperGet() == GT_MUL_LONG)
{
hasImpliedFirstOperand = true;
// GT_MUL_LONG produce ULONG or LONG but work on UINT or INT
if (nodeType == TYP_LONG)
{
nodeType = TYP_INT;
}
else
{
assert(nodeType == TYP_ULONG);
nodeType = TYP_UINT;
}
Comment thread
Daniel-Svensson marked this conversation as resolved.
Outdated
}
#endif
else if (IsContainableImmed(node, op2) || IsContainableImmed(node, op1))
Expand Down Expand Up @@ -8013,7 +8024,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
//
if (memOp == nullptr)
{
if ((op2->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op2))
if ((op2->TypeGet() == nodeType) && IsContainableMemoryOp(op2))
{
isSafeToContainOp2 = IsSafeToContainMem(node, op2);
if (isSafeToContainOp2)
Expand All @@ -8022,7 +8033,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
}
}

if ((memOp == nullptr) && (op1->TypeGet() == node->TypeGet()) && IsContainableMemoryOp(op1))
if ((memOp == nullptr) && (op1->TypeGet() == nodeType) && IsContainableMemoryOp(op1))
{
isSafeToContainOp1 = IsSafeToContainMem(node, op1);
if (isSafeToContainOp1)
Expand All @@ -8033,7 +8044,7 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
}
else
{
if ((memOp->TypeGet() != node->TypeGet()))
if ((memOp->TypeGet() != nodeType))
{
memOp = nullptr;
}
Expand All @@ -8058,6 +8069,11 @@ void Lowering::ContainCheckMul(GenTreeOp* node)
if (memOp != nullptr)
{
MakeSrcContained(node, memOp);
// Swap the operands so that the contained memory operand is always op2.
if (memOp == op1)
Comment thread
Daniel-Svensson marked this conversation as resolved.
Outdated
{
std::swap(node->gtOp1, node->gtOp2);
}
}
else
{
Expand Down
15 changes: 14 additions & 1 deletion src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,20 @@ regMaskTP LinearScan::getKillSetForMul(GenTreeOp* mulNode)
regMaskTP killMask = RBM_NONE;
#ifdef TARGET_XARCH
assert(mulNode->OperIsMul());
if (!mulNode->OperIs(GT_MUL) || (((mulNode->gtFlags & GTF_UNSIGNED) != 0) && mulNode->gtOverflowEx()))
if (!mulNode->OperIs(GT_MUL))
{
// If we can use the mulx instruction, we don't need to kill RAX
if (mulNode->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2))
{
// For mulx we force one arg to RDX, but we do not modify it
// keep killMask set to RBM_NONE
}
else
{
killMask = RBM_RAX | RBM_RDX;
}
}
else if (mulNode->IsUnsigned() && mulNode->gtOverflowEx())
{
killMask = RBM_RAX | RBM_RDX;
}
Expand Down
65 changes: 54 additions & 11 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,14 @@ bool LinearScan::isRMWRegOper(GenTree* tree)
}
return (!tree->gtGetOp2()->isContainedIntOrIImmed() && !tree->gtGetOp1()->isContainedIntOrIImmed());
}
#ifdef TARGET_X86
case GT_MUL_LONG:
#endif
case GT_MULHI:
{
// MUL, IMUL are RMW but mulx is not (which is used for unsigned operands when BMI2 is availible)
return !(tree->IsUnsigned() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2));
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about extracting a helper method for determining if a multiply node should emit mulx since

tree->OperGet() != GT_MUL && isUnsignedMultiply && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) is used in a few places, but I did not know here to place such a helper so did not do it

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to look at using mulx when numbers are signed, but they are proven to be non-negative ?

If so where would it make sense to have a helper and do you have a suggestion for name shouldEmitMulxForMultiplication ?

}

#ifdef FEATURE_HW_INTRINSICS
case GT_HWINTRINSIC:
Expand Down Expand Up @@ -3213,18 +3221,47 @@ int LinearScan::BuildMul(GenTree* tree)
return BuildSimple(tree);
}

// ToDo-APX : imul currently doesn't have rex2 support. So, cannot use R16-R31.
int srcCount = BuildBinaryUses(tree->AsOp());
bool isUnsignedMultiply = tree->IsUnsigned();
bool requiresOverflowCheck = tree->gtOverflowEx();
bool useMulx = tree->OperGet() != GT_MUL && isUnsignedMultiply &&
compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2);

// ToDo-APX : imul currently doesn't have rex2 support. So, cannot use R16-R31.
int srcCount = 0;
int dstCount = 1;
SingleTypeRegSet dstCandidates = RBM_NONE;

bool isUnsignedMultiply = ((tree->gtFlags & GTF_UNSIGNED) != 0);
bool requiresOverflowCheck = tree->gtOverflowEx();
// Lowering has ensured that op1 is never the memory operand
assert(!op1->isUsedFromMemory());

// There are three forms of x86 multiply:
// Start with building the uses, ensuring that one of the operands is in the implicit register (RAX or RDX)
// Place first operand in implicit register, unless:
// * it is a memory address
// * or the second operand is already in the register
if (useMulx)
{
// In lowering, we place any memory operand in op2 so we default to placing op1 in RDX
Comment thread
Daniel-Svensson marked this conversation as resolved.
Outdated
// By selecting RDX here we don't have to kill it
srcCount = BuildOperandUses(op1, SRBM_RDX);
srcCount += BuildOperandUses(op2, RBM_NONE);
Copy link
Copy Markdown
Contributor Author

@Daniel-Svensson Daniel-Svensson Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is heavily inspired by how MultiplyNoFlags is implemented.

Is it safe to not have RDX killed if SRBM_RDX is specified as register here ?

I hope this is able to produce slightly better code than to always kill RDX and just specify any register instead. (since rdx can be reused)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just found #10196
and the code comment in

regMaskTP LinearScan::getKillSetForHWIntrinsic(GenTreeHWIntrinsic* node)
{
regMaskTP killMask = RBM_NONE;
#ifdef TARGET_XARCH
switch (node->GetHWIntrinsicId())
{
case NI_X86Base_MaskMove:
// maskmovdqu uses edi as the implicit address register.
// Although it is set as the srcCandidate on the address, if there is also a fixed
// assignment for the definition of the address, resolveConflictingDefAndUse() may
// change the register assignment on the def or use of a tree temp (SDSU) when there
// is a conflict, and the FixedRef on edi won't be sufficient to ensure that another
// Interval will not be allocated there.
// Issue #17674 tracks this.
killMask = RBM_EDI;
break;

Is that still an issue? (NI_AVX2_MultiplyNoFlags does not do anything similar and still seems to work)

}
else
{
SingleTypeRegSet srcCandidates1 = RBM_NONE;
// If op2 is memory then tell allocator to pass op1 in RAX
if (op2->isUsedFromMemory())
{
srcCandidates1 = SRBM_RAX;
}
srcCount = BuildRMWUses(tree, op1, op2, srcCandidates1, RBM_NONE);
}

// There are three forms of x86 multiply in base instruction set
// one-op form: RDX:RAX = RAX * r/m
// two-op form: reg *= r/m
// three-op form: reg = r/m * imm
// If the BMI2 instruction set is supported there is an additional unsigned multiply
// mulx reg1:reg2 = RDX * reg3/m

// This special widening 32x32->64 MUL is not used on x64
#if defined(TARGET_X86)
Expand All @@ -3248,16 +3285,22 @@ int LinearScan::BuildMul(GenTree* tree)
}
else if (tree->OperGet() == GT_MULHI)
{
// Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
// upper 32 bits of the result set the destination candidate to REG_RDX.
dstCandidates = SRBM_RDX;
if (!useMulx)
{
// Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
// upper 32 bits of the result set the destination candidate to REG_RDX.
dstCandidates = SRBM_RDX;
}
}
#if defined(TARGET_X86)
else if (tree->OperGet() == GT_MUL_LONG)
{
// have to use the encoding:RDX:RAX = RAX * rm
dstCandidates = SRBM_RAX | SRBM_RDX;
dstCount = 2;
dstCount = 2;
if (!useMulx)
{
// We have to use the encoding:RDX:RAX = RAX * rm
dstCandidates = SRBM_RAX | SRBM_RDX;
}
}
#endif
GenTree* containedMemOp = nullptr;
Expand Down
Loading