Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,10 @@ class CodeGen final : public CodeGenInterface

void genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed);

#if defined(TARGET_ARM64)
void genUnknownSizeFrame();
#endif

#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
bool genInstrWithConstant(instruction ins,
emitAttr attr,
Expand Down
38 changes: 38 additions & 0 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4876,9 +4876,47 @@ void CodeGen::genPushCalleeSavedRegisters()
m_compiler->compFrameInfo.calleeSaveSpOffset = calleeSaveSpOffset;
m_compiler->compFrameInfo.calleeSaveSpDelta = calleeSaveSpDelta;
m_compiler->compFrameInfo.offsetSpToSavedFp = offsetSpToSavedFp;

if (m_compiler->compUsesUnknownSizeFrame)
{
genUnknownSizeFrame();
}
#endif // TARGET_ARM64
}

#if defined(TARGET_ARM64)
// See Compiler::UnknownSizeFrame for implementation details.
void CodeGen::genUnknownSizeFrame()
Comment thread
snickolls-arm marked this conversation as resolved.
{
assert(m_compiler->compLocallocUsed && m_compiler->compUsesUnknownSizeFrame);
assert(m_compiler->unkSizeFrame.isFinalized);
unsigned totalVectorCount = m_compiler->unkSizeFrame.FrameSizeInVectors();

// We reserve REG_UNKBASE for addressing SVE locals. This will always point at the top of
// of the UnknownSizeFrame and we index into it.
// TODO-SVE: We may want this to point into the middle of the frame to reduce address
// computations (we have a signed 9-bit indexing immediate).
inst_Mov(TYP_I_IMPL, REG_UNKBASE, REG_SP, false);

if (0 < totalVectorCount && totalVectorCount <= 32)
{
GetEmitter()->emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, REG_SP, REG_SP, -(ssize_t)totalVectorCount);
}
else
{
// Generate `sp = sp - totalVectorCount * VL`
assert(totalVectorCount != 0);
regNumber rsvd = rsGetRsvdReg();
// mov rsvd, #totalVectorCount
// rdvl scratch, #1
// msub sp, rsvd, scratch, sp
instGen_Set_Reg_To_Imm(EA_8BYTE, rsvd, totalVectorCount);
GetEmitter()->emitIns_R_I(INS_sve_rdvl, EA_8BYTE, REG_SCRATCH, 1);
GetEmitter()->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_SP, rsvd, REG_SCRATCH, REG_SP);
}
}
#endif

/*****************************************************************************
*
* Generates code for a function epilog.
Expand Down
16 changes: 16 additions & 0 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3644,6 +3644,11 @@ void CodeGen::genCheckUseBlockInit()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

if (m_compiler->fgVarIsNeverZeroInitializedInProlog(varNum))
{
varDsc->lvMustInit = 0;
Expand Down Expand Up @@ -4001,6 +4006,12 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,

noway_assert(varDsc->lvOnFrame);

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
// This local will belong on the UnknownSizeFrame, which will handle zeroing instead.
continue;
}

// lvMustInit can only be set for GC types or TYP_STRUCT types
// or when compInitMem is true
// or when in debug code
Expand Down Expand Up @@ -5067,6 +5078,11 @@ void CodeGen::genFnProlog()
continue;
}

if (m_compiler->lvaIsUnknownSizeLocal(varNum))
{
continue;
}

signed int loOffs = varDsc->GetStackOffset();
signed int hiOffs = varDsc->GetStackOffset() + m_compiler->lvaLclStackHomeSize(varNum);

Expand Down
170 changes: 170 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -4124,6 +4124,176 @@ class Compiler

int lvaOSRLocalTier0FrameOffset(unsigned varNum);

//------------------------- UnknownSizeFrame ---------------------------------

void lvaInitUnknownSizeFrame();
void lvaAllocUnknownSizeLocal(unsigned varNum);

bool compUsesUnknownSizeFrame;

#if defined(FEATURE_SIMD) && defined(TARGET_ARM64)
// For ARM64, the UnknownSizeFrame lives at the end of the statically
// allocated stack space. This means it belongs to the 'alloca' space on the
// frame, and it is essentially the first dynamically allocated stack
// variable.
//
// Currently, the only locals with unknown size are SIMD types supporting
// Vector<T>, TYP_SIMD and TYP_MASK. We do not know the size of these types
// at compile time, so we need to execute the rdvl/addvl instruction to
// learn this size and allocate the UnknownSizeFrame.
//
// We reserve the x19 register to point to the top of the UnknownSizeFrame
// and use this as the base address for local variables with unknown size.
// Reserving a register is simpler than using fp/sp, as fp may point
// to different locations depending on various properties of the frame, and
// the value of sp may change at runtime.
//
// Typically, a vector is loaded using a base address and some index which
// the instruction will scale by VL, for example: `ldr z0, [x19, #3 MUL VL]`.
// A mask is loaded with `ldr p0, [x19, #3 MUL VL]`, but in this case the
// `MUL VL` indicates we are scaling with the length of the predicate
// register rather than the vector. A predicate register is defined to have
// 1/8th the length of a vector register.
//
// We know that sizeof(TYP_SIMD) and sizeof(TYP_MASK) are invariant despite
// being unknown at compile time, so we allocate them in single homogeneous
// blocks per type. An individual local can be referenced from the start of
// its block by an index into the block.
//
// The difference in addressing-mode index scaling means we have to be
// careful where we place the mask locals block with respect to the vector
// locals block. If we place the mask locals after the vector locals, we'll
// need to offset the load index by (8 * nVector) to account for the vector
// locals.
//
// Instead, we choose to pad the mask locals block to VL and place it at the
// beginning of the frame (closest to fp). This way we'll need to offset
// vector load indices by `roundUp(nMask, 8) / 8`. This is less likely to
// put pressure on the immediate encoding range and result in requiring an
// address computation.
//
// The maximum wasted space from the padding is 7/8ths VL (224 bytes with
// the architectural maximum 256 byte vectors), which occurs when 1 mask
// local is spilled to the frame. Alternatively this is 28 bytes for 32 byte
// vectors, for an example closer to today's implementations.
//
// The padding also makes it simple to allocate the UnknownSizeFrame since
// the UnknownSizeFrame will be aligned to VL. The total number of vectors
// to allocate is `(roundUp(nMask, 8) / 8) + nVector`. The stack pointer
// can be adjusted with a single instruction `addvl sp, sp, #totalVectors`.
//
// See the diagram below for a visual representation of this scheme.
//
// ...
// | static space |
// | (totalFrameSize) |
// +----------------------------------+ x19, begin UnknownSizeFrame
// | mask locals block | ^
// | (nMask * VL/8) | |
// +----------------------------------+ |
// | padding to VL alignment | |
// +----------------------------------+ (roundUp(nMask, 8)/8 + nVector)*VL
// | | |
// | vector locals block | |
// | (nVector * VL) | |
// | | v
// +----------------------------------+ end UnknownSizeFrame
// | |
// | rest of alloca space |
// ... sp
struct UnknownSizeFrame
{
// Number of allocated vectors/masks. These also represent the end of
// the allocation space for each block. The allocator for each block is
// a simple bump allocator.
unsigned nVector = 0;
unsigned nMask = 0;

#ifdef DEBUG
bool isFinalized = false;
#endif

// Returns the size of the mask block in number of vector lengths.
unsigned MaskBlockSizeInVectors()
{
assert(roundUp(0U, 8U) == 0);
return roundUp(nMask, 8) / 8;
}

// Returns the size of the vector block in number of vector lengths.
unsigned VectorBlockSize()
{
return nVector;
}

// Returns the size of the total UnknownSizeFrame in number of vector
// lengths.
unsigned FrameSizeInVectors()
{
return MaskBlockSizeInVectors() + VectorBlockSize();
}

// Allocate a mask, returning an index of the mask in the mask block.
unsigned AllocMask()
{
assert(!isFinalized);
unsigned idx = nMask;
nMask++;
return idx;
}

// Allocate a vector, returning an index of the vector in the vector
// block.
unsigned AllocVector()
{
assert(!isFinalized);
unsigned idx = nVector;
nVector++;
return idx;
}

// Returns a negative offset relative to the base of the UnknownSizeFrame
// for addressing an allocated vector or mask local.
// If `isMask == true`, given an index that was assigned to mask local,
// the returned offset is an index measured in units of VL/8.
// Otherwise given an index that was assigned to a vector local, the
// returned offset is measured in units of VL.
// The index parameter should have been obtained through AllocMask() or
// AllocVector().
int GetOffset(unsigned index, bool isMask = false)
{
// We can't compute addresses if we haven't finished allocating.
assert(isFinalized);

unsigned offset = UINT32_MAX;
if (isMask)
{
assert(index < nMask);
offset = index;
}
else
{
assert(index < nVector);
offset = MaskBlockSizeInVectors() + index;
}
assert(offset != UINT32_MAX);
// The index is always offset by 1 as we are writing from below fp
// upwards.
return -(int)(offset + 1);
}

// This system ensures we don't try and generate an address on the frame
// without finishing all allocations.
void Finalize()
{
#ifdef DEBUG
isFinalized = true;
#endif
}

} unkSizeFrame;
#endif

//------------------------ For splitting types ----------------------------

void lvaInitTypeRef();
Expand Down
14 changes: 12 additions & 2 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2744,7 +2744,7 @@ inline
#endif // !TARGET_AMD64
}

FPbased = varDsc->lvFramePointerBased;
FPbased = varDsc->lvFramePointerBased && !lvaIsUnknownSizeLocal(varNum);

#ifdef DEBUG
#if FEATURE_FIXED_OUT_ARGS
Expand All @@ -2765,7 +2765,17 @@ inline
}
#endif // DEBUG

varOffset = varDsc->GetStackOffset();
#ifdef TARGET_ARM64
if (lvaIsUnknownSizeLocal(varNum) && !varDsc->lvIsStructField)
{
assert(!FPbased);
varOffset = unkSizeFrame.GetOffset(varDsc->GetStackOffset(), varDsc->TypeIs(TYP_MASK));
}
else
#endif
Comment thread
snickolls-arm marked this conversation as resolved.
Outdated
{
varOffset = varDsc->GetStackOffset();
}
}
else // Its a spill-temp
{
Expand Down
Loading
Loading