Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6229,6 +6229,7 @@ class Compiler
void fgConvertBBToThrowBB(BasicBlock* block);

bool fgCastNeeded(GenTree* tree, var_types toType);
bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);

void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
void fgLoopCallMark();
Expand Down
19 changes: 10 additions & 9 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
// On x86, long->floating casts are implemented in DecomposeCast.
// Those nodes, plus any nodes that produce a long, will be examined.
if (!tree->TypeIs(TYP_LONG) &&
!(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
#else
Expand All @@ -159,6 +161,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
// Here we do a conservative check for specific cases where it is certain the load/store
// can be contained. In those cases, we can skip decomposition.
//
// We also look for longs consumed directly by a long->floating cast. These can skip
// decomposition because the cast is implemented using HWIntrinsics.

GenTree* user = use.User();

Expand Down Expand Up @@ -589,21 +594,17 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
// The sequence this creates is simply:
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()

NamedIntrinsic intrinsicId = NI_Illegal;
GenTree* srcOp = cast->CastOp();
var_types dstType = cast->CastToType();
var_types baseFloatingType = (dstType == TYP_FLOAT) ? TYP_FLOAT : TYP_DOUBLE;
var_types baseIntegralType = cast->IsUnsigned() ? TYP_ULONG : TYP_LONG;
NamedIntrinsic intrinsicId = NI_Illegal;
GenTree* srcOp = cast->CastOp();

assert(!cast->gtOverflow());
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));

intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;

GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
GenTree* convert =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcType, 16);
GenTree* convert = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, srcType, 16);
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, dstType, 16);

Range().InsertAfter(cast, createScalar, convert, toScalar);
Range().Remove(cast);
Expand Down
38 changes: 38 additions & 0 deletions src/coreclr/jit/flowgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1271,6 +1271,44 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
return true;
}

//-------------------------------------------------------------------------------------
// fgCastRequiresHelper: Check whether a given cast must be converted to a helper call.
//
// Arguments:
// fromType - The source type of the cast.
// toType - The target type of the cast.
// overflow - True if the cast has the GTF_OVERFLOW flag set.
//
// Return Value:
// True if the cast requires a helper call, otherwise false.
//
bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
{
if (overflow && varTypeIsFloating(fromType))
{
assert(varTypeIsIntegral(toType));
return true;
}

#if !defined(TARGET_64BIT)
if (varTypeIsFloating(fromType) && varTypeIsLong(toType))
{
return true;
}

if (varTypeIsLong(fromType) && varTypeIsFloating(toType))
{
#if defined(TARGET_X86)
return !compOpportunisticallyDependsOn(InstructionSet_AVX512);
#else
return true;
#endif // TARGET_X86
}
#endif // !TARGET_64BIT

return false;
}

GenTree* Compiler::fgGetCritSectOfStaticMethod()
{
noway_assert(!compIsForInlining());
Expand Down
24 changes: 2 additions & 22 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8167,28 +8167,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
goto _CONV;

_CONV:
// only converts from FLOAT or DOUBLE to an integer type
// and converts from ULONG (or LONG on ARM) to DOUBLE are morphed to calls

if (varTypeIsFloating(lclTyp))
{
callNode = varTypeIsLong(impStackTop().val) ||
uns // uint->dbl gets turned into uint->long->dbl
#ifdef TARGET_64BIT
// TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
// TYP_BYREF could be used as TYP_I_IMPL which is long.
// TODO-CQ: remove this when we lower casts long/ulong --> float/double
// and generate SSE2 code instead of going through helper calls.
|| impStackTop().val->TypeIs(TYP_BYREF)
#endif
;
}
else
{
callNode = varTypeIsFloating(impStackTop().val->TypeGet());
}

op1 = impPopStack().val;
op1 = impPopStack().val;
callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);

impBashVarAddrsToI(op1);

Expand Down
94 changes: 89 additions & 5 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,7 @@ void Lowering::LowerCast(GenTree* tree)

GenTree* castOp = tree->AsCast()->CastOp();
var_types dstType = tree->CastToType();
var_types srcType = castOp->TypeGet();
var_types srcType = genActualType(castOp);

// force the srcType to unsigned if GT_UNSIGNED flag is set
if (tree->IsUnsigned())
Expand All @@ -844,12 +844,96 @@ void Lowering::LowerCast(GenTree* tree)
// Long types should have been handled by helper call or in DecomposeLongs on x86.
assert(!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit);
}
else if (srcType == TYP_UINT)

#ifdef TARGET_X86
if ((srcType == TYP_UINT) && varTypeIsFloating(dstType) &&
!m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
{
// uint->float casts should have an intermediate cast to long unless
// we have the EVEX unsigned conversion instructions available.
assert(dstType != TYP_FLOAT || m_compiler->canUseEvexEncodingDebugOnly());
// Pre-AVX-512, there was no conversion instruction for uint->floating, so we emulate it
// using signed int conversion. This is necessary only on 32-bit, because x64 simply casts
// the uint up to a signed long before conversion.
//
// This logic depends on the fact that conversion from int to double is lossless. When
// converting to float, we use a double intermediate, and convert to float only after the
// double result is fixed up. This ensures the floating result is rounded correctly.

LABELEDDISPTREERANGE("LowerCast before", BlockRange(), tree);

LIR::Range castRange = LIR::EmptyRange();

// This creates the equivalent of the following C# code:
// var castResult = Sse2.ConvertScalarToVector128Double(Vector128<double>.Zero, (int)castOp);

GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
GenTree* castResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, castOp, NI_X86Base_ConvertScalarToVector128Double,
TYP_INT, 16);

castRange.InsertAtEnd(zero);
castRange.InsertAtEnd(castResult);

// We will use the conversion result multiple times, so replace it with a lclVar.
LIR::Use resUse;
LIR::Use::MakeDummyUse(castRange, castResult, &resUse);
resUse.ReplaceWithLclVar(m_compiler);
castResult = resUse.Def();

// If the input had the MSB set, it will have converted as a negative, so we must wrap the
// result back around to positive by adding 2^32. `blendvpd` uses only the MSB of the mask
// element.
//
// This creates the equivalent of the following C# code:
// var addRes = Sse2.AddScalar(castResult, Vector128.CreateScalar(4294967296.0));
// castResult = Sse41.BlendVariable(castResult, addRes, castResult);

GenTreeVecCon* addCns = m_compiler->gtNewVconNode(TYP_SIMD16);
addCns->gtSimdVal.f64[0] = 4294967296.0;

GenTree* addRes =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult, addCns, NI_X86Base_AddScalar, TYP_DOUBLE, 16);

castRange.InsertAtEnd(addCns);
castRange.InsertAtEnd(addRes);

GenTree* resClone1 = m_compiler->gtClone(castResult);
GenTree* resClone2 = m_compiler->gtClone(castResult);
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, resClone1, addRes, resClone2,
NI_X86Base_BlendVariable, TYP_DOUBLE, 16);
castRange.InsertAtEnd(resClone1);
castRange.InsertAtEnd(resClone2);
castRange.InsertAtEnd(castResult);

// Convert to float if necessary, then ToScalar() the result out.
if (dstType == TYP_FLOAT)
{
castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult,
NI_X86Base_ConvertToVector128Single, TYP_DOUBLE, 16);
castRange.InsertAtEnd(castResult);
}

GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, castResult, dstType, 16);
castRange.InsertAtEnd(toScalar);

LIR::ReadOnlyRange lowerRange(castRange.FirstNode(), castRange.LastNode());
BlockRange().InsertBefore(tree, std::move(castRange));

LABELEDDISPTREERANGE("LowerCast after", BlockRange(), toScalar);

LIR::Use castUse;
if (BlockRange().TryGetUse(tree, &castUse))
{
castUse.ReplaceWith(toScalar);
}
else
{
toScalar->SetUnusedValue();
}

BlockRange().Remove(tree);
LowerRange(lowerRange);
return;
}
#endif // TARGET_X86

if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
{
Expand Down
Loading
Loading