Commit 4df5d8e6 authored by fftzeng's avatar fftzeng Committed by gbsbuild

1. Add dot(N, L) early out support.

2. Separate earlyout control for CS and PS

Change-Id: If3afd574db92be102f082fd3e2a9c10a0a411de2
parent 44826958
......@@ -1340,6 +1340,10 @@ void OptimizeIR(CodeGenContext* pContext)
mpm.add(llvm::createDeadCodeEliminationPass());
mpm.add(llvm::createEarlyCSEPass());
if (pContext->type == ShaderType::COMPUTE_SHADER)
{
mpm.add(CreateEarlyOutPatternsPass());
}
if(pContext->type == ShaderType::PIXEL_SHADER)
{
// insert early output in case sampleC returns 0
......@@ -1360,9 +1364,12 @@ void OptimizeIR(CodeGenContext* pContext)
mpm.add(createIGCIndirectICBPropagaionPass());
}
if(pContext->type == ShaderType::PIXEL_SHADER)
if(pContext->type == ShaderType::PIXEL_SHADER || pContext->type == ShaderType::COMPUTE_SHADER)
{
mpm.add(CreateEarlyOutPatternsPass());
}
if(pContext->type == ShaderType::PIXEL_SHADER)
{
mpm.add(createBlendToDiscardPass());
}
......
......@@ -1441,4 +1441,47 @@ void appendToUsed(llvm::Module &M, ArrayRef<GlobalValue *> Values)
GV->setSection("llvm.metadata");
}
bool safeScheduleUp(llvm::BasicBlock *BB, llvm::Value *V, llvm::Instruction *&InsertPos, llvm::DenseSet<llvm::Instruction *> Scheduled)
{
llvm::Instruction *I = llvm::dyn_cast<llvm::Instruction>(V);
if (!I)
return false;
// Skip value defined in other BBs.
if (I->getParent() != BB)
return false;
// Skip phi-node as they are eventually defined in other BBs.
if (llvm::isa<llvm::PHINode>(I))
return false;
// Don't re-schedule instruction again.
if (Scheduled.count(I)) {
if (InsertPos && !isInstPrecede(I, InsertPos))
InsertPos = I;
return false;
}
bool Changed = false;
// Try to schedule all its operands first.
for (auto OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI)
Changed |= safeScheduleUp(BB, OI->get(), InsertPos, Scheduled);
// Mark this instruction `visited`.
Scheduled.insert(I);
// Skip if the instruction is already defined before insertion position.
if (InsertPos && isInstPrecede(I, InsertPos))
return Changed;
// Schedule itself.
if (InsertPos) {
I->removeFromParent();
I->insertAfter(InsertPos);
}
InsertPos = I;
return true;
}
} // namespace IGC
......@@ -40,6 +40,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/PassManager.h>
#include <llvm/ADT/SmallSet.h>
#include <llvm/ADT/DenseSet.h>
#include "common/LLVMWarningsPop.hpp"
#include "GenISAIntrinsics/GenIntrinsics.h"
#include "GenISAIntrinsics/GenIntrinsicInst.h"
......@@ -324,6 +325,8 @@ inline float GetThreadOccupancyPerSubslice(SIMDMode simdMode, unsigned threadGro
// Global can now be any pointer type that uses addrspace
void appendToUsed(llvm::Module &M, llvm::ArrayRef<llvm::GlobalValue *> Values);
bool safeScheduleUp(llvm::BasicBlock *BB, llvm::Value *V, llvm::Instruction *&InsertPos, llvm::DenseSet<llvm::Instruction *> Scheduled);
inline unsigned GetHwThreadsPerWG(const IGC::CPlatform& platform)
{
unsigned hwThreadPerWorkgroup = platform.getMaxNumberThreadPerSubslice();
......
......@@ -2339,7 +2339,8 @@ class EarlyOutPatterns : public FunctionPass
public:
static char ID;
EarlyOutPatterns() : FunctionPass(ID)
EarlyOutPatterns() : FunctionPass(ID),
m_ctx(nullptr)
{
}
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const
......@@ -2348,15 +2349,16 @@ public:
}
virtual bool runOnFunction(Function &F);
virtual bool processBlock(BasicBlock* BB);
virtual llvm::StringRef getPassName() const
{
return "EarlyOutPatterns";
}
private:
static bool processBlock(BasicBlock* BB);
static bool canOptimizeSampleInst(SmallVector<Instruction*, 4> &Channels, GenIntrinsicInst *GII);
static bool canOptimizeDotProduct(SmallVector<Instruction*, 4> &Values, Instruction *I);
static bool canOptimizeNdotL(SmallVector<Instruction*, 4> &Values, FCmpInst *FC);
static bool DotProductMatch(const Instruction *I);
static bool DotProductSourceMatch(const Instruction *I);
static BasicBlock* tryFoldAndSplit(
......@@ -2389,6 +2391,7 @@ private:
Instruction *Root,
ArrayRef<Instruction*> Values,
const DenseSet<const Value*> &FoldedVals);
CodeGenContext *m_ctx;
};
char EarlyOutPatterns::ID = 0;
......@@ -2400,9 +2403,9 @@ FunctionPass* IGC::CreateEarlyOutPatternsPass()
bool EarlyOutPatterns::runOnFunction(Function &F)
{
auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
m_ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
if (IGC_IS_FLAG_ENABLED(DisableEarlyOutPatterns) ||
pCtx->m_DriverInfo.WaNOSNotResolved())
m_ctx->m_DriverInfo.WaNOSNotResolved())
{
return false;
}
......@@ -2502,6 +2505,13 @@ bool EarlyOutPatterns::FoldsToZero(const Instruction* inst, const Value* use, co
return false;
}
}
else if (auto* inst = dyn_cast<Instruction>(use))
{
if (inst->getOpcode() == Instruction::SExt)
{
return true;
}
}
return false;
}
......@@ -2732,7 +2742,6 @@ bool EarlyOutPatterns::DotProductMatch(const Instruction *I)
Value *Z2 = nullptr;
// dp3
return match(I,
m_FAdd(
m_FMul(m_Value(Z1), m_Value(Z2)),
......@@ -2750,6 +2759,113 @@ bool EarlyOutPatterns::DotProductSourceMatch(const Instruction *I)
return false;
}
bool EarlyOutPatterns::canOptimizeNdotL(SmallVector<Instruction*, 4> &Values, FCmpInst *FC)
{
// this function checks the lighting pattern -
// in short, the shader has a dot(N, L), multiply it with color, and max with 0.
// if so, we might benefit from checking the dot(N, L) > 0 before calculating the color
// LLVM example:
// %319 = from dp3
// %res_s231 = fcmp fast ogt float %319, 0.000000e+00 -> function input parameter FC
// %selResi32_s232 = sext i1 %res_s231 to i32
// %res_s246 = and i32 % 339, %selResi32_s232
// % res_s247 = and i32 % 340, %selResi32_s232
// % res_s248 = and i32 % 341, %selResi32_s232
// % 342 = bitcast i32 %res_s246 to float
// % 343 = bitcast i32 %res_s247 to float
// % 344 = bitcast i32 %res_s248 to float
// % res_s249 = fmul fast float %res_s224, % 342
// % res_s250 = fmul fast float %res_s225, % 343
// % res_s251 = fmul fast float %res_s226, % 344
// % 345 = call fast float @llvm.maxnum.f32(float %res_s249, float 0.000000e+00)
// % 346 = call fast float @llvm.maxnum.f32(float %res_s250, float 0.000000e+00)
// % 347 = call fast float @llvm.maxnum.f32(float %res_s251, float 0.000000e+00)
// ========================================================================================
// check if FC is comparing with 0
// %res_s231 = fcmp fast ogt float %319, 0.000000e+00, !dbg !278
ConstantFP* src1 = dyn_cast<ConstantFP>(FC->getOperand(1));
if (FC->getPredicate() != FCmpInst::FCMP_OGT || !FC->hasOneUse() || !src1 || !src1->isZero())
{
return false;
}
// check if FC is from a dp3
Instruction *src0 = dyn_cast<Instruction>(FC->getOperand(0));
if(!src0 || !DotProductMatch(src0))
{
return false;
}
// check if FC is followed by and+mul+max
// sext is needed between "fcmp" and "and".
// also the result should have 3 uses - x,y,z component of the light ray.
Instruction *sextInst = dyn_cast<Instruction>(*FC->user_begin());
if (!sextInst || sextInst->getOpcode() != Instruction::SExt || !sextInst->hasNUses(3))
{
return false;
}
for (auto iter = sextInst->user_begin(); iter != sextInst->user_end(); iter++)
{
// %res_s246 = and i32 %339, %selResi32_s232
BinaryOperator* andInst = dyn_cast<BinaryOperator>(*iter);
if (!andInst || andInst->getOpcode() != BinaryOperator::And || !andInst->hasOneUse())
{
return false;
}
// % 342 = bitcast i32 %res_s246 to float
BitCastInst *bitCastInst = dyn_cast<BitCastInst>(*andInst->user_begin());
if (!bitCastInst || !bitCastInst->hasOneUse())
{
return false;
}
Instruction* tempInst = dyn_cast<Instruction>(*bitCastInst->user_begin());
while (tempInst && tempInst->hasOneUse())
{
// % 345 = call fast float @llvm.maxnum.f32(float %res_s249, float 0.000000e+00)
CallInst* CI = dyn_cast<CallInst>(tempInst);
if (CI && GetOpCode(CI) == llvm_max)
{
ConstantFP* maxSrc1 = dyn_cast<ConstantFP>(CI->getOperand(1));
if (maxSrc1 && maxSrc1->isZero())
{
// found max with 0. do the optimization
break;
}
else
{
return false;
}
}
else if (tempInst->getOpcode() == BinaryOperator::FMul)
{
// if it is a fmul, keep going down to see if we can find a max
tempInst = dyn_cast<Instruction>(*tempInst->user_begin());
continue;
}
// not max, not mul, skip the optimization
return false;
}
}
// find all instructions contribute to FC and safely move them up to skip as many instructions as possible after early out
DenseSet<llvm::Instruction *> Scheduled;
Scheduled.clear();
BasicBlock *BB = FC->getParent();
Instruction *InsertPos = &*BB->getFirstInsertionPt();
safeScheduleUp(BB, cast<Value>(FC), InsertPos, Scheduled);
return true;
}
bool EarlyOutPatterns::canOptimizeSampleInst(SmallVector<Instruction*, 4> &Channels, GenIntrinsicInst *GII)
{
auto ID = GII->getIntrinsicID();
......@@ -2817,15 +2933,27 @@ bool EarlyOutPatterns::processBlock(BasicBlock* BB)
{
bool Changed = false;
bool BBSplit = true;
bool SamplePatternEnable = 0;
bool DPMaxPatternEnable = 0;
bool DPFSatPatternEnable = 0;
bool NdotLPatternEnable = 0;
// Each pattern below is given a bit to toggle on/off
// to isolate the performance for each individual pattern.
const bool SamplePatternEnable =
(IGC_GET_FLAG_VALUE(EarlyOutPatternSelect) & 0x1) != 0;
const bool DPMaxPatternEnable =
(IGC_GET_FLAG_VALUE(EarlyOutPatternSelect) & 0x2) != 0;
const bool DPFSatPatternEnable =
(IGC_GET_FLAG_VALUE(EarlyOutPatternSelect) & 0x4) != 0;
if (m_ctx->type == ShaderType::COMPUTE_SHADER)
{
SamplePatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectCS) & 0x1) != 0;
DPMaxPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectCS) & 0x2) != 0;
DPFSatPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectCS) & 0x4) != 0;
NdotLPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectCS) & 0x8) != 0;
}
else if (m_ctx->type == ShaderType::PIXEL_SHADER)
{
SamplePatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectPS) & 0x1) != 0;
DPMaxPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectPS) & 0x2) != 0;
DPFSatPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectPS) & 0x4) != 0;
NdotLPatternEnable = (IGC_GET_FLAG_VALUE(EarlyOutPatternSelectPS) & 0x8) != 0;
}
while (BBSplit)
{
......@@ -2871,6 +2999,12 @@ bool EarlyOutPatterns::processBlock(BasicBlock* BB)
break;
}
}
else if (auto *FC = dyn_cast<FCmpInst>(&II))
{
OptCandidate = NdotLPatternEnable &&
canOptimizeNdotL(Values, FC) && canOptimizeDotProduct(Values, &II);
}
if (OptCandidate)
{
......
......@@ -222,7 +222,8 @@ DECLARE_IGC_REGKEY(bool, EnablePreRARematFlag, true, "Enable PreRA Rem
DECLARE_IGC_REGKEY(bool, EnableGASResolver, true, "Enable GAS Resolver")
DECLARE_IGC_REGKEY(bool, DisableRecompilation, false, "Disable recompilation")
DECLARE_IGC_REGKEY(bool, DisableEarlyOutPatterns, false, "Disable optimization trying to create an early out after sampleC messages")
DECLARE_IGC_REGKEY(DWORD, EarlyOutPatternSelect, 0xf, "Each bit selects a pattern match to enable/disable. All on by default.")
DECLARE_IGC_REGKEY(DWORD, EarlyOutPatternSelectPS, 0x7, "Each bit selects a pattern match to enable/disable.")
DECLARE_IGC_REGKEY(DWORD, EarlyOutPatternSelectCS, 0x8, "Each bit selects a pattern match to enable/disable.")
DECLARE_IGC_REGKEY(bool, EnableReasso, false, "Enable reassociation")
DECLARE_IGC_REGKEY(bool, EnableOCLScratchPrivateMemory, true, "Enable the use of scratch space for private memory [OCL only]")
DECLARE_IGC_REGKEY(bool, Enable64BitEmulation, false, "Enable 64-bit emulation")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment