From d59c5e4d9ef4d0662feabefd2a850fe652f35afc Mon Sep 17 00:00:00 2001 From: Rahil Shah Date: Tue, 13 Mar 2018 14:16:43 -0400 Subject: [PATCH] Add jProfilingRecompilation test in loop When jProfiling in compilation is enabled, add a test after aynchcheck node in loop to calculate the frequency and trip up recompilation of the method if reached certain threshold. Signed-off-by: Rahil Shah --- runtime/compiler/build/files/common.mk | 1 + .../optimizer/J9OptimizationManager.cpp | 5 +- runtime/compiler/optimizer/J9Optimizer.cpp | 8 + .../compiler/optimizer/J9TransformUtil.cpp | 16 ++ .../compiler/optimizer/J9TransformUtil.hpp | 1 + .../compiler/optimizer/JProfilingBlock.cpp | 90 ++----- .../compiler/optimizer/JProfilingBlock.hpp | 4 +- .../optimizer/JProfilingRecompLoopTest.cpp | 249 ++++++++++++++++++ .../optimizer/JProfilingRecompLoopTest.hpp | 59 +++++ runtime/compiler/optimizer/Optimizations.enum | 3 +- runtime/compiler/runtime/J9Profiler.cpp | 109 ++++++++ runtime/compiler/runtime/J9Profiler.hpp | 4 +- 12 files changed, 472 insertions(+), 77 deletions(-) create mode 100644 runtime/compiler/optimizer/JProfilingRecompLoopTest.cpp create mode 100644 runtime/compiler/optimizer/JProfilingRecompLoopTest.hpp diff --git a/runtime/compiler/build/files/common.mk b/runtime/compiler/build/files/common.mk index e8e53bd81d3..268afbea89e 100644 --- a/runtime/compiler/build/files/common.mk +++ b/runtime/compiler/build/files/common.mk @@ -43,6 +43,7 @@ JIT_PRODUCT_BACKEND_SOURCES+=\ compiler/optimizer/JitProfiler.cpp \ compiler/optimizer/JProfilingBlock.cpp \ compiler/optimizer/JProfilingValue.cpp \ + compiler/optimizer/JProfilingRecompLoopTest.cpp \ compiler/optimizer/LiveVariablesForGC.cpp \ compiler/optimizer/LoopAliasRefiner.cpp \ compiler/optimizer/MonitorElimination.cpp \ diff --git a/runtime/compiler/optimizer/J9OptimizationManager.cpp b/runtime/compiler/optimizer/J9OptimizationManager.cpp index 74cfa10c3cd..75bb418ed5d 100644 --- a/runtime/compiler/optimizer/J9OptimizationManager.cpp +++ b/runtime/compiler/optimizer/J9OptimizationManager.cpp @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2017 IBM Corp. and others + * Copyright (c) 2000, 2018 IBM Corp. and others * * This program and the accompanying materials are made available under * the terms of the Eclipse Public License 2.0 which accompanies this @@ -118,6 +118,9 @@ J9::OptimizationManager::OptimizationManager(TR::Optimizer *o, OptimizationFacto case OMR::jProfilingBlock: _flags.set(doesNotRequireAliasSets); break; + case OMR::jProfilingRecompLoopTest: + _flags.set(requiresStructure); + break; default: // do nothing break; diff --git a/runtime/compiler/optimizer/J9Optimizer.cpp b/runtime/compiler/optimizer/J9Optimizer.cpp index b95bcae978c..70d5339fa56 100644 --- a/runtime/compiler/optimizer/J9Optimizer.cpp +++ b/runtime/compiler/optimizer/J9Optimizer.cpp @@ -72,6 +72,7 @@ #include "optimizer/OSRGuardRemoval.hpp" #include "optimizer/JProfilingBlock.hpp" #include "optimizer/JProfilingValue.hpp" +#include "optimizer/JProfilingRecompLoopTest.hpp" #include "runtime/J9Profiler.hpp" #include "optimizer/UnsafeFastPath.hpp" #include "optimizer/VarHandleTransformer.hpp" @@ -279,6 +280,7 @@ static const OptimizationStrategy coldStrategyOpts[] = { OMR::rematerialization }, { OMR::compactNullChecks, OMR::IfEnabled }, { OMR::signExtendLoadsGroup, OMR::IfEnabled }, + { OMR::jProfilingRecompLoopTest, OMR::IfLoops }, { OMR::jProfilingValue, OMR::MustBeDone }, { OMR::trivialDeadTreeRemoval, }, { OMR::cheapTacticalGlobalRegisterAllocatorGroup, OMR::IfAOTAndEnabled }, @@ -364,6 +366,7 @@ static const OptimizationStrategy warmStrategyOpts[] = { OMR::globalDeadStoreElimination, OMR::IfVoluntaryOSR }, { OMR::arraysetStoreElimination }, { OMR::checkcastAndProfiledGuardCoalescer }, + { OMR::jProfilingRecompLoopTest, OMR::IfLoops }, { OMR::jProfilingValue, OMR::MustBeDone }, { OMR::cheapTacticalGlobalRegisterAllocatorGroup, OMR::IfEnabled }, { OMR::globalDeadStoreGroup, }, @@ -398,6 +401,7 @@ static const OptimizationStrategy reducedWarmStrategyOpts[] = { OMR::localCSE }, { OMR::treeSimplification, OMR::MarkLastRun }, { OMR::deadTreesElimination, OMR::IfEnabled }, // cleanup at the end + { OMR::jProfilingRecompLoopTest, OMR::IfLoops }, { OMR::jProfilingValue, OMR::MustBeDone }, { OMR::cheapTacticalGlobalRegisterAllocatorGroup, OMR::IfEnabled }, { OMR::endOpts } @@ -459,6 +463,7 @@ const OptimizationStrategy hotStrategyOpts[] = { OMR::localValuePropagation, OMR::MarkLastRun }, { OMR::arraycopyTransformation }, { OMR::checkcastAndProfiledGuardCoalescer }, + { OMR::jProfilingRecompLoopTest, OMR::IfLoops }, { OMR::jProfilingValue, OMR::MustBeDone }, { OMR::tacticalGlobalRegisterAllocatorGroup, OMR::IfEnabled }, { OMR::globalDeadStoreElimination, OMR::IfMoreThanOneBlock }, // global dead store removal @@ -708,6 +713,7 @@ static const OptimizationStrategy cheapWarmStrategyOpts[] = { OMR::deadTreesElimination, OMR::IfEnabled }, // cleanup at the end { OMR::treeSimplification, OMR::IfEnabledMarkLastRun }, // Simplify non-normalized address computations introduced by prefetch insertion { OMR::trivialDeadTreeRemoval, OMR::IfEnabled }, // final cleanup before opcode expansion + { OMR::jProfilingRecompLoopTest, OMR::IfLoops }, { OMR::jProfilingValue, OMR::MustBeDone }, { OMR::cheapTacticalGlobalRegisterAllocatorGroup, OMR::IfEnabled }, { OMR::globalDeadStoreGroup, }, @@ -811,6 +817,8 @@ J9::Optimizer::Optimizer(TR::Compilation *comp, TR::ResolvedMethodSymbol *method new (comp->allocator()) TR::OptimizationManager(self(), TR_OSRGuardRemoval::create, OMR::osrGuardRemoval); _opts[OMR::jProfilingBlock] = new (comp->allocator()) TR::OptimizationManager(self(), TR_JProfilingBlock::create, OMR::jProfilingBlock); + _opts[OMR::jProfilingRecompLoopTest] = + new (comp->allocator()) TR::OptimizationManager(self(), TR_JProfilingRecompLoopTest::create, OMR::jProfilingRecompLoopTest); _opts[OMR::jProfilingValue] = new (comp->allocator()) TR::OptimizationManager(self(), TR_JProfilingValue::create, OMR::jProfilingValue); // NOTE: Please add new J9 optimizations here! diff --git a/runtime/compiler/optimizer/J9TransformUtil.cpp b/runtime/compiler/optimizer/J9TransformUtil.cpp index 82cc0bc5e5b..78a447f4e86 100644 --- a/runtime/compiler/optimizer/J9TransformUtil.cpp +++ b/runtime/compiler/optimizer/J9TransformUtil.cpp @@ -39,6 +39,22 @@ #include "ras/DebugCounter.hpp" #include "j9.h" #include "optimizer/OMROptimization_inlines.hpp" +#include "optimizer/Structure.hpp" + +/** + * Walks the TR_RegionStructure counting loops to get the nesting depth of the block + */ +int32_t J9::TransformUtil::getLoopNestingDepth(TR::Compilation *comp, TR::Block *block) + { + TR_RegionStructure *region = block->getParentStructureIfExists(comp->getFlowGraph()); + int32_t nestingDepth = 0; + while (region && region->isNaturalLoop()) + { + nestingDepth++; + region = region->getParent(); + } + return nestingDepth; + } /* * Generate trees for call to jitRetranslateCallerWithPrep to trigger recompilation from JIT-Compiled code. diff --git a/runtime/compiler/optimizer/J9TransformUtil.hpp b/runtime/compiler/optimizer/J9TransformUtil.hpp index 923e001780a..96753f16de1 100644 --- a/runtime/compiler/optimizer/J9TransformUtil.hpp +++ b/runtime/compiler/optimizer/J9TransformUtil.hpp @@ -47,6 +47,7 @@ class OMR_EXTENSIBLE TransformUtil : public OMR::TransformUtilConnector { public: static TR::TreeTop *generateRetranslateCallerWithPrepTrees(TR::Node *node, TR_PersistentMethodInfo::InfoBits reason, TR::Compilation *comp); + static int32_t getLoopNestingDepth(TR::Compilation *comp, TR::Block *block); static bool foldFinalFieldsIn(TR_OpaqueClassBlock *clazz, char *className, int32_t classNameLength, bool isStatic, TR::Compilation *comp); static TR::Node *generateArrayElementShiftAmountTrees( diff --git a/runtime/compiler/optimizer/JProfilingBlock.cpp b/runtime/compiler/optimizer/JProfilingBlock.cpp index ec8272ab8a7..d7087da96bb 100644 --- a/runtime/compiler/optimizer/JProfilingBlock.cpp +++ b/runtime/compiler/optimizer/JProfilingBlock.cpp @@ -41,7 +41,6 @@ int32_t TR_JProfilingBlock::nestedLoopRecompileThreshold = 10; int32_t TR_JProfilingBlock::loopRecompileThreshold = 250; int32_t TR_JProfilingBlock::recompileThreshold = 500; -int32_t TR_JProfilingBlock::profilingCompileThreshold = 2; /** * Prim's algorithm to compute a Minimum Spanning Tree traverses the edges of the tree @@ -831,9 +830,9 @@ void TR_JProfilingBlock::dumpCounterDependencies(TR_BitVector **componentCounter * appropriate number of method entries has occurred as determined by the raw block * count of the first block of the method. */ -void TR_JProfilingBlock::addRecompilationTests(TR_BlockFrequencyInfo *blockFrequencyInfo, TR_BitVector **componentCounters) +void TR_JProfilingBlock::addRecompilationTests(TR_BlockFrequencyInfo *blockFrequencyInfo) { - // add invocation check to the top of the method + // add invocation check to the top of the method int32_t *thresholdLocation = NULL; if (comp()->getMethodSymbol()->mayHaveNestedLoops()) thresholdLocation = &nestedLoopRecompileThreshold; @@ -842,97 +841,47 @@ void TR_JProfilingBlock::addRecompilationTests(TR_BlockFrequencyInfo *blockFrequ else thresholdLocation = &recompileThreshold; - // Profiling compilations have a lower threshold, so that less time is - // spent running the high overhead implementation - if (comp()->isProfilingCompilation()) - thresholdLocation = &profilingCompileThreshold; - int32_t startBlockNumber = comp()->getStartBlock()->getNumber(); blockFrequencyInfo->setEntryBlockNumber(startBlockNumber); - TR::Node *node = comp()->getMethodSymbol()->getFirstTreeTop()->getNode(); - - if (componentCounters[startBlockNumber * 2] && (((uintptr_t)componentCounters[startBlockNumber * 2]) & 0x1 == 1 || !componentCounters[startBlockNumber * 2]->isEmpty())) + TR::Node *root = blockFrequencyInfo->generateBlockRawCountCalculationSubTree(comp(), startBlockNumber, node); + bool isProfilingCompilation = comp()->isProfilingCompilation(); + if (root != NULL) { - TR::DebugCounter::incStaticDebugCounter(comp(), TR::DebugCounter::debugCounterName(comp(), "jprofiling.instrument/success/(%s)", comp()->signature())); - comp()->getFlowGraph()->setStructure(NULL); - // add the positive counters - TR::Node *addRoot = NULL; - if (((uintptr_t)componentCounters[startBlockNumber * 2]) & 0x1 == 1) - { - TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(blockFrequencyInfo->getFrequencyForBlock(((uintptr_t)componentCounters[startBlockNumber * 2]) >> 1), TR::Int32); - addRoot = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); - } - else - { - TR_BitVectorIterator addBVI(*(componentCounters[startBlockNumber * 2])); - while (addBVI.hasMoreElements()) - { - TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(blockFrequencyInfo->getFrequencyForBlock(addBVI.getNextElement()), TR::Int32); - TR::Node *counterLoad = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); - if (addRoot) - addRoot = TR::Node::create(node, TR::iadd, 2, addRoot, counterLoad); - else - addRoot = counterLoad; - } - } - TR::Node *subRoot = NULL; - if (componentCounters[startBlockNumber * 2 + 1] != NULL) - { - if (((uintptr_t)componentCounters[startBlockNumber * 2 + 1]) & 0x1 == 1) - { - TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(blockFrequencyInfo->getFrequencyForBlock(((uintptr_t)componentCounters[startBlockNumber * 2 + 1]) >> 1), TR::Int32); - subRoot = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); - } - else - { - TR_BitVectorIterator subBVI(*(componentCounters[startBlockNumber * 2 + 1])); - while (subBVI.hasMoreElements()) - { - TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(blockFrequencyInfo->getFrequencyForBlock(subBVI.getNextElement()), TR::Int32); - TR::Node *counterLoad = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); - if (subRoot) - { - subRoot = TR::Node::create(node, TR::isub, 2, subRoot, counterLoad); - } - else - { - subRoot = counterLoad; - } - } - } - } - TR::Node *root = addRoot; - if (subRoot) - { - root = TR::Node::create(node, TR::isub, 2, root, subRoot); - } - TR::Block * originalFirstBlock = comp()->getStartBlock(); TR::Block *guardBlock1 = TR::Block::createEmptyBlock(node, comp(), originalFirstBlock->getFrequency()); { - TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(blockFrequencyInfo->getEnableJProfilingRecompilation(), TR::Int32); + // If this is profiling compilation we do not need to check if jProfiling is enabled or not at runtime, + // In this case we only check if we have queued for recompilation before comparing against method invocation count. + int32_t *loadAddress = isProfilingCompilation ? blockFrequencyInfo->getIsQueuedForRecompilation() : blockFrequencyInfo->getEnableJProfilingRecompilation(); + TR::SymbolReference *symRef = comp()->getSymRefTab()->createKnownStaticDataSymbolRef(loadAddress, TR::Int32); TR::Node *enableLoad = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); - TR::Node *enableTest = TR::Node::createif(TR::ificmpeq, enableLoad, TR::Node::iconst(node, 0), originalFirstBlock->getEntry()); + TR::Node *enableTest = TR::Node::createif(TR::ificmpeq, enableLoad, TR::Node::iconst(node, -1), originalFirstBlock->getEntry()); TR::TreeTop *enableTree = TR::TreeTop::create(comp(), enableTest); enableTest->setIsProfilingCode(); guardBlock1->append(enableTree); } + static int32_t jProfilingCompileThreshold = comp()->getOptions()->getJProfilingMethodRecompThreshold(); + if (trace()) + traceMsg(comp(),"Profiling Compile Threshold for method = %d\n",isProfilingCompilation ? jProfilingCompileThreshold : *thresholdLocation); TR::Block *guardBlock2 = TR::Block::createEmptyBlock(node, comp(), originalFirstBlock->getFrequency()); - TR::Node *recompThreshold = TR::Node::createWithSymRef(node, TR::iload, 0, comp()->getSymRefTab()->createKnownStaticDataSymbolRef(thresholdLocation, TR::Int32)); + TR::Node *recompThreshold = isProfilingCompilation ? TR::Node::iconst(node, jProfilingCompileThreshold) : TR::Node::createWithSymRef(node, TR::iload, 0, comp()->getSymRefTab()->createKnownStaticDataSymbolRef(thresholdLocation, TR::Int32)); TR::Node *cmpFlagNode = TR::Node::createif(TR::ificmplt, root, recompThreshold, originalFirstBlock->getEntry()); TR::TreeTop *cmpFlag = TR::TreeTop::create(comp(), cmpFlagNode); cmpFlagNode->setIsProfilingCode(); guardBlock2->append(cmpFlag); // construct call block + const char * const dc1 = TR::DebugCounter::debugCounterName(comp(), + "methodRecomp/(%s)", comp()->signature()); TR::Block *callRecompileBlock = TR::Block::createEmptyBlock(node, comp(), UNKNOWN_COLD_BLOCK_COUNT); callRecompileBlock->setIsCold(true); TR::TreeTop *callTree = TR::TransformUtil::generateRetranslateCallerWithPrepTrees(node, TR_PersistentMethodInfo::RecompDueToJProfiling, comp()); callTree->getNode()->setIsProfilingCode(); callRecompileBlock->append(callTree); + TR::DebugCounter::prependDebugCounter(comp(), dc1, callTree); comp()->getRecompilationInfo()->getJittedBodyInfo()->setUsesJProfiling(); TR::CFG *cfg = comp()->getFlowGraph(); @@ -1156,10 +1105,9 @@ int32_t TR_JProfilingBlock::perform() // dump counter dependency information if (trace()) - dumpCounterDependencies(componentCounters); - + dumpCounterDependencies(componentCounters); // modify the method to add tests to trigger recompilation at runtime - addRecompilationTests(blockFrequencyInfo, componentCounters); + addRecompilationTests(blockFrequencyInfo); return 1; } diff --git a/runtime/compiler/optimizer/JProfilingBlock.hpp b/runtime/compiler/optimizer/JProfilingBlock.hpp index 8bf8d19d98f..7a2b19d327b 100644 --- a/runtime/compiler/optimizer/JProfilingBlock.hpp +++ b/runtime/compiler/optimizer/JProfilingBlock.hpp @@ -51,7 +51,6 @@ class TR_JProfilingBlock : public TR::Optimization static int32_t nestedLoopRecompileThreshold; static int32_t loopRecompileThreshold; static int32_t recompileThreshold; - static int32_t profilingCompileThreshold; TR_JProfilingBlock(TR::OptimizationManager *manager) : TR::Optimization(manager) {} @@ -62,13 +61,12 @@ class TR_JProfilingBlock : public TR::Optimization virtual int32_t perform(); virtual const char * optDetailString() const throw(); - protected: void computeMinimumSpanningTree(BlockParents &parents, BlockPriorityQueue &Q, TR::StackMemoryRegion &stackMemoryRegion); int32_t processCFGForCounting(BlockParents &parent, TR::BlockChecklist &countedBlocks, TR::CFGEdge &loopBack); TR_BlockFrequencyInfo *initRecompDataStructures(); void dumpCounterDependencies(TR_BitVector **componentCounters); - void addRecompilationTests(TR_BlockFrequencyInfo *blockFrequencyInfo, TR_BitVector **componentCounters); + void addRecompilationTests(TR_BlockFrequencyInfo *blockFrequencyInfo); }; #endif diff --git a/runtime/compiler/optimizer/JProfilingRecompLoopTest.cpp b/runtime/compiler/optimizer/JProfilingRecompLoopTest.cpp new file mode 100644 index 00000000000..47ca021017e --- /dev/null +++ b/runtime/compiler/optimizer/JProfilingRecompLoopTest.cpp @@ -0,0 +1,249 @@ +/******************************************************************************* + * Copyright (c) 2018, 2018 IBM Corp. and others + * + * This program and the accompanying materials are made available under + * the terms of the Eclipse Public License 2.0 which accompanies this + * distribution and is available at https://www.eclipse.org/legal/epl-2.0/ + * or the Apache License, Version 2.0 which accompanies this distribution and + * is available at https://www.apache.org/licenses/LICENSE-2.0. + * + * This Source Code may also be made available under the following + * Secondary Licenses when the conditions for such availability set + * forth in the Eclipse Public License, v. 2.0 are satisfied: GNU + * General Public License, version 2 with the GNU Classpath + * Exception [1] and GNU General Public License, version 2 with the + * OpenJDK Assembly Exception [2]. + * + * [1] https://www.gnu.org/software/classpath/license.html + * [2] http://openjdk.java.net/legal/assembly-exception.html + * + * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception + *******************************************************************************/ +#include "JProfilingRecompLoopTest.hpp" +#include "il/Block.hpp" +#include "infra/Cfg.hpp" +#include "infra/TRCfgEdge.hpp" +#include "infra/Checklist.hpp" +#include "infra/ILWalk.hpp" +#include "infra/List.hpp" +#include "optimizer/Optimization_inlines.hpp" +#include "il/Node_inlines.hpp" +#include "infra/Checklist.hpp" // for TR::NodeChecklist +#include "ras/DebugCounter.hpp" +#include "control/Recompilation.hpp" // for TR_Recompilation, etc +#include "control/RecompilationInfo.hpp" // for TR_Recompilation, etc +#include "codegen/CodeGenerator.hpp" +#include "optimizer/TransformUtil.hpp" // for TransformUtil +#include "optimizer/JProfilingBlock.hpp" + +int32_t TR_JProfilingRecompLoopTest::maxLoopRecompilationThreshold = 10000; +/** + * A utility function that iterates through the TR::list of byte code info and checks if passed bytecode info exists in list + * by checking bytecode index and caller index of the bytecode info + */ +bool +TR_JProfilingRecompLoopTest::isByteCodeInfoInCurrentTestLocationList(TR_ByteCodeInfo &bci, TR::list &addedLocationBCIList) + { + for (auto iter = addedLocationBCIList.begin(), listEnd = addedLocationBCIList.end(); iter != listEnd; ++iter) + { + TR_ByteCodeInfo iterBCI = *iter; + if (iterBCI.getByteCodeIndex() == bci.getByteCodeIndex() && iterBCI.getCallerIndex() == bci.getCallerIndex()) + return true; + } + return false; + } + +int32_t +TR_JProfilingRecompLoopTest::perform() + { + if (comp()->getProfilingMode() != JProfiling) + { + if (trace()) + traceMsg(comp(), "JProfiling for profiling compilations has not been enabled, skip JProfilingRecompLoopTest\n"); + return 0; + } + RecompilationTestLocationsInfo testLocations(RecompilationTestLocationInfoAllocator(comp()->trMemory()->currentStackRegion())); + TR::TreeTop *cursor = comp()->getStartTree(); + TR::CFG *cfg = comp()->getFlowGraph(); + TR::Block *currentBlock = NULL; + TR::list addedLocationBCIList(comp()->trMemory()->currentStackRegion()); + while (cursor) + { + if (cursor->getNode()->getOpCodeValue() == TR::BBStart) + { + currentBlock = cursor->getNode()->getBlock(); + /** + * As we are already walking down the tree tops, we can get the enclosing block by tracking TR::BBStart nodes and currentBlock contains + * this information. + * We also keep local list of ByteCodeInfo for each test locations (asyncchecknode) in the extended basic blocks. + * This can avoid adding multiple tests for multiple locations with same byte code info in extended basic block. + * As soon as we encounter a block which is not extenstion of previous block, we clear the list of byte code info. + */ + if (!currentBlock->isExtensionOfPreviousBlock() && !addedLocationBCIList.empty()) + addedLocationBCIList.clear(); + } + else if (cursor->getNode()->getOpCodeValue() == TR::asynccheck) + { + TR_ASSERT_FATAL(currentBlock != NULL,"We should have encountered BBStart before and should have the enclosing block"); + if (currentBlock->getStructureOf()->getContainingLoop() != NULL) + { + TR_ByteCodeInfo bci = cursor->getNode()->getByteCodeInfo(); + // If the list of TR_ByteCodeInfo used to track the tree tops in extended basic blocks is empty or the bytecode info + // of current asynccheck node does not match with any tracked bytecode info, we will add the test otherwise skip it. + // We can have list for addedLocationBCIList as in extended basic blocks, there will be very less test locations so + // This list won't be too large. + if (addedLocationBCIList.empty() || !isByteCodeInfoInCurrentTestLocationList(bci, addedLocationBCIList)) + { + addedLocationBCIList.push_back(bci); + int32_t loopDepth = TR::TransformUtil::getLoopNestingDepth(comp(), currentBlock); + testLocations.push_back(std::make_pair(std::make_pair(cursor, currentBlock),loopDepth)); + } + } + } + cursor = cursor->getNextTreeTop(); + } + if (!testLocations.empty()) + addRecompilationTests(comp(), testLocations); + return 1; + } + +/** \brief Adds trees and control flow to check the loop raw frequency and trip recompilation of the method + * if it has spent enough time in the loop. + * \details Iterates a list of recompilation test locations passed by callee and adds the following trees and blocks. + * ----------------- + * |... | <-- originalBlock + * |testLocationTT | + * |... | + * ----------------- + * Test location block shown above becomes, + * ---------------------------------------------- + * |... | <--- originalBlock + * |testLocationTT | + * |ifcmple goto remainingCodeBlock |--------------------------------------- + * | iload isMethodQueuedForRecompilation | | + * | iconst -1 | | + * ---------------------------------------------- | + * | | + * | | + * V | + * ----------------------------------------------- | + * |ifcmple goto remainingCodeBlock | <---- CalculateCheckLoopFreqBlock | + * | rawFrequencyOfTheLoop | | + * | AdjustedLoopRecompilationThreshold | | + * ----------------------------------------------- | + * | | + * | | + * V | + * ----------------------------------------------- | + * |call TR_jitRetranslateCallerWithPrep | <--- callRecompilation block | + * | loadaddr startPC | | + * | loadaddr | | + * ----------------------------------------------- | + * | | + * | | + * V | + * -------------------------------- | + * |treeTops after insertion Point| <--- remainingCodeBlock | + * |from original block | | + * |... |<---------------------------------------------- + * -------------------------------- + * \param comp Current compilation object + * \param testLocations RecompilationTestLocation list containing TreeTops after which test are required to be added + * And corresponding to that location, a loop nesting depth + */ + +void +TR_JProfilingRecompLoopTest::addRecompilationTests(TR::Compilation *comp, RecompilationTestLocationsInfo &testLocations) + { + TR_PersistentProfileInfo *profileInfo = comp->getRecompilationInfo()->findOrCreateProfileInfo(); + TR_BlockFrequencyInfo *bfi = TR_BlockFrequencyInfo::get(profileInfo); + TR::CFG *cfg = comp->getFlowGraph(); + // TODO: We should do experiment with fixing the structure instead of invalidating and do compile time + // Experimemt to see which is better. + cfg->invalidateStructure(); + + // Following environment sets up the base recompilation threshold for the loop. + // This base recompile threshold in conjunction with the depth in loop is compared with the raw count of the + // loop to decide if we have run this loop enough time to trip method recompilation. + static int32_t recompileThreshold = comp->getOptions()->getJProfilingLoopRecompThreshold(); + if (trace()) + traceMsg(comp, "Loop Recompilation Base Threshold = %d\n",recompileThreshold); + // Iterating backwards to avoid losing original block associated with the test location tree tops in case we have found multiple + // recompilation test location in same block. + for (auto testLocationIter = testLocations.rbegin(), testLocationEnd = testLocations.rend(); testLocationIter != testLocationEnd; ++testLocationIter) + { + TR::TreeTop *asyncCheckTreeTop = testLocationIter->first.first; + TR::Block *originalBlock = testLocationIter->first.second; + TR::Node *node = asyncCheckTreeTop->getNode(); + int32_t depth = testLocationIter->second; + if (trace()) + traceMsg(comp, "block_%d, n%dn, depth = %d\n",originalBlock->getNumber(), asyncCheckTreeTop->getNode()->getGlobalIndex(), depth); + TR_ByteCodeInfo bci = asyncCheckTreeTop->getNode()->getByteCodeInfo(); + + TR::Node *root = bfi->generateBlockRawCountCalculationSubTree(comp, node, trace()); + + // If we got a bad counters/ bad block frequency info, above API would return NULL, if that is the case we can not generate a recompilation test for that location. + if (!root) + { + TR::DebugCounter::incStaticDebugCounter(comp, TR::DebugCounter::debugCounterName(comp, "jprofiling.instrument/badcounters/(%s)", comp->signature())); + continue; + } + dumpOptDetails(comp, "%s Add recompilation test after asyncCheck node n%dn\n", optDetailString(), node->getGlobalIndex()); + + //Splitting the original block from AsyncCheckTT + TR::Block *remainingCodeBlock = originalBlock->split(asyncCheckTreeTop->getNextTreeTop(), cfg, true, true); + TR::Block *callRecompileBlock = TR::Block::createEmptyBlock(node, comp, UNKNOWN_COLD_BLOCK_COUNT); + callRecompileBlock->setIsCold(true); + + // jitRetranslateCallerWithPrep Helper call + TR::TreeTop *callTree = TR::TransformUtil::generateRetranslateCallerWithPrepTrees(node, TR_PersistentMethodInfo::RecompDueToJProfiling, comp); + callTree->getNode()->setIsProfilingCode(); + callRecompileBlock->append(callTree); + cfg->addNode(callRecompileBlock); + const char * const name = TR::DebugCounter::debugCounterName(comp, + "recompilationHelper/(%s)/%d", + comp->signature(),depth); + TR::DebugCounter::prependDebugCounter(comp, name, callTree); + + // Code to calculate the raw frequency of loop from block counters and comparing with the adjusted recompilation threshold. + // threshold for this particular test location is calculated from the base recompile threshold and the nesting depth of the loop + // Putting a higher threshold limit to 10K currently to prohibit running profiling body for too long. + int32_t threshold = recompileThreshold << (depth-1); + // It is very unlikely that we have a very large depth of the loop which causes above value to become negative + // To safeguard this scenario, we also check if threshold is negative or zero we set this thrshold to maxLoopRecompileThreshold + TR::Node *cmpNode = TR::Node::createif(TR::ificmple, root, TR::Node::iconst(node, (threshold > 0 && threshold <= maxLoopRecompilationThreshold) ? threshold : maxLoopRecompilationThreshold), remainingCodeBlock->getEntry()); + TR::TreeTop *cmpFlag = TR::TreeTop::create(comp, cmpNode); + cmpFlag->getNode()->setIsProfilingCode(); + remainingCodeBlock->getEntry()->insertTreeTopsBeforeMe(callRecompileBlock->getEntry(), callRecompileBlock->getExit()); + static bool generateJProfilingRecompQueueTest = (feGetEnv("TR_DontGenerateJProfilingRecompQueueTest") == NULL); + if (generateJProfilingRecompQueueTest) + { + TR::Block *calculateLoopRawFreq = TR::Block::createEmptyBlock(node, comp, remainingCodeBlock->getFrequency()); + TR::SymbolReference *symRef = comp->getSymRefTab()->createKnownStaticDataSymbolRef(bfi->getIsQueuedForRecompilation(), TR::Int32); + TR::Node *loadIsQueuedForRecompilation = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); + TR::Node *checkIfQueueForRecompilation = TR::Node::createif(TR::ificmpeq, loadIsQueuedForRecompilation, TR::Node::iconst(node, -1), remainingCodeBlock->getEntry()); + TR::TreeTop *checkIfNeededRecompilationTestTT = TR::TreeTop::create(comp, originalBlock->getLastRealTreeTop(), checkIfQueueForRecompilation); + calculateLoopRawFreq->append(cmpFlag); + cfg->addNode(calculateLoopRawFreq); + callRecompileBlock->getEntry()->insertTreeTopsBeforeMe(calculateLoopRawFreq->getEntry(), calculateLoopRawFreq->getExit()); + cfg->addEdge(TR::CFGEdge::createEdge(originalBlock, calculateLoopRawFreq, comp->trMemory())); + cfg->addEdge(TR::CFGEdge::createEdge(calculateLoopRawFreq, callRecompileBlock, comp->trMemory())); + cfg->addEdge(TR::CFGEdge::createEdge(calculateLoopRawFreq, remainingCodeBlock, comp->trMemory())); + } + else + { + originalBlock->append(cmpFlag); + cfg->addEdge(TR::CFGEdge::createEdge(originalBlock, callRecompileBlock, comp->trMemory())); + } + cfg->addEdge(TR::CFGEdge::createEdge(callRecompileBlock, remainingCodeBlock, comp->trMemory())); + if (trace()) + traceMsg(comp,"\t\t Newly created recompilation Test : Threshold comparison Node n%dn\n\t\tRecompilation Call in block_%d\n", + cmpNode->getGlobalIndex(), callRecompileBlock->getNumber()); + } + } + +const char * +TR_JProfilingRecompLoopTest::optDetailString() const throw() + { + return "O^O JPROFILER RECOMP TEST: "; + } diff --git a/runtime/compiler/optimizer/JProfilingRecompLoopTest.hpp b/runtime/compiler/optimizer/JProfilingRecompLoopTest.hpp new file mode 100644 index 00000000000..73010949201 --- /dev/null +++ b/runtime/compiler/optimizer/JProfilingRecompLoopTest.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright (c) 2018, 2018 IBM Corp. and others + * + * This program and the accompanying materials are made available under + * the terms of the Eclipse Public License 2.0 which accompanies this + * distribution and is available at https://www.eclipse.org/legal/epl-2.0/ + * or the Apache License, Version 2.0 which accompanies this distribution and + * is available at https://www.apache.org/licenses/LICENSE-2.0. + * + * This Source Code may also be made available under the following + * Secondary Licenses when the conditions for such availability set + * forth in the Eclipse Public License, v. 2.0 are satisfied: GNU + * General Public License, version 2 with the GNU Classpath + * Exception [1] and GNU General Public License, version 2 with the + * OpenJDK Assembly Exception [2]. + * + * [1] https://www.gnu.org/software/classpath/license.html + * [2] http://openjdk.java.net/legal/assembly-exception.html + * + * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 OR LicenseRef-GPL-2.0 WITH Assembly-exception + *******************************************************************************/ +#include // for int32_t +#include "optimizer/Optimization.hpp" // for Optimization +#include "optimizer/OptimizationManager.hpp" // for OptimizationManager +#include "infra/Checklist.hpp" // fir NodeChecklist +#include "runtime/J9Profiler.hpp" +#include "optimizer/Structure.hpp" +/** + * Class TR_JProfilingRecompLoopTest + * =================================== + * + * When jProfiling in compilations are enabled, we have only one recompilation + * test that counts the method invocation to trip up recompilation. For work loads + * where we are spending a lot of time in loops, we are hitting a risk where we + * are running into compiled method body with profiling trees if we just rely on + * method invocations. This optimization scans the method body and puts + * recompilation test after asynccheck node to consider number of times a loop + * is running. + */ +class TR_JProfilingRecompLoopTest : public TR::Optimization + { + public: + // While doing the first walk over treetop, we collect TreeTop after which we put recompilation test, corresponding Block and loop nesting depth + // Following data structures are used to keep this information + typedef TR::typed_allocator, int32_t>, TR::Region &> RecompilationTestLocationInfoAllocator; + typedef std::deque, int32_t>, RecompilationTestLocationInfoAllocator> RecompilationTestLocationsInfo; + TR_JProfilingRecompLoopTest(TR::OptimizationManager *manager) + : TR::Optimization(manager) + {} + static TR::Optimization *create(TR::OptimizationManager *manager) + { + return new (manager->allocator()) TR_JProfilingRecompLoopTest(manager); + } + virtual int32_t perform(); + virtual const char *optDetailString() const throw(); + void addRecompilationTests(TR::Compilation *comp, RecompilationTestLocationsInfo &testLocations); + bool isByteCodeInfoInCurrentTestLocationList(TR_ByteCodeInfo &bci, TR::list &addedLocationBCIList); + static int32_t maxLoopRecompilationThreshold; + }; diff --git a/runtime/compiler/optimizer/Optimizations.enum b/runtime/compiler/optimizer/Optimizations.enum index f3da6b54697..fa7dd52fe74 100644 --- a/runtime/compiler/optimizer/Optimizations.enum +++ b/runtime/compiler/optimizer/Optimizations.enum @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2017 IBM Corp. and others + * Copyright (c) 2000, 2018 IBM Corp. and others * * This program and the accompanying materials are made available under * the terms of the Eclipse Public License 2.0 which accompanies this @@ -33,3 +33,4 @@ OPTIMIZATION(osrGuardRemoval) OPTIMIZATION(jProfilingBlock) OPTIMIZATION(jProfilingValue) + OPTIMIZATION(jProfilingRecompLoopTest) diff --git a/runtime/compiler/runtime/J9Profiler.cpp b/runtime/compiler/runtime/J9Profiler.cpp index 32d07bd8844..d8d7dde3fa6 100644 --- a/runtime/compiler/runtime/J9Profiler.cpp +++ b/runtime/compiler/runtime/J9Profiler.cpp @@ -1933,6 +1933,115 @@ TR_BlockFrequencyInfo::getRawCount(TR::ResolvedMethodSymbol *resolvedMethod, TR_ return frequency; } +/** \brief Returns block number of the original block in which current byteCodeInfo was created. + * \details + * It finds the byte code info of the start of the block in which passed bci was originally + * created and returns the block number of that block from the stored information. + * It is useful when we are trying to associate the profiling information with the node which might + * have moved to different blocks with different bci + * \param bci TR_ByteCodeInfo for which original block number is searched for + * \param comp Current compilation object + * \return block number of the original block bci belongs to. + * WARNING: If consumer of this API uses this to to get the profiled data in later compilation and + * requested BCI was not inlined before, it returns -1. + */ +int32_t +TR_BlockFrequencyInfo::getOriginalBlockNumberToGetRawCount(TR_ByteCodeInfo &bci, TR::Compilation *comp, bool trace) + { + int32_t callerIndex = bci.getCallerIndex(); + TR::ResolvedMethodSymbol *resolvedMethod = callerIndex < 0 ? comp->getMethodSymbol() : comp->getInlinedResolvedMethodSymbol(callerIndex); + int32_t byteCodeToSearch = resolvedMethod->getProfilingByteCodeIndex(bci.getByteCodeIndex()); + TR_ByteCodeInfo searchBCI = bci; + searchBCI.setByteCodeIndex(byteCodeToSearch); + bool currentCallSiteInfo = TR_CallSiteInfo::getCurrent(comp) == _callSiteInfo; + for (auto i=0; i < _numBlocks; ++i) + { + if (currentCallSiteInfo && _callSiteInfo->hasSameBytecodeInfo(_blocks[i], searchBCI, comp) || + (!currentCallSiteInfo && _blocks[i].getCallerIndex() == searchBCI.getCallerIndex() && _blocks[i].getByteCodeIndex() == searchBCI.getByteCodeIndex())) + { + if (trace) + traceMsg(comp, "Get frequency from original block_%d\n", i); + return i; + } + } + return -1; + } +/** \brief Using stored static blocl frequency counters creates a node that calculates the raw count of block in which passed node belongs to + * \param comp Current compilation object + * \return root A node that loads/adds/subtracts the static block counter to calculate raw frequency of corresponding block + */ +TR::Node* +TR_BlockFrequencyInfo::generateBlockRawCountCalculationSubTree(TR::Compilation *comp, TR::Node *node, bool trace) + { + return generateBlockRawCountCalculationSubTree(comp, getOriginalBlockNumberToGetRawCount(node->getByteCodeInfo(), comp, trace), node); + } +/** \brief Creates a node that calculates the raw count of passed block from the stored static block frequency counters + * \param comp Current compilation object + * \param blockNumber Number of a block for which we need to generate frequency calculation node + * \return root A node that loads/adds/subtracts the static block counter to calculate raw frequency of corresponding block + */ +TR::Node* +TR_BlockFrequencyInfo::generateBlockRawCountCalculationSubTree(TR::Compilation *comp, int32_t blockNumber, TR::Node *node) + { + TR::Node *root = NULL; + if (blockNumber > -1 && (_counterDerivationInfo[blockNumber * 2] + && ((((uintptr_t)_counterDerivationInfo[blockNumber * 2]) & 0x1 == 1) + || !_counterDerivationInfo[blockNumber * 2]->isEmpty()))) + { + TR::Node *addRoot = NULL; + if (((uintptr_t)_counterDerivationInfo[blockNumber * 2]) & 0x1 == 1) + { + TR::SymbolReference *symRef = comp->getSymRefTab()->createKnownStaticDataSymbolRef(getFrequencyForBlock(((uintptr_t)_counterDerivationInfo[blockNumber * 2]) >> 1), TR::Int32); + addRoot = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); + } + else + { + TR_BitVectorIterator addBVI(*(_counterDerivationInfo[blockNumber * 2])); + while (addBVI.hasMoreElements()) + { + TR::SymbolReference *symRef = comp->getSymRefTab()->createKnownStaticDataSymbolRef(getFrequencyForBlock(addBVI.getNextElement()), TR::Int32); + TR::Node *counterLoad = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); + if (addRoot) + addRoot = TR::Node::create(node, TR::iadd, 2, addRoot, counterLoad); + else + addRoot = counterLoad; + } + } + TR::Node *subRoot = NULL; + if (_counterDerivationInfo[blockNumber * 2 +1] != NULL) + { + if (((uintptr_t)_counterDerivationInfo[blockNumber *2 + 1]) & 0x1 == 1) + { + TR::SymbolReference *symRef = comp->getSymRefTab()->createKnownStaticDataSymbolRef(getFrequencyForBlock(((uintptr_t)_counterDerivationInfo[blockNumber * 2 + 1]) >> 1), TR::Int32); + subRoot = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); + } + else + { + TR_BitVectorIterator subBVI(*(_counterDerivationInfo[blockNumber * 2 + 1])); + while (subBVI.hasMoreElements()) + { + TR::SymbolReference *symRef = comp->getSymRefTab()->createKnownStaticDataSymbolRef(getFrequencyForBlock(subBVI.getNextElement()), TR::Int32); + TR::Node *counterLoad = TR::Node::createWithSymRef(node, TR::iload, 0, symRef); + if (subRoot) + { + subRoot = TR::Node::create(node, TR::isub, 2, subRoot, counterLoad); + } + else + { + subRoot = counterLoad; + } + } + } + } + root = addRoot; + if (subRoot) + { + root = TR::Node::create(node, TR::isub, 2, root, subRoot); + } + } + return root; + } + int32_t TR_BlockFrequencyInfo::getRawCount(TR_ByteCodeInfo &bci, TR_CallSiteInfo *callSiteInfo, int64_t maxCount, TR::Compilation *comp) { diff --git a/runtime/compiler/runtime/J9Profiler.hpp b/runtime/compiler/runtime/J9Profiler.hpp index 52bb5b3e5e4..8dd0e13bcb7 100644 --- a/runtime/compiler/runtime/J9Profiler.hpp +++ b/runtime/compiler/runtime/J9Profiler.hpp @@ -640,15 +640,17 @@ class TR_BlockFrequencyInfo void setIsQueuedForRecompilation() { _isQueuedForRecompilation = -1; } int32_t *getIsQueuedForRecompilation() { return &_isQueuedForRecompilation; } + TR::Node* generateBlockRawCountCalculationSubTree(TR::Compilation *comp, int32_t blockNumber, TR::Node *node); + TR::Node* generateBlockRawCountCalculationSubTree(TR::Compilation *comp, TR::Node *node, bool trace); void dumpInfo(TR::FILE *); int32_t getCallCount(); int32_t getMaxRawCount(int32_t callerIndex); int32_t getMaxRawCount(); - private: int32_t getRawCount(TR::ResolvedMethodSymbol *resolvedMethod, TR_ByteCodeInfo &bci, TR_CallSiteInfo *callSiteInfo, int64_t maxCount, TR::Compilation *comp); int32_t getRawCount(TR_ByteCodeInfo &bci, TR_CallSiteInfo *callSiteInfo, int64_t maxCount, TR::Compilation *comp); + int32_t getOriginalBlockNumberToGetRawCount(TR_ByteCodeInfo &bci, TR::Compilation *comp, bool trace); TR_CallSiteInfo * _callSiteInfo; int32_t const _numBlocks;