From 738976e06e1c4ef124b9436ef88bb6c8485cb608 Mon Sep 17 00:00:00 2001 From: sevdokim Date: Thu, 21 Apr 2022 12:20:27 +0200 Subject: [PATCH] CPV: mute raw decoder error reporting for 10 minutes if it reports more than 10 errors per minute --- .../include/CPVReconstruction/RawDecoder.h | 13 +++-- .../CPV/reconstruction/src/RawDecoder.cxx | 58 ++++++++++++------- .../CPVWorkflow/RawToDigitConverterSpec.h | 20 ++++--- .../workflow/src/RawToDigitConverterSpec.cxx | 58 +++++++++++++++++-- 4 files changed, 110 insertions(+), 39 deletions(-) diff --git a/Detectors/CPV/reconstruction/include/CPVReconstruction/RawDecoder.h b/Detectors/CPV/reconstruction/include/CPVReconstruction/RawDecoder.h index 322cce079e0d5..822d6bb08323d 100644 --- a/Detectors/CPV/reconstruction/include/CPVReconstruction/RawDecoder.h +++ b/Detectors/CPV/reconstruction/include/CPVReconstruction/RawDecoder.h @@ -24,9 +24,7 @@ namespace o2 namespace cpv { -class RawDecoderError -{ - public: +struct RawDecoderError { RawDecoderError() = default; //Constructors for vector::emplace_back methods RawDecoderError(short c, short d, short g, short p, RawErrorType_t e) : ccId(c), dil(d), gas(g), pad(p), errortype(e) {} RawDecoderError(const RawDecoderError& e) = default; @@ -37,7 +35,6 @@ class RawDecoderError short gas; short pad; RawErrorType_t errortype; - ClassDefNV(RawDecoderError, 1); }; union AddressCharge { @@ -99,6 +96,9 @@ class RawDecoder /// \return Reference to the list of decoding errors const std::vector& getErrors() const { return mErrors; } + /// \brief mute error reporting + void muteErrors() { mIsMuteErrors = true; } + protected: /// \brief Read channels for the current event in the raw buffer RawErrorType_t readChannels(); @@ -111,9 +111,10 @@ class RawDecoder std::vector mDigits; ///< vector of channels and BCs in the raw stream std::vector mBCRecords; ///< vector of bc references to digits std::vector mErrors; ///< vector of decoding errors - bool mChannelsInitialized = false; ///< check whether the channels are initialized + bool mChannelsInitialized; ///< check whether the channels are initialized + bool mIsMuteErrors; ///< mute errors - ClassDefNV(RawDecoder, 2); + ClassDefNV(RawDecoder, 3); }; } // namespace cpv diff --git a/Detectors/CPV/reconstruction/src/RawDecoder.cxx b/Detectors/CPV/reconstruction/src/RawDecoder.cxx index b83eeb3b10dec..948212e559e22 100644 --- a/Detectors/CPV/reconstruction/src/RawDecoder.cxx +++ b/Detectors/CPV/reconstruction/src/RawDecoder.cxx @@ -19,13 +19,13 @@ using namespace o2::cpv; RawDecoder::RawDecoder(RawReaderMemory& reader) : mRawReader(reader), - mChannelsInitialized(false) + mChannelsInitialized(false), + mIsMuteErrors(false) { } RawErrorType_t RawDecoder::decode() { - auto& rdh = mRawReader.getRawHeader(); short linkID = o2::raw::RDHUtils::getLinkID(rdh); mDigits.clear(); @@ -42,6 +42,12 @@ RawErrorType_t RawDecoder::decode() RawErrorType_t RawDecoder::readChannels() { mChannelsInitialized = false; + // // test error + // if (!mIsMuteErrors) { + // LOG(error) << "RawDecoder::readChannels() : " + // << "test error"; + // } + // mErrors.emplace_back(-1, 0, 0, 0, kOK); //5 is non-existing link with general errors auto& payloadWords = mRawReader.getPayload(); uint32_t wordCountFromLastHeader = 1; //header word is included @@ -59,20 +65,24 @@ RawErrorType_t RawDecoder::readChannels() << "I read cpv header for orbit = " << header.orbit() << " and BC = " << header.bc(); if (!isHeaderExpected) { //actually, header was not expected - LOG(error) << "RawDecoder::readChannels() : " - << "header was not expected"; + if (!mIsMuteErrors) { + LOG(error) << "RawDecoder::readChannels() : " + << "header was not expected"; + } removeLastNDigits(nDigitsAddedFromLastHeader); //remove previously added digits as they are bad - mErrors.emplace_back(5, 0, 0, 0, kNO_CPVTRAILER); + mErrors.emplace_back(-1, 0, 0, 0, kNO_CPVTRAILER); } skipUntilNextHeader = false; currentBC = header.bc(); wordCountFromLastHeader = 0; nDigitsAddedFromLastHeader = 0; if (currentOrbit != header.orbit()) { //bad cpvheader - LOG(error) << "RawDecoder::readChannels() : " - << "currentOrbit(=" << currentOrbit - << ") != header.orbit()(=" << header.orbit() << ")"; - mErrors.emplace_back(5, 0, 0, 0, kCPVHEADER_INVALID); //5 is non-existing link with general errors + if (!mIsMuteErrors) { + LOG(error) << "RawDecoder::readChannels() : " + << "currentOrbit(=" << currentOrbit + << ") != header.orbit()(=" << header.orbit() << ")"; + } + mErrors.emplace_back(-1, 0, 0, 0, kCPVHEADER_INVALID); //5 is non-existing link with general errors skipUntilNextHeader = true; } } else { @@ -89,8 +99,10 @@ RawErrorType_t RawDecoder::readChannels() if (addDigit(pw.mDataWord, word.ccId(), currentBC)) { nDigitsAddedFromLastHeader++; } else { - LOG(debug) << "RawDecoder::readChannels() : " - << "read pad word with non-valid pad address"; + if (!mIsMuteErrors) { + LOG(debug) << "RawDecoder::readChannels() : " + << "read pad word with non-valid pad address"; + } unsigned int dil = pw.dil, gas = pw.gas, address = pw.address; mErrors.emplace_back(word.ccId(), dil, gas, address, kPadAddress); } @@ -103,18 +115,22 @@ RawErrorType_t RawDecoder::readChannels() if (diffInCount > 1 || diffInCount < -1) { //some words lost? - LOG(error) << "RawDecoder::readChannels() : " - << "Read " << wordCountFromLastHeader << " words, expected " << trailer.wordCounter(); - mErrors.emplace_back(5, 0, 0, 0, kCPVTRAILER_INVALID); + if (!mIsMuteErrors) { + LOG(error) << "RawDecoder::readChannels() : " + << "Read " << wordCountFromLastHeader << " words, expected " << trailer.wordCounter(); + } + mErrors.emplace_back(-1, 0, 0, 0, kCPVTRAILER_INVALID); //throw all previous data and go to next header removeLastNDigits(nDigitsAddedFromLastHeader); skipUntilNextHeader = true; } if (trailer.bc() != currentBC) { //trailer does not fit header - LOG(error) << "RawDecoder::readChannels() : " - << "CPVHeader BC is " << currentBC << " but CPVTrailer BC is " << trailer.bc(); - mErrors.emplace_back(5, 0, 0, 0, kCPVTRAILER_INVALID); + if (!mIsMuteErrors) { + LOG(error) << "RawDecoder::readChannels() : " + << "CPVHeader BC(" << currentBC << ") != CPVTrailer BC(" << trailer.bc() << ")"; + } + mErrors.emplace_back(-1, 0, 0, 0, kCPVTRAILER_INVALID); removeLastNDigits(nDigitsAddedFromLastHeader); skipUntilNextHeader = true; } @@ -122,9 +138,11 @@ RawErrorType_t RawDecoder::readChannels() } else { wordCountFromLastHeader++; //error - LOG(error) << "RawDecoder::readChannels() : " - << "Read unknown word"; - mErrors.emplace_back(5, 0, 0, 0, kUNKNOWN_WORD); //add error for non-existing row + if (!mIsMuteErrors) { + LOG(error) << "RawDecoder::readChannels() : " + << "Read unknown word"; + } + mErrors.emplace_back(-1, 0, 0, 0, kUNKNOWN_WORD); //add error for non-existing row //what to do? } } diff --git a/Detectors/CPV/workflow/include/CPVWorkflow/RawToDigitConverterSpec.h b/Detectors/CPV/workflow/include/CPVWorkflow/RawToDigitConverterSpec.h index 9c7cdb1001981..1145bd5541bb5 100644 --- a/Detectors/CPV/workflow/include/CPVWorkflow/RawToDigitConverterSpec.h +++ b/Detectors/CPV/workflow/include/CPVWorkflow/RawToDigitConverterSpec.h @@ -10,7 +10,7 @@ // or submit itself to any jurisdiction. #include - +#include #include "Framework/DataProcessorSpec.h" #include "Framework/Task.h" #include "Framework/ConcreteDataMatcher.h" @@ -68,12 +68,18 @@ class RawToDigitConverterSpec : public framework::Task char CheckHWAddress(short ddl, short hwAddress, short& fee); private: - bool mIsUsingGainCalibration; ///< Use gain calibration from CCDB - bool mIsUsingBadMap; ///< Use BadChannelMap to mask bad channels - bool mIsPedestalData; ///< Do not subtract pedestals if true - std::vector mOutputDigits; ///< Container with output cells - std::vector mOutputTriggerRecords; ///< Container with output cells - std::vector mOutputHWErrors; ///< Errors occured in reading data + bool mIsUsingGainCalibration; ///< Use gain calibration from CCDB + bool mIsUsingBadMap; ///< Use BadChannelMap to mask bad channels + bool mIsPedestalData; ///< Do not subtract pedestals if true + std::vector mOutputDigits; ///< Container with output cells + std::vector mOutputTriggerRecords; ///< Container with output cells + std::vector mOutputHWErrors; ///< Errors occured in reading data + bool mIsMuteDecoderErrors = false; ///< mute errors for 10 minutes + int mDecoderErrorsCounterWhenMuted = 0; ///< errors counter while errors are muted + int mDecoderErrorsPerMinute = 0; ///< errors per minute counter + int mMinutesPassed = 0; ///< runtime duration in minutes + std::chrono::time_point mStartTime; ///< Time of start of decoding + std::chrono::time_point mTimeWhenMuted; ///< Time when muted errors }; /// \brief Creating DataProcessorSpec for the CPV Digit Converter Spec diff --git a/Detectors/CPV/workflow/src/RawToDigitConverterSpec.cxx b/Detectors/CPV/workflow/src/RawToDigitConverterSpec.cxx index 22740c178e96f..58f84f5f4b424 100644 --- a/Detectors/CPV/workflow/src/RawToDigitConverterSpec.cxx +++ b/Detectors/CPV/workflow/src/RawToDigitConverterSpec.cxx @@ -35,6 +35,10 @@ using Lifetime = o2::framework::Lifetime; void RawToDigitConverterSpec::init(framework::InitContext& ctx) { + mStartTime = std::chrono::system_clock::now(); + mDecoderErrorsPerMinute = 0; + mIsMuteDecoderErrors = false; + LOG(debug) << "Initializing RawToDigitConverterSpec..."; // Pedestal flag true/false LOG(info) << "Pedestal run: " << (mIsPedestalData ? "YES" : "NO"); @@ -56,6 +60,23 @@ void RawToDigitConverterSpec::init(framework::InitContext& ctx) void RawToDigitConverterSpec::run(framework::ProcessingContext& ctx) { + // check timers if we need mute/unmute error reporting + auto now = std::chrono::system_clock::now(); + if (mIsMuteDecoderErrors) { // check if 10-minutes muting period passed + if (((now - mTimeWhenMuted) / std::chrono::minutes(1)) >= 10) { + mIsMuteDecoderErrors = false; //unmute + if (mDecoderErrorsCounterWhenMuted) { + LOG(error) << "RawToDigitConverterSpec::run() : " << mDecoderErrorsCounterWhenMuted << " errors happened while it was muted (("; + } + mDecoderErrorsCounterWhenMuted = 0; + } + } + if (((now - mStartTime) / std::chrono::minutes(1)) > mMinutesPassed) { + mMinutesPassed = (now - mStartTime) / std::chrono::minutes(1); + LOG(debug) << "minutes passed: " << mMinutesPassed; + mDecoderErrorsPerMinute = 0; + } + // Cache digits from bunch crossings as the component reads timeframes from many links consecutively std::map>> digitBuffer; // Internal digit buffer int firstEntry = 0; @@ -120,11 +141,13 @@ void RawToDigitConverterSpec::run(framework::ProcessingContext& ctx) try { rawreader.next(); } catch (RawErrorType_t e) { - LOG(error) << "Raw decoding error " << (int)e; + if (!mIsMuteDecoderErrors) { + LOG(error) << "Raw decoding error " << (int)e; + } //add error list //RawErrorType_t is defined in O2/Detectors/CPV/reconstruction/include/CPVReconstruction/RawReaderMemory.h //RawDecoderError(short c, short d, short g, short p, RawErrorType_t e) - mOutputHWErrors.emplace_back(25, 0, 0, 0, e); //Put general errors to non-existing ccId 25 + mOutputHWErrors.emplace_back(-1, 0, 0, 0, e); //Put general errors to non-existing ccId -1 //if problem in header, abandon this page if (e == RawErrorType_t::kRDH_DECODING) { LOG(error) << "RDH decoding error. Skipping this TF"; @@ -139,17 +162,40 @@ void RawToDigitConverterSpec::run(framework::ProcessingContext& ctx) auto mod = o2::raw::RDHUtils::getLinkID(rdh) + 2; //link=0,1,2 -> mod=2,3,4 //for now all modules are written to one LinkID if (mod > o2::cpv::Geometry::kNMod || mod < 2) { //only 3 correct modules:2,3,4 - LOG(error) << "module=" << mod << "do not exist"; - mOutputHWErrors.emplace_back(25, mod, 0, 0, kRDH_INVALID); //Add non-existing modules to non-existing ccId 25 and dilogic = mod - continue; //skip STU mod + if (!mIsMuteDecoderErrors) { + LOG(error) << "RDH linkId corresponds to module " << mod << " which does not exist"; + } + mOutputHWErrors.emplace_back(-1, mod, 0, 0, kRDH_INVALID); //Add non-existing modules to non-existing ccId -1 and dilogic = mod + continue; } o2::cpv::RawDecoder decoder(rawreader); + if (mIsMuteDecoderErrors) { + decoder.muteErrors(); + } RawErrorType_t err = decoder.decode(); + int decoderErrors = 0; + for (auto errs : decoder.getErrors()) { + if (errs.ccId == -1) { // error related to wrong data format + decoderErrors++; + } + } + mDecoderErrorsPerMinute += decoderErrors; + // LOG(debug) << "RawDecoder found " << decoderErrors << " raw format errors"; + // LOG(debug) << "Now I have " << mDecoderErrorsPerMinute << " errors for current minute"; + if (mIsMuteDecoderErrors) { + mDecoderErrorsCounterWhenMuted += decoder.getErrors().size(); + } else { + if (mDecoderErrorsPerMinute > 10) { // mute error reporting for 10 minutes + LOG(warning) << "> 10 raw decoder error messages per minute, muting it for 10 minutes"; + mIsMuteDecoderErrors = true; + mTimeWhenMuted = std::chrono::system_clock::now(); + } + } if (!(err == kOK || err == kOK_NO_PAYLOAD)) { //TODO handle severe errors //TODO: probably careful conversion of decoder errors to Fitter errors? - mOutputHWErrors.emplace_back(25, mod, 0, 0, err); //assign general RDH errors to non-existing ccId 25 and dilogic = mod + mOutputHWErrors.emplace_back(-1, mod, 0, 0, err); //assign general RDH errors to non-existing ccId -1 and dilogic = mod } std::shared_ptr> currentDigitContainer;