Skip to content

Commit

Permalink
Setting a 46h soft timeout for PromptReco jobs (#4600)
Browse files Browse the repository at this point in the history
  • Loading branch information
germanfgv authored Oct 27, 2021
1 parent b66377c commit 2f62671
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 17 deletions.
11 changes: 3 additions & 8 deletions bin/00_deploy_prod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,6 @@ echo 'config.TaskArchiver.useWorkQueue = False' >> ./config/tier0/config.py
echo "config.AgentStatusWatcher.enabled = False" >> ./config/tier0/config.py
echo "config.AgentStatusWatcher.onlySSB = False" >> ./config/tier0/config.py

#
# Increase ErrorHandler maxFailTime
#
echo "config.ErrorHandler.maxFailTime = 604800" >> ./config/tier0/config.py

#
# JobAccountant Repack Error Dataset settings
#
Expand Down Expand Up @@ -258,8 +253,8 @@ echo "config.AgentStatusWatcher.runningRepackPercent = 10" >> ./config/tier0/con

#Configurable retry number for failing jobs before they go to paused
echo "config.RetryManager.PauseAlgo.section_('Express')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Express.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Express.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.section_('Processing')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Processing.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Processing.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.section_('Repack')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Repack.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Repack.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
11 changes: 3 additions & 8 deletions bin/00_deploy_replay.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,6 @@ echo 'config.TaskArchiver.useWorkQueue = False' >> ./config/tier0/config.py
echo "config.AgentStatusWatcher.enabled = False" >> ./config/tier0/config.py
echo "config.AgentStatusWatcher.onlySSB = False" >> ./config/tier0/config.py

#
# Increase ErrorHandler maxFailTime
#
echo "config.ErrorHandler.maxFailTime = 604800" >> ./config/tier0/config.py

#
# JobAccountant Repack Error Dataset settings
#
Expand Down Expand Up @@ -251,11 +246,11 @@ echo "config.AgentStatusWatcher.runningRepackPercent = 10" >> ./config/tier0/con

#Configurable retry number for failing jobs before they go to paused
echo "config.RetryManager.PauseAlgo.section_('Express')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Express.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Express.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.section_('Processing')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Processing.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Processing.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.section_('Repack')" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Repack.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py
echo "config.RetryManager.PauseAlgo.Repack.retryErrorCodes = { 8001: 0, 70: 0, 50513: 0, 50660: 0, 50661: 0, 50664: 0, 71304: 0, 99109: 0, 99303: 0, 99400: 0, 8001: 0, 50115: 0 }" >> ./config/tier0/config.py

#Overwrite RetryManager to show Logcollect and CleanUp jobs paused instead of automatically fails
sed -i "s/config.RetryManager.plugins.*/config.RetryManager.plugins={'default': 'PauseAlgo', 'Cleanup': 'PauseAlgo', 'LogCollect': 'PauseAlgo'}/g" ./config/tier0/config.py
Expand Down
2 changes: 1 addition & 1 deletion src/python/T0/RunConfig/RunConfigAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ def releasePromptReco(tier0Config, specDirectory, dqmUploadProxy):
'SiteBlacklist': [],
'TrustSitelists': "True",
'BlockCloseMaxWaitTime': datasetConfig.BlockCloseDelay,
'SoftTimeout': 604800, #7 days, effectively disabled
'SoftTimeout': 165600, #46 hours
'GracePeriod': 3600,
'Dashboard': "t0" } )

Expand Down

0 comments on commit 2f62671

Please sign in to comment.