-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Many jobs based on the same agent template produce many failed deployment #102
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,7 +131,7 @@ public Collection<NodeProvisioner.PlannedNode> provision(CloudState cloudState, | |
return agent; | ||
} catch (Exception e) { | ||
LOGGER.log(Level.WARNING, "AciCloud: Provision agent {0} failed: {1}", | ||
new Object[] {agent.getNodeName(), e.getMessage()}); | ||
new Object[] {agent.getNodeName(), e}); | ||
|
||
agent.terminate(); | ||
|
||
|
@@ -177,8 +177,22 @@ public AciContainerTemplate getFirstTemplate(Label label) { | |
public void addIpEnv(AciAgent agent) throws Exception { | ||
AzureResourceManager azureResourceManager = getAzureClient(); | ||
|
||
String ip = azureResourceManager.containerGroups() | ||
.getByResourceGroup(resourceGroup, agent.getNodeName()).ipAddress(); | ||
// Workaround for https://github.com/Azure/azure-sdk-for-java/issues/27083 | ||
String ip = null; | ||
boolean nullIsThrown; | ||
do { | ||
try { | ||
ip = azureResourceManager.containerGroups() | ||
.getByResourceGroup(resourceGroup, agent.getNodeName()).ipAddress(); | ||
nullIsThrown = false; | ||
} catch (NullPointerException e) { | ||
LOGGER.log(Level.WARNING, "During asking for IP address of Agent {0} NullPointerException is thrown," | ||
+ "but it is ignored.", agent.getNodeName()); | ||
nullIsThrown = true; | ||
final int retryInterval = 5 * 1000; | ||
Thread.sleep(retryInterval); | ||
} | ||
} while (nullIsThrown); | ||
|
||
EnvironmentVariablesNodeProperty ipEnv = new EnvironmentVariablesNodeProperty( | ||
new EnvironmentVariablesNodeProperty.Entry("IP", ip) | ||
|
@@ -215,19 +229,25 @@ private void waitToOnline(AciAgent agent, int startupTimeout, StopWatch stopWatc | |
if (computer == null) { | ||
throw new IllegalStateException("Agent node has been deleted"); | ||
} | ||
ContainerGroup containerGroup = | ||
azureResourceManager.containerGroups().getByResourceGroup(resourceGroup, agent.getNodeName()); | ||
|
||
if (containerGroup.containers().containsKey(agent.getNodeName()) | ||
&& containerGroup.containers().get(agent.getNodeName()).instanceView().currentState().state() | ||
.equals("Terminated")) { | ||
|
||
// there doesn't seem to be anyway to get debug information with the current API version in the SDK | ||
// logs and events just return nothing | ||
// while debugging with the CLI the best way I could find was 'attaching' to the container | ||
// see https://github.com/Azure/azure-libraries-for-java/issues/1379 | ||
throw new IllegalStateException("ACI container terminated, see the Azure portal / " | ||
+ "CLI for more information"); | ||
try { | ||
ContainerGroup containerGroup = | ||
azureResourceManager.containerGroups().getByResourceGroup(resourceGroup, agent.getNodeName()); | ||
|
||
if (containerGroup.containers().containsKey(agent.getNodeName()) | ||
&& containerGroup.containers().get(agent.getNodeName()).instanceView().currentState().state() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we check the container for null instead and then retry rather than handling a null pointer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the NullPointerException is coming from Resource Manager internally |
||
.equals("Terminated")) { | ||
|
||
// there doesn't seem to be anyway to get debug information with the current API version in the SDK | ||
// logs and events just return nothing | ||
// while debugging with the CLI the best way I could find was 'attaching' to the container | ||
// see https://github.com/Azure/azure-libraries-for-java/issues/1379 | ||
throw new IllegalStateException("ACI container terminated, see the Azure portal / " | ||
+ "CLI for more information"); | ||
} | ||
} catch (NullPointerException e) { | ||
// workaround for https://github.com/Azure/azure-sdk-for-java/issues/27083 | ||
LOGGER.log(Level.WARNING, "Waiting for Agent {0} produces a NullPointerException, " | ||
+ "but it is ignored.", agent.getNodeName()); | ||
} | ||
|
||
if (computer.isOnline()) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could we check the ip address for null instead and then retry rather than handling a null pointer?
is it possible the IP hasn't been allocated yet? seems quite weird though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the NullPointerException is coming from Resource Manager internally
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.