From fe34533cf5dc09488cf458e1d5fd54d427b2a9fa Mon Sep 17 00:00:00 2001 From: Tom Denham Date: Mon, 4 Jan 2016 16:05:46 -0800 Subject: [PATCH] Don't fail under load Allow cleanup code in _claim_block_affinity to "fail" if it's safe to do so. Fixes #66 --- calico_containers/pycalico/ipam.py | 9 +++-- calico_containers/tests/unit/ipam_test.py | 44 ++++++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/calico_containers/pycalico/ipam.py b/calico_containers/pycalico/ipam.py index e148a6ea..874a9c77 100644 --- a/calico_containers/pycalico/ipam.py +++ b/calico_containers/pycalico/ipam.py @@ -185,11 +185,16 @@ def _claim_block_affinity(self, host, block_cidr): return # Some other host beat us to claiming this block. Clean up. - self.etcd_client.delete(key) + try: + self.etcd_client.delete(key) + except EtcdKeyNotFound: + # A race exists where another process on the same host could + # have already deleted the key. This is fine as long as the key + # no longer exists. + pass # Throw a key error to let the caller know the block wasn't free # after all. - raise HostAffinityClaimedError("Block %s already claimed by %s", block_id, block.host_affinity) # successfully created the block. Done. diff --git a/calico_containers/tests/unit/ipam_test.py b/calico_containers/tests/unit/ipam_test.py index fdc41faa..f28c8d4b 100644 --- a/calico_containers/tests/unit/ipam_test.py +++ b/calico_containers/tests/unit/ipam_test.py @@ -21,7 +21,7 @@ from pycalico.ipam import (IPAMClient, BlockHandleReaderWriter, CASError, NoFreeBlocksError, _block_datastore_key, - _handle_datastore_key) + _handle_datastore_key, HostAffinityClaimedError) from pycalico.datastore_errors import PoolNotFound from pycalico.block import AllocationBlock, AddressNotAssignedError, BLOCK_SIZE from pycalico.handle import AllocationHandle, AddressCountTooLow @@ -1312,6 +1312,48 @@ def test_claim_block_affinity_already_owned(self): prevExist=False)]) self.m_etcd_client.read.assert_called_once_with(key, quorum=True) + def test_claim_block_affinity_already_deleted(self): + """ + Test _claim_block_affinity() when another process deletes the host + affinity under our feet. + This can occur when the block gets claimed by one host and a second + host has two competing processes trying to claim the block. They can + both try to clean up at the same time. + + Order of events + 1 Write host affinity + 2 Try to write the new block, but this fails + 3 Re-read the block, discover another host owns it + 4 Delete key from 1 but find it's already removed. + """ + + block = _test_block_empty_v4() + m_result0 = Mock(spec=EtcdResult) + m_result0.value = block.to_json() + + block_our_host = AllocationBlock(BLOCK_V4_1, "test_host2") + + # Reads at 3 + self.m_etcd_client.read.return_value = m_result0 + + # Write at 1, 2 + self.m_etcd_client.write.side_effect = [None, EtcdAlreadyExist()] + + # Delete at 4 + self.m_etcd_client.delete.side_effect = [EtcdKeyNotFound()] + + with self.assertRaises(HostAffinityClaimedError): + self.client._claim_block_affinity(block_our_host.host_affinity, + block.cidr) + + key = _block_datastore_key(block_our_host.cidr) + value = block_our_host.to_json() + self.m_etcd_client.write.assert_has_calls([call(ANY, ""), + call(key, value, + prevExist=False)]) + self.m_etcd_client.read.assert_called_once_with(key, quorum=True) + self.m_etcd_client.delete.assert_called_once_with(ANY) + def test_new_affine_block_race(self): """ Test _new_affine_block when another host claims it between reading