From ca98878028118371d667f402129b1e024b5a0a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 1 Aug 2020 15:28:36 +0200 Subject: [PATCH 001/195] do not force --- pytorch_lightning/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0a4c9c349fba4..31998e9cf793d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1324,7 +1324,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) + self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1349,7 +1349,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) + self.set_random_port() self.testing = True self.model = model results = self.fit(model) From b0dbc283004a39af0d55676f6bb8012b0e8bba8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:19:40 +0200 Subject: [PATCH 002/195] debug --- pl_examples/basic_examples/gpu_template.py | 6 +++++- pytorch_lightning/core/lightning.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index ced4525d4db66..d4e62b2faf7a6 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -20,11 +20,15 @@ def main(args): # ------------------------ # 2 INIT TRAINER # ------------------------ - trainer = Trainer.from_argparse_args(args) + trainer = Trainer.from_argparse_args( + args, + distributed_backend='ddp' + ) # ------------------------ # 3 START TRAINING # ------------------------ + trainer.test(model) trainer.fit(model) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 5ff64156e1b33..8a1cd8b4e33ab 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -953,6 +953,8 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi f"is not equal to the computed world size ({world_size}). Ignored." ) + print('MASTER PORT', os.environ['MASTER_PORT']) + torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) From c9f91e02be6644505195d62d3dc867b9f2843280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:23:35 +0200 Subject: [PATCH 003/195] debug --- pytorch_lightning/trainer/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 31998e9cf793d..af49bf8cfc4ae 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1324,7 +1324,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port() + self.set_random_port(force=True) + # self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1349,7 +1350,8 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port() + self.set_random_port(force=True) + # self.set_random_port() self.testing = True self.model = model results = self.fit(model) From e64a56f2cbbb0dd146afac56c1e5def2fcf815cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:28:20 +0200 Subject: [PATCH 004/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 3ca5f6ffa68f3..aaf5b9562e96e 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -179,8 +179,8 @@ def train_fx(trial_hparams, cluster_manager, _): else: XLA_AVAILABLE = True -PID = os.getpid() -RNG1 = np.random.RandomState(PID) +#PID = os.getpid() +RNG1 = np.random.RandomState(0) RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) From 602809f87205852e97dd97bb291b8ed58f0653b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:31:19 +0200 Subject: [PATCH 005/195] debug --- pytorch_lightning/trainer/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index af49bf8cfc4ae..848c29006c32d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1338,6 +1338,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model_ref = self.get_model() model_ref.teardown('test') + torch_distrib.destroy_process_group() + return results def __test_given_model(self, model, test_dataloaders): @@ -1361,6 +1363,8 @@ def __test_given_model(self, model, test_dataloaders): if self.is_function_implemented('teardown'): model.teardown('test') + torch_distrib.destroy_process_group() + return results def barrier(self, name): From fb2e0c811104bc66f6c88d1f3979aa31e3f4da02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:45:03 +0200 Subject: [PATCH 006/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 + pytorch_lightning/trainer/trainer.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index aaf5b9562e96e..052c675931b4d 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -407,6 +407,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') + @rank_zero_only def set_random_port(self, force=False): """ When running DDP NOT managed by SLURM, the ports might collide diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 848c29006c32d..e33532fb31b52 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1338,8 +1338,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model_ref = self.get_model() model_ref.teardown('test') - torch_distrib.destroy_process_group() - return results def __test_given_model(self, model, test_dataloaders): From f238dc3898ed7366c096b47ed14856a0bf6d72dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:46:35 +0200 Subject: [PATCH 007/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 052c675931b4d..ce201c3a2b486 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -179,8 +179,10 @@ def train_fx(trial_hparams, cluster_manager, _): else: XLA_AVAILABLE = True -#PID = os.getpid() -RNG1 = np.random.RandomState(0) + +print('rank at import ', rank_zero_only.rank) +PID = os.getpid() +RNG1 = np.random.RandomState(PID) RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) From 81c5255d6fd06ed13be35fd9dca42f12dd8c9124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:51:03 +0200 Subject: [PATCH 008/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index ce201c3a2b486..3b20433a7893b 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -409,7 +409,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - @rank_zero_only + # @rank_zero_only def set_random_port(self, force=False): """ When running DDP NOT managed by SLURM, the ports might collide From d48e1477fbd177a5a488f30a374c68b1729661ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 19:53:38 +0200 Subject: [PATCH 009/195] debug --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e33532fb31b52..3da6b51b110d4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1338,6 +1338,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model_ref = self.get_model() model_ref.teardown('test') + torch_distrib.destroy_process_group() + return results def __test_given_model(self, model, test_dataloaders): From 885c1d772a8f383bbe205e3d835f2038bd6d1fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:17:29 +0200 Subject: [PATCH 010/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 9 +++------ pytorch_lightning/trainer/trainer.py | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 3b20433a7893b..08490ae780b3f 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -429,17 +429,14 @@ def set_random_port(self, force=False): def spawn_ddp_children(self, model): port = os.environ['MASTER_PORT'] - master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR'] + master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank node_rank = '0' - if 'NODE_RANK' in os.environ: - node_rank = os.environ['NODE_RANK'] - if 'GROUP_RANK' in os.environ: - node_rank = os.environ['GROUP_RANK'] - + node_rank = os.environ.get('NODE_RANK', node_rank) + node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 3da6b51b110d4..19abc862af158 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1008,7 +1008,7 @@ def fit( results = self.accelerator_backend.teardown(model) elif self.distributed_backend == 'ddp': - self.set_random_port() + self.set_random_port(force=True) results = self.spawn_ddp_children(model) elif self.use_dp: @@ -1324,7 +1324,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) + #self.set_random_port(force=True) # self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' @@ -1352,7 +1352,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) + #self.set_random_port(force=True) # self.set_random_port() self.testing = True self.model = model From c1da18b8f3371753cc9040cde979ea4bc8a24928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:24:21 +0200 Subject: [PATCH 011/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 08490ae780b3f..87fc5c96c1903 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -183,7 +183,7 @@ def train_fx(trial_hparams, cluster_manager, _): print('rank at import ', rank_zero_only.rank) PID = os.getpid() RNG1 = np.random.RandomState(PID) -RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) +RANDOM_PORTS = list(range(10000, 20000)) class TrainerDDPMixin(ABC): From 98cb6bb40133099023fec9dd386fc36ec76cfed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:27:12 +0200 Subject: [PATCH 012/195] debug --- pytorch_lightning/trainer/trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 19abc862af158..b6e23b47b4631 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1008,7 +1008,7 @@ def fit( results = self.accelerator_backend.teardown(model) elif self.distributed_backend == 'ddp': - self.set_random_port(force=True) + self.set_random_port() results = self.spawn_ddp_children(model) elif self.use_dp: @@ -1324,7 +1324,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - #self.set_random_port(force=True) + self.set_random_port(force=True) # self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' @@ -1338,7 +1338,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model_ref = self.get_model() model_ref.teardown('test') - torch_distrib.destroy_process_group() + #torch_distrib.destroy_process_group() return results @@ -1352,7 +1352,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - #self.set_random_port(force=True) + self.set_random_port(force=True) # self.set_random_port() self.testing = True self.model = model @@ -1363,7 +1363,7 @@ def __test_given_model(self, model, test_dataloaders): if self.is_function_implemented('teardown'): model.teardown('test') - torch_distrib.destroy_process_group() + #torch_distrib.destroy_process_group() return results From 87e4a78062d4f6ef9a6108f05788809ff51115d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:37:39 +0200 Subject: [PATCH 013/195] debug --- pytorch_lightning/core/lightning.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 8a1cd8b4e33ab..4113bef3f4eff 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -33,6 +33,18 @@ XLA_AVAILABLE = True + +def run_once(fn): + def wrapper(*args, **kwargs): + if not wrapper.has_run: + wrapper.has_run = True + fn(*args, **kwargs) + wrapper.has_run = False + return wrapper + + + + class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks, Module): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -921,6 +933,7 @@ def _init_slurm_connection(self) -> None: root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node + @run_once def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: """ Override to define your custom way of setting up a distributed environment. From 827944961de1bbca0111b73244645c8b7e9d5ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:39:46 +0200 Subject: [PATCH 014/195] debug --- pytorch_lightning/core/lightning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 4113bef3f4eff..dee7456b2e10c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -24,6 +24,7 @@ from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args +from pytorch_lightning.utilities import rank_zero_only try: import torch_xla.core.xla_model as xm @@ -38,6 +39,7 @@ def run_once(fn): def wrapper(*args, **kwargs): if not wrapper.has_run: wrapper.has_run = True + print('running it once on ', rank_zero_only.rank) fn(*args, **kwargs) wrapper.has_run = False return wrapper From 3fbdf763e7fbbae977d3fb5c9c6e916bc8749f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:41:02 +0200 Subject: [PATCH 015/195] debug --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index dee7456b2e10c..d1e286708651a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -39,7 +39,7 @@ def run_once(fn): def wrapper(*args, **kwargs): if not wrapper.has_run: wrapper.has_run = True - print('running it once on ', rank_zero_only.rank) + # print('running it once on ', rank_zero_only.rank) fn(*args, **kwargs) wrapper.has_run = False return wrapper From 34ac16b462ac4dfabf01933e78852f32f1db1fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:42:56 +0200 Subject: [PATCH 016/195] debug --- pytorch_lightning/core/lightning.py | 2 -- pytorch_lightning/trainer/trainer.py | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d1e286708651a..4113bef3f4eff 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -24,7 +24,6 @@ from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args -from pytorch_lightning.utilities import rank_zero_only try: import torch_xla.core.xla_model as xm @@ -39,7 +38,6 @@ def run_once(fn): def wrapper(*args, **kwargs): if not wrapper.has_run: wrapper.has_run = True - # print('running it once on ', rank_zero_only.rank) fn(*args, **kwargs) wrapper.has_run = False return wrapper diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b6e23b47b4631..1d88bc65943d3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1324,7 +1324,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) + self.set_random_port() # self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' @@ -1352,8 +1352,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) - # self.set_random_port() + self.set_random_port() self.testing = True self.model = model results = self.fit(model) From 24fb056d271d18871884b788e0477fae9c8c4a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:44:22 +0200 Subject: [PATCH 017/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 87fc5c96c1903..c6450d593490f 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -183,7 +183,8 @@ def train_fx(trial_hparams, cluster_manager, _): print('rank at import ', rank_zero_only.rank) PID = os.getpid() RNG1 = np.random.RandomState(PID) -RANDOM_PORTS = list(range(10000, 20000)) +#RANDOM_PORTS = list(range(10000, 20000)) +RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) class TrainerDDPMixin(ABC): From 9fce42149d1d3bcf5d11b9fb95c76ce6c555b555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:54:31 +0200 Subject: [PATCH 018/195] merge --- pytorch_lightning/accelerator_backends/ddp_backend.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 0b90a83474612..e7d053c81acd2 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -58,17 +58,14 @@ def train(self, model): def spawn_ddp_children(self, model): port = os.environ['MASTER_PORT'] - master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR'] + master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank node_rank = '0' - if 'NODE_RANK' in os.environ: - node_rank = os.environ['NODE_RANK'] - if 'GROUP_RANK' in os.environ: - node_rank = os.environ['GROUP_RANK'] - + node_rank = os.environ.get('NODE_RANK', node_rank) + node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' From 7b42a0fb9a2c9b043787c513442c1e5c22121dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 3 Aug 2020 23:57:48 +0200 Subject: [PATCH 019/195] debug --- pytorch_lightning/trainer/trainer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e2a3281fec3db..f217a4524b838 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1361,8 +1361,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): model_ref = self.get_model() model_ref.teardown('test') - #torch_distrib.destroy_process_group() - return results def __test_given_model(self, model, test_dataloaders): @@ -1383,8 +1381,6 @@ def __test_given_model(self, model, test_dataloaders): if self.is_function_implemented('teardown'): model.teardown('test') - #torch_distrib.destroy_process_group() - return results def barrier(self, name): From 223136bacf60b5aa888706924125196df2009978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 00:03:25 +0200 Subject: [PATCH 020/195] debug --- pytorch_lightning/core/decorators.py | 17 +++++++++++++++++ pytorch_lightning/core/lightning.py | 15 +-------------- .../trainer/distrib_data_parallel.py | 1 - pytorch_lightning/trainer/trainer.py | 1 - 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 97eba56ea2464..1828e5a398416 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -48,3 +48,20 @@ def auto_transfer_args(self, *args, **kwargs): return fn(self, *args, **kwargs) return auto_transfer_args + + +def run_once(fn): + """ + Decorate a function or method to make it run only once. + Subsequent calls will result in a no-operation. + """ + @wraps(fn) + def wrapper(*args, **kwargs): + if not wrapper.has_run: + wrapper.has_run = True + fn(*args, **kwargs) + + wrapper.has_run = False + return wrapper + + diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 46f50c10a29f1..475478081bc2c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -16,6 +16,7 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log +from pytorch_lightning.core.decorators import run_once from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import ModelHooks from pytorch_lightning.core.memory import ModelSummary @@ -33,18 +34,6 @@ XLA_AVAILABLE = True - -def run_once(fn): - def wrapper(*args, **kwargs): - if not wrapper.has_run: - wrapper.has_run = True - fn(*args, **kwargs) - wrapper.has_run = False - return wrapper - - - - class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks, Module): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -966,8 +955,6 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi f"is not equal to the computed world size ({world_size}). Ignored." ) - print('MASTER PORT', os.environ['MASTER_PORT']) - torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 457825454d87c..e82951abdec7a 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -408,7 +408,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - # @rank_zero_only def set_random_port(self, force=False): """ When running DDP NOT managed by SLURM, the ports might collide diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f217a4524b838..24af21e771c47 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1348,7 +1348,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path self.set_random_port() - # self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model From 463dfbbe3322d9aa408363483fa9f4622f008b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 00:23:01 +0200 Subject: [PATCH 021/195] debug --- pytorch_lightning/core/decorators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 1828e5a398416..097e1fa2100d7 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,8 +1,6 @@ from functools import wraps from typing import Callable -from pytorch_lightning.core.lightning import LightningModule - def auto_move_data(fn: Callable) -> Callable: """ @@ -40,6 +38,9 @@ def forward(self, x): """ @wraps(fn) def auto_transfer_args(self, *args, **kwargs): + # local import to prevent circular import issue + from pytorch_lightning.core.lightning import LightningModule + if not isinstance(self, LightningModule): return fn(self, *args, **kwargs) From 8395e149441f82d2513fc7e314a54fc237e0653b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 00:24:07 +0200 Subject: [PATCH 022/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index e82951abdec7a..d6ce609d2b966 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -172,10 +172,8 @@ def train_fx(trial_hparams, cluster_manager, _): XLA_AVAILABLE = True -print('rank at import ', rank_zero_only.rank) PID = os.getpid() RNG1 = np.random.RandomState(PID) -#RANDOM_PORTS = list(range(10000, 20000)) RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) From 3453ee21405b1fd99f69da0ec3fa591e8318a0ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 02:43:15 +0200 Subject: [PATCH 023/195] debug --- pl_examples/basic_examples/gpu_template.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index d4e62b2faf7a6..ae18a47335ec4 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -22,7 +22,10 @@ def main(args): # ------------------------ trainer = Trainer.from_argparse_args( args, - distributed_backend='ddp' + distributed_backend='ddp', + limit_train_batches=10, + limit_val_batches=10, + max_epochs=1, ) # ------------------------ @@ -30,6 +33,8 @@ def main(args): # ------------------------ trainer.test(model) trainer.fit(model) + trainer.test(model) + trainer.fit(model) def run_cli(): From 69804b7f16071592e7e0c32b8954dbd9a39d4b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:06:13 +0200 Subject: [PATCH 024/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 5 +++++ pytorch_lightning/trainer/training_loop.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index e7d053c81acd2..82be0680e3164 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -222,5 +222,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # clean up memory torch.cuda.empty_cache() + # clean up dist group + #if self.use_ddp or self.use_ddp2: + import torch.distributed as torch_distrib + torch_distrib.destroy_process_group() + if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index e0a7b43a872aa..79b02a6d1fdff 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1021,8 +1021,8 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - if self.use_ddp or self.use_ddp2: - torch_distrib.destroy_process_group() + # if self.use_ddp or self.use_ddp2: + # torch_distrib.destroy_process_group() # clear mem if self.on_gpu: From 60241b5ce6f5017c1e7fb2b655fcaebfaaeae825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:11:26 +0200 Subject: [PATCH 025/195] debug --- pytorch_lightning/core/lightning.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 475478081bc2c..0e926987948cb 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -922,7 +922,7 @@ def _init_slurm_connection(self) -> None: root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node - @run_once + #@run_once def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: """ Override to define your custom way of setting up a distributed environment. @@ -936,6 +936,10 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi is_slurm_managing_tasks: is cluster managed by SLURM. """ + from torch.distributed.distributed_c10d import is_initialized + if is_initialized(): + return + if is_slurm_managing_tasks: self._init_slurm_connection() From 700d881119baa696b5d6691d0a23cf43f599ae91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:14:31 +0200 Subject: [PATCH 026/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 4 ++-- pytorch_lightning/trainer/training_loop.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 82be0680e3164..a0baaa0fc2768 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -224,8 +224,8 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # clean up dist group #if self.use_ddp or self.use_ddp2: - import torch.distributed as torch_distrib - torch_distrib.destroy_process_group() + # import torch.distributed as torch_distrib + # torch_distrib.destroy_process_group() if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 79b02a6d1fdff..e0a7b43a872aa 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1021,8 +1021,8 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - # if self.use_ddp or self.use_ddp2: - # torch_distrib.destroy_process_group() + if self.use_ddp or self.use_ddp2: + torch_distrib.destroy_process_group() # clear mem if self.on_gpu: From 99281485bce41eb023139d6e5b8f5217006d066b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:16:29 +0200 Subject: [PATCH 027/195] debug --- pytorch_lightning/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 24af21e771c47..c30f78e640baa 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1347,7 +1347,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port() + self.set_random_port(force=True) self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1370,7 +1370,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port() + self.set_random_port(force=True) self.testing = True self.model = model results = self.fit(model) From f3c44044a142e4a81c746d61b934e5dabb614204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:18:22 +0200 Subject: [PATCH 028/195] debug --- pytorch_lightning/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c30f78e640baa..c62b9053a34af 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1347,7 +1347,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) + #self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1370,7 +1370,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) + #self.set_random_port() self.testing = True self.model = model results = self.fit(model) From d95cc46d9831c5818ca86fc39d652d179fdeb63c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:23:32 +0200 Subject: [PATCH 029/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index a0baaa0fc2768..62c1f9ef07bf5 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -56,6 +56,8 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): + + self.trainer.set_random_port() port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') From 50ab31ea662697ad98c782e8cd9a5ff32aa726ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:38:58 +0200 Subject: [PATCH 030/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 62c1f9ef07bf5..535ef831d1f6f 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -56,7 +56,7 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): - + print('setting port on ', self.trainer.global_rank) self.trainer.set_random_port() port = os.environ['MASTER_PORT'] From 43f2d652e70fed21b65fcebcdceca82a257f7941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:56:11 +0200 Subject: [PATCH 031/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index d6ce609d2b966..9d213adaee0d7 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -172,9 +172,10 @@ def train_fx(trial_hparams, cluster_manager, _): XLA_AVAILABLE = True -PID = os.getpid() -RNG1 = np.random.RandomState(PID) -RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) +#PID = os.getpid() +#RNG1 = np.random.RandomState(PID) +#RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) +RANDOM_PORTS = list(range(10000, 20000)) class TrainerDDPMixin(ABC): From 752dbf19cda6b72cacd61481a0ca66bea9b6d43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 03:57:06 +0200 Subject: [PATCH 032/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 535ef831d1f6f..abdb6404a1f2f 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -57,7 +57,7 @@ def train(self, model): def spawn_ddp_children(self, model): print('setting port on ', self.trainer.global_rank) - self.trainer.set_random_port() + self.trainer.set_random_port(force=True) port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') From fc15ea772fc1619df14882ade7215dc81f373f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 04:03:45 +0200 Subject: [PATCH 033/195] debug --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 0e926987948cb..73633db770ca9 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -958,7 +958,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi f"WORLD_SIZE environment variable ({os.environ['WORLD_SIZE']}) " f"is not equal to the computed world size ({world_size}). Ignored." ) - + print('master port init', os.environ['MASTER_PORT']) torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) From 61e90f580a1166e7d1acad4ba4ff6734f7531fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 04:05:42 +0200 Subject: [PATCH 034/195] debug --- pytorch_lightning/core/lightning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 73633db770ca9..3ddaeb988cc36 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -937,8 +937,8 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi """ from torch.distributed.distributed_c10d import is_initialized - if is_initialized(): - return + #if is_initialized(): + #return if is_slurm_managing_tasks: self._init_slurm_connection() From bf30a98d8c5cf32c43f2870d06f32353901a40f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 04:08:33 +0200 Subject: [PATCH 035/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 9d213adaee0d7..88ec05ee618d1 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -413,13 +413,14 @@ def set_random_port(self, force=False): """ # pick a random port first assert self.num_nodes == 1, 'random port can only be called from single node training' - global RANDOM_PORTS - default_port = RANDOM_PORTS[-1] - RANDOM_PORTS = RANDOM_PORTS[:-1] + + default_port = os.environ.get('MASTER_PORT') # when not forced, use the user port - if not force: - default_port = os.environ.get('MASTER_PORT', default_port) + if force or not default_port: + global RANDOM_PORTS + default_port = RANDOM_PORTS[-1] + RANDOM_PORTS = RANDOM_PORTS[:-1] os.environ['MASTER_PORT'] = str(default_port) From 703c1c95c4ad41508e70b07309d9ad8c20a5f2d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 04:11:04 +0200 Subject: [PATCH 036/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 4 ++-- pytorch_lightning/trainer/distrib_data_parallel.py | 1 + pytorch_lightning/trainer/trainer.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index abdb6404a1f2f..c5a1d7075862f 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -56,8 +56,8 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): - print('setting port on ', self.trainer.global_rank) - self.trainer.set_random_port(force=True) + # + #self.trainer.set_random_port(force=True) port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 88ec05ee618d1..53192642222aa 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -414,6 +414,7 @@ def set_random_port(self, force=False): # pick a random port first assert self.num_nodes == 1, 'random port can only be called from single node training' + print('setting port on rank', self.global_rank) default_port = os.environ.get('MASTER_PORT') # when not forced, use the user port diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c62b9053a34af..c30f78e640baa 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1347,7 +1347,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - #self.set_random_port() + self.set_random_port(force=True) self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1370,7 +1370,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - #self.set_random_port() + self.set_random_port(force=True) self.testing = True self.model = model results = self.fit(model) From 414e6ccf14e8453332e161b5fc8773f5d9ce120b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 21:51:18 +0200 Subject: [PATCH 037/195] debug --- pl_examples/basic_examples/gpu_template2.py | 59 +++++++++++++++++++++ pytorch_lightning/core/lightning.py | 4 +- 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 pl_examples/basic_examples/gpu_template2.py diff --git a/pl_examples/basic_examples/gpu_template2.py b/pl_examples/basic_examples/gpu_template2.py new file mode 100644 index 0000000000000..56c58e8850875 --- /dev/null +++ b/pl_examples/basic_examples/gpu_template2.py @@ -0,0 +1,59 @@ +""" +Runs a model on a single node across multiple gpus. +""" +import os +from argparse import ArgumentParser + +from pytorch_lightning import Trainer, seed_everything +from pl_examples.models.lightning_template import LightningTemplateModel + +seed_everything(234) + + +def main(args): + """ Main training routine specific for this project. """ + # ------------------------ + # 1 INIT LIGHTNING MODEL + # ------------------------ + model = LightningTemplateModel(**vars(args)) + + # ------------------------ + # 2 INIT TRAINER + # ------------------------ + trainer = Trainer.from_argparse_args( + args, + distributed_backend='ddp', + limit_train_batches=10, + limit_val_batches=10, + max_epochs=1, + ) + + # ------------------------ + # 3 START TRAINING + # ------------------------ + trainer.fit(model) + trainer.test(model) + + +def run_cli(): + # ------------------------ + # TRAINING ARGUMENTS + # ------------------------ + # these are project-wide arguments + root_dir = os.path.dirname(os.path.realpath(__file__)) + parent_parser = ArgumentParser(add_help=False) + + # each LightningModule defines arguments relevant to it + parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) + parser = Trainer.add_argparse_args(parser) + parser.set_defaults(gpus=2) + args = parser.parse_args() + + # --------------------- + # RUN TRAINING + # --------------------- + main(args) + + +if __name__ == '__main__': + run_cli() diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3ddaeb988cc36..73633db770ca9 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -937,8 +937,8 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi """ from torch.distributed.distributed_c10d import is_initialized - #if is_initialized(): - #return + if is_initialized(): + return if is_slurm_managing_tasks: self._init_slurm_connection() From 3a75faf495ce9923055d5f985cc1923b2ec8fd2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 21:54:18 +0200 Subject: [PATCH 038/195] debug --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index e0a7b43a872aa..1180a361b3a3b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1021,8 +1021,8 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - if self.use_ddp or self.use_ddp2: - torch_distrib.destroy_process_group() + # if self.use_ddp or self.use_ddp2: + # torch_distrib.destroy_process_group() # clear mem if self.on_gpu: From 6d8cd8143278e9007ca89ee757cdb9861a5b6b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 21:59:57 +0200 Subject: [PATCH 039/195] debug --- pl_examples/basic_examples/gpu_template2.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pl_examples/basic_examples/gpu_template2.py b/pl_examples/basic_examples/gpu_template2.py index 56c58e8850875..d9817368a8fb7 100644 --- a/pl_examples/basic_examples/gpu_template2.py +++ b/pl_examples/basic_examples/gpu_template2.py @@ -4,12 +4,18 @@ import os from argparse import ArgumentParser -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import Trainer, seed_everything, Callback from pl_examples.models.lightning_template import LightningTemplateModel seed_everything(234) +class DebugCallback(Callback): + + def on_test_batch_end(self, trainer, pl_module): + print('test_batch', trainer.global_rank) + + def main(args): """ Main training routine specific for this project. """ # ------------------------ From 85f8929ddbbb726529ca80a6d01493cf4bab76ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:05:18 +0200 Subject: [PATCH 040/195] debug --- pytorch_lightning/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c30f78e640baa..24af21e771c47 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1347,7 +1347,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) + self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1370,7 +1370,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) + self.set_random_port() self.testing = True self.model = model results = self.fit(model) From f3bb93d430ddc7d575939f511da3496c1be1e62c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:10:52 +0200 Subject: [PATCH 041/195] debug --- pytorch_lightning/core/lightning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 73633db770ca9..22434ce40c39e 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -938,6 +938,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi """ from torch.distributed.distributed_c10d import is_initialized if is_initialized(): + print('already initialized', os.environ['MASTER_PORT'], os.getpid()) return if is_slurm_managing_tasks: @@ -958,7 +959,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi f"WORLD_SIZE environment variable ({os.environ['WORLD_SIZE']}) " f"is not equal to the computed world size ({world_size}). Ignored." ) - print('master port init', os.environ['MASTER_PORT']) + print('master port init', os.environ['MASTER_PORT'], os.getpid()) torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) From 53a733812d6103b0b5c692c1de02b007076a3e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:20:43 +0200 Subject: [PATCH 042/195] debug --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 22434ce40c39e..c23bf306e315c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -936,7 +936,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi is_slurm_managing_tasks: is cluster managed by SLURM. """ - from torch.distributed.distributed_c10d import is_initialized + from torch.distributed import is_initialized if is_initialized(): print('already initialized', os.environ['MASTER_PORT'], os.getpid()) return From 7a35761b1c262e8cf4a6bc7d6c309815f4baddbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:30:04 +0200 Subject: [PATCH 043/195] debug --- pytorch_lightning/trainer/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 24af21e771c47..95db6a7591c00 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1296,6 +1296,7 @@ def test( self.verbose_test = verbose if self.global_rank != 0: + print('testing on global rank > 0 does not work???') return # If you supply a datamodule you can't supply train_dataloader or val_dataloaders From 79358fc3c72368dbb931dd06629f41c06590a850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:54:09 +0200 Subject: [PATCH 044/195] debug --- .../accelerator_backends/ddp_backend.py | 17 ++++++++++++----- pytorch_lightning/core/lightning.py | 5 ----- pytorch_lightning/trainer/trainer.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index c5a1d7075862f..dfac33eae5f09 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -152,11 +152,18 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) + + from torch.distributed import is_initialized + if not is_master or not is_initialized(): + assert is_master and self.trainer.global_rank == 0 + # on rank > 0, we always need to initialize, because these are new processes + model.init_ddp_connection( + self.trainer.global_rank, + self.trainer.world_size, + self.trainer.is_slurm_managing_tasks + ) + else: + print('already initialized', os.environ['MASTER_PORT'], os.getpid(), is_master) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index c23bf306e315c..a3329742d6cc7 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -936,11 +936,6 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi is_slurm_managing_tasks: is cluster managed by SLURM. """ - from torch.distributed import is_initialized - if is_initialized(): - print('already initialized', os.environ['MASTER_PORT'], os.getpid()) - return - if is_slurm_managing_tasks: self._init_slurm_connection() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 95db6a7591c00..7538ccf75e353 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1296,7 +1296,7 @@ def test( self.verbose_test = verbose if self.global_rank != 0: - print('testing on global rank > 0 does not work???') + # do nothing, rank 0 process will launch new processes for testing return # If you supply a datamodule you can't supply train_dataloader or val_dataloaders From f106dfb9f3f540b093df7e1ea1830cd173d014a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:55:59 +0200 Subject: [PATCH 045/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index dfac33eae5f09..d441dd504ccb3 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -155,7 +155,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 from torch.distributed import is_initialized if not is_master or not is_initialized(): - assert is_master and self.trainer.global_rank == 0 + assert not (is_master and self.trainer.global_rank > 0) # on rank > 0, we always need to initialize, because these are new processes model.init_ddp_connection( self.trainer.global_rank, From 3d016046f8352ace838aefa344d0d421f3c2e273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 22:59:34 +0200 Subject: [PATCH 046/195] debug --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1180a361b3a3b..5ff5c092a2c29 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1021,8 +1021,8 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - # if self.use_ddp or self.use_ddp2: - # torch_distrib.destroy_process_group() + if (self.use_ddp or self.use_ddp2) and self.global_rank > 0: + torch_distrib.destroy_process_group() # clear mem if self.on_gpu: From f97a8ed88697a931504417227230e8e400b475c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:17:29 +0200 Subject: [PATCH 047/195] debug --- .../accelerator_backends/ddp_backend.py | 27 ++++++++++--------- pytorch_lightning/trainer/training_loop.py | 2 +- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index d441dd504ccb3..731a38376af56 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -57,7 +57,8 @@ def train(self, model): def spawn_ddp_children(self, model): # - #self.trainer.set_random_port(force=True) + assert self.trainer.global_rank == 0 + self.trainer.set_random_port(force=True) port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') @@ -153,17 +154,17 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # where to store ip_table model.trainer = self.trainer - from torch.distributed import is_initialized - if not is_master or not is_initialized(): - assert not (is_master and self.trainer.global_rank > 0) - # on rank > 0, we always need to initialize, because these are new processes - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - else: - print('already initialized', os.environ['MASTER_PORT'], os.getpid(), is_master) + # from torch.distributed import is_initialized + # if not is_master or not is_initialized(): + # assert not (is_master and self.trainer.global_rank > 0) + # # on rank > 0, we always need to initialize, because these are new processes + model.init_ddp_connection( + self.trainer.global_rank, + self.trainer.world_size, + self.trainer.is_slurm_managing_tasks + ) + # else: + # print('already initialized', os.environ['MASTER_PORT'], os.getpid(), is_master) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) @@ -236,5 +237,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # import torch.distributed as torch_distrib # torch_distrib.destroy_process_group() + torch.distributed.destroy_process_group() + if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5ff5c092a2c29..bb181ca12ca83 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1021,7 +1021,7 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - if (self.use_ddp or self.use_ddp2) and self.global_rank > 0: + if (self.use_ddp or self.use_ddp2): torch_distrib.destroy_process_group() # clear mem From b42625843c048436b651c4644cfe658fe2114e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:19:17 +0200 Subject: [PATCH 048/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- pytorch_lightning/trainer/training_loop.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 731a38376af56..d2138c375786a 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -237,7 +237,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # import torch.distributed as torch_distrib # torch_distrib.destroy_process_group() - torch.distributed.destroy_process_group() + # torch.distributed.destroy_process_group() if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bb181ca12ca83..68c834b74fe63 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1022,6 +1022,7 @@ def run_training_teardown(self): # clean up dist group if (self.use_ddp or self.use_ddp2): + print('destroy on rank ', self.global_rank, os.getpid()) torch_distrib.destroy_process_group() # clear mem From cf0964264fcd802d1315ed19bf50145995a0ebb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:25:12 +0200 Subject: [PATCH 049/195] debug --- pytorch_lightning/trainer/trainer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7538ccf75e353..052e6bbf844a6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1019,7 +1019,7 @@ def fit( # ddp elif self.distributed_backend == 'ddp': - self.set_random_port() + # self.set_random_port() self.accelerator_backend = DDPBackend(self) results = self.accelerator_backend.spawn_ddp_children(model) @@ -1315,6 +1315,10 @@ def test( self.teardown('test') + if torch.distributed.is_initialized(): + print('destroy in test', self.global_rank, os.getpid()) + torch.distributed.destroy_process_group(()) + return results def __test_using_best_weights(self, ckpt_path, test_dataloaders): @@ -1348,7 +1352,7 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port() + #self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1371,7 +1375,7 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port() + #self.set_random_port() self.testing = True self.model = model results = self.fit(model) From 138c9061a4c5402f0b2d70c0d6cb10c40b2376bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:26:02 +0200 Subject: [PATCH 050/195] debug --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 052e6bbf844a6..26e2a1e728f3c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1317,7 +1317,7 @@ def test( if torch.distributed.is_initialized(): print('destroy in test', self.global_rank, os.getpid()) - torch.distributed.destroy_process_group(()) + torch.distributed.destroy_process_group() return results From b3665d7fce0dfb28735e57a220b072105a6f6c3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:31:49 +0200 Subject: [PATCH 051/195] debug --- pl_examples/basic_examples/gpu_template2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pl_examples/basic_examples/gpu_template2.py b/pl_examples/basic_examples/gpu_template2.py index d9817368a8fb7..c85043458c939 100644 --- a/pl_examples/basic_examples/gpu_template2.py +++ b/pl_examples/basic_examples/gpu_template2.py @@ -32,6 +32,7 @@ def main(args): limit_train_batches=10, limit_val_batches=10, max_epochs=1, + callbacks=[DebugCallback()], ) # ------------------------ From 47c480045f888b1d2692069456f36960ea8098f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:39:40 +0200 Subject: [PATCH 052/195] debug --- pytorch_lightning/trainer/evaluation_loop.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index add9bb24c672a..9c597d7538dcd 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -291,6 +291,7 @@ def _evaluate( # run validation for dataloader_idx, dataloader in enumerate(dataloaders): + print('here 1') dl_outputs = [] # on TPU we have to wrap it under the ParallelLoader @@ -303,6 +304,7 @@ def _evaluate( dl_max_batches = max_batches[dataloader_idx] for batch_idx, batch in enumerate(dataloader): + print('here 2') if batch is None: continue @@ -600,7 +602,7 @@ def __log_evaluation_epoch_metrics(self, eval_results, test_mode): def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False): # make dataloader_idx arg in validation_step optional args = [batch, batch_idx] - + print('here 3') if (test_mode and len(self.test_dataloaders) > 1) \ or (not test_mode and len(self.val_dataloaders) > 1): args.append(dataloader_idx) @@ -610,6 +612,8 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: output = model(*args) return output + print('here 4') + # Horovod if self.use_horovod and self.on_gpu: batch = self.transfer_batch_to_gpu(batch, hvd.local_rank()) @@ -635,4 +639,6 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: else: output = model.validation_step(*args) + print('here 5') + return output From 6a9750fb1138ff692ccb0c34cff6cb1e42435a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 4 Aug 2020 23:42:11 +0200 Subject: [PATCH 053/195] debug --- pytorch_lightning/trainer/evaluation_loop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 9c597d7538dcd..a3a8102f3f204 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -609,6 +609,7 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: # handle DP, DDP forward if self.use_ddp or self.use_dp or self.use_ddp2: + # SOMETHING GOES WRONG HERE, test loop is stuck output = model(*args) return output From 4e39510da568f95183feed5dd177f2f352e74b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:33:29 +0200 Subject: [PATCH 054/195] ddptest --- tests/models/data/ddp/train_default_model.py | 54 ++++++++++++++++++++ tests/models/test_gpu.py | 23 +++++++++ 2 files changed, 77 insertions(+) create mode 100644 tests/models/data/ddp/train_default_model.py diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py new file mode 100644 index 0000000000000..0bbd493e16f5d --- /dev/null +++ b/tests/models/data/ddp/train_default_model.py @@ -0,0 +1,54 @@ +""" +Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. +""" +from argparse import ArgumentParser + +from pytorch_lightning import Trainer, seed_everything +from tests.base import EvalModelTemplate + + +def variation_fit_test(trainer, model): + trainer.fit(model) + trainer.test(model) + + +def variation_test_fit(trainer, model): + trainer.test(model) + trainer.fit(model) + + +def variation_test_test(trainer, model): + trainer.test(model) + trainer.test(model) + + +def variation_test_fit_test(trainer, model): + trainer.test(model) + trainer.fit(model) + trainer.test(model) + + +def get_variations(): + variations = [v for v in locals() if v.startswith("variation")] + return variations + + +def main(): + seed_everything(1234) + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parser) + parser.add_argument('variation', default=variation_fit_test.__name__, required=True) + parser.set_defaults(gpus=2) + parser.set_defaults(distributed_backend="ddp") + args = parser.parse_args() + + model = EvalModelTemplate() + trainer = Trainer.from_argparse_args(args) + + # run the chosen variation + run_variation = locals()[args.variation] + run_variation(trainer, model) + + +if __name__ == '__main__': + main() diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6752e559632cb..e45a366d30abb 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,3 +1,6 @@ +import subprocess +import sys +from pathlib import Path from collections import namedtuple import pytest @@ -11,6 +14,9 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate from torchtext.data import Batch, Dataset, Example, Field, LabelField + +from tests.models.data.ddp import train_default_model + PRETEND_N_OF_GPUS = 16 @@ -92,6 +98,23 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') +@pytest.mark.parametrize('cli_args', [ + '--max_epochs 1 --gpus 2 --distributed_backend ddp', +]) +@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): + file = Path(train_default_model.__file__).absolute() + cli_args = cli_args.split(' ') if cli_args else [] + cli_args += ['--default_root_dir', str(tmpdir)] + cli_args += ['--variation', variation] + command = [sys.executable, file] + cli_args + + p = subprocess.Popen(command, stderr=subprocess.PIPE) + std, err = p.communicate() + assert std and not err + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() From 7d82e6b0e25eba96e112bdbe516a34603bcb8232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:39:34 +0200 Subject: [PATCH 055/195] ddptest --- tests/models/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index e45a366d30abb..a1dba7ec759e9 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -99,10 +99,10 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ - '--max_epochs 1 --gpus 2 --distributed_backend ddp', + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) @pytest.mark.parametrize('variation', train_default_model.get_variations()) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +#@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_default_model.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] From 6c4e4c9611c1bf945d4c91e9130fe7ab3866e882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:41:45 +0200 Subject: [PATCH 056/195] ddptest --- tests/models/test_gpu.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index a1dba7ec759e9..826857d1d3fa4 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -99,20 +99,21 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ - pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), + '--max_epochs 1 --gpus 2 --distributed_backend ddp', ]) @pytest.mark.parametrize('variation', train_default_model.get_variations()) -#@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): - file = Path(train_default_model.__file__).absolute() - cli_args = cli_args.split(' ') if cli_args else [] - cli_args += ['--default_root_dir', str(tmpdir)] - cli_args += ['--variation', variation] - command = [sys.executable, file] + cli_args - - p = subprocess.Popen(command, stderr=subprocess.PIPE) - std, err = p.communicate() - assert std and not err + assert True + # file = Path(train_default_model.__file__).absolute() + # cli_args = cli_args.split(' ') if cli_args else [] + # cli_args += ['--default_root_dir', str(tmpdir)] + # cli_args += ['--variation', variation] + # command = [sys.executable, file] + cli_args + # + # p = subprocess.Popen(command, stderr=subprocess.PIPE) + # std, err = p.communicate() + # assert std and not err @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From 111633d10f1c66176cc865b88fa6e262bc4b562e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:42:40 +0200 Subject: [PATCH 057/195] ddptest --- tests/models/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 826857d1d3fa4..581093f7750d1 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -101,8 +101,8 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ '--max_epochs 1 --gpus 2 --distributed_backend ddp', ]) -@pytest.mark.parametrize('variation', train_default_model.get_variations()) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +#@pytest.mark.parametrize('variation', train_default_model.get_variations()) +#@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): assert True # file = Path(train_default_model.__file__).absolute() From 87ee6142a42acd315850de7f53b3ffb57c6f7655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:43:12 +0200 Subject: [PATCH 058/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 581093f7750d1..880995d8b4570 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -103,7 +103,7 @@ def test_multi_gpu_model_dp(tmpdir): ]) #@pytest.mark.parametrize('variation', train_default_model.get_variations()) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): +def test_multi_gpu_model_ddp(tmpdir, cli_args): assert True # file = Path(train_default_model.__file__).absolute() # cli_args = cli_args.split(' ') if cli_args else [] From 1a26952994370cbc9eb548da2f7ce741cad3e320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:44:09 +0200 Subject: [PATCH 059/195] ddptest --- tests/models/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 880995d8b4570..7e8423418c367 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -101,9 +101,9 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ '--max_epochs 1 --gpus 2 --distributed_backend ddp', ]) -#@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.parametrize('variation', train_default_model.get_variations()) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_ddp(tmpdir, cli_args): +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): assert True # file = Path(train_default_model.__file__).absolute() # cli_args = cli_args.split(' ') if cli_args else [] From 6354b2159d0f3f17e12380aab77d8aff3edbbfa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:45:22 +0200 Subject: [PATCH 060/195] ddptest --- tests/models/test_gpu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7e8423418c367..e9ece83806880 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -98,10 +98,12 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') +variations = train_default_model.get_variations() + @pytest.mark.parametrize('cli_args', [ '--max_epochs 1 --gpus 2 --distributed_backend ddp', ]) -@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.parametrize('variation', variations) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): assert True From e7b6ea415b4028e865cb3f2b19d3e6adb8c615d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:46:07 +0200 Subject: [PATCH 061/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index e9ece83806880..ef5868758fbe2 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -100,10 +100,10 @@ def test_multi_gpu_model_dp(tmpdir): variations = train_default_model.get_variations() +@pytest.mark.parametrize('variation', variations) @pytest.mark.parametrize('cli_args', [ '--max_epochs 1 --gpus 2 --distributed_backend ddp', ]) -@pytest.mark.parametrize('variation', variations) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): assert True From ab9410056ed540e2c23cfad6f94bec9a68fcb015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:46:58 +0200 Subject: [PATCH 062/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index ef5868758fbe2..77cd2f6984e3a 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -102,7 +102,7 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('variation', variations) @pytest.mark.parametrize('cli_args', [ - '--max_epochs 1 --gpus 2 --distributed_backend ddp', + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): From 18e47c7338d90c5030ddd65beb65d4b66a223ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:47:36 +0200 Subject: [PATCH 063/195] ddptest --- tests/models/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 77cd2f6984e3a..45c3f6ca041fc 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -100,8 +100,8 @@ def test_multi_gpu_model_dp(tmpdir): variations = train_default_model.get_variations() -@pytest.mark.parametrize('variation', variations) -@pytest.mark.parametrize('cli_args', [ +@pytest.mark.parametrize(['variation'], variations) +@pytest.mark.parametrize(['cli_args'], [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) #@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From d396f7f8de7a8be795735166a98e4914e9dfdd7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:49:14 +0200 Subject: [PATCH 064/195] ddptest --- tests/models/test_gpu.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 45c3f6ca041fc..9364f8d58e9b5 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -98,24 +98,20 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') -variations = train_default_model.get_variations() - -@pytest.mark.parametrize(['variation'], variations) @pytest.mark.parametrize(['cli_args'], [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) -#@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): - assert True - # file = Path(train_default_model.__file__).absolute() - # cli_args = cli_args.split(' ') if cli_args else [] - # cli_args += ['--default_root_dir', str(tmpdir)] - # cli_args += ['--variation', variation] - # command = [sys.executable, file] + cli_args - # - # p = subprocess.Popen(command, stderr=subprocess.PIPE) - # std, err = p.communicate() - # assert std and not err +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp(tmpdir, cli_args): + file = Path(train_default_model.__file__).absolute() + cli_args = cli_args.split(' ') if cli_args else [] + cli_args += ['--default_root_dir', str(tmpdir)] + + for variation in train_default_model.get_variations(): + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stderr=subprocess.PIPE) + std, err = p.communicate() + assert std and not err @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From 5024dcca5fa5e87ec673eea5b593f69b9eefeca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:50:13 +0200 Subject: [PATCH 065/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 9364f8d58e9b5..b518cb1bceb96 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -109,7 +109,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args): for variation in train_default_model.get_variations(): command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stderr=subprocess.PIPE) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate() assert std and not err From 26d49c801c9b171a50846472f08bd04b7362af0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:52:25 +0200 Subject: [PATCH 066/195] ddptest --- tests/models/test_gpu.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b518cb1bceb96..faf1e9f6834e8 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -112,6 +112,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args): p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate() assert std and not err + if p.returncode: + print(std) + print(err) + print(command) + raise RuntimeError('error') @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From bd8f762a1819313174ecdb02b0f8c568d0a77b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:53:30 +0200 Subject: [PATCH 067/195] ddptest --- tests/models/data/ddp/train_default_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py index 0bbd493e16f5d..5dd60a626dd92 100644 --- a/tests/models/data/ddp/train_default_model.py +++ b/tests/models/data/ddp/train_default_model.py @@ -29,7 +29,7 @@ def variation_test_fit_test(trainer, model): def get_variations(): - variations = [v for v in locals() if v.startswith("variation")] + variations = [v for v in locals().keys() if v.startswith("variation")] return variations From f3fe1bcdece47ae9ef78ff146717f5c7b3689add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:56:38 +0200 Subject: [PATCH 068/195] ddptest --- tests/models/data/ddp/train_default_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py index 5dd60a626dd92..da55bc037dfdb 100644 --- a/tests/models/data/ddp/train_default_model.py +++ b/tests/models/data/ddp/train_default_model.py @@ -29,7 +29,7 @@ def variation_test_fit_test(trainer, model): def get_variations(): - variations = [v for v in locals().keys() if v.startswith("variation")] + variations = [v for v in globals() if v.startswith("variation")] return variations @@ -46,7 +46,7 @@ def main(): trainer = Trainer.from_argparse_args(args) # run the chosen variation - run_variation = locals()[args.variation] + run_variation = globals()[args.variation] run_variation(trainer, model) From e4d1823fe6693668e46d79d43e60dc265b38da18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:57:55 +0200 Subject: [PATCH 069/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index faf1e9f6834e8..eaff45e70d4f6 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,7 +111,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate() - assert std and not err + #assert std and not err if p.returncode: print(std) print(err) From 924b26a4e1bdf4ae2d5c2cc163cdcec02dcad36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 22:59:11 +0200 Subject: [PATCH 070/195] ddptest --- tests/models/data/ddp/train_default_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py index da55bc037dfdb..2949750951099 100644 --- a/tests/models/data/ddp/train_default_model.py +++ b/tests/models/data/ddp/train_default_model.py @@ -37,7 +37,7 @@ def main(): seed_everything(1234) parser = ArgumentParser(add_help=False) parser = Trainer.add_argparse_args(parser) - parser.add_argument('variation', default=variation_fit_test.__name__, required=True) + parser.add_argument('--variation', default=variation_fit_test.__name__) parser.set_defaults(gpus=2) parser.set_defaults(distributed_backend="ddp") args = parser.parse_args() From 38b89d8ba9c6109c9a4e749393fb7c19e1130641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:05:30 +0200 Subject: [PATCH 071/195] ddptest --- tests/models/test_gpu.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index eaff45e70d4f6..c26b9d79b4766 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -101,22 +101,23 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize(['cli_args'], [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) +@pytest.mark.parametrize(['variation'], train_default_model.get_variations()) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_multi_gpu_model_ddp(tmpdir, cli_args): +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_default_model.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - for variation in train_default_model.get_variations(): - command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - std, err = p.communicate() - #assert std and not err - if p.returncode: - print(std) - print(err) - print(command) - raise RuntimeError('error') + #for variation in train_default_model.get_variations(): + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + std, err = p.communicate() + #assert std and not err + if p.returncode: + print(std) + print(err) + print(command) + raise RuntimeError('error') @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From 6bd3cec6af9b25ac70f560ed6eeda2fc8ad25613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:07:08 +0200 Subject: [PATCH 072/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index c26b9d79b4766..b2e99fee1a283 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -101,7 +101,7 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize(['cli_args'], [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) -@pytest.mark.parametrize(['variation'], train_default_model.get_variations()) +@pytest.mark.parametrize('variation', train_default_model.get_variations()) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_default_model.__file__).absolute() From 443121387eecf67b9a759187f4109bbcccade33f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:08:40 +0200 Subject: [PATCH 073/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b2e99fee1a283..b02bcc0f0191e 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -98,7 +98,7 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') -@pytest.mark.parametrize(['cli_args'], [ +@pytest.mark.parametrize('cli_args', [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) @pytest.mark.parametrize('variation', train_default_model.get_variations()) From 28ab5cdb483d8384c05162c0701f7b4159525d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:13:05 +0200 Subject: [PATCH 074/195] ddptest --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b02bcc0f0191e..d88b4b6b27324 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,7 +111,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): #for variation in train_default_model.get_variations(): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - std, err = p.communicate() + std, err = p.communicate(timeout=60) #assert std and not err if p.returncode: print(std) From dc16a1f97b2cad46ce66fc7f4ad69854cff1063a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:32:14 +0200 Subject: [PATCH 075/195] add ddp script variations --- tests/models/data/ddp/train_default_model.py | 54 ++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/models/data/ddp/train_default_model.py diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py new file mode 100644 index 0000000000000..2949750951099 --- /dev/null +++ b/tests/models/data/ddp/train_default_model.py @@ -0,0 +1,54 @@ +""" +Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. +""" +from argparse import ArgumentParser + +from pytorch_lightning import Trainer, seed_everything +from tests.base import EvalModelTemplate + + +def variation_fit_test(trainer, model): + trainer.fit(model) + trainer.test(model) + + +def variation_test_fit(trainer, model): + trainer.test(model) + trainer.fit(model) + + +def variation_test_test(trainer, model): + trainer.test(model) + trainer.test(model) + + +def variation_test_fit_test(trainer, model): + trainer.test(model) + trainer.fit(model) + trainer.test(model) + + +def get_variations(): + variations = [v for v in globals() if v.startswith("variation")] + return variations + + +def main(): + seed_everything(1234) + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parser) + parser.add_argument('--variation', default=variation_fit_test.__name__) + parser.set_defaults(gpus=2) + parser.set_defaults(distributed_backend="ddp") + args = parser.parse_args() + + model = EvalModelTemplate() + trainer = Trainer.from_argparse_args(args) + + # run the chosen variation + run_variation = globals()[args.variation] + run_variation(trainer, model) + + +if __name__ == '__main__': + main() From 903155861ba2f61874f4f32d4ee46197eea2afb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:34:01 +0200 Subject: [PATCH 076/195] add ddp test --- tests/models/test_gpu.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7497a53083612..2101496b16d2d 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,4 +1,7 @@ +import subprocess +import sys from collections import namedtuple +from pathlib import Path import pytest import torch @@ -11,6 +14,7 @@ from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate +from tests.models.data.ddp import train_default_model PRETEND_N_OF_GPUS = 16 @@ -93,6 +97,26 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') +@pytest.mark.parametrize('cli_args', [ + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +]) +@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): + file = Path(train_default_model.__file__).absolute() + cli_args = cli_args.split(' ') if cli_args else [] + cli_args += ['--default_root_dir', str(tmpdir)] + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + std, err = p.communicate(timeout=60) + #assert std and not err + if p.returncode: + print(std) + print(err) + print(command) + raise RuntimeError('error') + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() From b5bc4d6afa43d422c1a899489a7592b49ce796bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:22:07 +0200 Subject: [PATCH 077/195] rename --- .../{train_default_model.py => train_test_variations.py} | 0 tests/models/test_gpu.py | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) rename tests/models/data/ddp/{train_default_model.py => train_test_variations.py} (100%) diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_test_variations.py similarity index 100% rename from tests/models/data/ddp/train_default_model.py rename to tests/models/data/ddp/train_test_variations.py diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 2101496b16d2d..890bd3f9ab2d1 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -14,7 +14,7 @@ from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -from tests.models.data.ddp import train_default_model +from tests.models.data.ddp import train_test_variations PRETEND_N_OF_GPUS = 16 @@ -100,14 +100,14 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) -@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.parametrize('variation', train_test_variations.get_variations()) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): - file = Path(train_default_model.__file__).absolute() + file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) std, err = p.communicate(timeout=60) #assert std and not err if p.returncode: From 13fc64afea9c81effa789bc2135f334548f76fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:38:53 +0200 Subject: [PATCH 078/195] shell --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 890bd3f9ab2d1..eb7e632962df3 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -107,7 +107,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate(timeout=60) #assert std and not err if p.returncode: From 3163db8cc8dd2edf4783bf5dc54eb648a99cc6e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:43:22 +0200 Subject: [PATCH 079/195] test --- tests/models/test_gpu.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index eb7e632962df3..28cd68d7fec85 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -109,12 +109,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate(timeout=60) - #assert std and not err + std = std.decode('utf-8').strip() + err = err.decode('utf-8').strip() + assert std and not err if p.returncode: print(std) print(err) print(command) - raise RuntimeError('error') + pytest.fail(err) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From bd189a913e44201a8321bdbe4b55e9536c431d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:55:54 +0200 Subject: [PATCH 080/195] test --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 28cd68d7fec85..956d325da2fb2 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,7 +111,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - assert std and not err + # assert std and not err if p.returncode: print(std) print(err) From ce4274f597255405357348fde963b901a36ad08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 03:39:02 +0200 Subject: [PATCH 081/195] try call --- tests/models/test_gpu.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 956d325da2fb2..3e1ead264e42c 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -107,16 +107,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - std, err = p.communicate(timeout=60) - std = std.decode('utf-8').strip() - err = err.decode('utf-8').strip() + exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + raise SystemExit(exitcode) + # std, err = p.communicate(timeout=60) + # std = std.decode('utf-8').strip() + # err = err.decode('utf-8').strip() # assert std and not err - if p.returncode: - print(std) - print(err) - print(command) - pytest.fail(err) + # if p.returncode: + # print(std) + # print(err) + # print(command) + # pytest.fail(err) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From 886ce192714f6ff043e07f203ba304e61fa67fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 15:59:04 +0200 Subject: [PATCH 082/195] try without subprocess --- tests/models/test_gpu.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 3e1ead264e42c..7ff612249d914 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -2,6 +2,7 @@ import sys from collections import namedtuple from pathlib import Path +from unittest import mock import pytest import torch @@ -106,9 +107,9 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - command = [sys.executable, file, '--variation', variation] + cli_args - exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - raise SystemExit(exitcode) + # command = [sys.executable, file, '--variation', variation] + cli_args + # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # raise SystemExit(exitcode) # std, err = p.communicate(timeout=60) # std = std.decode('utf-8').strip() # err = err.decode('utf-8').strip() @@ -119,6 +120,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): # print(command) # pytest.fail(err) + cli_args += ['--variation', variation] + from tests.models.data.ddp.train_test_variations import main + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + main() + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): From 884e75948276449e0b52a214fb5a7858a8a636c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 16:07:59 +0200 Subject: [PATCH 083/195] test --- tests/models/test_gpu.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7ff612249d914..1028c6e49aede 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -107,9 +107,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - # command = [sys.executable, file, '--variation', variation] + cli_args - # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # raise SystemExit(exitcode) + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + assert p.returncode == 0 # std, err = p.communicate(timeout=60) # std = std.decode('utf-8').strip() # err = err.decode('utf-8').strip() @@ -120,10 +121,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): # print(command) # pytest.fail(err) - cli_args += ['--variation', variation] - from tests.models.data.ddp.train_test_variations import main - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - main() + # cli_args += ['--variation', variation] + # from tests.models.data.ddp.train_test_variations import main + # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + # main() @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From 65c1cffb4b4ae34bb089a20649f9c65efbf17234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 16:54:10 +0200 Subject: [PATCH 084/195] display the error --- tests/models/test_gpu.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 1028c6e49aede..dc2d9416fd5a5 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,16 +110,16 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() - assert p.returncode == 0 - # std, err = p.communicate(timeout=60) - # std = std.decode('utf-8').strip() - # err = err.decode('utf-8').strip() + # assert p.returncode == 0 + std, err = p.communicate(timeout=60) + std = std.decode('utf-8').strip() + err = err.decode('utf-8').strip() # assert std and not err - # if p.returncode: - # print(std) - # print(err) - # print(command) - # pytest.fail(err) + if p.returncode: + print(std) + print(err) + print(command) + pytest.fail(err) # cli_args += ['--variation', variation] # from tests.models.data.ddp.train_test_variations import main From d6c57eb314d69c33a8ac484376fb37925fce8f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 8 Aug 2020 05:38:14 +0200 Subject: [PATCH 085/195] list all variations --- tests/models/data/ddp/train_test_variations.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py index 2949750951099..1ac2e110dd599 100644 --- a/tests/models/data/ddp/train_test_variations.py +++ b/tests/models/data/ddp/train_test_variations.py @@ -17,6 +17,11 @@ def variation_test_fit(trainer, model): trainer.fit(model) +def variation_fit_fit(trainer, model): + trainer.fit(model) + trainer.fit(model) + + def variation_test_test(trainer, model): trainer.test(model) trainer.test(model) @@ -29,7 +34,13 @@ def variation_test_fit_test(trainer, model): def get_variations(): - variations = [v for v in globals() if v.startswith("variation")] + variations = [ + "variation_fit_test", + "variation_test_fit", + "variation_fit_fit", + "variation_test_test", + "variation_test_fit_test", + ] return variations From 3be75baedabe93643dc6eb2dfb67d69cb70ab5c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 05:12:50 +0200 Subject: [PATCH 086/195] try string --- tests/models/test_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index dc2d9416fd5a5..0fa78b919d8cd 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -107,7 +107,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - command = [sys.executable, file, '--variation', variation] + cli_args + # command = [sys.executable, file, '--variation', variation] + cli_args + command = ['python', file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() # assert p.returncode == 0 From 25a27480ff66fdc81e8060ed93f35ddfc53d75ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 05:48:37 +0200 Subject: [PATCH 087/195] try copy env --- tests/models/test_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 0fa78b919d8cd..c21881c1857bc 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,3 +1,4 @@ +import os import subprocess import sys from collections import namedtuple @@ -109,7 +110,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args += ['--default_root_dir', str(tmpdir)] # command = [sys.executable, file, '--variation', variation] + cli_args command = ['python', file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) p.communicate() # assert p.returncode == 0 std, err = p.communicate(timeout=60) From 0911f31765180539b271cd4b0ee62be5900ccbab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:27:04 +0200 Subject: [PATCH 088/195] debug --- tests/models/test_gpu.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index c21881c1857bc..26dec8dd97d5b 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,6 +110,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args += ['--default_root_dir', str(tmpdir)] # command = [sys.executable, file, '--variation', variation] + cli_args command = ['python', file, '--variation', variation] + cli_args + + # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL + p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + std, err = p.communicate() + std = std.decode('utf-8') + print(std) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) p.communicate() # assert p.returncode == 0 From e700f816883f76cc7b283e8b1fd078d76ec1f8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:44:08 +0200 Subject: [PATCH 089/195] pythonpath --- tests/models/test_gpu.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 26dec8dd97d5b..6dacffb7c849f 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -11,6 +11,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils +import pytorch_lightning from pytorch_lightning import Trainer from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device @@ -112,13 +113,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = ['python', file, '--variation', variation] + cli_args # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL + p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() std, err = p.communicate() std = std.decode('utf-8') print(std) - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) + env = os.environ.copy() + env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH'] + + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() # assert p.returncode == 0 std, err = p.communicate(timeout=60) From 83bd21367e38d146e010accb130adbd9c4bb8255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:55:42 +0200 Subject: [PATCH 090/195] path --- tests/models/test_gpu.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6dacffb7c849f..e930a6f517a5a 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -109,8 +109,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - # command = [sys.executable, file, '--variation', variation] + cli_args - command = ['python', file, '--variation', variation] + cli_args + command = [sys.executable, str(file), '--variation', variation] + cli_args # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL @@ -121,8 +120,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): print(std) env = os.environ.copy() - env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH'] - + env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') + print('python path', env['PYTHONPATH']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() # assert p.returncode == 0 From 1cecde9699c63011b09c9f4acb28121a0003f7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 07:07:52 +0200 Subject: [PATCH 091/195] update test --- tests/models/test_gpu.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index e930a6f517a5a..58fbce3c351ae 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,36 +110,20 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, str(file), '--variation', variation] + cli_args - - # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL - - p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - p.communicate() - std, err = p.communicate() - std = std.decode('utf-8') - print(std) - env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') - print('python path', env['PYTHONPATH']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() - # assert p.returncode == 0 std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - # assert std and not err + assert std if p.returncode: print(std) print(err) print(command) pytest.fail(err) - # cli_args += ['--variation', variation] - # from tests.models.data.ddp.train_test_variations import main - # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - # main() - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): From 1316c553f443a79d512a49dbffec8fe2cc6746c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 07:17:20 +0200 Subject: [PATCH 092/195] change --- tests/models/test_gpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 58fbce3c351ae..39137c9805437 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -114,6 +114,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() + std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() From 61a80ec76fa9285bddf0985e08e1d64220eb2013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 08:08:04 +0200 Subject: [PATCH 093/195] remove old file --- tests/models/data/ddp/train_default_model.py | 54 -------------------- 1 file changed, 54 deletions(-) delete mode 100644 tests/models/data/ddp/train_default_model.py diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py deleted file mode 100644 index 2949750951099..0000000000000 --- a/tests/models/data/ddp/train_default_model.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. -""" -from argparse import ArgumentParser - -from pytorch_lightning import Trainer, seed_everything -from tests.base import EvalModelTemplate - - -def variation_fit_test(trainer, model): - trainer.fit(model) - trainer.test(model) - - -def variation_test_fit(trainer, model): - trainer.test(model) - trainer.fit(model) - - -def variation_test_test(trainer, model): - trainer.test(model) - trainer.test(model) - - -def variation_test_fit_test(trainer, model): - trainer.test(model) - trainer.fit(model) - trainer.test(model) - - -def get_variations(): - variations = [v for v in globals() if v.startswith("variation")] - return variations - - -def main(): - seed_everything(1234) - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument('--variation', default=variation_fit_test.__name__) - parser.set_defaults(gpus=2) - parser.set_defaults(distributed_backend="ddp") - args = parser.parse_args() - - model = EvalModelTemplate() - trainer = Trainer.from_argparse_args(args) - - # run the chosen variation - run_variation = globals()[args.variation] - run_variation(trainer, model) - - -if __name__ == '__main__': - main() From 462776b8d60d765ec87df99c32a5204b0703e758 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:04:48 +0200 Subject: [PATCH 094/195] debug --- pl_examples/basic_examples/gpu_template.py | 11 +--- pl_examples/basic_examples/gpu_template2.py | 66 --------------------- pytorch_lightning/trainer/trainer.py | 6 +- 3 files changed, 4 insertions(+), 79 deletions(-) delete mode 100644 pl_examples/basic_examples/gpu_template2.py diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index ae18a47335ec4..ced4525d4db66 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -20,20 +20,11 @@ def main(args): # ------------------------ # 2 INIT TRAINER # ------------------------ - trainer = Trainer.from_argparse_args( - args, - distributed_backend='ddp', - limit_train_batches=10, - limit_val_batches=10, - max_epochs=1, - ) + trainer = Trainer.from_argparse_args(args) # ------------------------ # 3 START TRAINING # ------------------------ - trainer.test(model) - trainer.fit(model) - trainer.test(model) trainer.fit(model) diff --git a/pl_examples/basic_examples/gpu_template2.py b/pl_examples/basic_examples/gpu_template2.py deleted file mode 100644 index c85043458c939..0000000000000 --- a/pl_examples/basic_examples/gpu_template2.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -Runs a model on a single node across multiple gpus. -""" -import os -from argparse import ArgumentParser - -from pytorch_lightning import Trainer, seed_everything, Callback -from pl_examples.models.lightning_template import LightningTemplateModel - -seed_everything(234) - - -class DebugCallback(Callback): - - def on_test_batch_end(self, trainer, pl_module): - print('test_batch', trainer.global_rank) - - -def main(args): - """ Main training routine specific for this project. """ - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = LightningTemplateModel(**vars(args)) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = Trainer.from_argparse_args( - args, - distributed_backend='ddp', - limit_train_batches=10, - limit_val_batches=10, - max_epochs=1, - callbacks=[DebugCallback()], - ) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model) - trainer.test(model) - - -def run_cli(): - # ------------------------ - # TRAINING ARGUMENTS - # ------------------------ - # these are project-wide arguments - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # each LightningModule defines arguments relevant to it - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - parser = Trainer.add_argparse_args(parser) - parser.set_defaults(gpus=2) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args) - - -if __name__ == '__main__': - run_cli() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 42bda638b05b5..7156d94e8a7c8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1321,9 +1321,9 @@ def test( self.teardown('test') - if torch.distributed.is_initialized(): - print('destroy in test', self.global_rank, os.getpid()) - torch.distributed.destroy_process_group() + # if torch.distributed.is_initialized(): + # print('destroy in test', self.global_rank, os.getpid()) + # torch.distributed.destroy_process_group() return results From 764c06a550e4355035c28c1df657c508d0e0c0a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:35:35 +0200 Subject: [PATCH 095/195] try new --- .../accelerator_backends/ddp_backend.py | 17 +++++++---- pytorch_lightning/trainer/supporters.py | 29 +++++++++++++++++++ pytorch_lightning/trainer/trainer.py | 2 +- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index f3a8b54df9997..70a6b28f618ff 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -19,6 +19,8 @@ from time import sleep import numpy as np from os.path import abspath + +from pytorch_lightning.trainer.supporters import DistributedConnection from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning import _logger as log @@ -45,6 +47,7 @@ class DDPBackend(object): def __init__(self, trainer): self.trainer = trainer self.task_idx = None + self.distributed_connection = DistributedConnection() def slurm_setup(self): self.task_idx = int(os.environ['SLURM_LOCALID']) @@ -58,7 +61,7 @@ def train(self, model): def spawn_ddp_children(self, model): # assert self.trainer.global_rank == 0 - self.trainer.set_random_port(force=True) + # self.trainer.set_random_port(force=True) port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') @@ -158,11 +161,13 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # if not is_master or not is_initialized(): # assert not (is_master and self.trainer.global_rank > 0) # # on rank > 0, we always need to initialize, because these are new processes - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) + + self.distributed_connection.init_connection(self.trainer, model) + # model.init_ddp_connection( + # self.trainer.global_rank, + # self.trainer.world_size, + # self.trainer.is_slurm_managing_tasks + # ) # else: # print('already initialized', os.environ['MASTER_PORT'], os.getpid(), is_master) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 8853d7aaa05b0..a3dfb732b8c08 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -1,6 +1,10 @@ +import atexit from typing import Optional import torch +import torch.distributed + +from pytorch_lightning.utilities import rank_zero_info class TensorRunningAccum(object): @@ -90,3 +94,28 @@ def accumulate(self, x): def mean(self): return self.total / self.num_values + + +class DistributedConnection: + + def __init__(self): + super().__init__() + # self.world_size = world_size + # self.is_slurm_managing_tasks = is_slurm_managing_tasks + self._is_initialized = False + + def init_connection(self, trainer, model): + if self._is_initialized: + rank_zero_info("ddp connection already initialized") + return + + trainer.set_random_port() + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + self._is_initialized = True + + def exit_handler(): + if torch.distributed.is_initialized(): + torch.distributed.barrier() + torch.distributed.destroy_process_group() + + atexit.register(exit_handler) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7156d94e8a7c8..ccad79d201734 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -43,7 +43,7 @@ from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin -from pytorch_lightning.trainer.supporters import TensorRunningAccum +from pytorch_lightning.trainer.supporters import TensorRunningAccum, DistributedConnection from pytorch_lightning.trainer.training_io import TrainerIOMixin from pytorch_lightning.trainer.training_loop import TrainerTrainLoopMixin from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin From 69fe561d64c163bfd244833a4f8117562906e64c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:44:26 +0200 Subject: [PATCH 096/195] port --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- pytorch_lightning/trainer/supporters.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 70a6b28f618ff..e944976dea1f8 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -47,7 +47,7 @@ class DDPBackend(object): def __init__(self, trainer): self.trainer = trainer self.task_idx = None - self.distributed_connection = DistributedConnection() + self.distributed_connection = DistributedConnection(trainer) def slurm_setup(self): self.task_idx = int(os.environ['SLURM_LOCALID']) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index a3dfb732b8c08..beb4db7698dae 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -98,11 +98,13 @@ def mean(self): class DistributedConnection: - def __init__(self): + def __init__(self, trainer): super().__init__() # self.world_size = world_size # self.is_slurm_managing_tasks = is_slurm_managing_tasks + self.trainer = trainer self._is_initialized = False + self.trainer.set_random_port() def init_connection(self, trainer, model): if self._is_initialized: @@ -110,6 +112,7 @@ def init_connection(self, trainer, model): return trainer.set_random_port() + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) self._is_initialized = True From 844f1061c3979b0635cc9fb8b3d3d7e33578a7a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:48:28 +0200 Subject: [PATCH 097/195] debug --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 ++ pytorch_lightning/trainer/supporters.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index e944976dea1f8..02d6ac3d4e2c5 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -248,5 +248,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # torch.distributed.destroy_process_group() + self.distributed_connection.teardown() + if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index beb4db7698dae..a5c1db1722b33 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -118,7 +118,12 @@ def init_connection(self, trainer, model): def exit_handler(): if torch.distributed.is_initialized(): - torch.distributed.barrier() + # torch.distributed.barrier() torch.distributed.destroy_process_group() atexit.register(exit_handler) + + def teardown(self): + if torch.distributed.is_initialized(): + # torch.distributed.barrier() + torch.distributed.destroy_process_group() From a44b9e3465e22b4e3214a8a2a97af34f4a87078b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:49:23 +0200 Subject: [PATCH 098/195] debug --- pytorch_lightning/trainer/supporters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index a5c1db1722b33..5be4b707135d7 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -125,5 +125,5 @@ def exit_handler(): def teardown(self): if torch.distributed.is_initialized(): - # torch.distributed.barrier() + torch.distributed.barrier() torch.distributed.destroy_process_group() From e712eb9af9b6afeef0d8a53a4e81d0b48a28212c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 01:53:44 +0200 Subject: [PATCH 099/195] debug --- pytorch_lightning/trainer/supporters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 5be4b707135d7..dfba3aa9f7f2a 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -125,5 +125,6 @@ def exit_handler(): def teardown(self): if torch.distributed.is_initialized(): + torch.cuda.empty_cache() torch.distributed.barrier() torch.distributed.destroy_process_group() From 5c21884b3270dc3ee3d4354deb5a1bf15bdf0dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:07:22 +0200 Subject: [PATCH 100/195] debug --- .../accelerator_backends/ddp_backend.py | 4 ++-- pytorch_lightning/trainer/distrib_data_parallel.py | 14 +++++++++++--- pytorch_lightning/trainer/supporters.py | 4 ++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 02d6ac3d4e2c5..aa4466bc1a2d7 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -62,10 +62,10 @@ def spawn_ddp_children(self, model): # assert self.trainer.global_rank == 0 # self.trainer.set_random_port(force=True) - port = os.environ['MASTER_PORT'] + #port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') - os.environ['MASTER_PORT'] = f'{port}' + #os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 53192642222aa..44f5b7da37b15 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -419,9 +419,17 @@ def set_random_port(self, force=False): # when not forced, use the user port if force or not default_port: - global RANDOM_PORTS - default_port = RANDOM_PORTS[-1] - RANDOM_PORTS = RANDOM_PORTS[:-1] + # global RANDOM_PORTS + # default_port = RANDOM_PORTS[-1] + # RANDOM_PORTS = RANDOM_PORTS[:-1] + # def get_open_port(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + default_port = port os.environ['MASTER_PORT'] = str(default_port) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index dfba3aa9f7f2a..adf44baa94127 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -104,14 +104,14 @@ def __init__(self, trainer): # self.is_slurm_managing_tasks = is_slurm_managing_tasks self.trainer = trainer self._is_initialized = False - self.trainer.set_random_port() + # self.trainer.set_random_port() def init_connection(self, trainer, model): if self._is_initialized: rank_zero_info("ddp connection already initialized") return - trainer.set_random_port() + trainer.set_random_port(force=True) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) self._is_initialized = True From 2fe51fa00ac4e853abe7c83ec81d90a3dfc4b5cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:23:56 +0200 Subject: [PATCH 101/195] debug --- .../trainer/distrib_data_parallel.py | 1 + pytorch_lightning/trainer/supporters.py | 21 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 44f5b7da37b15..81708ff365190 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -432,6 +432,7 @@ def set_random_port(self, force=False): default_port = port os.environ['MASTER_PORT'] = str(default_port) + return default_port def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']: diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index adf44baa94127..b486898ac3a07 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -1,4 +1,5 @@ import atexit +import os from typing import Optional import torch @@ -103,18 +104,24 @@ def __init__(self, trainer): # self.world_size = world_size # self.is_slurm_managing_tasks = is_slurm_managing_tasks self.trainer = trainer - self._is_initialized = False - # self.trainer.set_random_port() + # self._is_initialized = False + #if self.trainer.gl + #self.trainer.set_random_port() def init_connection(self, trainer, model): - if self._is_initialized: - rank_zero_info("ddp connection already initialized") - return + if torch.distributed.is_initialized(): + rank_zero_info("ddp connection already initialized, moving to new port") - trainer.set_random_port(force=True) + if trainer.global_rank == 0: + new_port = trainer.set_random_port(force=True) + torch.distributed.broadcast(torch.tensor(new_port), src=0) + else: + new_port = torch.empty(1) + torch.distributed.broadcast(new_port, trainer.global_rank) + os.environ['MASTER_PORT'] = str(new_port.item()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - self._is_initialized = True + #self._is_initialized = True def exit_handler(): if torch.distributed.is_initialized(): From 59c01735cc7be061e79cb478dc8c54732372fd12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:27:17 +0200 Subject: [PATCH 102/195] debug --- pytorch_lightning/trainer/supporters.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index b486898ac3a07..dd1a11f12f11b 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -110,12 +110,14 @@ def __init__(self, trainer): def init_connection(self, trainer, model): if torch.distributed.is_initialized(): - rank_zero_info("ddp connection already initialized, moving to new port") + print("ddp connection already initialized, moving to new port") if trainer.global_rank == 0: + print('sending new port to others') new_port = trainer.set_random_port(force=True) torch.distributed.broadcast(torch.tensor(new_port), src=0) else: + print('receiving new port') new_port = torch.empty(1) torch.distributed.broadcast(new_port, trainer.global_rank) os.environ['MASTER_PORT'] = str(new_port.item()) @@ -131,6 +133,7 @@ def exit_handler(): atexit.register(exit_handler) def teardown(self): + return if torch.distributed.is_initialized(): torch.cuda.empty_cache() torch.distributed.barrier() From f3d019020446245e422c3f9a0e68f62ddb10c2fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:29:09 +0200 Subject: [PATCH 103/195] debug --- pytorch_lightning/trainer/supporters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index dd1a11f12f11b..0b78b195ca9be 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -115,10 +115,10 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: print('sending new port to others') new_port = trainer.set_random_port(force=True) - torch.distributed.broadcast(torch.tensor(new_port), src=0) + torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) else: print('receiving new port') - new_port = torch.empty(1) + new_port = torch.empty(1, device=model.device) torch.distributed.broadcast(new_port, trainer.global_rank) os.environ['MASTER_PORT'] = str(new_port.item()) @@ -133,7 +133,7 @@ def exit_handler(): atexit.register(exit_handler) def teardown(self): - return + return if torch.distributed.is_initialized(): torch.cuda.empty_cache() torch.distributed.barrier() From 5c066798ced04e97f254fd0b67524277d484d17f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:30:47 +0200 Subject: [PATCH 104/195] debug --- pytorch_lightning/trainer/supporters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 0b78b195ca9be..21196fe25f333 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -120,7 +120,7 @@ def init_connection(self, trainer, model): print('receiving new port') new_port = torch.empty(1, device=model.device) torch.distributed.broadcast(new_port, trainer.global_rank) - os.environ['MASTER_PORT'] = str(new_port.item()) + os.environ['MASTER_PORT'] = str(int(new_port.item())) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) #self._is_initialized = True From 5ba396252b13704ab5099f4a0fa4a76e058eca53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:38:17 +0200 Subject: [PATCH 105/195] debug --- pytorch_lightning/trainer/supporters.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 21196fe25f333..c81a2b828043f 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -109,18 +109,18 @@ def __init__(self, trainer): #self.trainer.set_random_port() def init_connection(self, trainer, model): - if torch.distributed.is_initialized(): - print("ddp connection already initialized, moving to new port") - - if trainer.global_rank == 0: - print('sending new port to others') - new_port = trainer.set_random_port(force=True) - torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) - else: - print('receiving new port') - new_port = torch.empty(1, device=model.device) - torch.distributed.broadcast(new_port, trainer.global_rank) - os.environ['MASTER_PORT'] = str(int(new_port.item())) + # if torch.distributed.is_initialized(): + # print("ddp connection already initialized, moving to new port") + + if trainer.global_rank == 0: + print('sending new port to others') + new_port = trainer.set_random_port(force=True) + torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) + else: + print('receiving new port on rank=', trainer.global_rank) + new_port = torch.empty(1, device=model.device) + torch.distributed.broadcast(new_port, trainer.global_rank) + os.environ['MASTER_PORT'] = str(int(new_port.item())) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) #self._is_initialized = True From e74cb9c6facb1968cd551d7d3db31c59e350e128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:46:20 +0200 Subject: [PATCH 106/195] debug --- pytorch_lightning/trainer/supporters.py | 31 ++++++++++++++----------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index c81a2b828043f..8a52427c5fdab 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -106,30 +106,33 @@ def __init__(self, trainer): self.trainer = trainer # self._is_initialized = False #if self.trainer.gl - #self.trainer.set_random_port() + + # initial random port, before ddp connection is initialized + self.trainer.set_random_port() def init_connection(self, trainer, model): - # if torch.distributed.is_initialized(): - # print("ddp connection already initialized, moving to new port") - - if trainer.global_rank == 0: - print('sending new port to others') - new_port = trainer.set_random_port(force=True) - torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) - else: - print('receiving new port on rank=', trainer.global_rank) - new_port = torch.empty(1, device=model.device) - torch.distributed.broadcast(new_port, trainer.global_rank) - os.environ['MASTER_PORT'] = str(int(new_port.item())) + if torch.distributed.is_initialized(): + print("ddp connection already initialized, moving to new port") + + if trainer.global_rank == 0: + print('sending new port to others') + new_port = trainer.set_random_port(force=True) + torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) + else: + print('receiving new port on rank=', trainer.global_rank) + new_port = torch.empty(1, device=model.device) + torch.distributed.broadcast(new_port, trainer.global_rank) + os.environ['MASTER_PORT'] = str(int(new_port.item())) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - #self._is_initialized = True def exit_handler(): if torch.distributed.is_initialized(): # torch.distributed.barrier() torch.distributed.destroy_process_group() + print('group destroyed on ', trainer.global_rank) + atexit.register(exit_handler) def teardown(self): From a7c732d8feda14159625096f1b6c35eb40590d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:48:34 +0200 Subject: [PATCH 107/195] debug --- pytorch_lightning/trainer/supporters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 8a52427c5fdab..5806889743fa3 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -133,7 +133,7 @@ def exit_handler(): print('group destroyed on ', trainer.global_rank) - atexit.register(exit_handler) + # atexit.register(exit_handler) def teardown(self): return From fa5d177a652a91ea8c8a155ce7ecdc7c89f4f9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:50:23 +0200 Subject: [PATCH 108/195] debug --- pytorch_lightning/trainer/supporters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 5806889743fa3..c3a739b6baf0f 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -114,6 +114,8 @@ def init_connection(self, trainer, model): if torch.distributed.is_initialized(): print("ddp connection already initialized, moving to new port") + torch.distributed.barrier() + if trainer.global_rank == 0: print('sending new port to others') new_port = trainer.set_random_port(force=True) From 01a8f11debe87a9bd4cd50923a9364628ddbc7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 02:55:32 +0200 Subject: [PATCH 109/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 3 ++- pytorch_lightning/trainer/supporters.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 81708ff365190..66cf297482fca 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -414,7 +414,7 @@ def set_random_port(self, force=False): # pick a random port first assert self.num_nodes == 1, 'random port can only be called from single node training' - print('setting port on rank', self.global_rank) + default_port = os.environ.get('MASTER_PORT') # when not forced, use the user port @@ -431,6 +431,7 @@ def set_random_port(self, force=False): s.close() default_port = port + print('setting port on rank', self.global_rank, default_port) os.environ['MASTER_PORT'] = str(default_port) return default_port diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index c3a739b6baf0f..a9ae677ea52bf 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -115,7 +115,7 @@ def init_connection(self, trainer, model): print("ddp connection already initialized, moving to new port") torch.distributed.barrier() - + if trainer.global_rank == 0: print('sending new port to others') new_port = trainer.set_random_port(force=True) From 3ac56093fb6d2d3b4e6ea409329d3f5aef756f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:05:03 +0200 Subject: [PATCH 110/195] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 6 +++--- pytorch_lightning/trainer/supporters.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 66cf297482fca..01a033a69ec8c 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -407,14 +407,13 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - def set_random_port(self, force=False): + def set_random_port(self, force=False, overwrite=True): """ When running DDP NOT managed by SLURM, the ports might collide """ # pick a random port first assert self.num_nodes == 1, 'random port can only be called from single node training' - default_port = os.environ.get('MASTER_PORT') # when not forced, use the user port @@ -432,7 +431,8 @@ def set_random_port(self, force=False): default_port = port print('setting port on rank', self.global_rank, default_port) - os.environ['MASTER_PORT'] = str(default_port) + if overwrite: + os.environ['MASTER_PORT'] = str(default_port) return default_port def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index a9ae677ea52bf..7c40b808f6dfb 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -118,13 +118,16 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: print('sending new port to others') - new_port = trainer.set_random_port(force=True) + new_port = trainer.set_random_port(force=True, overwrite=False) torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) else: print('receiving new port on rank=', trainer.global_rank) new_port = torch.empty(1, device=model.device) torch.distributed.broadcast(new_port, trainer.global_rank) - os.environ['MASTER_PORT'] = str(int(new_port.item())) + new_port = int(new_port.item()) + + torch.distributed.destroy_process_group() + os.environ['MASTER_PORT'] = str(new_port) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 0531f118e3bfe9aa6fd7d3ea0b3847f46fbe067b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:11:51 +0200 Subject: [PATCH 111/195] debug --- pytorch_lightning/trainer/supporters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 7c40b808f6dfb..0b9b989afb7a8 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,12 +119,13 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: print('sending new port to others') new_port = trainer.set_random_port(force=True, overwrite=False) + print('sending new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) else: - print('receiving new port on rank=', trainer.global_rank) new_port = torch.empty(1, device=model.device) - torch.distributed.broadcast(new_port, trainer.global_rank) + torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) + print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() os.environ['MASTER_PORT'] = str(new_port) From 24313330f1303513a78b7f1e76eecaa76b75d651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:24:03 +0200 Subject: [PATCH 112/195] debug --- pytorch_lightning/trainer/supporters.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 0b9b989afb7a8..28db3f8ea1de2 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -117,16 +117,15 @@ def init_connection(self, trainer, model): torch.distributed.barrier() if trainer.global_rank == 0: - print('sending new port to others') new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - torch.distributed.broadcast(torch.tensor(new_port, device=model.device), src=0) + new_port = torch.tensor(new_port, device=model.device) else: new_port = torch.empty(1, device=model.device) - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - print('receiving new port on rank=', trainer.global_rank, 'port', new_port) + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() os.environ['MASTER_PORT'] = str(new_port) From 7b40fc0746157b93d3ee4d674fa7891861f404ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:28:04 +0200 Subject: [PATCH 113/195] debug --- pytorch_lightning/trainer/supporters.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 28db3f8ea1de2..87fca33af0f43 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,9 +119,11 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor(new_port, device=model.device) + new_port = torch.tensor([new_port], device=model.device) + print(new_port) else: new_port = torch.empty(1, device=model.device) + print(new_port) torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) From a293da0cff7b7cc6d14a4820623e344737c87bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:31:39 +0200 Subject: [PATCH 114/195] debug --- pytorch_lightning/trainer/supporters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 87fca33af0f43..29a8bf9eb6657 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,10 +119,10 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port], device=model.device) + new_port = torch.tensor([new_port]).cuda() print(new_port) else: - new_port = torch.empty(1, device=model.device) + new_port = torch.empty(1).cuda() print(new_port) torch.distributed.broadcast(new_port, src=0) From ee393bd7c6852de38a4c6934bd6c4b4e0f58d78b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:36:17 +0200 Subject: [PATCH 115/195] debug --- pytorch_lightning/trainer/supporters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 29a8bf9eb6657..d2f193a54cfaf 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,10 +119,10 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port]).cuda() + new_port = torch.tensor([new_port]).cuda(0) print(new_port) else: - new_port = torch.empty(1).cuda() + new_port = torch.empty(1).cuda(0) print(new_port) torch.distributed.broadcast(new_port, src=0) From a4c546acd609525093a0722bf67332cf393d810b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 03:48:32 +0200 Subject: [PATCH 116/195] debug --- pytorch_lightning/trainer/supporters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index d2f193a54cfaf..b3bf701ac8309 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -125,7 +125,8 @@ def init_connection(self, trainer, model): new_port = torch.empty(1).cuda(0) print(new_port) - torch.distributed.broadcast(new_port, src=0) + torch.distributed.broadcast_multigpu([new_port], src=0, src_tensor=0) + #torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() From ba517bdf4fb3212da895b5b41e5ce292b2e37b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:01:50 +0200 Subject: [PATCH 117/195] debug --- pytorch_lightning/trainer/supporters.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index b3bf701ac8309..782370e3150da 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -121,11 +121,14 @@ def init_connection(self, trainer, model): print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port]).cuda(0) print(new_port) + for i in range(1, trainer.world_size): + torch.distributed.send(new_port, dst=i) else: new_port = torch.empty(1).cuda(0) + torch.distributed.recv(new_port, src=0) print(new_port) - torch.distributed.broadcast_multigpu([new_port], src=0, src_tensor=0) + #torch.distributed.broadcast_multigpu([new_port], src=0, src_tensor=0) #torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) print('receiving new port on rank=', trainer.global_rank, 'port', new_port) From 308ed14557e40099604298961226b05c220f3107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:07:53 +0200 Subject: [PATCH 118/195] debug --- pytorch_lightning/trainer/supporters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 782370e3150da..920723aff1f8c 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -121,13 +121,15 @@ def init_connection(self, trainer, model): print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port]).cuda(0) print(new_port) - for i in range(1, trainer.world_size): - torch.distributed.send(new_port, dst=i) else: new_port = torch.empty(1).cuda(0) - torch.distributed.recv(new_port, src=0) print(new_port) + tensor_list = [torch.empty_like(new_port)] * trainer.world_size + torch.distributed.all_gather(tensor_list, new_port) + new_port = tensor_list[0] + + #torch.distributed.broadcast_multigpu([new_port], src=0, src_tensor=0) #torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) From 9f34b2cbdb6719eb81ba9a6564743b238c3e7dab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:13:31 +0200 Subject: [PATCH 119/195] debug --- pytorch_lightning/trainer/supporters.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 920723aff1f8c..3fdae727d3e27 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -112,29 +112,23 @@ def __init__(self, trainer): def init_connection(self, trainer, model): if torch.distributed.is_initialized(): - print("ddp connection already initialized, moving to new port") + print(trainer.global_rank, "ddp connection already initialized, moving to new port") torch.distributed.barrier() if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port]).cuda(0) + new_port = torch.tensor([new_port]).cuda() print(new_port) else: - new_port = torch.empty(1).cuda(0) + new_port = torch.empty(1).cuda() print(new_port) - tensor_list = [torch.empty_like(new_port)] * trainer.world_size - torch.distributed.all_gather(tensor_list, new_port) - new_port = tensor_list[0] - - - #torch.distributed.broadcast_multigpu([new_port], src=0, src_tensor=0) - #torch.distributed.broadcast(new_port, src=0) + torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) print('receiving new port on rank=', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() + torch.distributed.destroy_process_group() # destroy connections on old port os.environ['MASTER_PORT'] = str(new_port) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 49ed09d048275bcc59b6eab360c4d5ea7867985f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:25:05 +0200 Subject: [PATCH 120/195] debug --- pytorch_lightning/trainer/supporters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 3fdae727d3e27..b890b179feff6 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -120,10 +120,10 @@ def init_connection(self, trainer, model): new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port]).cuda() - print(new_port) + print(new_port.shape, new_port.dtype) else: new_port = torch.empty(1).cuda() - print(new_port) + print(new_port.shape, new_port.dtype) torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) From 1874b8a531b5b09fe133f8c953a22a7fe94e4af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:26:13 +0200 Subject: [PATCH 121/195] debug --- pytorch_lightning/trainer/supporters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index b890b179feff6..88aa3c5d32ba1 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,10 +119,10 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port]).cuda() + new_port = torch.tensor([new_port], dtype=torch.int).cuda() print(new_port.shape, new_port.dtype) else: - new_port = torch.empty(1).cuda() + new_port = torch.empty(1, dtype=torch.int).cuda() print(new_port.shape, new_port.dtype) torch.distributed.broadcast(new_port, src=0) From b22bd74042116a064123b56cfeeafcf3bcc28dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:27:26 +0200 Subject: [PATCH 122/195] debug --- pytorch_lightning/trainer/supporters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 88aa3c5d32ba1..299a6db622113 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -119,10 +119,10 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port], dtype=torch.int).cuda() + new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') print(new_port.shape, new_port.dtype) else: - new_port = torch.empty(1, dtype=torch.int).cuda() + new_port = torch.empty(1, dtype=torch.int, device='cuda') print(new_port.shape, new_port.dtype) torch.distributed.broadcast(new_port, src=0) From 46915c64b3a28233a96372b3d73a4a3820110ebb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:41:18 +0200 Subject: [PATCH 123/195] cleanup --- .../accelerator_backends/ddp_backend.py | 27 +------------------ pytorch_lightning/core/decorators.py | 21 ++------------- pytorch_lightning/core/lightning.py | 4 +-- pytorch_lightning/trainer/supporters.py | 17 +++++------- 4 files changed, 10 insertions(+), 59 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index aa4466bc1a2d7..e5437b3586153 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -59,13 +59,9 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): - # assert self.trainer.global_rank == 0 - # self.trainer.set_random_port(force=True) - #port = os.environ['MASTER_PORT'] master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') - #os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank @@ -157,19 +153,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # where to store ip_table model.trainer = self.trainer - # from torch.distributed import is_initialized - # if not is_master or not is_initialized(): - # assert not (is_master and self.trainer.global_rank > 0) - # # on rank > 0, we always need to initialize, because these are new processes - - self.distributed_connection.init_connection(self.trainer, model) - # model.init_ddp_connection( - # self.trainer.global_rank, - # self.trainer.world_size, - # self.trainer.is_slurm_managing_tasks - # ) - # else: - # print('already initialized', os.environ['MASTER_PORT'], os.getpid(), is_master) + self.distributed_connection.reset_connection(self.trainer, model) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) @@ -241,14 +225,5 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # clean up memory torch.cuda.empty_cache() - # clean up dist group - #if self.use_ddp or self.use_ddp2: - # import torch.distributed as torch_distrib - # torch_distrib.destroy_process_group() - - # torch.distributed.destroy_process_group() - - self.distributed_connection.teardown() - if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 097e1fa2100d7..3e9241ea20e2e 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,6 +1,8 @@ from functools import wraps from typing import Callable +from pytorch_lightning.core.lightning import LightningModule + def auto_move_data(fn: Callable) -> Callable: """ @@ -38,8 +40,6 @@ def forward(self, x): """ @wraps(fn) def auto_transfer_args(self, *args, **kwargs): - # local import to prevent circular import issue - from pytorch_lightning.core.lightning import LightningModule if not isinstance(self, LightningModule): return fn(self, *args, **kwargs) @@ -49,20 +49,3 @@ def auto_transfer_args(self, *args, **kwargs): return fn(self, *args, **kwargs) return auto_transfer_args - - -def run_once(fn): - """ - Decorate a function or method to make it run only once. - Subsequent calls will result in a no-operation. - """ - @wraps(fn) - def wrapper(*args, **kwargs): - if not wrapper.has_run: - wrapper.has_run = True - fn(*args, **kwargs) - - wrapper.has_run = False - return wrapper - - diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index ba495b90d7003..f816726ddf1e1 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -16,7 +16,6 @@ from torch.utils.data import DataLoader from pytorch_lightning import _logger as log -from pytorch_lightning.core.decorators import run_once from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import ModelHooks from pytorch_lightning.core.memory import ModelSummary @@ -922,7 +921,6 @@ def _init_slurm_connection(self) -> None: root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node - #@run_once def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None: """ Override to define your custom way of setting up a distributed environment. @@ -954,7 +952,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi f"WORLD_SIZE environment variable ({os.environ['WORLD_SIZE']}) " f"is not equal to the computed world size ({world_size}). Ignored." ) - print('master port init', os.environ['MASTER_PORT'], os.getpid()) + torch_backend = "nccl" if self.trainer.on_gpu else "gloo" log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 299a6db622113..70fd4c64bb499 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -101,16 +101,11 @@ class DistributedConnection: def __init__(self, trainer): super().__init__() - # self.world_size = world_size - # self.is_slurm_managing_tasks = is_slurm_managing_tasks self.trainer = trainer - # self._is_initialized = False - #if self.trainer.gl - # initial random port, before ddp connection is initialized self.trainer.set_random_port() - def init_connection(self, trainer, model): + def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "ddp connection already initialized, moving to new port") @@ -118,16 +113,16 @@ def init_connection(self, trainer, model): if trainer.global_rank == 0: new_port = trainer.set_random_port(force=True, overwrite=False) - print('sending new port on rank=', trainer.global_rank, 'port', new_port) + #print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') - print(new_port.shape, new_port.dtype) + #print(new_port.shape, new_port.dtype) else: new_port = torch.empty(1, dtype=torch.int, device='cuda') - print(new_port.shape, new_port.dtype) + #print(new_port.shape, new_port.dtype) torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) - print('receiving new port on rank=', trainer.global_rank, 'port', new_port) + #print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() # destroy connections on old port os.environ['MASTER_PORT'] = str(new_port) @@ -138,7 +133,7 @@ def exit_handler(): # torch.distributed.barrier() torch.distributed.destroy_process_group() - print('group destroyed on ', trainer.global_rank) + #print('group destroyed on ', trainer.global_rank) # atexit.register(exit_handler) From c3f9c866f0b7f280feabecbc4338202113e70f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:48:43 +0200 Subject: [PATCH 124/195] cleanup --- pytorch_lightning/core/decorators.py | 1 - .../trainer/distrib_data_parallel.py | 21 ++----------------- pytorch_lightning/trainer/evaluation_loop.py | 9 +------- pytorch_lightning/trainer/trainer.py | 9 +------- 4 files changed, 4 insertions(+), 36 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 3e9241ea20e2e..97eba56ea2464 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -40,7 +40,6 @@ def forward(self, x): """ @wraps(fn) def auto_transfer_args(self, *args, **kwargs): - if not isinstance(self, LightningModule): return fn(self, *args, **kwargs) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 01a033a69ec8c..b34384a0de223 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -131,20 +131,13 @@ def train_fx(trial_hparams, cluster_manager, _): import re from abc import ABC, abstractmethod from distutils.version import LooseVersion -from typing import Union, List, Optional, Callable, Tuple -import subprocess -import sys -from time import sleep -import numpy as np -from os.path import abspath -from pkg_resources import parse_version +from typing import Union, List, Optional, Tuple import torch from pytorch_lightning import _logger as log from pytorch_lightning.loggers import LightningLoggerBase -from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, rank_zero_info +from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_info from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule @@ -172,12 +165,6 @@ def train_fx(trial_hparams, cluster_manager, _): XLA_AVAILABLE = True -#PID = os.getpid() -#RNG1 = np.random.RandomState(PID) -#RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) -RANDOM_PORTS = list(range(10000, 20000)) - - class TrainerDDPMixin(ABC): # this is just a summary on variables used in this abstract class, @@ -418,10 +405,6 @@ def set_random_port(self, force=False, overwrite=True): # when not forced, use the user port if force or not default_port: - # global RANDOM_PORTS - # default_port = RANDOM_PORTS[-1] - # RANDOM_PORTS = RANDOM_PORTS[:-1] - # def get_open_port(): import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("", 0)) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index a3a8102f3f204..add9bb24c672a 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -291,7 +291,6 @@ def _evaluate( # run validation for dataloader_idx, dataloader in enumerate(dataloaders): - print('here 1') dl_outputs = [] # on TPU we have to wrap it under the ParallelLoader @@ -304,7 +303,6 @@ def _evaluate( dl_max_batches = max_batches[dataloader_idx] for batch_idx, batch in enumerate(dataloader): - print('here 2') if batch is None: continue @@ -602,19 +600,16 @@ def __log_evaluation_epoch_metrics(self, eval_results, test_mode): def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False): # make dataloader_idx arg in validation_step optional args = [batch, batch_idx] - print('here 3') + if (test_mode and len(self.test_dataloaders) > 1) \ or (not test_mode and len(self.val_dataloaders) > 1): args.append(dataloader_idx) # handle DP, DDP forward if self.use_ddp or self.use_dp or self.use_ddp2: - # SOMETHING GOES WRONG HERE, test loop is stuck output = model(*args) return output - print('here 4') - # Horovod if self.use_horovod and self.on_gpu: batch = self.transfer_batch_to_gpu(batch, hvd.local_rank()) @@ -640,6 +635,4 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: else: output = model.validation_step(*args) - print('here 5') - return output diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ccad79d201734..d91f026ae3451 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -43,7 +43,7 @@ from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin -from pytorch_lightning.trainer.supporters import TensorRunningAccum, DistributedConnection +from pytorch_lightning.trainer.supporters import TensorRunningAccum from pytorch_lightning.trainer.training_io import TrainerIOMixin from pytorch_lightning.trainer.training_loop import TrainerTrainLoopMixin from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin @@ -1025,7 +1025,6 @@ def fit( # ddp elif self.distributed_backend == 'ddp': - # self.set_random_port() self.accelerator_backend = DDPBackend(self) results = self.accelerator_backend.spawn_ddp_children(model) @@ -1321,10 +1320,6 @@ def test( self.teardown('test') - # if torch.distributed.is_initialized(): - # print('destroy in test', self.global_rank, os.getpid()) - # torch.distributed.destroy_process_group() - return results def __test_using_best_weights(self, ckpt_path, test_dataloaders): @@ -1358,7 +1353,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - #self.set_random_port() self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1381,7 +1375,6 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - #self.set_random_port() self.testing = True self.model = model results = self.fit(model) From 454d4cf0e94a29983ac4ddd1c6ad8497afbeb94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:52:08 +0200 Subject: [PATCH 125/195] cleanup --- pytorch_lightning/trainer/training_loop.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 651590715e2a4..993e8ccd53fd0 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1040,8 +1040,7 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - if (self.use_ddp or self.use_ddp2): - print('destroy on rank ', self.global_rank, os.getpid()) + if self.use_ddp or self.use_ddp2: torch_distrib.destroy_process_group() # clear mem From 27a815f41a8a291f96149d4016d770d9677eef2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 04:58:10 +0200 Subject: [PATCH 126/195] move class --- .../accelerator_backends/ddp_backend.py | 50 ++++++++++++++++++- pytorch_lightning/trainer/supporters.py | 48 ------------------ 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index e5437b3586153..6813c78e9ca28 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -14,13 +14,13 @@ import os import torch +import torch.distributed import subprocess import sys from time import sleep import numpy as np from os.path import abspath -from pytorch_lightning.trainer.supporters import DistributedConnection from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning import _logger as log @@ -227,3 +227,51 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results + + +class DistributedConnection: + + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + # initial random port, before ddp connection is initialized + self.trainer.set_random_port() + + def reset_connection(self, trainer, model): + if torch.distributed.is_initialized(): + print(trainer.global_rank, "ddp connection already initialized, moving to new port") + + torch.distributed.barrier() + + if trainer.global_rank == 0: + new_port = trainer.set_random_port(force=True, overwrite=False) + #print('sending new port on rank=', trainer.global_rank, 'port', new_port) + new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') + #print(new_port.shape, new_port.dtype) + else: + new_port = torch.empty(1, dtype=torch.int, device='cuda') + #print(new_port.shape, new_port.dtype) + + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + #print('receiving new port on rank=', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + os.environ['MASTER_PORT'] = str(new_port) + + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + + def exit_handler(): + if torch.distributed.is_initialized(): + # torch.distributed.barrier() + torch.distributed.destroy_process_group() + + #print('group destroyed on ', trainer.global_rank) + + # atexit.register(exit_handler) + + def teardown(self): + return + if torch.distributed.is_initialized(): + torch.cuda.empty_cache() + torch.distributed.barrier() + torch.distributed.destroy_process_group() diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 70fd4c64bb499..12b6e0e751683 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -95,51 +95,3 @@ def accumulate(self, x): def mean(self): return self.total / self.num_values - - -class DistributedConnection: - - def __init__(self, trainer): - super().__init__() - self.trainer = trainer - # initial random port, before ddp connection is initialized - self.trainer.set_random_port() - - def reset_connection(self, trainer, model): - if torch.distributed.is_initialized(): - print(trainer.global_rank, "ddp connection already initialized, moving to new port") - - torch.distributed.barrier() - - if trainer.global_rank == 0: - new_port = trainer.set_random_port(force=True, overwrite=False) - #print('sending new port on rank=', trainer.global_rank, 'port', new_port) - new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') - #print(new_port.shape, new_port.dtype) - else: - new_port = torch.empty(1, dtype=torch.int, device='cuda') - #print(new_port.shape, new_port.dtype) - - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - #print('receiving new port on rank=', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port - os.environ['MASTER_PORT'] = str(new_port) - - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - - def exit_handler(): - if torch.distributed.is_initialized(): - # torch.distributed.barrier() - torch.distributed.destroy_process_group() - - #print('group destroyed on ', trainer.global_rank) - - # atexit.register(exit_handler) - - def teardown(self): - return - if torch.distributed.is_initialized(): - torch.cuda.empty_cache() - torch.distributed.barrier() - torch.distributed.destroy_process_group() From 748a96303223b13efc448e2a050088f7d2d73fb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 05:02:52 +0200 Subject: [PATCH 127/195] cleanup --- pytorch_lightning/trainer/supporters.py | 4 ---- tests/base/model_valid_epoch_ends.py | 1 - 2 files changed, 5 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 12b6e0e751683..5975fd1c95b16 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -1,12 +1,8 @@ -import atexit -import os from typing import Optional import torch import torch.distributed -from pytorch_lightning.utilities import rank_zero_info - class TensorRunningAccum(object): """Tracks a running accumulation values (min, max, mean) without graph diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py index f09c382a38c82..a7295aa9caef0 100644 --- a/tests/base/model_valid_epoch_ends.py +++ b/tests/base/model_valid_epoch_ends.py @@ -21,7 +21,6 @@ def _mean(res, key): # recursive mean for multilevel dicts return torch.stack([x[key] if isinstance(x, dict) else _mean(x, key) for x in res]).mean() - print('in validation epoch end') val_loss_mean = _mean(outputs, 'val_loss') val_acc_mean = _mean(outputs, 'val_acc') From ce2f31e0a840b5ceb5705f20b808a77d552ebdfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 05:15:05 +0200 Subject: [PATCH 128/195] cleanup --- .../trainer/distrib_data_parallel.py | 21 +++++++++++-------- pytorch_lightning/trainer/supporters.py | 1 - 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index b34384a0de223..11e83a827e366 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -405,15 +405,8 @@ def set_random_port(self, force=False, overwrite=True): # when not forced, use the user port if force or not default_port: - import socket - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(("", 0)) - s.listen(1) - port = s.getsockname()[1] - s.close() - default_port = port - - print('setting port on rank', self.global_rank, default_port) + default_port = find_open_network_port() + if overwrite: os.environ['MASTER_PORT'] = str(default_port) return default_port @@ -520,3 +513,13 @@ def check_horovod(self): def has_horovodrun(): """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ + + +def find_open_network_port(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port \ No newline at end of file diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 5975fd1c95b16..8853d7aaa05b0 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -1,7 +1,6 @@ from typing import Optional import torch -import torch.distributed class TensorRunningAccum(object): From 76fe75ba5b932d009a20bba14336677aedd24217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 05:55:21 +0200 Subject: [PATCH 129/195] cleanup --- .../accelerator_backends/ddp_backend.py | 31 ++++++++++++++++--- .../accelerator_backends/ddp_spawn_backend.py | 17 ++++++---- .../trainer/distrib_data_parallel.py | 27 ---------------- 3 files changed, 38 insertions(+), 37 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 6813c78e9ca28..2972dc46f6b72 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -234,8 +234,8 @@ class DistributedConnection: def __init__(self, trainer): super().__init__() self.trainer = trainer - # initial random port, before ddp connection is initialized - self.trainer.set_random_port() + # select or set an initial port before ddp connection is initialized + self._set_master_port(port=self._get_master_port()) def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): @@ -244,7 +244,7 @@ def reset_connection(self, trainer, model): torch.distributed.barrier() if trainer.global_rank == 0: - new_port = trainer.set_random_port(force=True, overwrite=False) + new_port = find_open_network_port() #print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') #print(new_port.shape, new_port.dtype) @@ -256,7 +256,8 @@ def reset_connection(self, trainer, model): new_port = int(new_port.item()) #print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() # destroy connections on old port - os.environ['MASTER_PORT'] = str(new_port) + self._set_master_port(port=new_port) + #os.environ['MASTER_PORT'] = str(new_port) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) @@ -269,9 +270,31 @@ def exit_handler(): # atexit.register(exit_handler) + def _get_master_port(self): + return os.environ.get('MASTER_PORT') + + # TODO: document + def _set_master_port(self, port: int = None): + """ + When running DDP NOT managed by SLURM, the ports might collide + """ + # assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' + os.environ['MASTER_PORT'] = str(port or find_open_network_port()) + return port + def teardown(self): return if torch.distributed.is_initialized(): torch.cuda.empty_cache() torch.distributed.barrier() torch.distributed.destroy_process_group() + + +def find_open_network_port(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port \ No newline at end of file diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index 704fc5558588a..1a4feeaef5a0f 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -15,6 +15,8 @@ import os import torch import torch.multiprocessing as mp + +from pytorch_lightning.accelerator_backends.ddp_backend import find_open_network_port, DistributedConnection from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning import _logger as log @@ -31,9 +33,11 @@ class DDPSpawnBackend(object): def __init__(self, trainer): self.trainer = trainer self.mp_queue = None + self.distributed_connection = DistributedConnection(trainer) def setup(self): - self.trainer.set_random_port() + # TODO: check + # self.trainer.set_random_port() # pass in a state q smp = mp.get_context('spawn') @@ -95,11 +99,12 @@ def ddp_train(self, process_idx, mp_queue, model): # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) + self.distributed_connection.reset_connection(self.trainer, model) + # model.init_ddp_connection( + # self.trainer.global_rank, + # self.trainer.world_size, + # self.trainer.is_slurm_managing_tasks + # ) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 11e83a827e366..99466542210c3 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -394,23 +394,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - def set_random_port(self, force=False, overwrite=True): - """ - When running DDP NOT managed by SLURM, the ports might collide - """ - # pick a random port first - assert self.num_nodes == 1, 'random port can only be called from single node training' - - default_port = os.environ.get('MASTER_PORT') - - # when not forced, use the user port - if force or not default_port: - default_port = find_open_network_port() - - if overwrite: - os.environ['MASTER_PORT'] = str(default_port) - return default_port - def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']: return @@ -513,13 +496,3 @@ def check_horovod(self): def has_horovodrun(): """Returns True if running with `horovodrun` using Gloo or OpenMPI.""" return 'OMPI_COMM_WORLD_RANK' in os.environ or 'HOROVOD_RANK' in os.environ - - -def find_open_network_port(): - import socket - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(("", 0)) - s.listen(1) - port = s.getsockname()[1] - s.close() - return port \ No newline at end of file From 6c45ebc9466ade2fd72099fd0f34d50488b5707f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 05:58:20 +0200 Subject: [PATCH 130/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 2972dc46f6b72..ce793f865ea67 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -268,7 +268,7 @@ def exit_handler(): #print('group destroyed on ', trainer.global_rank) - # atexit.register(exit_handler) + atexit.register(exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 0530234cef49d958429bfb0f6a93196d446679cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 05:58:59 +0200 Subject: [PATCH 131/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index ce793f865ea67..e26828824021c 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License - +import atexit import os import torch import torch.distributed From fe596563d625ddd829b17a433c208d57d973b7a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:02:04 +0200 Subject: [PATCH 132/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index e26828824021c..e6523dc729449 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -245,29 +245,21 @@ def reset_connection(self, trainer, model): if trainer.global_rank == 0: new_port = find_open_network_port() - #print('sending new port on rank=', trainer.global_rank, 'port', new_port) new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') - #print(new_port.shape, new_port.dtype) else: new_port = torch.empty(1, dtype=torch.int, device='cuda') - #print(new_port.shape, new_port.dtype) torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) - #print('receiving new port on rank=', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() # destroy connections on old port self._set_master_port(port=new_port) - #os.environ['MASTER_PORT'] = str(new_port) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): if torch.distributed.is_initialized(): - # torch.distributed.barrier() torch.distributed.destroy_process_group() - #print('group destroyed on ', trainer.global_rank) - atexit.register(exit_handler) def _get_master_port(self): From cbab0953404cfaf02e0ea97c89a417f5af9c9b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:04:37 +0200 Subject: [PATCH 133/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index e6523dc729449..010342d7ec73e 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -242,13 +242,12 @@ def reset_connection(self, trainer, model): print(trainer.global_rank, "ddp connection already initialized, moving to new port") torch.distributed.barrier() + new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: new_port = find_open_network_port() - new_port = torch.tensor([new_port], dtype=torch.int, device='cuda') - else: - new_port = torch.empty(1, dtype=torch.int, device='cuda') - + new_port[0] = new_port + torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) torch.distributed.destroy_process_group() # destroy connections on old port From 9c3dde55de19f04cb0e1b095546be0e3b772a602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:05:51 +0200 Subject: [PATCH 134/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 010342d7ec73e..06db812006baf 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -245,9 +245,9 @@ def reset_connection(self, trainer, model): new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: - new_port = find_open_network_port() - new_port[0] = new_port - + port = find_open_network_port() + new_port[0] = port + torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) torch.distributed.destroy_process_group() # destroy connections on old port From 02a50708ff49fa886193e9b2f8bd6e4226f6f0e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:07:10 +0200 Subject: [PATCH 135/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 06db812006baf..7a23d7eaad3c6 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -241,7 +241,7 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "ddp connection already initialized, moving to new port") - torch.distributed.barrier() + # torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: From 0c5592c8574fff6a063a4dcd412a2ef6e2996159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:09:30 +0200 Subject: [PATCH 136/195] cleanup --- pytorch_lightning/accelerator_backends/ddp_backend.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 7a23d7eaad3c6..6280443949bc4 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -22,7 +22,7 @@ from os.path import abspath from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_debug from pytorch_lightning import _logger as log from typing import Optional @@ -239,9 +239,8 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): - print(trainer.global_rank, "ddp connection already initialized, moving to new port") + rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - # torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: From f1c5edcfbebd7219f15dab2b6e2d09077cf76017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:17:42 +0200 Subject: [PATCH 137/195] cleanup --- .../accelerator_backends/ddp_backend.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 6280443949bc4..6d2c727aeb006 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -234,8 +234,9 @@ class DistributedConnection: def __init__(self, trainer): super().__init__() self.trainer = trainer - # select or set an initial port before ddp connection is initialized - self._set_master_port(port=self._get_master_port()) + if trainer.num_nodes == 1: + # select or forcibly set an initial port before ddp connection is initialized + self._set_master_port(port=self._get_master_port()) def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): @@ -263,22 +264,14 @@ def exit_handler(): def _get_master_port(self): return os.environ.get('MASTER_PORT') - # TODO: document def _set_master_port(self, port: int = None): """ When running DDP NOT managed by SLURM, the ports might collide """ - # assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' + assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' os.environ['MASTER_PORT'] = str(port or find_open_network_port()) return port - def teardown(self): - return - if torch.distributed.is_initialized(): - torch.cuda.empty_cache() - torch.distributed.barrier() - torch.distributed.destroy_process_group() - def find_open_network_port(): import socket From 59c95acb22935e5decfa6618a37e493a3488cc48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:22:16 +0200 Subject: [PATCH 138/195] cleanup --- .../accelerator_backends/ddp_backend.py | 11 +++++++++-- .../accelerator_backends/ddp_spawn_backend.py | 12 ++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 6d2c727aeb006..369e1900a96b9 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -266,7 +266,14 @@ def _get_master_port(self): def _set_master_port(self, port: int = None): """ - When running DDP NOT managed by SLURM, the ports might collide + Sets the `MASTER_PORT` environment variable in single-node DDP training. + + Args: + port: If provided, sets the environment variable MASTER_PORT, and otherwhise + an attempt is made to find an unused open port. + + Return: + The port that was set. """ assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' os.environ['MASTER_PORT'] = str(port or find_open_network_port()) @@ -280,4 +287,4 @@ def find_open_network_port(): s.listen(1) port = s.getsockname()[1] s.close() - return port \ No newline at end of file + return port diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index 1a4feeaef5a0f..627e471cf2fce 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License -import os import torch import torch.multiprocessing as mp -from pytorch_lightning.accelerator_backends.ddp_backend import find_open_network_port, DistributedConnection +from pytorch_lightning.accelerator_backends.ddp_backend import DistributedConnection from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning import _logger as log @@ -36,9 +35,6 @@ def __init__(self, trainer): self.distributed_connection = DistributedConnection(trainer) def setup(self): - # TODO: check - # self.trainer.set_random_port() - # pass in a state q smp = mp.get_context('spawn') self.mp_queue = smp.SimpleQueue() @@ -99,12 +95,8 @@ def ddp_train(self, process_idx, mp_queue, model): # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer + self.distributed_connection.reset_connection(self.trainer, model) - # model.init_ddp_connection( - # self.trainer.global_rank, - # self.trainer.world_size, - # self.trainer.is_slurm_managing_tasks - # ) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) From ed4058fbc98aa64a760da17c7fab3224e7590869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:26:32 +0200 Subject: [PATCH 139/195] merge --- .../accelerator_backends/ddp_backend.py | 290 ------------------ .../accelerator_backends/ddp_spawn_backend.py | 163 ---------- pytorch_lightning/accelerators/ddp_backend.py | 105 +++++-- .../accelerators/ddp_spawn_backend.py | 24 +- 4 files changed, 94 insertions(+), 488 deletions(-) delete mode 100644 pytorch_lightning/accelerator_backends/ddp_backend.py delete mode 100644 pytorch_lightning/accelerator_backends/ddp_spawn_backend.py diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py deleted file mode 100644 index 369e1900a96b9..0000000000000 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import atexit -import os -import torch -import torch.distributed -import subprocess -import sys -from time import sleep -import numpy as np -from os.path import abspath - -from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE -from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_debug -from pytorch_lightning import _logger as log -from typing import Optional - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - -try: - from apex import amp -except ImportError: - APEX_AVAILABLE = False -else: - APEX_AVAILABLE = True - - -class DDPBackend(object): - - def __init__(self, trainer): - self.trainer = trainer - self.task_idx = None - self.distributed_connection = DistributedConnection(trainer) - - def slurm_setup(self): - self.task_idx = int(os.environ['SLURM_LOCALID']) - - def torchelastic_setup(self): - self.task_idx = int(os.environ['LOCAL_RANK']) - - def train(self, model): - self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) - - def spawn_ddp_children(self, model): - assert self.trainer.global_rank == 0 - - master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') - os.environ['MASTER_ADDR'] = f'{master_address}' - - # allow the user to pass the node rank - node_rank = '0' - node_rank = os.environ.get('NODE_RANK', node_rank) - node_rank = os.environ.get('GROUP_RANK', node_rank) - os.environ['NODE_RANK'] = node_rank - os.environ['LOCAL_RANK'] = '0' - - # when user is using hydra find the absolute path - path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path - - # pull out the commands used to run the script and resolve the abs file path - command = sys.argv - try: - full_path = path_lib(command[0]) - except Exception as e: - full_path = abspath(command[0]) - - command[0] = full_path - # use the same python interpreter and actually running - command = [sys.executable] + command - - # since this script sets the visible devices we replace the gpus flag with a number - num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__() - - if '--gpus' in command: - gpu_flag_idx = command.index('--gpus') - command[gpu_flag_idx + 1] = f'{num_gpus}' - - os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' - - self.trainer.interactive_ddp_procs = [] - for local_rank in range(1, self.trainer.num_processes): - env_copy = os.environ.copy() - env_copy['LOCAL_RANK'] = f'{local_rank}' - - # start process - # if hydra is available and initialized, make sure to set the cwd correctly - cwd: Optional[str] = None - if HYDRA_AVAILABLE: - if HydraConfig.initialized(): - cwd = get_original_cwd() - proc = subprocess.Popen(command, env=env_copy, cwd=cwd) - self.trainer.interactive_ddp_procs.append(proc) - - # starting all processes at once can cause issues - # with dataloaders delay between 1-10 seconds - delay = np.random.uniform(1, 5, 1)[0] - sleep(delay) - - local_rank = 0 - results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True) - del os.environ['WORLD_SIZE'] - - return results - - def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - is_master: - proc_offset: - - Returns: - - """ - # offset the process id if requested - process_idx = process_idx + proc_offset - - # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # determine which process we are and world size - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - - self.distributed_connection.reset_connection(self.trainer, model) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero: - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend}') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) - self.trainer.optimizers = optimizers - self.trainer.lr_schedulers = lr_schedulers - self.trainer.optimizer_frequencies = optimizer_frequencies - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = model.configure_sync_batchnorm(model) - - # MODEL - # copy model to each gpu - if self.trainer.on_gpu: - gpu_idx = process_idx - - # when using ddp, the master process (proc 0) continues running as the main one - # this means that the local rank will always be 0 - # (even if cuda visible devices has other visible gpus) - # this means that the master process needs to pull the 0th visible index as the device number - if is_master: - available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') - gpu_idx = int(available_gpus[self.trainer.local_rank]) - - self.trainer.root_gpu = gpu_idx - torch.cuda.set_device(self.trainer.root_gpu) - model.cuda(self.trainer.root_gpu) - - # set model properties before going into wrapper - self.trainer.copy_trainer_model_properties(model) - - # AMP - # run through amp wrapper before going to distributed DP - # TODO: remove with dropping NVIDIA AMP support - if self.trainer.use_amp and not NATIVE_AMP_AVALAIBLE: - model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) - self.trainer.optimizers = optimizers - self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) - - # DDP2 uses all GPUs on the machine - if self.trainer.distributed_backend == 'ddp' or self.trainer.distributed_backend == 'ddp_spawn': - device_ids = [self.trainer.root_gpu] - else: # includes ddp_cpu - device_ids = None - - # allow user to configure ddp - model = model.configure_ddp(model, device_ids) - - # continue training routine - results = self.trainer.run_pretrain_routine(model) - - # get original model - model = self.trainer.get_model() - - # persist info in ddp_spawn - self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - - # clean up memory - torch.cuda.empty_cache() - - if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: - return results - - -class DistributedConnection: - - def __init__(self, trainer): - super().__init__() - self.trainer = trainer - if trainer.num_nodes == 1: - # select or forcibly set an initial port before ddp connection is initialized - self._set_master_port(port=self._get_master_port()) - - def reset_connection(self, trainer, model): - if torch.distributed.is_initialized(): - rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - - new_port = torch.empty(1, dtype=torch.int, device='cuda') - - if trainer.global_rank == 0: - port = find_open_network_port() - new_port[0] = port - - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - torch.distributed.destroy_process_group() # destroy connections on old port - self._set_master_port(port=new_port) - - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - - def exit_handler(): - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - atexit.register(exit_handler) - - def _get_master_port(self): - return os.environ.get('MASTER_PORT') - - def _set_master_port(self, port: int = None): - """ - Sets the `MASTER_PORT` environment variable in single-node DDP training. - - Args: - port: If provided, sets the environment variable MASTER_PORT, and otherwhise - an attempt is made to find an unused open port. - - Return: - The port that was set. - """ - assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' - os.environ['MASTER_PORT'] = str(port or find_open_network_port()) - return port - - -def find_open_network_port(): - import socket - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.bind(("", 0)) - s.listen(1) - port = s.getsockname()[1] - s.close() - return port diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py deleted file mode 100644 index 627e471cf2fce..0000000000000 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License - -import torch -import torch.multiprocessing as mp - -from pytorch_lightning.accelerator_backends.ddp_backend import DistributedConnection -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning import _logger as log - -try: - from apex import amp -except ImportError: - APEX_AVAILABLE = False -else: - APEX_AVAILABLE = True - - -class DDPSpawnBackend(object): - - def __init__(self, trainer): - self.trainer = trainer - self.mp_queue = None - self.distributed_connection = DistributedConnection(trainer) - - def setup(self): - # pass in a state q - smp = mp.get_context('spawn') - self.mp_queue = smp.SimpleQueue() - - def train(self, model, nprocs): - mp.spawn(self.ddp_train, nprocs=nprocs, args=(self.mp_queue, model,)) - - def teardown(self, model): - # restore main state with best weights - best_path = self.mp_queue.get() - results = self.mp_queue.get() - last_path = self.mp_queue.get() - - # transfer back the best path to the trainer - if self.trainer.checkpoint_callback: - self.trainer.checkpoint_callback.best_model_path = best_path - # todo, pass also bets score - - # load last weights - if last_path is not None and not self.trainer.testing: - ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt) - - self.trainer.model = model - return results - - def ddp_train(self, process_idx, mp_queue, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - - """ - # show progressbar only on progress_rank 0 - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # determine which process we are and world size - if self.trainer.use_ddp: - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - elif self.trainer.use_ddp2: - self.trainer.local_rank = self.trainer.node_rank - self.trainer.global_rank = self.trainer.node_rank - self.trainer.world_size = self.trainer.num_nodes - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - - self.distributed_connection.reset_connection(self.trainer, model) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero: - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend}') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) - self.trainer.optimizers = optimizers - self.trainer.lr_schedulers = lr_schedulers - self.trainer.optimizer_frequencies = optimizer_frequencies - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = model.configure_sync_batchnorm(model) - - # MODEL - # copy model to each gpu - if self.trainer.on_gpu: - gpu_idx = process_idx - self.trainer.root_gpu = gpu_idx - torch.cuda.set_device(self.trainer.root_gpu) - model.cuda(self.trainer.root_gpu) - - # set model properties before going into wrapper - self.trainer.copy_trainer_model_properties(model) - - # AMP - # run through amp wrapper before going to distributed DP - # TODO: remove with dropping NVIDIA AMP support - native_amp_available = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") - if self.trainer.use_amp and not native_amp_available: - model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) - self.trainer.optimizers = optimizers - self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) - - # DDP2 uses all GPUs on the machine - if self.trainer.distributed_backend == 'ddp' or self.trainer.distributed_backend == 'ddp_spawn': - device_ids = [self.trainer.root_gpu] - elif self.trainer.use_ddp2: - device_ids = self.trainer.data_parallel_device_ids - else: # includes ddp_cpu - device_ids = None - - # allow user to configure ddp - model = model.configure_ddp(model, device_ids) - - # continue training routine - results = self.trainer.run_pretrain_routine(model) - - # get original model - model = self.trainer.get_model() - - # persist info in ddp_spawn - self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - - # clean up memory - torch.cuda.empty_cache() diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index e499feda651d9..369e1900a96b9 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -11,20 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License - +import atexit import os +import torch +import torch.distributed import subprocess import sys -from os.path import abspath from time import sleep -from typing import Optional - import numpy as np -import torch +from os.path import abspath +from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE +from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_debug from pytorch_lightning import _logger as log -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from typing import Optional try: from hydra.utils import to_absolute_path, get_original_cwd @@ -37,7 +37,9 @@ try: from apex import amp except ImportError: - amp = None + APEX_AVAILABLE = False +else: + APEX_AVAILABLE = True class DDPBackend(object): @@ -45,6 +47,7 @@ class DDPBackend(object): def __init__(self, trainer): self.trainer = trainer self.task_idx = None + self.distributed_connection = DistributedConnection(trainer) def slurm_setup(self): self.task_idx = int(os.environ['SLURM_LOCALID']) @@ -56,19 +59,15 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): - port = os.environ['MASTER_PORT'] + assert self.trainer.global_rank == 0 - master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR'] - os.environ['MASTER_PORT'] = f'{port}' + master_address = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_ADDR'] = f'{master_address}' # allow the user to pass the node rank node_rank = '0' - if 'NODE_RANK' in os.environ: - node_rank = os.environ['NODE_RANK'] - if 'GROUP_RANK' in os.environ: - node_rank = os.environ['GROUP_RANK'] - + node_rank = os.environ.get('NODE_RANK', node_rank) + node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' @@ -153,11 +152,8 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) + + self.distributed_connection.reset_connection(self.trainer, model) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) @@ -200,8 +196,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # set model properties before going into wrapper self.trainer.copy_trainer_model_properties(model) - # AMP - run through amp wrapper before going to distributed DP - if self.trainer.amp_type == AMPType.APEX: + # AMP + # run through amp wrapper before going to distributed DP + # TODO: remove with dropping NVIDIA AMP support + if self.trainer.use_amp and not NATIVE_AMP_AVALAIBLE: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) @@ -229,3 +227,64 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results + + +class DistributedConnection: + + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + if trainer.num_nodes == 1: + # select or forcibly set an initial port before ddp connection is initialized + self._set_master_port(port=self._get_master_port()) + + def reset_connection(self, trainer, model): + if torch.distributed.is_initialized(): + rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") + + new_port = torch.empty(1, dtype=torch.int, device='cuda') + + if trainer.global_rank == 0: + port = find_open_network_port() + new_port[0] = port + + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + torch.distributed.destroy_process_group() # destroy connections on old port + self._set_master_port(port=new_port) + + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + + def exit_handler(): + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + atexit.register(exit_handler) + + def _get_master_port(self): + return os.environ.get('MASTER_PORT') + + def _set_master_port(self, port: int = None): + """ + Sets the `MASTER_PORT` environment variable in single-node DDP training. + + Args: + port: If provided, sets the environment variable MASTER_PORT, and otherwhise + an attempt is made to find an unused open port. + + Return: + The port that was set. + """ + assert self.trainer.num_nodes == 1, 'random port can only be called from single node training' + os.environ['MASTER_PORT'] = str(port or find_open_network_port()) + return port + + +def find_open_network_port(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py index 9ed68f66083ad..627e471cf2fce 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py @@ -15,14 +15,16 @@ import torch import torch.multiprocessing as mp -from pytorch_lightning import _logger as log -from pytorch_lightning.utilities import AMPType +from pytorch_lightning.accelerator_backends.ddp_backend import DistributedConnection from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning import _logger as log try: from apex import amp except ImportError: - amp = None + APEX_AVAILABLE = False +else: + APEX_AVAILABLE = True class DDPSpawnBackend(object): @@ -30,10 +32,9 @@ class DDPSpawnBackend(object): def __init__(self, trainer): self.trainer = trainer self.mp_queue = None + self.distributed_connection = DistributedConnection(trainer) def setup(self): - self.trainer.set_random_port() - # pass in a state q smp = mp.get_context('spawn') self.mp_queue = smp.SimpleQueue() @@ -94,11 +95,8 @@ def ddp_train(self, process_idx, mp_queue, model): # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self.trainer - model.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) + + self.distributed_connection.reset_connection(self.trainer, model) # call setup after the ddp process has connected self.trainer.call_setup_hook(model) @@ -132,9 +130,11 @@ def ddp_train(self, process_idx, mp_queue, model): # set model properties before going into wrapper self.trainer.copy_trainer_model_properties(model) - # AMP - + # AMP # run through amp wrapper before going to distributed DP - if self.trainer.amp_type == AMPType.APEX: + # TODO: remove with dropping NVIDIA AMP support + native_amp_available = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") + if self.trainer.use_amp and not native_amp_available: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) From 1aa0591c51bd8eb9b3381500aa35a10b3bf48ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 06:27:33 +0200 Subject: [PATCH 140/195] cleanup --- pytorch_lightning/accelerators/ddp_spawn_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py index 627e471cf2fce..3eea10017dd61 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py @@ -15,7 +15,7 @@ import torch import torch.multiprocessing as mp -from pytorch_lightning.accelerator_backends.ddp_backend import DistributedConnection +from pytorch_lightning.accelerators.ddp_backend import DistributedConnection from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning import _logger as log From c81138f53d9214bfad764b44dd7cd6b579397ce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 16:34:13 +0200 Subject: [PATCH 141/195] cleanup --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 39137c9805437..33e66f7a1f95a 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -113,7 +113,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - p.communicate() + # p.communicate() std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() From 528381ba4dce4ac13dc35f85c40c9ec3d7b95f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 16:54:13 +0200 Subject: [PATCH 142/195] try atexit handler --- pytorch_lightning/accelerators/ddp_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 369e1900a96b9..fe4eb177a136b 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -108,6 +108,8 @@ def spawn_ddp_children(self, model): proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) + atexit.register(lambda _: proc.kill()) + # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] From a0dca5baeead0fb5df9008866b230c00f825a501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 17:02:17 +0200 Subject: [PATCH 143/195] cleanup --- tests/models/data/ddp/train_test_variations.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py index 1ac2e110dd599..e37b71fa7ae57 100644 --- a/tests/models/data/ddp/train_test_variations.py +++ b/tests/models/data/ddp/train_test_variations.py @@ -60,6 +60,10 @@ def main(): run_variation = globals()[args.variation] run_variation(trainer, model) + # TODO + for p in trainer.interactive_ddp_procs: + p.kill() + if __name__ == '__main__': main() From 7a16c3224eb3441edd3c482fc912f950536e3cab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 17:38:44 +0200 Subject: [PATCH 144/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index fe4eb177a136b..369e1900a96b9 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -108,8 +108,6 @@ def spawn_ddp_children(self, model): proc = subprocess.Popen(command, env=env_copy, cwd=cwd) self.trainer.interactive_ddp_procs.append(proc) - atexit.register(lambda _: proc.kill()) - # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds delay = np.random.uniform(1, 5, 1)[0] From 473f004b8117cafd137d43b4133743cf551792cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 17:43:10 +0200 Subject: [PATCH 145/195] add note about teardown --- tests/models/data/ddp/train_test_variations.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py index e37b71fa7ae57..40088e5f74520 100644 --- a/tests/models/data/ddp/train_test_variations.py +++ b/tests/models/data/ddp/train_test_variations.py @@ -61,6 +61,9 @@ def main(): run_variation(trainer, model) # TODO + # remove this in https://github.com/PyTorchLightning/pytorch-lightning/pull/2165 + # when we have proper signal handling working + # otherwise we will see zombie processes in CI, causing tests to hang for p in trainer.interactive_ddp_procs: p.kill() From c7365fd355aa930f0c1f7d34c98865ed73c536af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:08:21 +0200 Subject: [PATCH 146/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 2 ++ tests/models/test_gpu.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 369e1900a96b9..3afdea8ae1666 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -253,6 +253,8 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() # destroy connections on old port self._set_master_port(port=new_port) + torch.distributed.barrier() + #sleep(2) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 33e66f7a1f95a..37e5398e6daf7 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -113,7 +113,6 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - # p.communicate() std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() From d432f564fe188daece8c2a097be17529cb002c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:09:22 +0200 Subject: [PATCH 147/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 3afdea8ae1666..f875da6f61710 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -253,8 +253,8 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() # destroy connections on old port self._set_master_port(port=new_port) - torch.distributed.barrier() - #sleep(2) + torch.distributed.barrier() + #sleep(2) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): From dbac94475dc24f42ba5ef2d8907a5440650638d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:13:21 +0200 Subject: [PATCH 148/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index f875da6f61710..5ca97dd45b2db 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -253,8 +253,8 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() # destroy connections on old port self._set_master_port(port=new_port) - torch.distributed.barrier() - #sleep(2) + + sleep(2) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): From ce1de36c5f505bc2935dc4ec65975da27c005f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:21:58 +0200 Subject: [PATCH 149/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 5ca97dd45b2db..7ceb0d10e621e 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,7 +241,7 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - + print('rank', trainer.global_rank) new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: From f6dfab9b873a8c50158b58e51f60322214a5753b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:25:39 +0200 Subject: [PATCH 150/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 7ceb0d10e621e..d149f52bdeb51 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,7 +241,7 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - print('rank', trainer.global_rank) + assert trainer.global_rank == 0 new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: From c527ab5819fc64ed80b150cf272c204fe748511f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:37:42 +0200 Subject: [PATCH 151/195] cleanup --- pytorch_lightning/accelerators/ddp_backend.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index d149f52bdeb51..966138c1aea83 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -240,22 +240,26 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): - rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") assert trainer.global_rank == 0 - new_port = torch.empty(1, dtype=torch.int, device='cuda') - - if trainer.global_rank == 0: - port = find_open_network_port() - new_port[0] = port - - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - torch.distributed.destroy_process_group() # destroy connections on old port - self._set_master_port(port=new_port) - + rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - sleep(2) - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + else: + # assert trainer.global_rank == 0 + # new_port = torch.empty(1, dtype=torch.int, device='cuda') + # + # if trainer.global_rank == 0: + # port = find_open_network_port() + # new_port[0] = port + # + # torch.distributed.broadcast(new_port, src=0) + # new_port = int(new_port.item()) + # torch.distributed.destroy_process_group() # destroy connections on old port + # self._set_master_port(port=new_port) + + + #sleep(2) + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): if torch.distributed.is_initialized(): From 48263a84cc91804fd9639e4d08af49f222848def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:41:57 +0200 Subject: [PATCH 152/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 966138c1aea83..fab5efdf60190 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -13,6 +13,8 @@ # limitations under the License import atexit import os +import signal + import torch import torch.distributed import subprocess @@ -266,6 +268,8 @@ def exit_handler(): torch.distributed.destroy_process_group() atexit.register(exit_handler) + signal.signal(signal.SIGINT, exit_handler) + signal.signal(signal.SIGTERM, exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From f393d469cd67db427eae6825365023cb0d03bd8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:46:04 +0200 Subject: [PATCH 153/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index fab5efdf60190..99cff713026f3 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -264,7 +264,7 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() and trainer.global_rank > 0: torch.distributed.destroy_process_group() atexit.register(exit_handler) From d8b7d6600edd266f74f644aa0c789624420e827e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:49:49 +0200 Subject: [PATCH 154/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 99cff713026f3..86a4d0bc304bd 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -13,8 +13,6 @@ # limitations under the License import atexit import os -import signal - import torch import torch.distributed import subprocess @@ -242,34 +240,31 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): - assert trainer.global_rank == 0 - rank_zero_debug("DDP connection already initialized. Reinitializing on new port...") - - else: - # assert trainer.global_rank == 0 - # new_port = torch.empty(1, dtype=torch.int, device='cuda') - # - # if trainer.global_rank == 0: - # port = find_open_network_port() - # new_port[0] = port - # - # torch.distributed.broadcast(new_port, src=0) - # new_port = int(new_port.item()) - # torch.distributed.destroy_process_group() # destroy connections on old port - # self._set_master_port(port=new_port) - - - #sleep(2) - print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + print("DDP connection already initialized. Reinitializing on new port...") + + new_port = torch.empty(1, dtype=torch.int, device='cuda') + + if trainer.global_rank == 0: + port = find_open_network_port() + new_port[0] = port + + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + self._set_master_port(port=new_port) + + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): - if torch.distributed.is_initialized() and trainer.global_rank > 0: + if torch.distributed.is_initialized(): + print('destroying on ', trainer.global_rank) torch.distributed.destroy_process_group() atexit.register(exit_handler) - signal.signal(signal.SIGINT, exit_handler) - signal.signal(signal.SIGTERM, exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 569fe0e26eed234ea0e0d0b0caf0a17fda240ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 10 Aug 2020 21:56:06 +0200 Subject: [PATCH 155/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 86a4d0bc304bd..888ad857f1bef 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,7 +241,8 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print("DDP connection already initialized. Reinitializing on new port...") - + + torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') if trainer.global_rank == 0: From 3d66bac53768e92ff6c9ab5fe01f162d55288a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 06:52:37 +0200 Subject: [PATCH 156/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 888ad857f1bef..ace4db2a05596 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -216,6 +216,9 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # continue training routine results = self.trainer.run_pretrain_routine(model) + # in case this is the testing loop, n + self.trainer.run_training_teardown() + # get original model model = self.trainer.get_model() @@ -241,7 +244,7 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print("DDP connection already initialized. Reinitializing on new port...") - + torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') From ce59c5f58236cfb71f92002216423637c04bcc30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 06:55:14 +0200 Subject: [PATCH 157/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index ace4db2a05596..926ed97409848 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -216,8 +216,13 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # continue training routine results = self.trainer.run_pretrain_routine(model) - # in case this is the testing loop, n - self.trainer.run_training_teardown() + if self.trainer.global_rank == 0: + for proc in self.interactive_ddp_procs: + subprocess.Popen.kill(proc) + + # clean up dist group + if self.use_ddp or self.use_ddp2: + torch.distributed.destroy_process_group() # get original model model = self.trainer.get_model() From e53dbe06f17a2085e5b3faf290683cf65b0d7a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 06:55:46 +0200 Subject: [PATCH 158/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 926ed97409848..e238fbdd3bf56 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -217,11 +217,11 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 results = self.trainer.run_pretrain_routine(model) if self.trainer.global_rank == 0: - for proc in self.interactive_ddp_procs: + for proc in self.trainer.interactive_ddp_procs: subprocess.Popen.kill(proc) # clean up dist group - if self.use_ddp or self.use_ddp2: + if self.trainer.use_ddp or self.trainer.use_ddp2: torch.distributed.destroy_process_group() # get original model From cab2245860708702391644f973d67199a3a11716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 06:56:27 +0200 Subject: [PATCH 159/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index e238fbdd3bf56..7e4b7da65de25 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -216,14 +216,6 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 # continue training routine results = self.trainer.run_pretrain_routine(model) - if self.trainer.global_rank == 0: - for proc in self.trainer.interactive_ddp_procs: - subprocess.Popen.kill(proc) - - # clean up dist group - if self.trainer.use_ddp or self.trainer.use_ddp2: - torch.distributed.destroy_process_group() - # get original model model = self.trainer.get_model() From f7fb55d5d7c4c0889126bb17a47cc5cc9ec30fdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 06:57:45 +0200 Subject: [PATCH 160/195] repair --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index ec5bd0938d15c..39ed7177657af 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1144,8 +1144,8 @@ def run_training_teardown(self): subprocess.Popen.kill(proc) # clean up dist group - if self.use_ddp or self.use_ddp2: - torch_distrib.destroy_process_group() + # if self.use_ddp or self.use_ddp2: + # torch_distrib.destroy_process_group() # clear mem if self.on_gpu: From d9bd460a0855183f2587e8c329b96c9d562383dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:01:56 +0200 Subject: [PATCH 161/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 7e4b7da65de25..abf8602ae5181 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,21 +241,21 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print("DDP connection already initialized. Reinitializing on new port...") - - torch.distributed.barrier() - new_port = torch.empty(1, dtype=torch.int, device='cuda') - - if trainer.global_rank == 0: - port = find_open_network_port() - new_port[0] = port - - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port - print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - self._set_master_port(port=new_port) + return + # torch.distributed.barrier() + # new_port = torch.empty(1, dtype=torch.int, device='cuda') + # + # if trainer.global_rank == 0: + # port = find_open_network_port() + # new_port[0] = port + # + # torch.distributed.broadcast(new_port, src=0) + # new_port = int(new_port.item()) + # print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + # torch.distributed.destroy_process_group() # destroy connections on old port + # print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + # self._set_master_port(port=new_port) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From d6fd24c0c0694d1e2b9e23fc39446dff6c440250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:03:12 +0200 Subject: [PATCH 162/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index abf8602ae5181..7e4b7da65de25 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,21 +241,21 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print("DDP connection already initialized. Reinitializing on new port...") - return - # torch.distributed.barrier() - # new_port = torch.empty(1, dtype=torch.int, device='cuda') - # - # if trainer.global_rank == 0: - # port = find_open_network_port() - # new_port[0] = port - # - # torch.distributed.broadcast(new_port, src=0) - # new_port = int(new_port.item()) - # print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - # torch.distributed.destroy_process_group() # destroy connections on old port - # print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - # self._set_master_port(port=new_port) + + torch.distributed.barrier() + new_port = torch.empty(1, dtype=torch.int, device='cuda') + + if trainer.global_rank == 0: + port = find_open_network_port() + new_port[0] = port + + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + self._set_master_port(port=new_port) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 4bf37065cbae2e4a670515b51e662ff1171849bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:04:13 +0200 Subject: [PATCH 163/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 7e4b7da65de25..7864432ffef58 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -252,7 +252,7 @@ def reset_connection(self, trainer, model): torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port + # torch.distributed.destroy_process_group() # destroy connections on old port print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) self._set_master_port(port=new_port) From 72edd6a79a6eab8604823b800a05fef5d4cfe454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:13:39 +0200 Subject: [PATCH 164/195] debug --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 7864432ffef58..95fe6ed77e19c 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -240,7 +240,7 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): - print("DDP connection already initialized. Reinitializing on new port...") + print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') @@ -252,7 +252,7 @@ def reset_connection(self, trainer, model): torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - # torch.distributed.destroy_process_group() # destroy connections on old port + torch.distributed.destroy_process_group() # destroy connections on old port print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) self._set_master_port(port=new_port) From d128cd508b8e3079c44f2dfe62b77ebae888a0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:15:26 +0200 Subject: [PATCH 165/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 95fe6ed77e19c..792113fbf6ec9 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -265,7 +265,7 @@ def exit_handler(): print('destroying on ', trainer.global_rank) torch.distributed.destroy_process_group() - atexit.register(exit_handler) + # atexit.register(exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 795de43ae285e05e46b700d1dcfd9938a9538684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:27:41 +0200 Subject: [PATCH 166/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 792113fbf6ec9..8fba93db197dd 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,7 +241,7 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - + return torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') @@ -261,11 +261,11 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) def exit_handler(): - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() and trainer.global_rank > 0: print('destroying on ', trainer.global_rank) torch.distributed.destroy_process_group() - # atexit.register(exit_handler) + atexit.register(exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 4c0550a64afacdb0febfca2785d5823ee1069849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:31:25 +0200 Subject: [PATCH 167/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 8fba93db197dd..615ea3a81b2a1 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,6 +241,9 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") + torch.distributed.destroy_process_group() + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + return torch.distributed.barrier() new_port = torch.empty(1, dtype=torch.int, device='cuda') From a2c47b19b4cbbe7de29679cb7b70d7cd55265656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:38:34 +0200 Subject: [PATCH 168/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 615ea3a81b2a1..5e2f6df85fa51 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -13,6 +13,8 @@ # limitations under the License import atexit import os +from socket import socket + import torch import torch.distributed import subprocess @@ -242,6 +244,9 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") torch.distributed.destroy_process_group() + + sleep(10) + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) return From ae201e8edfcd1ed135f897bf63fe30dbd22e5829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:51:40 +0200 Subject: [PATCH 169/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 5e2f6df85fa51..c5bb0d3831dfa 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -13,7 +13,7 @@ # limitations under the License import atexit import os -from socket import socket +import socket import torch import torch.distributed @@ -245,8 +245,11 @@ def reset_connection(self, trainer, model): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") torch.distributed.destroy_process_group() + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((self._get_master_address(), self._get_master_port())) + s.shutdown(socket.SHUT_RDWR) + s.close() sleep(10) - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) return @@ -278,6 +281,9 @@ def exit_handler(): def _get_master_port(self): return os.environ.get('MASTER_PORT') + def _get_master_address(self): + return os.environ.get('MASTER_ADDRESS') + def _set_master_port(self, port: int = None): """ Sets the `MASTER_PORT` environment variable in single-node DDP training. @@ -295,7 +301,6 @@ def _set_master_port(self, port: int = None): def find_open_network_port(): - import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("", 0)) s.listen(1) From ce90830d5aba264f8e421ffb58d8d79e090aa591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:53:52 +0200 Subject: [PATCH 170/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index c5bb0d3831dfa..f340807e36d9a 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -282,7 +282,7 @@ def _get_master_port(self): return os.environ.get('MASTER_PORT') def _get_master_address(self): - return os.environ.get('MASTER_ADDRESS') + return os.environ.get('MASTER_ADDR') def _set_master_port(self, port: int = None): """ From 99fd9f648d7181947de4b51f700f2eeaca4bdfbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:54:32 +0200 Subject: [PATCH 171/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index f340807e36d9a..262ab1274800f 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -246,7 +246,7 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect((self._get_master_address(), self._get_master_port())) + s.connect((self._get_master_address(), int(self._get_master_port()))) s.shutdown(socket.SHUT_RDWR) s.close() sleep(10) From e5ff21f7797ad62d0a27fbcb7152e5574f438eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:58:24 +0200 Subject: [PATCH 172/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 262ab1274800f..ae9cee3f4e5e2 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -246,8 +246,10 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + print('shutdown', self._get_master_address(), int(self._get_master_port())) s.connect((self._get_master_address(), int(self._get_master_port()))) - s.shutdown(socket.SHUT_RDWR) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + #s.shutdown(socket.SHUT_RDWR) s.close() sleep(10) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 9e6b892a0ba4c6caec351e3dedf520e7e7999240 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 07:59:16 +0200 Subject: [PATCH 173/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index ae9cee3f4e5e2..8f998230d459c 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -249,7 +249,7 @@ def reset_connection(self, trainer, model): print('shutdown', self._get_master_address(), int(self._get_master_port())) s.connect((self._get_master_address(), int(self._get_master_port()))) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - #s.shutdown(socket.SHUT_RDWR) + s.shutdown(socket.SHUT_RDWR) s.close() sleep(10) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From ab7ebdd1db3a6ff10c735242577e994292f108d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:02:32 +0200 Subject: [PATCH 174/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 8f998230d459c..07c208653a16b 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -272,6 +272,7 @@ def reset_connection(self, trainer, model): print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') def exit_handler(): if torch.distributed.is_initialized() and trainer.global_rank > 0: From 47712a0452f41be6625770632853aded87ae08c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:05:26 +0200 Subject: [PATCH 175/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 07c208653a16b..3c72270c8edfc 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,34 +241,38 @@ def __init__(self, trainer): self._set_master_port(port=self._get_master_port()) def reset_connection(self, trainer, model): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + print('shutdown', self._get_master_address(), int(self._get_master_port())) + s.connect((self._get_master_address(), int(self._get_master_port()))) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.shutdown(socket.SHUT_RDWR) + s.close() + sleep(10) + + + if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") + torch.distributed.destroy_process_group() - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - print('shutdown', self._get_master_address(), int(self._get_master_port())) - s.connect((self._get_master_address(), int(self._get_master_port()))) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.shutdown(socket.SHUT_RDWR) - s.close() - sleep(10) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - return - torch.distributed.barrier() - new_port = torch.empty(1, dtype=torch.int, device='cuda') - - if trainer.global_rank == 0: - port = find_open_network_port() - new_port[0] = port - - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port - print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - self._set_master_port(port=new_port) + # + # torch.distributed.barrier() + # new_port = torch.empty(1, dtype=torch.int, device='cuda') + # + # if trainer.global_rank == 0: + # port = find_open_network_port() + # new_port[0] = port + # + # torch.distributed.broadcast(new_port, src=0) + # new_port = int(new_port.item()) + # print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + # torch.distributed.destroy_process_group() # destroy connections on old port + # print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + # self._set_master_port(port=new_port) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 68a2db66ee1b39a5c569a6787e8a0b3e0fe820eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:06:59 +0200 Subject: [PATCH 176/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 3c72270c8edfc..0b12071ac76f6 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -241,13 +241,7 @@ def __init__(self, trainer): self._set_master_port(port=self._get_master_port()) def reset_connection(self, trainer, model): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - print('shutdown', self._get_master_address(), int(self._get_master_port())) - s.connect((self._get_master_address(), int(self._get_master_port()))) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.shutdown(socket.SHUT_RDWR) - s.close() - sleep(10) + @@ -256,7 +250,18 @@ def reset_connection(self, trainer, model): torch.distributed.destroy_process_group() - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + print('shutdown', self._get_master_address(), int(self._get_master_port())) + s.connect((self._get_master_address(), int(self._get_master_port()))) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.shutdown(socket.SHUT_RDWR) + s.close() + sleep(10) + + + + + #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) # # torch.distributed.barrier() @@ -274,6 +279,8 @@ def reset_connection(self, trainer, model): # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) # self._set_master_port(port=new_port) + torch.distributed.barrier() + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') From 8f8c0fd0e5f3e9d5a182bf08cd5b46040f642df5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:07:42 +0200 Subject: [PATCH 177/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 0b12071ac76f6..4a46628a6384d 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -279,8 +279,6 @@ def reset_connection(self, trainer, model): # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) # self._set_master_port(port=new_port) - torch.distributed.barrier() - print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') From 159b4c8d68c4b04090782750d7783ca994075a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:09:55 +0200 Subject: [PATCH 178/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 4a46628a6384d..c872fc60d1a03 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -242,21 +242,12 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): - - - if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - + return torch.distributed.destroy_process_group() - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - print('shutdown', self._get_master_address(), int(self._get_master_port())) - s.connect((self._get_master_address(), int(self._get_master_port()))) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.shutdown(socket.SHUT_RDWR) - s.close() - sleep(10) + @@ -283,6 +274,14 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + #print('shutdown', self._get_master_address(), int(self._get_master_port())) + s.connect((self._get_master_address(), int(self._get_master_port()))) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + #s.shutdown(socket.SHUT_RDWR) + s.close() + #sleep(10) + def exit_handler(): if torch.distributed.is_initialized() and trainer.global_rank > 0: print('destroying on ', trainer.global_rank) From ce4ad1e3fa92b94d47100a37471e5bd2b2c6bfbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:10:08 +0200 Subject: [PATCH 179/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index c872fc60d1a03..721fd2fda74cf 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -244,8 +244,8 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - return - torch.distributed.destroy_process_group() + + #torch.distributed.destroy_process_group() From ce8a93ce1a57d643589b7dd81ad4e94248c699a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:10:19 +0200 Subject: [PATCH 180/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 721fd2fda74cf..affe4236e2c88 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -245,7 +245,7 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - #torch.distributed.destroy_process_group() + torch.distributed.destroy_process_group() From 25767df0966884257848bfbada63e5e5c05aa1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:15:59 +0200 Subject: [PATCH 181/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index affe4236e2c88..da5f02c1a4701 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -244,7 +244,7 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - + print(dir(torch.distributed.distributed_c10d._get_default_store())) torch.distributed.destroy_process_group() From 418fc90e0d19da55f7112f2daa7bba7afa1ba9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:33:53 +0200 Subject: [PATCH 182/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index da5f02c1a4701..b042076e88a59 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -244,7 +244,7 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - print(dir(torch.distributed.distributed_c10d._get_default_store())) + return torch.distributed.destroy_process_group() @@ -287,7 +287,7 @@ def exit_handler(): print('destroying on ', trainer.global_rank) torch.distributed.destroy_process_group() - atexit.register(exit_handler) + #atexit.register(exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 0495da80ed3c898209b5dfbf56ebfa5e3e7a7ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:41:47 +0200 Subject: [PATCH 183/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 50 ++++++++----------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index b042076e88a59..62896bc6f8176 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -244,50 +244,42 @@ def reset_connection(self, trainer, model): if torch.distributed.is_initialized(): print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") - return - torch.distributed.destroy_process_group() - - - + #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + torch.distributed.barrier() + new_port = torch.empty(1, dtype=torch.int, device='cuda') - #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + if trainer.global_rank == 0: + port = find_open_network_port() + new_port[0] = port - # - # torch.distributed.barrier() - # new_port = torch.empty(1, dtype=torch.int, device='cuda') - # - # if trainer.global_rank == 0: - # port = find_open_network_port() - # new_port[0] = port - # - # torch.distributed.broadcast(new_port, src=0) - # new_port = int(new_port.item()) - # print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - # torch.distributed.destroy_process_group() # destroy connections on old port - # print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - # print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - # self._set_master_port(port=new_port) + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + self._set_master_port(port=new_port) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - #print('shutdown', self._get_master_address(), int(self._get_master_port())) - s.connect((self._get_master_address(), int(self._get_master_port()))) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - #s.shutdown(socket.SHUT_RDWR) - s.close() - #sleep(10) + # s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # #print('shutdown', self._get_master_address(), int(self._get_master_port())) + # s.connect((self._get_master_address(), int(self._get_master_port()))) + # s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + # #s.shutdown(socket.SHUT_RDWR) + # s.close() + # #sleep(10) def exit_handler(): if torch.distributed.is_initialized() and trainer.global_rank > 0: print('destroying on ', trainer.global_rank) torch.distributed.destroy_process_group() - #atexit.register(exit_handler) + atexit.register(exit_handler) def _get_master_port(self): return os.environ.get('MASTER_PORT') From 5b267ffa48ca4411688ced1a6f907f99dd1c8fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:48:57 +0200 Subject: [PATCH 184/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 62896bc6f8176..91a79583ad701 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -242,7 +242,12 @@ def __init__(self, trainer): def reset_connection(self, trainer, model): - if torch.distributed.is_initialized(): + if not torch.distributed.is_initialized(): + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') + + if torch.distributed.is_initialized() and trainer.global_rank > 0: print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) @@ -262,9 +267,9 @@ def reset_connection(self, trainer, model): print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) self._set_master_port(port=new_port) - print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + + # s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # #print('shutdown', self._get_master_address(), int(self._get_master_port())) From 18e75caa1c6d3e9c2775d2782a97a386df2de59d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:57:21 +0200 Subject: [PATCH 185/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 91a79583ad701..46bf59d53b37c 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -247,27 +247,28 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') + new_port = torch.tensor([self._get_master_port()], dtype=torch.int, device='cuda') if torch.distributed.is_initialized() and trainer.global_rank > 0: print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - torch.distributed.barrier() - new_port = torch.empty(1, dtype=torch.int, device='cuda') + # torch.distributed.barrier() - if trainer.global_rank == 0: - port = find_open_network_port() - new_port[0] = port - torch.distributed.broadcast(new_port, src=0) - new_port = int(new_port.item()) - print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port - print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - self._set_master_port(port=new_port) + #if trainer.global_rank == 0: + port = find_open_network_port() + new_port[0] = port - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + torch.distributed.broadcast(new_port, src=0) + new_port = int(new_port.item()) + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + self._set_master_port(port=new_port) + + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 6d56a7827c40e07464de22e9ed16c77a9429ca3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:57:49 +0200 Subject: [PATCH 186/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 46bf59d53b37c..099536ce3edff 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -247,7 +247,7 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') - new_port = torch.tensor([self._get_master_port()], dtype=torch.int, device='cuda') + new_port = torch.tensor([int(self._get_master_port())], dtype=torch.int, device='cuda') if torch.distributed.is_initialized() and trainer.global_rank > 0: print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") From 8622c43c4d784d10b429370a91ea2b2aecb8a262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 08:59:21 +0200 Subject: [PATCH 187/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 099536ce3edff..3f36231dc1261 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -262,13 +262,15 @@ def reset_connection(self, trainer, model): torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) - print('recv new port', 'rank', trainer.global_rank, 'port', new_port) - torch.distributed.destroy_process_group() # destroy connections on old port - print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) - print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) - self._set_master_port(port=new_port) - model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + if int(self._get_master_port()) != new_port: + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) + torch.distributed.destroy_process_group() # destroy connections on old port + print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) + print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) + self._set_master_port(port=new_port) + + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From 6dfec2c9a131544b201425b90a1e54bc2bfe78db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 09:01:54 +0200 Subject: [PATCH 188/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 3f36231dc1261..57f9d5376acb7 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -262,9 +262,9 @@ def reset_connection(self, trainer, model): torch.distributed.broadcast(new_port, src=0) new_port = int(new_port.item()) + print('recv new port', 'rank', trainer.global_rank, 'port', new_port) if int(self._get_master_port()) != new_port: - print('recv new port', 'rank', trainer.global_rank, 'port', new_port) torch.distributed.destroy_process_group() # destroy connections on old port print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) From b35679c64f406c8b57075f78305280729d34352c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 09:03:27 +0200 Subject: [PATCH 189/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 57f9d5376acb7..f826b16f6dd3d 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -265,6 +265,7 @@ def reset_connection(self, trainer, model): print('recv new port', 'rank', trainer.global_rank, 'port', new_port) if int(self._get_master_port()) != new_port: + print('need to update port') torch.distributed.destroy_process_group() # destroy connections on old port print('destroy group', 'rank', trainer.global_rank, 'port', self._get_master_port()) print('set port', 'rank', trainer.global_rank, 'port', self._get_master_port()) @@ -272,7 +273,7 @@ def reset_connection(self, trainer, model): model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) - + print('exit') # s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # #print('shutdown', self._get_master_address(), int(self._get_master_port())) From d0e6f3ba042803fb93019e49eb91ed4ae9f0e0af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 09:08:43 +0200 Subject: [PATCH 190/195] repair --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d23cde63f450e..8826b84e0d3f7 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -954,7 +954,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi ) torch_backend = "nccl" if self.trainer.on_gpu else "gloo" - log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") + log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}, ADDR: {os.environ['MASTER_ADDR']}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule': From 13e9236a56b383a0671761a50aa53f3929d0ba20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 13:27:06 +0200 Subject: [PATCH 191/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index f826b16f6dd3d..8cd98fd766bb4 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -174,6 +174,8 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies + print('here 1') + # call sync_bn before .cuda(), configure_apex and configure_ddp if self.trainer.sync_batchnorm: model = model.configure_sync_batchnorm(model) @@ -191,10 +193,13 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') gpu_idx = int(available_gpus[self.trainer.local_rank]) + print('here 2') self.trainer.root_gpu = gpu_idx torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) + print('here 3') + # set model properties before going into wrapper self.trainer.copy_trainer_model_properties(model) From f684550e8bc8a0d668141fa4dee707673b310122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 13:29:08 +0200 Subject: [PATCH 192/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 8cd98fd766bb4..ee64d9f364b44 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -217,12 +217,18 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 else: # includes ddp_cpu device_ids = None + print('here 4') + # allow user to configure ddp model = model.configure_ddp(model, device_ids) + print('here 5') + # continue training routine results = self.trainer.run_pretrain_routine(model) + print('here 6') + # get original model model = self.trainer.get_model() From b5f89782dd3923820fbdb4f9d39231e8a5f8c290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 11 Aug 2020 13:49:35 +0200 Subject: [PATCH 193/195] repair --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index ee64d9f364b44..3ba21399f835f 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -259,7 +259,7 @@ def reset_connection(self, trainer, model): print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port(), 'done') new_port = torch.tensor([int(self._get_master_port())], dtype=torch.int, device='cuda') - if torch.distributed.is_initialized() and trainer.global_rank > 0: + if torch.distributed.is_initialized() and trainer.global_rank == 0: print(trainer.global_rank, "DDP connection already initialized. Reinitializing on new port...") #model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) From f9a735339cd9f915957028c6bc67d5550f9cdd27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 15 Aug 2020 19:15:30 +0200 Subject: [PATCH 194/195] simple --- pytorch_lightning/accelerators/ddp_backend.py | 5 +++++ pytorch_lightning/trainer/trainer.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 3ba21399f835f..a679e1085184a 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -252,6 +252,11 @@ def __init__(self, trainer): self._set_master_port(port=self._get_master_port()) def reset_connection(self, trainer, model): + if not torch.distributed.is_initialized(): + print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) + model.init_ddp_connection(trainer.global_rank, trainer.world_size, trainer.is_slurm_managing_tasks) + + def reset_connection_old(self, trainer, model): if not torch.distributed.is_initialized(): print('init ddp', 'rank', trainer.global_rank, 'port', self._get_master_port()) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index cac34d655ad6f..fbf8cd04c3dc0 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -14,6 +14,7 @@ import inspect import os +import subprocess import warnings from argparse import ArgumentParser, Namespace from typing import Any, Dict, Iterable, List, Optional, Tuple, Union @@ -1339,6 +1340,20 @@ def test( self.teardown('test') + if self.global_rank == 0: + for proc in self.interactive_ddp_procs: + subprocess.Popen.kill(proc) + + # clean up dist group + # if self.use_ddp or self.use_ddp2: + # torch_distrib.destroy_process_group() + + # clear mem + if self.on_gpu: + model = self.get_model() + model.cpu() + torch.cuda.empty_cache() + return results def __test_using_best_weights(self, ckpt_path, test_dataloaders): From 68ec75074cec5b3d21cafe99cc6280d402c420c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 15 Aug 2020 19:20:06 +0200 Subject: [PATCH 195/195] mem --- pytorch_lightning/trainer/trainer.py | 8 ++++---- pytorch_lightning/trainer/training_loop.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index fbf8cd04c3dc0..ee3082734e4f6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1349,10 +1349,10 @@ def test( # torch_distrib.destroy_process_group() # clear mem - if self.on_gpu: - model = self.get_model() - model.cpu() - torch.cuda.empty_cache() + # if self.on_gpu: + # model = self.get_model() + # model.cpu() + # torch.cuda.empty_cache() return results diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 39ed7177657af..1feb58c0a76bd 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1148,10 +1148,10 @@ def run_training_teardown(self): # torch_distrib.destroy_process_group() # clear mem - if self.on_gpu: - model = self.get_model() - model.cpu() - torch.cuda.empty_cache() + # if self.on_gpu: + # model = self.get_model() + # model.cpu() + # torch.cuda.empty_cache() def training_forward(self, batch, batch_idx, opt_idx, hiddens): """