Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert subprocess test to standalone test #14101

Merged
merged 14 commits into from
Aug 10, 2022
11 changes: 9 additions & 2 deletions tests/tests_pytorch/run_standalone_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,16 @@ if [ -z "$LOGS" ]; then
exit 1
fi

cd ../../

# test that a user can manually launch individual processes
echo "Running manual ddp launch test"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} &
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args}
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} &
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args}

# test that ddp can launched as a module (-m option)
echo "Running ddp example as module"
export PYTHONPATH="${PYTHONPATH}:$(pwd)/examples/convert_from_pt_to_pl"
python -m examples.convert_from_pt_to_pl.image_classifier_5_lightning_datamodule ${args}
awaelchli marked this conversation as resolved.
Show resolved Hide resolved
58 changes: 0 additions & 58 deletions tests/tests_pytorch/strategies/ddp_model.py

This file was deleted.

67 changes: 24 additions & 43 deletions tests/tests_pytorch/strategies/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,60 +21,41 @@
from torch.nn.parallel.distributed import DistributedDataParallel

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.strategies import DDPStrategy
from tests_pytorch.helpers.datamodules import ClassifDataModule
from tests_pytorch.helpers.runif import RunIf
from tests_pytorch.strategies import ddp_model
from tests_pytorch.utilities.distributed import call_training_script
from tests_pytorch.helpers.simple_models import ClassificationModel

CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp"

@RunIf(min_cuda_gpus=2, standalone=True)
def test_multi_gpu_model_ddp_fit_only(tmpdir):
dm = ClassifDataModule()
model = ClassificationModel()
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
trainer.fit(model, datamodule=dm)

@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("as_module", [True, False])
def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module):
# call the script
call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module)

# load the results of the script
result_path = os.path.join(tmpdir, "ddp.result")
result = torch.load(result_path)
@RunIf(min_cuda_gpus=2, standalone=True)
def test_multi_gpu_model_ddp_test_only(tmpdir):
dm = ClassifDataModule()
model = ClassificationModel()
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
trainer.test(model, datamodule=dm)

# verify the file wrote the expected outputs
assert result["status"] == "complete"

@RunIf(min_cuda_gpus=2, standalone=True)
def test_multi_gpu_model_ddp_fit_test(tmpdir):
seed_everything(4321)
dm = ClassifDataModule()
model = ClassificationModel()
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
trainer.fit(model, datamodule=dm)
result = trainer.test(model, datamodule=dm)

@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("as_module", [True, False])
def test_multi_gpu_model_ddp_test_only(tmpdir, as_module):
# call the script
call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module)

# load the results of the script
result_path = os.path.join(tmpdir, "ddp.result")
result = torch.load(result_path)

# verify the file wrote the expected outputs
assert result["status"] == "complete"


@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("as_module", [True, False])
def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module):
# call the script
call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module)

# load the results of the script
result_path = os.path.join(tmpdir, "ddp.result")
result = torch.load(result_path)

# verify the file wrote the expected outputs
assert result["status"] == "complete"

model_outs = result["result"]
for out in model_outs:
for out in result:
assert out["test_acc"] > 0.7


Expand Down
45 changes: 0 additions & 45 deletions tests/tests_pytorch/utilities/distributed.py

This file was deleted.