forked from Azure/azhpc-images
-
Notifications
You must be signed in to change notification settings - Fork 0
/
install.sh
executable file
·123 lines (89 loc) · 2.55 KB
/
install.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
set -ex
GPUi="NVIDIA"
if [[ "$#" -gt 0 ]]; then
INPUT=$1
if [ "$INPUT" == "AMD" ]; then
GPUi="AMD"
echo "ERROR, the AMD pathway is not fully implemented yet."
exit 1
elif [ "$INPUT" != "NVIDIA" ]; then
echo "Error: Invalid GPU type. Please specify 'NVIDIA' or 'AMD'."
exit 1
fi
fi
export GPU=$GPUi
# install pre-requisites
./install_prerequisites.sh
# set properties
source ./set_properties.sh
# remove packages requiring Ubuntu Pro for security updates
$UBUNTU_COMMON_DIR/remove_unused_packages.sh
# install utils
./install_utils.sh
# install Lustre client
$UBUNTU_COMMON_DIR/install_lustre_client.sh
# install DOCA OFED
$UBUNTU_COMMON_DIR/install_doca.sh
# install PMIX
$UBUNTU_COMMON_DIR/install_pmix.sh
# install mpi libraries
$UBUNTU_COMMON_DIR/install_mpis.sh
if [ "$GPU" = "NVIDIA" ]; then
# install nvidia gpu driver
./install_nvidiagpudriver.sh
# Install NCCL
$UBUNTU_COMMON_DIR/install_nccl.sh
# Install NVIDIA docker container
$UBUNTU_COMMON_DIR/install_docker.sh
fi
if [ "$GPU" = "AMD" ]; then
# Set up docker
apt-get install -y moby-engine
systemctl enable docker
systemctl restart docker
fi
# cleanup downloaded tarballs - clear some space
rm -rf *.tgz *.bz2 *.tbz *.tar.gz *.run *.deb *_offline.sh
rm -rf /tmp/MLNX_OFED_LINUX* /tmp/*conf*
rm -rf /var/intel/ /var/cache/*
rm -Rf -- */
if [ "$GPU" = "NVIDIA" ]; then
# Install DCGM
$UBUNTU_COMMON_DIR/install_dcgm.sh
fi
# install Intel libraries
$COMMON_DIR/install_intel_libs.sh
# install diagnostic script
$COMMON_DIR/install_hpcdiag.sh
# install persistent rdma naming
$COMMON_DIR/install_azure_persistent_rdma_naming.sh
# optimizations
$UBUNTU_COMMON_DIR/hpc-tuning.sh
# Install AZNFS Mount Helper
$COMMON_DIR/install_aznfs.sh
# copy test file
$COMMON_DIR/copy_test_file.sh
# install monitor tools
$COMMON_DIR/install_monitoring_tools.sh
# install AMD libs
$COMMON_DIR/install_amd_libs.sh
# install Azure/NHC Health Checks
$COMMON_DIR/install_health_checks.sh
# disable cloud-init
$UBUNTU_COMMON_DIR/disable_cloudinit.sh
# diable auto kernel updates
$UBUNTU_COMMON_DIR/disable_auto_upgrade.sh
# Disable Predictive Network interface renaming
$UBUNTU_COMMON_DIR/disable_predictive_interface_renaming.sh
# SKU Customization
$COMMON_DIR/setup_sku_customizations.sh
if [ "$GPU" = "AMD" ]; then
#install rocm software stack
./install_rocm.sh
#install rccl and rccl-tests
./install_rccl.sh
fi
# clear history
# Uncomment the line below if you are running this on a VM
# $COMMON_DIR/clear_history.sh