-
Notifications
You must be signed in to change notification settings - Fork 332
/
deploy_kubeflow.sh
executable file
·287 lines (247 loc) · 10.6 KB
/
deploy_kubeflow.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/env bash
# Get the DeepOps root_dir and config_dir
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
ROOT_DIR="${SCRIPT_DIR}/../.."
CONFIG_DIR="${ROOT_DIR}/config"
# Source common libraries and env variables
source ${ROOT_DIR}/scripts/common.sh
# Poll for these to be available with the -w flag
KUBEFLOW_POLL_DEPLOYMENTS="${KUBEFLOW_DEPLOYMENTS:-profiles-deployment notebook-controller-deployment centraldashboard ml-pipeline minio mysql jupyter-web-app-deployment katib-mysql}"
# Specify how long to poll for Kubeflow to start
export KUBEFLOW_TIMEOUT="${KUBEFLOW_TIMEOUT:-600}"
export KUBEFLOW_DEPLOY_TIMEOUT="${KUBEFLOW_DEPLOY_TIMEOUT:-1200}"
# Define Kubeflow manifests location
export KUBEFLOW_MANIFESTS_DEST="${KUBEFLOW_MANIFESTS_DEST:-${CONFIG_DIR}/kubeflow-install/manifests}"
export KUBEFLOW_MANIFESTS_URL="${KUBEFLOW_MANIFESTS_URL:-https://github.com/kubeflow/manifests}"
export KUBEFLOW_MANIFESTS_VERSION="${KUBEFLOW_MANIFESTS_VERSION:-v1.7.0}"
# Define configuration we're injecting into the manifests location
export KUBEFLOW_DEEPOPS_CONFIG_DIR="${KUBEFLOW_DEEPOPS_CONFIG_DIR:-${CONFIG_DIR}/files/kubeflow}"
export KUBEFLOW_DEEPOPS_DEX_CONFIG="${KUBEFLOW_DEEPOPS_DEX_CONFIG:-${KUBEFLOW_DEEPOPS_CONFIG_DIR}/dex-config-map.yaml}"
export KUBEFLOW_DEEPOPS_USERNS_PARAMS="${KUBEFLOW_DEEPOPS_USERNS_PARAMS:-${KUBEFLOW_DEEPOPS_CONFIG_DIR}/user-namespace-params.env}"
# Define Kustomize location
export KUSTOMIZE_URL="${KUSTOMIZE_URL:-https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.1.0/kustomize_v5.1.0_linux_amd64.tar.gz}"
export KUSTOMIZE="${KUSTOMIZE:-${CONFIG_DIR}/kustomize}"
function help_me() {
echo "Usage:"
echo "-h This message."
echo "-p Print out the connection info for Kubeflow."
echo "-c Only clone the Kubeflow manifests repo, but do not deploy Kubeflow."
echo "-d Delete Kubeflow from your system (skipping the CRDs and istio-system namespace that may have been installed with Kubeflow."
echo "-D Deprecated, same as -d. Previously 'Fully Delete Kubeflow from your system along with all Kubeflow CRDs the istio-system namespace. WARNING, do not use this option if other components depend on istio.'"
echo "-x Deprecated, multi-user auth is now the default."
echo "-w Wait for Kubeflow homepage to respond (also polls for various Kubeflow Deployments to have an available status)."
}
function get_opts() {
while getopts "chpwxdDZ" option; do
case $option in
p)
KUBEFLOW_PRINT=true
;;
w)
KUBEFLOW_WAIT=true
;;
c)
KUBEFLOW_CLONE=true
;;
x)
;;
d)
KUBEFLOW_DELETE=true
;;
D)
KUBEFLOW_DELETE=true
echo "The -D flag is deprecated, use -d instead"
;;
Z)
# This is a dangerous command and is not included in the help
KUBEFLOW_EXTRA_FULL_DELETE=true
;;
h)
help_me
exit 1
;;
* )
help_me
exit 1
;;
esac
done
}
function install_dependencies() {
# Install dependencies
. /etc/os-release
case "$ID" in
rhel*|centos*)
type curl >/dev/null 2>&1
if [ $? -ne 0 ] ; then
sudo yum -y install curl wget
fi
;;
ubuntu*)
type curl >/dev/null 2>&1
if [ $? -ne 0 ] ; then
sudo apt -y install curl wget
fi
;;
*)
echo "Unsupported Operating System $ID_LIKE"
exit 1
;;
esac
# StorageClass (for volumes and MySQL DB)
kubectl get storageclass 2>&1 | grep "(default)" >/dev/null 2>&1
if [ $? -ne 0 ] ; then
echo "No storageclass found"
echo "To setup the nfs-client-provisioner (preferred), run: ansible-playbook playbooks/k8s-cluster/nfs-client-provisioner.yml"
echo "To provision Ceph storage, run: ./scripts/k8s/deploy_rook.sh"
exit 1
fi
# Proxies
if [ ${http_proxy} -o ${https_proxy} -o ${no_proxy} ]; then
echo "Proxy detected. This could cause problems with a default Kubeflow installation."
echo "Refer to the workaround here: https://github.com/kubeflow/kfctl/issues/237"
echo "After applying the workaround run: KUBEFLOW_PROXY_WORKAROUND=true ./${0}"
if [ -z ${KUBEFLOW_PROXY_WORKAROUND} ]; then
exit 1
fi
fi
}
function clone_repo() {
pushd .
if [ -d "${KUBEFLOW_MANIFESTS_DEST}" ]; then
echo "Kubeflow manifests directory already exists at: ${KUBEFLOW_MANIFESTS_DEST}"
echo "Exiting script! Please delete this directory before re-deploying."
exit 1
fi
mkdir -p "${KUBEFLOW_MANIFESTS_DEST}"
pushd "${KUBEFLOW_MANIFESTS_DEST}"
git clone -b "${KUBEFLOW_MANIFESTS_VERSION}" "${KUBEFLOW_MANIFESTS_URL}" .
# Inject custom dex config
cp -v "${KUBEFLOW_DEEPOPS_DEX_CONFIG}" "${KUBEFLOW_MANIFESTS_DEST}/common/dex/base/config-map.yaml"
cp -v "${KUBEFLOW_DEEPOPS_USERNS_PARAMS}" "${KUBEFLOW_MANIFESTS_DEST}/common/user-namespace/base/params.env"
# BUG: https://stackoverflow.com/questions/76502195/horizontalpodautoscaler-not-found-on-minikube-when-installing-kubeflow
sed -i 's:autoscaling/v2beta2:autoscaling/v2:' "${KUBEFLOW_MANIFESTS_DEST}/common/knative/knative-serving/base/upstream/serving-core.yaml"
# XXX: Change the default Istio Ingress Gateway configuration to support NodePort for ease-of-use in on-prem
sed -i 's:ClusterIP:NodePort:g' "${KUBEFLOW_MANIFESTS_DEST}/common/istio-1-16/istio-install/base/patches/service.yaml"
# XXX: Make the Kubeflow cluster allow insecure http instead of https
# Remove this for any production cluster and enable HTTPS suitable for the environment
# XXX: https://github.com/kubeflow/manifests#connect-to-your-kubeflow-cluster
sed -i 's:JWA_APP_SECURE_COOKIES=true:JWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/jupyter/jupyter-web-app/upstream/base/params.env"
sed -i 's:VWA_APP_SECURE_COOKIES=true:VWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/volumes-web-app/upstream/base/params.env"
sed -i 's:TWA_APP_SECURE_COOKIES=true:TWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/tensorboard/tensorboards-web-app/upstream/base/params.env"
popd
echo "Kubeflow manifests repo:"
echo "- Cloned from: ${KUBEFLOW_MANIFESTS_URL}"
echo "- Git branch or tag: ${KUBEFLOW_MANIFESTS_VERSION}"
echo "- Local path: ${KUBEFLOW_MANIFESTS_DEST}"
}
function stand_up() {
pushd .
pushd "${KUBEFLOW_MANIFESTS_DEST}"
wget -O "${KUSTOMIZE}.tgz" "${KUSTOMIZE_URL}"
tar -xvf "${KUSTOMIZE}.tgz" -C "${CONFIG_DIR}"
chmod +x "${KUSTOMIZE}"
echo "Beginning Kubeflow deployment"
timeout "${KUBEFLOW_DEPLOY_TIMEOUT}" bash -c -- "while ! ${KUSTOMIZE} build example | kubectl apply -f -; do sleep 10; done"
if [ $? -eq 124 ]; then
echo "Timed out attempt to deploy Kubeflow"
popd
exit 1
fi
popd
}
# Modify the ns finalizers so they don't wait for async processes to complete
function fix_terminating_ns() {
kubectl proxy &
for ns in ${@}; do
kubectl get namespace ${ns} -o json |jq '.spec = {"finalizers":[]}' > "/tmp/temp_${ns}.json"
curl -k -H "Content-Type: application/json" -X PUT --data-binary @"/tmp/temp_${ns}.json" 127.0.0.1:8001/api/v1/namespaces/${ns}/finalize
done
}
function tear_down() {
# TODO add a confirmation dialog
# TODO allow limiting namespace list
namespaces="kubeflow knative-eventing knative-serving"
echo "Tearing down Kubeflow installation!"
echo "Removing all objects in Kubernetes namespaces: ${namespaces}"
echo
echo "WARNING: This script does not delete the istio-system or cert-manager namespaces,"
echo "because these are commonly used by other applications."
echo
echo "If you want to remove these namespaces, please do so manually by running:"
echo " kubectl delete namespace istio-system cert-manager"
echo
for ns in $(echo "${namespaces}"); do
kubectl delete namespace "${ns}"
done
# There is an issues in the kfctl delete command that does not properly clean up and leaves NSs in a terminating state, this is a bit hacky but resolves it
if [ "${KUBEFLOW_EXTRA_FULL_DELETE}" == "true" ]; then
echo "Removing finalizers from all namespaces: ${namespaces}"
fix_terminating_ns ${namespaces}
fi
}
function poll_url() {
kubectl wait --for=condition=available --timeout=${KUBEFLOW_TIMEOUT}s -n kubeflow deployments ${KUBEFLOW_POLL_DEPLOYMENTS}
if [ "${?}" != "0" ]; then
echo "Kubeflow did not complete deployment within ${KUBEFLOW_TIMEOUT} seconds"
exit 1
fi
# It typically takes ~5 minutes for all pods and services to start, so we poll for ten minutes here
time=0
while [ ${time} -lt ${KUBEFLOW_TIMEOUT} ]; do
# XXX: This validates that the webapp is responding, it does not guarentee functionality
curl -s --raw -L "${kf_url}" && \
echo "Kubeflow homepage is up" && break
let time=$time+15
sleep 15
done
curl -s --raw -L "${kf_url}" || (echo "Kubeflow did not respond within ${KUBEFLOW_TIMEOUT} seconds" && \
exit 1) # Fail if we didn't come up in time.
}
function get_url() {
# Get LoadBalancer and NodePorts
master_ip=$(kubectl get nodes -l node-role.kubernetes.io/control-plane= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1)
nodePort="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=PORT:.spec.ports[?\(@.name==\"http2\"\)].nodePort)"
secure_nodePort="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=PORT:.spec.ports[?\(@.name==\"https\"\)].nodePort)"
lb_ip="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=:.status.loadBalancer.ingress[0].ip)"
export kf_url="http://${master_ip}:${nodePort}"
export secure_kf_url="https://${master_ip}:${secure_nodePort}"
export lb_url="https://${lb_ip}"
}
function print_info() {
echo
echo "Kubeflow app installed to: ${KF_DIR}"
echo
echo "It may take several minutes for all services to start. Run 'kubectl get pods -n kubeflow' to verify"
echo
echo "To remove (excluding CRDs, istio, auth, and cert-manager), run: ${0} -d"
echo
echo "To perform a full uninstall : ${0} -D"
echo
echo "Kubeflow Dashboard (HTTP NodePort): ${kf_url}"
echo
}
get_opts ${@}
if [ ${KUBEFLOW_PRINT} ] && [ ${KUBEFLOW_DELETE} ]; then
echo "Cannot specify print flag and delete flag"
exit 2
elif [ ${KUBEFLOW_PRINT} ]; then
get_url
print_info
elif [ ${KUBEFLOW_DELETE} ]; then
tear_down
elif [ ${KUBEFLOW_WAIT} ]; then
# Run print_info to get the kf_url
get_url
print_info
poll_url
elif [ ${KUBEFLOW_CLONE} ]; then
clone_repo
else
install_dependencies
clone_repo
stand_up
# install_mpi_operator # BUG: https://github.com/NVIDIA/deepops/issues/737
get_url
print_info
fi