-
Notifications
You must be signed in to change notification settings - Fork 716
/
Copy pathservice.go
128 lines (110 loc) · 4 KB
/
service.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package controller provides a Kubernetes controller for a TFJob resource.
package tensorflow
import (
"fmt"
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
common "github.com/kubeflow/common/job_controller/api/v1"
tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1"
"github.com/kubeflow/tf-operator/pkg/common/jobcontroller"
tflogger "github.com/kubeflow/tf-operator/pkg/logger"
)
// reconcileServices checks and updates services for each given TFReplicaSpec.
// It will requeue the tfjob in case of an error while creating/deleting services.
func (tc *TFController) reconcileServices(
tfjob *tfv1.TFJob,
services []*v1.Service,
rtype tfv1.TFReplicaType,
spec *common.ReplicaSpec) error {
// Convert TFReplicaType to lower string.
rt := strings.ToLower(string(rtype))
replicas := int(*spec.Replicas)
// Get all services for the type rt.
services, err := tc.FilterServicesForReplicaType(services, rt)
if err != nil {
return err
}
serviceSlices := tc.GetServiceSlices(services, replicas, tflogger.LoggerForReplica(tfjob, rt))
for index, serviceSlice := range serviceSlices {
if len(serviceSlice) > 1 {
tflogger.LoggerForReplica(tfjob, rt).Warningf("We have too many services for %s %d", rt, index)
// TODO(gaocegege): Kill some services.
} else if len(serviceSlice) == 0 {
tflogger.LoggerForReplica(tfjob, rt).Infof("need to create new service: %s-%d", rt, index)
err = tc.createNewService(tfjob, rtype, strconv.Itoa(index), spec)
if err != nil {
return err
}
}
}
return nil
}
// createNewService creates a new service for the given index and type.
func (tc *TFController) createNewService(tfjob *tfv1.TFJob, rtype tfv1.TFReplicaType, index string, spec *common.ReplicaSpec) error {
tfjobKey, err := KeyFunc(tfjob)
if err != nil {
utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfjob, err))
return err
}
// Convert TFReplicaType to lower string.
rt := strings.ToLower(string(rtype))
expectationServicesKey := jobcontroller.GenExpectationServicesKey(tfjobKey, rt)
err = tc.Expectations.ExpectCreations(expectationServicesKey, 1)
if err != nil {
return err
}
// Create OwnerReference.
controllerRef := tc.GenOwnerReference(tfjob)
// Append tfReplicaTypeLabel and tfReplicaIndexLabel labels.
labels := tc.GenLabels(tfjob.Name)
labels[tfReplicaTypeLabel] = rt
labels[tfReplicaIndexLabel] = index
port, err := GetPortFromTFJob(tfjob, rtype)
if err != nil {
return err
}
service := &v1.Service{
Spec: v1.ServiceSpec{
ClusterIP: "None",
Selector: labels,
Ports: []v1.ServicePort{
{
Name: tfv1.DefaultPortName,
Port: port,
},
},
},
}
service.Name = jobcontroller.GenGeneralName(tfjob.Name, rt, index)
service.Labels = labels
err = tc.ServiceControl.CreateServicesWithControllerRef(tfjob.Namespace, service, tfjob, controllerRef)
if err != nil && errors.IsTimeout(err) {
// Service is created but its initialization has timed out.
// If the initialization is successful eventually, the
// controller will observe the creation via the informer.
// If the initialization fails, or if the service keeps
// uninitialized for a long time, the informer will not
// receive any update, and the controller will create a new
// service when the expectation expires.
return nil
} else if err != nil {
return err
}
return nil
}