From 0a89426d2deaa821fec4c4eb76a56050b22739b7 Mon Sep 17 00:00:00 2001 From: shaowenchen Date: Tue, 26 Nov 2024 15:52:56 +0800 Subject: [PATCH] fix collect gpu --- api/v1/pipelinerun_types.go | 10 ++++++++++ controllers/pipelinerun_controller.go | 2 ++ controllers/taskrun_controller.go | 14 ++++++-------- tasks/collect-gpu-log.yaml | 13 ++++++------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/api/v1/pipelinerun_types.go b/api/v1/pipelinerun_types.go index a29abdc9..f6683f1e 100644 --- a/api/v1/pipelinerun_types.go +++ b/api/v1/pipelinerun_types.go @@ -17,9 +17,11 @@ limitations under the License. package v1 import ( + "fmt" opsconstants "github.com/shaowenchen/ops/pkg/constants" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "time" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -120,6 +122,14 @@ func (obj *PipelineRun) CopyWithOutVersion() *PipelineRun { } } +func (obj *PipelineRun) SetEnv() *PipelineRun { + if obj.Spec.Variables == nil { + obj.Spec.Variables = make(map[string]string) + } + obj.Spec.Variables["TIME"] = fmt.Sprintf("%d", time.Now().UnixMicro()) + return obj +} + func NewPipelineRun(p *Pipeline) *PipelineRun { if p == nil { return &PipelineRun{} diff --git a/controllers/pipelinerun_controller.go b/controllers/pipelinerun_controller.go index 9ddde69a..4b7823ec 100644 --- a/controllers/pipelinerun_controller.go +++ b/controllers/pipelinerun_controller.go @@ -99,6 +99,8 @@ func (r *PipelineRunReconciler) Reconcile(ctx context.Context, req ctrl.Request) if opsconstants.IsFinishedStatus(pr.Status.RunStatus) { return ctrl.Result{}, nil } + // insert env + pr.SetEnv() // if is others cluster, send and just sync status cluster := r.isOtherCluster(pr) if cluster != nil { diff --git a/controllers/taskrun_controller.go b/controllers/taskrun_controller.go index 2d5b3264..d2c273e6 100644 --- a/controllers/taskrun_controller.go +++ b/controllers/taskrun_controller.go @@ -24,7 +24,6 @@ import ( "time" "github.com/google/go-cmp/cmp" - "github.com/google/uuid" cron "github.com/robfig/cron/v3" opsv1 "github.com/shaowenchen/ops/api/v1" opsconstants "github.com/shaowenchen/ops/pkg/constants" @@ -219,6 +218,7 @@ func (r *TaskRunReconciler) run(logger *opslog.Logger, ctx context.Context, t *o } } else { cluster := opsv1.NewCurrentCluster() + logger.Info.Println(fmt.Sprintf("run task %s on cluster %s", t.GetUniqueKey(), cluster.Name)) err = r.runTaskOnKube(cliLogger, ctx, t, tr, &cluster) if err != nil { logger.Error.Println(err) @@ -252,7 +252,6 @@ func (r *TaskRunReconciler) runTaskOnHost(logger *opslog.Logger, ctx context.Con vars["NAMESPACE"] = tr.Namespace vars["OPSSERVER_ENDPOINT"] = r.getOpsServerEndpoint(t.Namespace) vars["EVENT_CLUSTER"] = opsconstants.GetEnvEventCluster() - vars["UUID"] = uuid.New().String() // insert host labels for k, v := range h.ObjectMeta.Labels { @@ -287,11 +286,11 @@ func (r *TaskRunReconciler) runTaskOnKube(logger *opslog.Logger, ctx context.Con return err } // if find host in cluster, and can connect - host, _ := kc.GetHost(opsconstants.OpsNamespace, tr.GetHost(t)) - if host != nil { - logger.Info.Println("use host credentials to run cluster task " + tr.Name) - return r.runTaskOnHost(logger, ctx, *kc.OpsClient, t, tr, host) - } + // host, _ := kc.GetHost(opsconstants.OpsNamespace, tr.GetHost(t)) + // if host != nil { + // logger.Info.Println("use host credentials to run cluster task " + tr.Name) + // return r.runTaskOnHost(logger, ctx, *kc.OpsClient, t, tr, host) + // } // else use pod to run task // build options hostStr := tr.GetHost(t) @@ -323,7 +322,6 @@ func (r *TaskRunReconciler) runTaskOnKube(logger *opslog.Logger, ctx context.Con vars["OPSSERVER_ENDPOINT"] = r.getOpsServerEndpoint(t.Namespace) vars["TASK"] = t.Name vars["TASKRUN"] = tr.Name - vars["UUID"] = uuid.New().String() opstask.RunTaskOnKube(logger, t, tr, kc, &node, opsoption.TaskOption{ Variables: vars, diff --git a/tasks/collect-gpu-log.yaml b/tasks/collect-gpu-log.yaml index e6ac7cd9..d55b7153 100644 --- a/tasks/collect-gpu-log.yaml +++ b/tasks/collect-gpu-log.yaml @@ -25,17 +25,16 @@ spec: steps: - name: run col_gpu_log.sh content: | - mkdir -p /tmp/collect-gpu-log || true - rm -rf /tmp/collect-gpu-log/* - cd /tmp/collect-gpu-log + mkdir -p /tmp/collect-gpu-log-${TIME} + cd /tmp/collect-gpu-log-${TIME} curl -sfL https://ghp.ci/https://raw.githubusercontent.com/shaowenchen/hubimage/main/ai/col_gpu_log.sh | bash - mv *.tar.gz collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz + mv *.tar.gz collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz - name: upload to s3 - remotefile: s3://collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz - localfile: /tmp/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz + remotefile: s3://collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz + localfile: /tmp/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz direction: upload - name: get 2h shared link content: | curl -sfL https://ks3util-version-update.ks3-cn-beijing.ksyuncs.com/2.6.0/ks3util-linux-amd64 -o /usr/local/bin/ks3util >/dev/null chmod +x /usr/local/bin/ks3util - /usr/local/bin/ks3util sign ks3://${bucket}/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz --endpoint ${endpoint} --timeout 7200 -i ${ak} -k ${sk} + /usr/local/bin/ks3util sign ks3://${bucket}/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz --endpoint ${endpoint} --timeout 7200 -i ${ak} -k ${sk}