Skip to content

Commit

Permalink
fix collect gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
shaowenchen committed Nov 26, 2024
1 parent aba45b6 commit 0a89426
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 15 deletions.
10 changes: 10 additions & 0 deletions api/v1/pipelinerun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ limitations under the License.
package v1

import (
"fmt"
opsconstants "github.com/shaowenchen/ops/pkg/constants"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"time"
)

// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
Expand Down Expand Up @@ -120,6 +122,14 @@ func (obj *PipelineRun) CopyWithOutVersion() *PipelineRun {
}
}

func (obj *PipelineRun) SetEnv() *PipelineRun {
if obj.Spec.Variables == nil {
obj.Spec.Variables = make(map[string]string)
}
obj.Spec.Variables["TIME"] = fmt.Sprintf("%d", time.Now().UnixMicro())
return obj
}

func NewPipelineRun(p *Pipeline) *PipelineRun {
if p == nil {
return &PipelineRun{}
Expand Down
2 changes: 2 additions & 0 deletions controllers/pipelinerun_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ func (r *PipelineRunReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if opsconstants.IsFinishedStatus(pr.Status.RunStatus) {
return ctrl.Result{}, nil
}
// insert env
pr.SetEnv()
// if is others cluster, send and just sync status
cluster := r.isOtherCluster(pr)
if cluster != nil {
Expand Down
14 changes: 6 additions & 8 deletions controllers/taskrun_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/uuid"
cron "github.com/robfig/cron/v3"
opsv1 "github.com/shaowenchen/ops/api/v1"
opsconstants "github.com/shaowenchen/ops/pkg/constants"
Expand Down Expand Up @@ -219,6 +218,7 @@ func (r *TaskRunReconciler) run(logger *opslog.Logger, ctx context.Context, t *o
}
} else {
cluster := opsv1.NewCurrentCluster()
logger.Info.Println(fmt.Sprintf("run task %s on cluster %s", t.GetUniqueKey(), cluster.Name))
err = r.runTaskOnKube(cliLogger, ctx, t, tr, &cluster)
if err != nil {
logger.Error.Println(err)
Expand Down Expand Up @@ -252,7 +252,6 @@ func (r *TaskRunReconciler) runTaskOnHost(logger *opslog.Logger, ctx context.Con
vars["NAMESPACE"] = tr.Namespace
vars["OPSSERVER_ENDPOINT"] = r.getOpsServerEndpoint(t.Namespace)
vars["EVENT_CLUSTER"] = opsconstants.GetEnvEventCluster()
vars["UUID"] = uuid.New().String()

// insert host labels
for k, v := range h.ObjectMeta.Labels {
Expand Down Expand Up @@ -287,11 +286,11 @@ func (r *TaskRunReconciler) runTaskOnKube(logger *opslog.Logger, ctx context.Con
return err
}
// if find host in cluster, and can connect
host, _ := kc.GetHost(opsconstants.OpsNamespace, tr.GetHost(t))
if host != nil {
logger.Info.Println("use host credentials to run cluster task " + tr.Name)
return r.runTaskOnHost(logger, ctx, *kc.OpsClient, t, tr, host)
}
// host, _ := kc.GetHost(opsconstants.OpsNamespace, tr.GetHost(t))
// if host != nil {
// logger.Info.Println("use host credentials to run cluster task " + tr.Name)
// return r.runTaskOnHost(logger, ctx, *kc.OpsClient, t, tr, host)
// }
// else use pod to run task
// build options
hostStr := tr.GetHost(t)
Expand Down Expand Up @@ -323,7 +322,6 @@ func (r *TaskRunReconciler) runTaskOnKube(logger *opslog.Logger, ctx context.Con
vars["OPSSERVER_ENDPOINT"] = r.getOpsServerEndpoint(t.Namespace)
vars["TASK"] = t.Name
vars["TASKRUN"] = tr.Name
vars["UUID"] = uuid.New().String()
opstask.RunTaskOnKube(logger, t, tr, kc, &node,
opsoption.TaskOption{
Variables: vars,
Expand Down
13 changes: 6 additions & 7 deletions tasks/collect-gpu-log.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,16 @@ spec:
steps:
- name: run col_gpu_log.sh
content: |
mkdir -p /tmp/collect-gpu-log || true
rm -rf /tmp/collect-gpu-log/*
cd /tmp/collect-gpu-log
mkdir -p /tmp/collect-gpu-log-${TIME}
cd /tmp/collect-gpu-log-${TIME}
curl -sfL https://ghp.ci/https://raw.githubusercontent.com/shaowenchen/hubimage/main/ai/col_gpu_log.sh | bash
mv *.tar.gz collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz
mv *.tar.gz collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz
- name: upload to s3
remotefile: s3://collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz
localfile: /tmp/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz
remotefile: s3://collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz
localfile: /tmp/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz
direction: upload
- name: get 2h shared link
content: |
curl -sfL https://ks3util-version-update.ks3-cn-beijing.ksyuncs.com/2.6.0/ks3util-linux-amd64 -o /usr/local/bin/ks3util >/dev/null
chmod +x /usr/local/bin/ks3util
/usr/local/bin/ks3util sign ks3://${bucket}/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${UUID}.tar.gz --endpoint ${endpoint} --timeout 7200 -i ${ak} -k ${sk}
/usr/local/bin/ks3util sign ks3://${bucket}/collect-gpu-log/collect-gpu-log-${HOSTNAME}-${TIME}.tar.gz --endpoint ${endpoint} --timeout 7200 -i ${ak} -k ${sk}

0 comments on commit 0a89426

Please sign in to comment.