Add signal handling #98

jt-nti · 2024-01-02T19:54:22Z

Ideally the k8s pod should be terminated if the peer is terminated. This is related to #88 however it will not always be possible to add an owner reference.

y12studio · 2024-01-31T04:02:06Z

Based on using liveprobeness in the operator-lifecycle-manager project, maybe using liveprobeness is a feasible way to turn off chaincode container consumption?

https://github.com/operator-framework/operator-lifecycle-manager/blob/1bb6009089171393c94c944257ec7b4d06834551/pkg/controller/registry/reconciler/reconciler.go#L170

liveprobeness example

package main

import (
	"context"
	"flag"
	"fmt"
	"path/filepath"

	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/kubernetes"
	"k8s.io/client-go/tools/clientcmd"
	"k8s.io/client-go/util/homedir"
)

func main() {
	// Determine the path to the kubeconfig file.
	var kubeconfig *string
	if home := homedir.HomeDir(); home != "" {
		kubeconfig = flag.String("kubeconfig", filepath.Join(home, ".kube", "config"), "(optional) absolute path to the kubeconfig file")
	} else {
		kubeconfig = flag.String("kubeconfig", "", "absolute path to the kubeconfig file")
	}
	flag.Parse()

	// Use the current context in kubeconfig.
	config, err := clientcmd.BuildConfigFromFlags("", *kubeconfig)
	if err != nil {
		panic(err.Error())
	}

	// Create a clientset for interacting with the kubernetes cluster.
	clientset, err := kubernetes.NewForConfig(config)
	if err != nil {
		panic(err.Error())
	}

	boolValue := true
	// Define the pod with two containers.
	pod := &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name: "chaincode101",
			Labels: map[string]string{
				"app":                          "chaincode101",
				"app.kubernetes.io/name":       "fabric",
				"app.kubernetes.io/component":  "chaincode",
				"app.kubernetes.io/created-by": "fabric-builder-k8s",
				"app.kubernetes.io/managed-by": "fabric-builder-k8s",
				"fabric-builder-k8s-mspid":     "chaincodeData.MspID",
				"fabric-builder-k8s-peerid":    "peerID",
			},
			Annotations: map[string]string{
				"fabric-builder-k8s-ccid": "chaincodeData.ChaincodeID",
			},
		},
		Spec: corev1.PodSpec{
			RestartPolicy:         corev1.RestartPolicyNever,
			ShareProcessNamespace: &boolValue,
			Containers: []corev1.Container{
				{
					Name:    "chaincode",
					Image:   "busybox",
					Command: []string{"sh", "-c", "httpd -f -v -p 3000"},
				},
				{
					Name:  "liveness",
					Image: "busybox",
					Args: []string{
						"/bin/sh",
						"-c",
						"touch /tmp/healthy; sleep 30; rm -f /tmp/healthy; kill -SIGTERM $(pgrep httpd) ;sleep 600",
					},
					LivenessProbe: &corev1.Probe{
						ProbeHandler: corev1.ProbeHandler{
							Exec: &corev1.ExecAction{
								Command: []string{"cat", "/tmp/healthy"},
							},
						},
						InitialDelaySeconds: 5,
						PeriodSeconds:       5,
					},
				},
			},
		},
	}

	// Create the pod in the default namespace.
	pod, err = clientset.CoreV1().Pods("default").Create(context.TODO(), pod, metav1.CreateOptions{})
	if err != nil {
		panic(err.Error())
	}

	fmt.Printf("Pod %s created\n", pod.Name)
}

testing result

$ go build -o create-pod 

$ ./create-pod
Pod chaincode101 created

$ k get pods
NAME           READY   STATUS    RESTARTS   AGE
chaincode101   2/2     Running   0          6s

$ k describe pods chaincode101

Name:             chaincode101
Namespace:        default
Priority:         0
Service Account:  default
Node:             k3d-mycluster-server-0/172.18.0.2
Start Time:       Wed, 31 Jan 2024 11:48:04 +0800
Labels:           app=chaincode101
                  app.kubernetes.io/component=chaincode
                  app.kubernetes.io/created-by=fabric-builder-k8s
                  app.kubernetes.io/managed-by=fabric-builder-k8s
                  app.kubernetes.io/name=fabric
                  fabric-builder-k8s-mspid=chaincodeData.MspID
                  fabric-builder-k8s-peerid=peerID
Annotations:      fabric-builder-k8s-ccid: chaincodeData.ChaincodeID
Status:           Failed
IP:               10.42.0.14
IPs:
  IP:  10.42.0.14
Containers:
  chaincode:
    Container ID:  containerd://92f69b9838a4a42a52871a139083b041940ae9c6f00d02bf9820804a1454e792
    Image:         busybox
    Image ID:      docker.io/library/busybox@sha256:6d9ac9237a84afe1516540f40a0fafdc86859b2141954b4d643af7066d598b74
    Port:          <none>
    Host Port:     <none>
    Command:
      sh
      -c
      httpd -f -v -p 3000
    State:          Terminated
      Reason:       Error
      Exit Code:    143
      Started:      Wed, 31 Jan 2024 11:48:06 +0800
      Finished:     Wed, 31 Jan 2024 11:48:38 +0800
    Ready:          False
    Restart Count:  0
    Environment:    <none>
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-2wbxm (ro)
  liveness:
    Container ID:  containerd://8c449fe46a3c782376d3b141901c071f15bac43a8c639f18a5715f7c46432869
    Image:         busybox
    Image ID:      docker.io/library/busybox@sha256:6d9ac9237a84afe1516540f40a0fafdc86859b2141954b4d643af7066d598b74
    Port:          <none>
    Host Port:     <none>
    Args:
      /bin/sh
      -c
      touch /tmp/healthy; sleep 30; rm -f /tmp/healthy; kill -SIGTERM $(pgrep httpd) ;sleep 600
    State:          Terminated
      Reason:       Error
      Exit Code:    143
      Started:      Wed, 31 Jan 2024 11:48:08 +0800
      Finished:     Wed, 31 Jan 2024 11:48:49 +0800
    Ready:          False
    Restart Count:  0
    Liveness:       exec [cat /tmp/healthy] delay=5s timeout=1s period=5s #success=1 #failure=3
    Environment:    <none>
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-2wbxm (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  kube-api-access-2wbxm:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
QoS Class:                   BestEffort
Node-Selectors:              <none>
Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
  Type     Reason     Age                From               Message
  ----     ------     ----               ----               -------
  Normal   Scheduled  99s                default-scheduler  Successfully assigned default/chaincode101 to k3d-mycluster-server-0
  Normal   Pulling    100s               kubelet            Pulling image "busybox"
  Normal   Pulled     98s                kubelet            Successfully pulled image "busybox" in 1.673769354s (1.673775908s including waiting)
  Normal   Created    98s                kubelet            Created container chaincode
  Normal   Started    98s                kubelet            Started container chaincode
  Normal   Pulling    98s                kubelet            Pulling image "busybox"
  Normal   Pulled     96s                kubelet            Successfully pulled image "busybox" in 1.633507923s (1.633514538s including waiting)
  Normal   Created    96s                kubelet            Created container liveness
  Normal   Started    96s                kubelet            Started container liveness
  Warning  Unhealthy  55s (x3 over 65s)  kubelet            Liveness probe failed: cat: can't open '/tmp/healthy': No such file or directory
  Normal   Killing    55s                kubelet            Stopping container liveness

#
# Terminating another container within the same pod from a different container
# No CPU/Memory resources should be actively consumed by the terminated processes of the pod itself.
#
$ k get pods
NAME           READY   STATUS   RESTARTS   AGE
chaincode101   0/2     Error    0          107s

jt-nti · 2024-01-31T13:56:15Z

Thanks @y12studio. If I understand correctly, the liveness probe will handle situations where the chaincode stops responding. That sounds like a really good thing to add however this issue is to handle the case where the chaincode is ok but the peer goes away. Would you be able to open a new issue or, if not, I'm happy to do that.

y12studio · 2024-02-01T03:05:17Z

I originally intended to use this livenessProbe to deal with the while-do problem, but it didn't seem to work. so I used the another example below to illustrate the original idea.

apiVersion: v1
kind: Pod
metadata:
  name: cc-peer-demo
spec:
  restartPolicy: Never
  shareProcessNamespace: true
  containers:
  - name: chaincode-mock
    env:
    - name: CORE_PEER_ADDRESS
      value: "localhost"
    - name: CORE_CHAINCODE_ID_NAME
      value: "cc101"
    image: registry.k8s.io/etcd:3.5.1-0
    command: [ "/usr/local/bin/etcd", "--data-dir",  "/var/lib/etcd", "--listen-client-urls", "http://0.0.0.0:2379", "--advertise-client-urls", "http://127.0.0.1:2379", "--log-level", "debug"]
    ports:
    - containerPort: 2379
    # 
    # Activating the livenessProbe in this chaincode context will deactivate peer-watch's kill.
    # The health checks will fail, and the kubelet will kill and restart the chaincode container.
    #
    #livenessProbe:
    #  grpc:
    #    port: 2379
    #  initialDelaySeconds: 10
  - name: peer-mock
    image: busybox
    command: 
    - sh
    - -c
    - |
      env
      httpd -f -v -p 3000 &
      sleep 20
      #
      # kill httpd process to simulate a event of fabric peer down
      #
      kill -SIGTERM $(pgrep httpd)
      sleep 60
  - name: peer-watch
    image: busybox
    command: 
    - sh
    - -c
    - |
      # env | grep CORE_PEER_ADDRESS
      # Peer's host and port to be checked
      HOST="localhost"
      PORT=3000
      # Interval in seconds between checks
      INTERVAL=5
      while true; do
        # Use nc to check the port
        if nc -z $HOST $PORT; then
          echo "$(date): $HOST:$PORT is up."
        else
          echo "$(date): ERROR - $HOST:$PORT is down."
          #
          # kill the chaincode container
          #
          kill -SIGTERM $(pgrep etcd)
          exit
        fi
        sleep $INTERVAL
      done

testing

$ kubectl create -f pod.yaml

$ kubectl get pods
NAME           READY   STATUS    RESTARTS   AGE
cc-peer-demo   3/3     Running   0          7s

$ kubectl get pods
NAME           READY   STATUS   RESTARTS   AGE
cc-peer-demo   0/3     Error    0          5m46s

jt-nti · 2024-02-01T17:34:57Z

Fabric peers do have a health endpoint which I think would work for a liveness probe. I've opened #100 to think about what might be possible for chaincode containers.

y12studio · 2024-02-02T00:36:22Z

We could start by adding a cautionary note to the external chaincode documentation. Here's a suggested wording:

Warning: There are instances where the chaincode pod may continue to run even after its associated peer has disconnected. To conserve Kubernetes CPU and memory resources, it's advisable to remove these lingering pods. They can be identified and removed by their label: fabric-builder-k8s-peerid=peerID. For example, you can use the following command to locate these pods:

kubectl get pod -A -l app.kubernetes.io/created-by=fabric-builder-k8s,fabric-builder-k8s-peerid=peerID

jt-nti · 2024-02-02T09:13:22Z

That's a great idea- I've made a start on some documentation so I'll add that.

jt-nti · 2024-05-03T13:44:00Z

Hopefully this issue won't be necessary after switching to kubernetes jobs. See #119

jt-nti · 2024-05-24T13:45:45Z

With #119, kubernetes now cleans up chaincode jobs/pods when the peer is terminated

jt-nti added the good first issue Good for newcomers label Jan 2, 2024

jt-nti added this to the Version 1 milestone Jan 2, 2024

jt-nti mentioned this issue Feb 1, 2024

Add liveness probe support to chaincode #100

Open

jt-nti mentioned this issue Feb 2, 2024

Create a doc site for the k8s builder #66

Closed

jt-nti mentioned this issue May 3, 2024

Make use of Jobs to manage chaincode workload #119

Closed

jt-nti added documentation Improvements or additions to documentation wontfix This will not be worked on and removed good first issue Good for newcomers documentation Improvements or additions to documentation labels May 3, 2024

jt-nti removed this from the Version 1 milestone May 24, 2024

jt-nti closed this as completed May 24, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add signal handling #98

Add signal handling #98

jt-nti commented Jan 2, 2024

y12studio commented Jan 31, 2024

jt-nti commented Jan 31, 2024 •

edited

Loading

y12studio commented Feb 1, 2024

jt-nti commented Feb 1, 2024

y12studio commented Feb 2, 2024

jt-nti commented Feb 2, 2024

jt-nti commented May 3, 2024

jt-nti commented May 24, 2024

Add signal handling #98

Add signal handling #98

Comments

jt-nti commented Jan 2, 2024

y12studio commented Jan 31, 2024

jt-nti commented Jan 31, 2024 • edited Loading

y12studio commented Feb 1, 2024

jt-nti commented Feb 1, 2024

y12studio commented Feb 2, 2024

jt-nti commented Feb 2, 2024

jt-nti commented May 3, 2024

jt-nti commented May 24, 2024

jt-nti commented Jan 31, 2024 •

edited

Loading