Kubernetes rolling update not working

6/13/2018

I have 2 kubernetes installs for different projects that as best as I can see have equivalent config in the areas that matter yet the 2 perform rolling updates differently.

Both were installed on AWS using kops.

System 1 (k8s v1.7.0) - Kill pod in a deployment using k8s web gui, new pod is created first and then once running will terminate old pod. No downtime.

System 2 (k8s v1.8.4) - Kill pod in a deployment using k8s web gui, old pod is killed instantly and then new pod is created. Causes brief downtime.

Any suggestions or ideas as to why they behave differently and how I can get system 2 to create new pod before terminating the old one?

System 1 deployment

{
  "kind": "Deployment",
  "apiVersion": "extensions/v1beta1",
  "metadata": {
    "name": "proxy-deployment",
    "namespace": "namespace",
    "selfLink": "/apis/extensions/v1beta1/namespaces/namespace/deployments/proxy-deployment",
    "uid": "d12778ba-8950-11e7-9e69-12f38e55b21a",
    "resourceVersion": "31538492",
    "generation": 7,
    "creationTimestamp": "2017-08-25T04:49:45Z",
    "labels": {
      "app": "proxy"
    },
    "annotations": {
      "deployment.kubernetes.io/revision": "6",
      "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"proxy-deployment\",\"namespace\":\"namespace\"},\"spec\":{\"replicas\":2,\"template\":{\"metadata\":{\"labels\":{\"app\":\"proxy\"}},\"spec\":{\"containers\":[{\"image\":\"xxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/nginx-proxy-xxxxxx:latest\",\"name\":\"proxy-ctr\",\"ports\":[{\"containerPort\":80},{\"containerPort\":8080}]}]}}}}\n"
    }
  },
  "spec": {
    "replicas": 1,
    "selector": {
      "matchLabels": {
        "app": "proxy"
      }
    },
    "template": {
      "metadata": {
        "creationTimestamp": null,
        "labels": {
          "app": "proxy",
          "date": "1522386390"
        }
      },
      "spec": {
        "containers": [
          {
            "name": "proxy-ctr",
            "image": "xxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/nginx-proxy-xxxxxx:latest",
            "ports": [
              {
                "containerPort": 80,
                "protocol": "TCP"
              },
              {
                "containerPort": 8080,
                "protocol": "TCP"
              }
            ],
            "resources": {},
            "terminationMessagePath": "/dev/termination-log",
            "terminationMessagePolicy": "File",
            "imagePullPolicy": "Always"
          }
        ],
        "restartPolicy": "Always",
        "terminationGracePeriodSeconds": 30,
        "dnsPolicy": "ClusterFirst",
        "securityContext": {},
        "schedulerName": "default-scheduler"
      }
    },
    "strategy": {
      "type": "RollingUpdate",
      "rollingUpdate": {
        "maxUnavailable": "25%",
        "maxSurge": "25%"
      }
    },
    "revisionHistoryLimit": 2,
    "progressDeadlineSeconds": 600
  },
  "status": {
    "observedGeneration": 7,
    "replicas": 1,
    "updatedReplicas": 1,
    "readyReplicas": 1,
    "availableReplicas": 1,
    "conditions": [
      {
        "type": "Progressing",
        "status": "True",
        "lastUpdateTime": "2018-03-30T05:03:01Z",
        "lastTransitionTime": "2017-08-25T04:49:45Z",
        "reason": "NewReplicaSetAvailable",
        "message": "ReplicaSet \"proxy-deployment-1457650622\" has successfully progressed."
      },
      {
        "type": "Available",
        "status": "True",
        "lastUpdateTime": "2018-06-01T06:55:12Z",
        "lastTransitionTime": "2018-06-01T06:55:12Z",
        "reason": "MinimumReplicasAvailable",
        "message": "Deployment has minimum availability."
      }
    ]
  }
}

System 2 Deployment

{
  "kind": "Deployment",
  "apiVersion": "extensions/v1beta1",
  "metadata": {
    "name": "prodefault-deployment",
    "namespace": "namespace",
    "selfLink": "/apis/extensions/v1beta1/namespaces/namespace/deployments/prodefault-deployment",
    "uid": "a80528c8-eb79-11e7-9364-068125440f70",
    "resourceVersion": "25203392",
    "generation": 10,
    "creationTimestamp": "2017-12-28T02:49:00Z",
    "labels": {
      "app": "prodefault"
    },
    "annotations": {
      "deployment.kubernetes.io/revision": "7",
      "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"prodefault-deployment\",\"namespace\":\"namespace\"},\"spec\":{\"replicas\":1,\"strategy\":{\"rollingUpdate\":{\"maxSurge\":\"25%\",\"maxUnavailable\":\"25%\"},\"type\":\"RollingUpdate\"},\"template\":{\"metadata\":{\"labels\":{\"app\":\"prodefault\"}},\"spec\":{\"containers\":[{\"image\":\"xxxxxxxxxxxx.dkr.ecr.us-west-2.amazonaws.com/xxxxxxxxxxx-pro-default:latest\",\"livenessProbe\":{\"httpGet\":{\"path\":\"/healthchk\",\"port\":80},\"initialDelaySeconds\":120,\"periodSeconds\":15,\"timeoutSeconds\":1},\"name\":\"prodefault-ctr\",\"ports\":[{\"containerPort\":80}],\"readinessProbe\":{\"httpGet\":{\"path\":\"/healthchk\",\"port\":80},\"initialDelaySeconds\":5,\"periodSeconds\":2,\"timeoutSeconds\":3},\"resources\":{\"limits\":{\"cpu\":\"1\",\"memory\":\"1024Mi\"},\"requests\":{\"cpu\":\"150m\",\"memory\":\"256Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/var/www/html/homes\",\"name\":\"efs-pvc\"},{\"mountPath\":\"/var/xero\",\"name\":\"xero-key\",\"readOnly\":true},{\"mountPath\":\"/var/gcal\",\"name\":\"gcal-json\",\"readOnly\":true}]}],\"volumes\":[{\"name\":\"efs-pvc\",\"persistentVolumeClaim\":{\"claimName\":\"tio-pv-claim-homes\"}},{\"name\":\"xero-key\",\"secret\":{\"secretName\":\"xero-key\"}},{\"name\":\"gcal-json\",\"secret\":{\"secretName\":\"gcaljson\"}}]}}}}\n"
    }
  },
  "spec": {
    "replicas": 1,
    "selector": {
      "matchLabels": {
        "app": "prodefault"
      }
    },
    "template": {
      "metadata": {
        "creationTimestamp": null,
        "labels": {
          "app": "prodefault"
        }
      },
      "spec": {
        "volumes": [
          {
            "name": "efs-pvc",
            "persistentVolumeClaim": {
              "claimName": "tio-pv-claim-homes"
            }
          },
          {
            "name": "xero-key",
            "secret": {
              "secretName": "xero-key",
              "defaultMode": 420
            }
          },
          {
            "name": "gcal-json",
            "secret": {
              "secretName": "gcaljson",
              "defaultMode": 420
            }
          }
        ],
        "containers": [
          {
            "name": "prodefault-ctr",
            "image": "xxxxxxxxxxxx.dkr.ecr.us-west-2.amazonaws.com/xxxxxxxxxxx-pro-default:latest",
            "ports": [
              {
                "containerPort": 80,
                "protocol": "TCP"
              }
            ],
            "resources": {
              "limits": {
                "cpu": "1",
                "memory": "1Gi"
              },
              "requests": {
                "cpu": "150m",
                "memory": "256Mi"
              }
            },
            "volumeMounts": [
              {
                "name": "efs-pvc",
                "mountPath": "/var/www/html/homes"
              },
              {
                "name": "xero-key",
                "readOnly": true,
                "mountPath": "/var/xero"
              },
              {
                "name": "gcal-json",
                "readOnly": true,
                "mountPath": "/var/gcal"
              }
            ],
            "livenessProbe": {
              "httpGet": {
                "path": "/healthchk",
                "port": 80,
                "scheme": "HTTP"
              },
              "initialDelaySeconds": 120,
              "timeoutSeconds": 1,
              "periodSeconds": 15,
              "successThreshold": 1,
              "failureThreshold": 3
            },
            "readinessProbe": {
              "httpGet": {
                "path": "/healthchk",
                "port": 80,
                "scheme": "HTTP"
              },
              "initialDelaySeconds": 5,
              "timeoutSeconds": 3,
              "periodSeconds": 2,
              "successThreshold": 1,
              "failureThreshold": 3
            },
            "terminationMessagePath": "/dev/termination-log",
            "terminationMessagePolicy": "File",
            "imagePullPolicy": "Always"
          }
        ],
        "restartPolicy": "Always",
        "terminationGracePeriodSeconds": 30,
        "dnsPolicy": "ClusterFirst",
        "securityContext": {},
        "schedulerName": "default-scheduler"
      }
    },
    "strategy": {
      "type": "RollingUpdate",
      "rollingUpdate": {
        "maxUnavailable": "25%",
        "maxSurge": "25%"
      }
    },
    "revisionHistoryLimit": 2,
    "progressDeadlineSeconds": 600
  },
  "status": {
    "observedGeneration": 10,
    "replicas": 1,
    "updatedReplicas": 1,
    "readyReplicas": 1,
    "availableReplicas": 1,
    "conditions": [
      {
        "type": "Progressing",
        "status": "True",
        "lastUpdateTime": "2018-01-15T06:07:52Z",
        "lastTransitionTime": "2017-12-28T03:00:16Z",
        "reason": "NewReplicaSetAvailable",
        "message": "ReplicaSet \"prodefault-deployment-9685f46d4\" has successfully progressed."
      },
      {
        "type": "Available",
        "status": "True",
        "lastUpdateTime": "2018-06-13T07:12:41Z",
        "lastTransitionTime": "2018-06-13T07:12:41Z",
        "reason": "MinimumReplicasAvailable",
        "message": "Deployment has minimum availability."
      }
    ]
  }
}
-- Mark Walker
kops
kubernetes

1 Answer

6/13/2018

I noticed both pods have the following rolling update stratege defined:

"strategy": {
  "type": "RollingUpdate",
  "rollingUpdate": {
    "maxUnavailable": "25%",
    "maxSurge": "25%"
  }
},

In this way, It should be terminating the old pod after new pod created in a normal rolling update through 'set image' or 'kubectl apply'.

So the different behavior between two systems maybe come from the dashboard. I guess you are running dashboard with different version in your two system, since according to the compatibility metrix of dashboard, the kubernetes v1.7 needs dashboard 1.7 to support, while the kubernetes v1.8 needs dashboard 1.8 to support. Maybe version different dashboard treat 'kill pod' as different action, I don't know.

Or if you are running dashboard 1.7 in your v1.8 system, then try to upgrade your dashbord at first.

And lastly, don't use 'kill pod' to do rolling update.

-- Kun Li
Source: StackOverflow