Single kafka pod keeps restarting

2/11/2022

Problem

Only one, single pod is failing on k8 node. Readiness and Liveness probes are indicating long resposne time, despite fact, that port is open and traffic between nodes is flowing. Termination error code is 137.

It's up immediately after killing, however port 9092 is not opened yet, and whole recreation process of bringing app up is taking about 30 minutes.

General environment config

We have k8 cluster consists of 6 nodes (2 nodes per each of 3 racks and racks are kept in different DC). Our kafka is deployed with helm in use, and each of its nodes is deployed on different host, due to affinity/anti-affinity.

Kafka config

 - log.cleaner.min.compaction.lag.ms=0
 - offsets.topic.num.partitions=50
 - log.flush.interval.messages=9223372036854775807
 - controller.socket.timeout.ms=30000
 - principal.builder.class=null
 - log.flush.interval.ms=null
 - min.insync.replicas=1
 - num.recovery.threads.per.data.dir=1
 - sasl.mechanism.inter.broker.protocol=GSSAPI
 - fetch.purgatory.purge.interval.requests=1000
 - replica.socket.timeout.ms=30000
 - message.max.bytes=1048588
 - max.connection.creation.rate=2147483647
 - connections.max.reauth.ms=0
 - log.flush.offset.checkpoint.interval.ms=60000
 - zookeeper.clientCnxnSocket=null
 - quota.window.num=11
 - zookeeper.connect=zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
 - authorizer.class.name=
 - password.encoder.secret=null
 - num.replica.fetchers=1
 - alter.log.dirs.replication.quota.window.size.seconds=1
 - log.roll.jitter.hours=0
 - password.encoder.old.secret=null
 - log.cleaner.delete.retention.ms=86400000
 - queued.max.requests=500
 - log.cleaner.threads=1
 - sasl.kerberos.service.name=null
 - socket.request.max.bytes=104857600
 - log.message.timestamp.type=CreateTime
 - connections.max.idle.ms=600000
 - zookeeper.set.acl=false
 - delegation.token.expiry.time.ms=86400000
 - session.timeout.ms=null
 - max.connections=2147483647
 - transaction.state.log.num.partitions=50
 - listener.security.protocol.map=PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
 - log.retention.hours=168
 - client.quota.callback.class=null
 - delete.records.purgatory.purge.interval.requests=1
 - log.roll.ms=null
 - replica.high.watermark.checkpoint.interval.ms=5000
 - replication.quota.window.size.seconds=1
 - sasl.kerberos.ticket.renew.window.factor=0.8
 - zookeeper.connection.timeout.ms=18000
 - metrics.recording.level=INFO
 - password.encoder.cipher.algorithm=AES/CBC/PKCS5Padding
 - replica.selector.class=null
 - max.connections.per.ip=2147483647
 - background.threads=10
 - quota.consumer.default=9223372036854775807
 - request.timeout.ms=30000
 - log.message.format.version=2.8-IV1
 - sasl.login.class=null
 - log.dir=/tmp/kafka-logs
 - log.segment.bytes=1073741824
 - replica.fetch.response.max.bytes=10485760
 - group.max.session.timeout.ms=1800000
 - port=9092
 - log.segment.delete.delay.ms=60000
 - log.retention.minutes=null
 - log.dirs=/kafka
 - controlled.shutdown.enable=true
 - socket.connection.setup.timeout.max.ms=30000
 - log.message.timestamp.difference.max.ms=9223372036854775807
 - password.encoder.key.length=128
 - sasl.login.refresh.min.period.seconds=60
 - transaction.abort.timed.out.transaction.cleanup.interval.ms=10000
 - sasl.kerberos.kinit.cmd=/usr/bin/kinit
 - log.cleaner.io.max.bytes.per.second=1.7976931348623157E308
 - auto.leader.rebalance.enable=true
 - leader.imbalance.check.interval.seconds=300
 - log.cleaner.min.cleanable.ratio=0.5
 - replica.lag.time.max.ms=30000
 - num.network.threads=3
 - sasl.client.callback.handler.class=null
 - metrics.num.samples=2
 - socket.send.buffer.bytes=102400
 - password.encoder.keyfactory.algorithm=null
 - socket.receive.buffer.bytes=102400
 - replica.fetch.min.bytes=1
 - broker.rack=null
 - unclean.leader.election.enable=false
 - offsets.retention.check.interval.ms=600000
 - producer.purgatory.purge.interval.requests=1000
 - metrics.sample.window.ms=30000
 - log.retention.check.interval.ms=300000
 - sasl.login.refresh.window.jitter=0.05
 - leader.imbalance.per.broker.percentage=10
 - controller.quota.window.num=11
 - advertised.host.name=null
 - metric.reporters=
 - quota.producer.default=9223372036854775807
 - auto.create.topics.enable=false
 - replica.socket.receive.buffer.bytes=65536
 - replica.fetch.wait.max.ms=500
 - password.encoder.iterations=4096
 - default.replication.factor=1
 - sasl.kerberos.principal.to.local.rules=DEFAULT
 - log.preallocate=false
 - transactional.id.expiration.ms=604800000
 - control.plane.listener.name=null
 - transaction.state.log.replication.factor=3
 - num.io.threads=8
 - sasl.login.refresh.buffer.seconds=300
 - offsets.commit.required.acks=-1
 - connection.failed.authentication.delay.ms=100
 - delete.topic.enable=true
 - quota.window.size.seconds=1
 - offsets.commit.timeout.ms=5000
 - log.cleaner.max.compaction.lag.ms=9223372036854775807
 - zookeeper.ssl.enabled.protocols=null
 - log.retention.ms=604800000
 - alter.log.dirs.replication.quota.window.num=11
 - log.cleaner.enable=true
 - offsets.load.buffer.size=5242880
 - controlled.shutdown.max.retries=3
 - offsets.topic.replication.factor=3
 - transaction.state.log.min.isr=1
 - sasl.kerberos.ticket.renew.jitter=0.05
 - zookeeper.session.timeout.ms=18000
 - log.retention.bytes=-1
 - controller.quota.window.size.seconds=1
 - sasl.jaas.config=null
 - sasl.kerberos.min.time.before.relogin=60000
 - offsets.retention.minutes=10080
 - replica.fetch.backoff.ms=1000
 - inter.broker.protocol.version=2.8-IV1
 - kafka.metrics.reporters=
 - num.partitions=1
 - socket.connection.setup.timeout.ms=10000
 - broker.id.generation.enable=true
 - listeners=PLAINTEXT://:9092,OUTSIDE://:9094
 - inter.broker.listener.name=null
 - alter.config.policy.class.name=null
 - delegation.token.expiry.check.interval.ms=3600000
 - log.flush.scheduler.interval.ms=9223372036854775807
 - zookeeper.max.in.flight.requests=10
 - log.index.size.max.bytes=10485760
 - sasl.login.callback.handler.class=null
 - replica.fetch.max.bytes=1048576
 - sasl.server.callback.handler.class=null
 - log.cleaner.dedupe.buffer.size=134217728
 - advertised.port=null
 - log.cleaner.io.buffer.size=524288
 - create.topic.policy.class.name=null
 - controlled.shutdown.retry.backoff.ms=5000
 - security.providers=null
 - log.roll.hours=168
 - log.cleanup.policy=delete
 - log.flush.start.offset.checkpoint.interval.ms=60000
 - host.name=
 - log.roll.jitter.ms=null
 - transaction.state.log.segment.bytes=104857600
 - offsets.topic.segment.bytes=104857600
 - group.initial.rebalance.delay.ms=3000
 - log.index.interval.bytes=4096
 - log.cleaner.backoff.ms=15000
 - ssl.truststore.location=null
 - offset.metadata.max.bytes=4096
 - ssl.keystore.password=null
 - zookeeper.sync.time.ms=2000
 - fetch.max.bytes=57671680
 - max.poll.interval.ms=null
 - compression.type=producer
 - max.connections.per.ip.overrides=
 - sasl.login.refresh.window.factor=0.8
 - kafka.metrics.polling.interval.secs=10
 - max.incremental.fetch.session.cache.slots=1000
 - delegation.token.master.key=null
 - reserved.broker.max.id=1000
 - transaction.remove.expired.transaction.cleanup.interval.ms=3600000
 - log.message.downconversion.enable=true
 - transaction.state.log.load.buffer.size=5242880
 - sasl.enabled.mechanisms=GSSAPI
 - num.replica.alter.log.dirs.threads=null
 - group.min.session.timeout.ms=6000
 - log.cleaner.io.buffer.load.factor=0.9
 - transaction.max.timeout.ms=900000
 - group.max.size=2147483647
 - delegation.token.max.lifetime.ms=604800000
 - broker.id=0
 - offsets.topic.compression.codec=0
 - zookeeper.ssl.endpoint.identification.algorithm=HTTPS
 - replication.quota.window.num=11
 - advertised.listeners=PLAINTEXT://:9092,OUTSIDE://kafka-0.kafka.kafka-shared-cluster.svc.cluster.local:9094
 - queued.max.request.bytes=-1

What have been verified

  • RAID I/O - no spikes before reboot
  • Zookeeper no logs indicating any connection problem
  • Ping - response time is raising periodically
  • Kafka logging level set to DEBUG - java.io.EOFException what is just DEBUG log, not WARNING or ERROR
  • K8 node logs - nothing significant beside readiness and liveness probes

Pods config

Containers:
  kafka:
    Image:          wurstmeister/kafka:latest
    Ports:          9092/TCP, 9094/TCP, 9999/TCP
    Host Ports:     0/TCP, 0/TCP, 0/TCP
    State:          Running
      Started:      Thu, 10 Feb 2022 16:36:48 +0100
    Last State:     Terminated
      Reason:       Error
      Exit Code:    137
      Started:      Tue, 08 Feb 2022 21:12:26 +0100
      Finished:     Thu, 10 Feb 2022 16:36:36 +0100
    Ready:          True
    Restart Count:  76
    Limits:
      cpu:     24
      memory:  64Gi
    Requests:
      cpu:      1
      memory:   2Gi
    Liveness:   tcp-socket :9092 delay=3600s timeout=5s period=10s #success=1 #failure=3
    Readiness:  tcp-socket :9092 delay=5s timeout=6s period=10s #success=1 #failure=5
    Environment:
      KAFKA_AUTO_CREATE_TOPICS_ENABLE:                 false
      ALLOW_PLAINTEXT_LISTENER:                        yes
      BROKER_ID_COMMAND:                               hostname | awk -F'-' '{print $NF}'
      HOSTNAME_COMMAND:                                hostname -f
      KAFKA_ADVERTISED_LISTENERS:                      PLAINTEXT://:9092,OUTSIDE://_{HOSTNAME_COMMAND}:9094
      KAFKA_LISTENERS:                                 PLAINTEXT://:9092,OUTSIDE://:9094
      KAFKA_ZOOKEEPER_CONNECT:                         zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP:            PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
      KAFKA_LOG_RETENTION_MS:                          604800000
      KAFKA_LOG_DIRS:                                  /kafka
      KAFKA_SESSION_TIMEOUT_MS:                        10000
      KAFKA_MAX_POLL_INTERVAL_MS:                      60000
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS:          3000
      KAFKA_JMX_OPTS:                                  

run command properties

-Xmx1G -Xms1G -server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -XX:MaxInlineLevel=15 -Djava.awt.headless=true -Xloggc:/opt/kafka/bin/../logs/kafkaServer-gc.log -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCTimeStamps -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=100M

Java heap size in bytes

  • uintx ErgoHeapSizeLimit = 0 {product}
  • uintx HeapSizePerGCThread = 87241520 {product}
  • uintx InitialHeapSize := 1073741824 {product}
  • uintx LargePageHeapSizeThreshold = 134217728 {product}
  • uintx MaxHeapSize := 17179869184 {product}

Question

do you have any ideas why only this particular pod is failing, and any suggestion for further steps?

-- EDIT --

Containers:
  kafka:
    Image:          wurstmeister/kafka:latest
    Ports:          9092/TCP, 9094/TCP, 9999/TCP
    Host Ports:     0/TCP, 0/TCP, 0/TCP
    State:          Running
      Started:      Tue, 08 Mar 2022 17:50:11 +0100
    Last State:     Terminated
      Reason:       Error
      Exit Code:    137
      Started:      Tue, 08 Mar 2022 16:35:38 +0100
      Finished:     Tue, 08 Mar 2022 17:49:51 +0100
    Ready:          True
    Restart Count:  1
    Limits:
      cpu:     24
      memory:  64Gi
    Requests:
      cpu:      1
      memory:   2Gi
    Liveness:   tcp-socket :9092 delay=3600s timeout=5s period=10s #success=1 #failure=3
    Readiness:  tcp-socket :9092 delay=5s timeout=6s period=10s #success=1 #failure=5
    Environment:
      KAFKA_AUTO_CREATE_TOPICS_ENABLE:                 false
      ALLOW_PLAINTEXT_LISTENER:                        yes
      BROKER_ID_COMMAND:                               hostname | awk -F'-' '{print $NF}'
      HOSTNAME_COMMAND:                                hostname -f
      KAFKA_ADVERTISED_LISTENERS:                      PLAINTEXT://:9092,OUTSIDE://_{HOSTNAME_COMMAND}:9094
      KAFKA_LISTENERS:                                 PLAINTEXT://:9092,OUTSIDE://:9094
      KAFKA_ZOOKEEPER_CONNECT:                         zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP:            PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
      KAFKA_LOG_RETENTION_MS:                          604800000
      KAFKA_LOG_DIRS:                                  /kafka
      KAFKA_SESSION_TIMEOUT_MS:                        10000
      KAFKA_MAX_POLL_INTERVAL_MS:                      60000
      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS:          3000
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR:          3
      KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR:  3
      KAFKA_HEAP_OPTS:                                 -Xmx6G -Xms6G
      KAFKA_DEFAULT_REPLICATION_FACTOR:                3
      KAFKA_MIN_INSYNC_REPLICAS:                       2
      KAFKA_REPLICA_LAG_TIME_MAX_MS:                   80000
      KAFKA_NUM_RECOVERY_THREADS_PER_DATA_DIR:         6
Conditions:
  Type              Status
  Initialized       True
  Ready             True
  ContainersReady   True
  PodScheduled      True
Volumes:
  kafka-data:
    Type:       PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
    ClaimName:  kafka-data-kafka-0
    ReadOnly:   false
  jolokia-agent:
    Type:      ConfigMap (a volume populated by a ConfigMap)
    Name:      jolokia-agent
    Optional:  false
Volumes:
  kafka-data:
    Type:       PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
    ClaimName:  kafka-data-kafka-0
    ReadOnly:   false
  jolokia-agent:
    Type:      ConfigMap (a volume populated by a ConfigMap)
    Name:      jolokia-agent
    Optional:  false
  default-token-d57jd:
    Type:        Secret (a volume populated by a Secret)
    SecretName:  default-token-d57jd
    Optional:    false

PVC

Name:          kafka-data-kafka-0
Namespace:     kafka-shared-cluster
StorageClass:  local-storage
Status:        Bound
Volume:        pvc-1d23ba70-cb15-43c3-91b1-4febc8fd9896
Labels:        app=kafka
Annotations:   pv.kubernetes.io/bind-completed: yes
               pv.kubernetes.io/bound-by-controller: yes
               volume.beta.kubernetes.io/storage-provisioner: rancher.io/local-path
               volume.kubernetes.io/selected-node: xxx
Finalizers:    [kubernetes.io/pvc-protection]
Capacity:      10Gi
Access Modes:  RWO
VolumeMode:    Filesystem
Used By:       kafka-0
Events:        <none>
-- Lok Lokaj
apache-kafka
kubernetes

0 Answers