Only one, single pod is failing on k8 node. Readiness and Liveness probes are indicating long resposne time, despite fact, that port is open and traffic between nodes is flowing. Termination error code is 137.
It's up immediately after killing, however port 9092 is not opened yet, and whole recreation process of bringing app up is taking about 30 minutes.
We have k8 cluster consists of 6 nodes (2 nodes per each of 3 racks and racks are kept in different DC). Our kafka is deployed with helm in use, and each of its nodes is deployed on different host, due to affinity/anti-affinity.
- log.cleaner.min.compaction.lag.ms=0
- offsets.topic.num.partitions=50
- log.flush.interval.messages=9223372036854775807
- controller.socket.timeout.ms=30000
- principal.builder.class=null
- log.flush.interval.ms=null
- min.insync.replicas=1
- num.recovery.threads.per.data.dir=1
- sasl.mechanism.inter.broker.protocol=GSSAPI
- fetch.purgatory.purge.interval.requests=1000
- replica.socket.timeout.ms=30000
- message.max.bytes=1048588
- max.connection.creation.rate=2147483647
- connections.max.reauth.ms=0
- log.flush.offset.checkpoint.interval.ms=60000
- zookeeper.clientCnxnSocket=null
- quota.window.num=11
- zookeeper.connect=zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
- authorizer.class.name=
- password.encoder.secret=null
- num.replica.fetchers=1
- alter.log.dirs.replication.quota.window.size.seconds=1
- log.roll.jitter.hours=0
- password.encoder.old.secret=null
- log.cleaner.delete.retention.ms=86400000
- queued.max.requests=500
- log.cleaner.threads=1
- sasl.kerberos.service.name=null
- socket.request.max.bytes=104857600
- log.message.timestamp.type=CreateTime
- connections.max.idle.ms=600000
- zookeeper.set.acl=false
- delegation.token.expiry.time.ms=86400000
- session.timeout.ms=null
- max.connections=2147483647
- transaction.state.log.num.partitions=50
- listener.security.protocol.map=PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
- log.retention.hours=168
- client.quota.callback.class=null
- delete.records.purgatory.purge.interval.requests=1
- log.roll.ms=null
- replica.high.watermark.checkpoint.interval.ms=5000
- replication.quota.window.size.seconds=1
- sasl.kerberos.ticket.renew.window.factor=0.8
- zookeeper.connection.timeout.ms=18000
- metrics.recording.level=INFO
- password.encoder.cipher.algorithm=AES/CBC/PKCS5Padding
- replica.selector.class=null
- max.connections.per.ip=2147483647
- background.threads=10
- quota.consumer.default=9223372036854775807
- request.timeout.ms=30000
- log.message.format.version=2.8-IV1
- sasl.login.class=null
- log.dir=/tmp/kafka-logs
- log.segment.bytes=1073741824
- replica.fetch.response.max.bytes=10485760
- group.max.session.timeout.ms=1800000
- port=9092
- log.segment.delete.delay.ms=60000
- log.retention.minutes=null
- log.dirs=/kafka
- controlled.shutdown.enable=true
- socket.connection.setup.timeout.max.ms=30000
- log.message.timestamp.difference.max.ms=9223372036854775807
- password.encoder.key.length=128
- sasl.login.refresh.min.period.seconds=60
- transaction.abort.timed.out.transaction.cleanup.interval.ms=10000
- sasl.kerberos.kinit.cmd=/usr/bin/kinit
- log.cleaner.io.max.bytes.per.second=1.7976931348623157E308
- auto.leader.rebalance.enable=true
- leader.imbalance.check.interval.seconds=300
- log.cleaner.min.cleanable.ratio=0.5
- replica.lag.time.max.ms=30000
- num.network.threads=3
- sasl.client.callback.handler.class=null
- metrics.num.samples=2
- socket.send.buffer.bytes=102400
- password.encoder.keyfactory.algorithm=null
- socket.receive.buffer.bytes=102400
- replica.fetch.min.bytes=1
- broker.rack=null
- unclean.leader.election.enable=false
- offsets.retention.check.interval.ms=600000
- producer.purgatory.purge.interval.requests=1000
- metrics.sample.window.ms=30000
- log.retention.check.interval.ms=300000
- sasl.login.refresh.window.jitter=0.05
- leader.imbalance.per.broker.percentage=10
- controller.quota.window.num=11
- advertised.host.name=null
- metric.reporters=
- quota.producer.default=9223372036854775807
- auto.create.topics.enable=false
- replica.socket.receive.buffer.bytes=65536
- replica.fetch.wait.max.ms=500
- password.encoder.iterations=4096
- default.replication.factor=1
- sasl.kerberos.principal.to.local.rules=DEFAULT
- log.preallocate=false
- transactional.id.expiration.ms=604800000
- control.plane.listener.name=null
- transaction.state.log.replication.factor=3
- num.io.threads=8
- sasl.login.refresh.buffer.seconds=300
- offsets.commit.required.acks=-1
- connection.failed.authentication.delay.ms=100
- delete.topic.enable=true
- quota.window.size.seconds=1
- offsets.commit.timeout.ms=5000
- log.cleaner.max.compaction.lag.ms=9223372036854775807
- zookeeper.ssl.enabled.protocols=null
- log.retention.ms=604800000
- alter.log.dirs.replication.quota.window.num=11
- log.cleaner.enable=true
- offsets.load.buffer.size=5242880
- controlled.shutdown.max.retries=3
- offsets.topic.replication.factor=3
- transaction.state.log.min.isr=1
- sasl.kerberos.ticket.renew.jitter=0.05
- zookeeper.session.timeout.ms=18000
- log.retention.bytes=-1
- controller.quota.window.size.seconds=1
- sasl.jaas.config=null
- sasl.kerberos.min.time.before.relogin=60000
- offsets.retention.minutes=10080
- replica.fetch.backoff.ms=1000
- inter.broker.protocol.version=2.8-IV1
- kafka.metrics.reporters=
- num.partitions=1
- socket.connection.setup.timeout.ms=10000
- broker.id.generation.enable=true
- listeners=PLAINTEXT://:9092,OUTSIDE://:9094
- inter.broker.listener.name=null
- alter.config.policy.class.name=null
- delegation.token.expiry.check.interval.ms=3600000
- log.flush.scheduler.interval.ms=9223372036854775807
- zookeeper.max.in.flight.requests=10
- log.index.size.max.bytes=10485760
- sasl.login.callback.handler.class=null
- replica.fetch.max.bytes=1048576
- sasl.server.callback.handler.class=null
- log.cleaner.dedupe.buffer.size=134217728
- advertised.port=null
- log.cleaner.io.buffer.size=524288
- create.topic.policy.class.name=null
- controlled.shutdown.retry.backoff.ms=5000
- security.providers=null
- log.roll.hours=168
- log.cleanup.policy=delete
- log.flush.start.offset.checkpoint.interval.ms=60000
- host.name=
- log.roll.jitter.ms=null
- transaction.state.log.segment.bytes=104857600
- offsets.topic.segment.bytes=104857600
- group.initial.rebalance.delay.ms=3000
- log.index.interval.bytes=4096
- log.cleaner.backoff.ms=15000
- ssl.truststore.location=null
- offset.metadata.max.bytes=4096
- ssl.keystore.password=null
- zookeeper.sync.time.ms=2000
- fetch.max.bytes=57671680
- max.poll.interval.ms=null
- compression.type=producer
- max.connections.per.ip.overrides=
- sasl.login.refresh.window.factor=0.8
- kafka.metrics.polling.interval.secs=10
- max.incremental.fetch.session.cache.slots=1000
- delegation.token.master.key=null
- reserved.broker.max.id=1000
- transaction.remove.expired.transaction.cleanup.interval.ms=3600000
- log.message.downconversion.enable=true
- transaction.state.log.load.buffer.size=5242880
- sasl.enabled.mechanisms=GSSAPI
- num.replica.alter.log.dirs.threads=null
- group.min.session.timeout.ms=6000
- log.cleaner.io.buffer.load.factor=0.9
- transaction.max.timeout.ms=900000
- group.max.size=2147483647
- delegation.token.max.lifetime.ms=604800000
- broker.id=0
- offsets.topic.compression.codec=0
- zookeeper.ssl.endpoint.identification.algorithm=HTTPS
- replication.quota.window.num=11
- advertised.listeners=PLAINTEXT://:9092,OUTSIDE://kafka-0.kafka.kafka-shared-cluster.svc.cluster.local:9094
- queued.max.request.bytes=-1
Containers:
kafka:
Image: wurstmeister/kafka:latest
Ports: 9092/TCP, 9094/TCP, 9999/TCP
Host Ports: 0/TCP, 0/TCP, 0/TCP
State: Running
Started: Thu, 10 Feb 2022 16:36:48 +0100
Last State: Terminated
Reason: Error
Exit Code: 137
Started: Tue, 08 Feb 2022 21:12:26 +0100
Finished: Thu, 10 Feb 2022 16:36:36 +0100
Ready: True
Restart Count: 76
Limits:
cpu: 24
memory: 64Gi
Requests:
cpu: 1
memory: 2Gi
Liveness: tcp-socket :9092 delay=3600s timeout=5s period=10s #success=1 #failure=3
Readiness: tcp-socket :9092 delay=5s timeout=6s period=10s #success=1 #failure=5
Environment:
KAFKA_AUTO_CREATE_TOPICS_ENABLE: false
ALLOW_PLAINTEXT_LISTENER: yes
BROKER_ID_COMMAND: hostname | awk -F'-' '{print $NF}'
HOSTNAME_COMMAND: hostname -f
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://:9092,OUTSIDE://_{HOSTNAME_COMMAND}:9094
KAFKA_LISTENERS: PLAINTEXT://:9092,OUTSIDE://:9094
KAFKA_ZOOKEEPER_CONNECT: zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
KAFKA_LOG_RETENTION_MS: 604800000
KAFKA_LOG_DIRS: /kafka
KAFKA_SESSION_TIMEOUT_MS: 10000
KAFKA_MAX_POLL_INTERVAL_MS: 60000
KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
KAFKA_JMX_OPTS:
-Xmx1G -Xms1G -server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -XX:MaxInlineLevel=15 -Djava.awt.headless=true -Xloggc:/opt/kafka/bin/../logs/kafkaServer-gc.log -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCTimeStamps -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=100M
do you have any ideas why only this particular pod is failing, and any suggestion for further steps?
Containers:
kafka:
Image: wurstmeister/kafka:latest
Ports: 9092/TCP, 9094/TCP, 9999/TCP
Host Ports: 0/TCP, 0/TCP, 0/TCP
State: Running
Started: Tue, 08 Mar 2022 17:50:11 +0100
Last State: Terminated
Reason: Error
Exit Code: 137
Started: Tue, 08 Mar 2022 16:35:38 +0100
Finished: Tue, 08 Mar 2022 17:49:51 +0100
Ready: True
Restart Count: 1
Limits:
cpu: 24
memory: 64Gi
Requests:
cpu: 1
memory: 2Gi
Liveness: tcp-socket :9092 delay=3600s timeout=5s period=10s #success=1 #failure=3
Readiness: tcp-socket :9092 delay=5s timeout=6s period=10s #success=1 #failure=5
Environment:
KAFKA_AUTO_CREATE_TOPICS_ENABLE: false
ALLOW_PLAINTEXT_LISTENER: yes
BROKER_ID_COMMAND: hostname | awk -F'-' '{print $NF}'
HOSTNAME_COMMAND: hostname -f
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://:9092,OUTSIDE://_{HOSTNAME_COMMAND}:9094
KAFKA_LISTENERS: PLAINTEXT://:9092,OUTSIDE://:9094
KAFKA_ZOOKEEPER_CONNECT: zookeeper-service.kafka-shared-cluster.svc.cluster.local:2181/kafka
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT
KAFKA_LOG_RETENTION_MS: 604800000
KAFKA_LOG_DIRS: /kafka
KAFKA_SESSION_TIMEOUT_MS: 10000
KAFKA_MAX_POLL_INTERVAL_MS: 60000
KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 3000
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
KAFKA_HEAP_OPTS: -Xmx6G -Xms6G
KAFKA_DEFAULT_REPLICATION_FACTOR: 3
KAFKA_MIN_INSYNC_REPLICAS: 2
KAFKA_REPLICA_LAG_TIME_MAX_MS: 80000
KAFKA_NUM_RECOVERY_THREADS_PER_DATA_DIR: 6
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
kafka-data:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: kafka-data-kafka-0
ReadOnly: false
jolokia-agent:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: jolokia-agent
Optional: false
Volumes:
kafka-data:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: kafka-data-kafka-0
ReadOnly: false
jolokia-agent:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: jolokia-agent
Optional: false
default-token-d57jd:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-d57jd
Optional: false
Name: kafka-data-kafka-0
Namespace: kafka-shared-cluster
StorageClass: local-storage
Status: Bound
Volume: pvc-1d23ba70-cb15-43c3-91b1-4febc8fd9896
Labels: app=kafka
Annotations: pv.kubernetes.io/bind-completed: yes
pv.kubernetes.io/bound-by-controller: yes
volume.beta.kubernetes.io/storage-provisioner: rancher.io/local-path
volume.kubernetes.io/selected-node: xxx
Finalizers: [kubernetes.io/pvc-protection]
Capacity: 10Gi
Access Modes: RWO
VolumeMode: Filesystem
Used By: kafka-0
Events: <none>