I am trying to run Ceph as described here. http://docs.ceph.com/docs/master/start/kube-helm/
I see two provisioner pods in crashloopbackoff (see those restarting below)
[root@togo ~]# kubectl -n ceph get deployment,daemonset,storageclass,service,secret,pod,job,configmap
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.extensions/ceph-mds 0/1 1 0 13h
deployment.extensions/ceph-mgr 0/1 1 0 13h
deployment.extensions/ceph-mon-check 0/1 1 0 13h
deployment.extensions/ceph-rbd-provisioner 1/2 2 1 13h
deployment.extensions/ceph-rgw 0/1 1 0 13h
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
daemonset.extensions/ceph-mon 0 0 0 0 0 ceph-mon=enabled 13h
daemonset.extensions/ceph-osd-dev-sdb 3 3 0 3 0 ceph-osd-device-dev-sdb=enabled,ceph-osd=enabled 13h
daemonset.extensions/ceph-osd-dev-sdc 3 3 0 3 0 ceph-osd-device-dev-sdc=enabled,ceph-osd=enabled 13h
NAME PROVISIONER AGE
storageclass.storage.k8s.io/ceph-rbd ceph.com/rbd 13h
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/ceph-mon ClusterIP None <none> 6789/TCP 13h
service/ceph-rgw ClusterIP 10.107.5.156 <none> 8088/TCP 13h
NAME TYPE DATA AGE
secret/ceph-keystone-user-rgw Opaque 7 13h
secret/default-token-f46vp kubernetes.io/service-account-token 3 14h
NAME READY STATUS RESTARTS AGE
pod/ceph-mds-85b4fbb478-w2njk 0/1 Pending 0 13h
pod/ceph-mds-keyring-generator-944tn 0/1 Pending 0 13h
pod/ceph-mgr-588577d89f-vgbdd 0/1 Pending 0 13h
pod/ceph-mgr-keyring-generator-pvfvb 0/1 Pending 0 13h
pod/ceph-mon-check-549b886885-pm4pg 0/1 Pending 0 13h
pod/ceph-mon-keyring-generator-67d58 0/1 Pending 0 13h
pod/ceph-namespace-client-key-generator-4cwdt 0/1 Pending 0 13h
pod/ceph-osd-dev-sdb-dgf6g 0/1 Init:0/3 0 13h
pod/ceph-osd-dev-sdb-fpncv 0/1 Init:0/3 0 13h
pod/ceph-osd-dev-sdb-zmhxn 0/1 Init:0/3 0 13h
pod/ceph-osd-dev-sdc-ddnw5 0/1 Init:0/3 0 13h
pod/ceph-osd-dev-sdc-tsrvv 0/1 Init:0/3 0 13h
pod/ceph-osd-dev-sdc-zgzpd 0/1 Init:0/3 0 13h
pod/ceph-osd-keyring-generator-79xrd 0/1 Pending 0 13h
pod/ceph-rbd-provisioner-5cf47cf8d5-24w8m 0/1 CrashLoopBackOff 152 13h
pod/ceph-rbd-provisioner-5cf47cf8d5-tvxjl 1/1 Running 152 13h
pod/ceph-rgw-7b9677854f-6cmqc 0/1 Pending 0 13h
pod/ceph-rgw-keyring-generator-6wpqn 0/1 Pending 0 13h
pod/ceph-storage-keys-generator-mmq8d 0/1 Pending 0 13h
NAME COMPLETIONS DURATION AGE
job.batch/ceph-mds-keyring-generator 0/1 13h 13h
job.batch/ceph-mgr-keyring-generator 0/1 13h 13h
job.batch/ceph-mon-keyring-generator 0/1 13h 13h
job.batch/ceph-namespace-client-key-generator 0/1 13h 13h
job.batch/ceph-osd-keyring-generator 0/1 13h 13h
job.batch/ceph-rgw-keyring-generator 0/1 13h 13h
job.batch/ceph-storage-keys-generator 0/1 13h 13h
NAME DATA AGE
configmap/ceph-bin 26 13h
configmap/ceph-bin-clients 2 13h
configmap/ceph-etc 1 13h
configmap/ceph-templates 5 13h
In the following I describe one of the crashing pods
[root@togo ~]# kubectl -n ceph describe pod/ceph-rbd-provisioner-5cf47cf8d5-24w8m
Name: ceph-rbd-provisioner-5cf47cf8d5-24w8m
Namespace: ceph
Priority: 0
PriorityClassName: <none>
Node: qatar.corp.sensis.com/10.93.98.36
Start Time: Mon, 25 Mar 2019 16:19:16 -0400
Labels: application=ceph
component=rbd-provisioner
pod-template-hash=5cf47cf8d5
release_group=ceph
Annotations: <none>
Status: Running
IP: 10.96.1.5
Controlled By: ReplicaSet/ceph-rbd-provisioner-5cf47cf8d5
Containers:
ceph-rbd-provisioner:
Container ID: docker://d195fad685bc06f5a3b257ca747d5dcc318d52ceff8f6ef1687edfa3393ecaae
Image: quay.io/external_storage/rbd-provisioner:v0.1.1
Image ID: docker-pullable://quay.io/external_storage/rbd-provisioner@sha256:658b01875f9c6e46e7da5bfbb8009aca2629aa2be3c05b4edea05105f0644a51
Port: <none>
Host Port: <none>
Command:
/tmp/rbd-provisioner.sh
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 255
Started: Tue, 26 Mar 2019 06:09:29 -0400
Finished: Tue, 26 Mar 2019 06:09:59 -0400
Ready: False
Restart Count: 152
Environment:
PROVISIONER_NAME: ceph.com/rbd
POD_NAME: ceph-rbd-provisioner-5cf47cf8d5-24w8m (v1:metadata.name)
Mounts:
/tmp/rbd-provisioner.sh from ceph-bin (ro)
/var/run/secrets/kubernetes.io/serviceaccount from default-token-f46vp (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
ceph-bin:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: ceph-bin
Optional: false
default-token-f46vp:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-f46vp
Optional: false
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning BackOff 7m8s (x3440 over 13h) kubelet, qatar.corp.sensis.com Back-off restarting failed container
Normal Pulled 2m6s (x152 over 13h) kubelet, qatar.corp.sensis.com Container image "quay.io/external_storage/rbd-provisioner:v0.1.1" already present on machine
I am running on CentOS 7. I cant display logs on the pods since they have not yet gotten to a state that would allow that
[root@togo ~]# kubectl get nodes -o wide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
benin.corp.sensis.com Ready <none> 18h v1.13.4 10.93.97.123 <none> CentOS Linux 7 (Core) 3.10.0-693.el7.x86_64 docker://18.9.3
chad.corp.sensis.com Ready <none> 5d14h v1.13.4 10.93.98.23 <none> CentOS Linux 7 (Core) 3.10.0-957.10.1.el7.x86_64 docker://18.9.3
qatar.corp.sensis.com Ready <none> 5d14h v1.13.4 10.93.98.36 <none> CentOS Linux 7 (Core) 3.10.0-957.10.1.el7.x86_64 docker://18.9.3
spain.corp.sensis.com Ready <none> 18h v1.13.4 10.93.103.236 <none> CentOS Linux 7 (Core) 3.10.0-693.el7.x86_64 docker://18.9.3
togo.corp.sensis.com Ready master 5d15h v1.13.4 10.93.98.204 <none> CentOS Linux 7 (Core) 3.10.0-957.5.1.el7.x86_64 docker://18.9.3
tonga.corp.sensis.com Ready <none> 18h v1.13.4 10.93.97.202 <none> CentOS Linux 7 (Core) 3.10.0-693.el7.x86_64 docker://18.9.3
I can ping the pod at 10.96.1.5
[root@togo ~]# ping 10.96.1.5
PING 10.96.1.5 (10.96.1.5) 56(84) bytes of data.
64 bytes from 10.96.1.5: icmp_seq=1 ttl=63 time=0.735 ms
64 bytes from 10.96.1.5: icmp_seq=2 ttl=63 time=0.686 ms
I've labeled 3 of the nodes to function as a Ceph cluster
kubectl label node tonga.corp.sensis.com ceph-osd=enabled ceph-osd-device-dev-sdb=enabled ceph-osd-device-dev-sdc=enabled
kubectl label node benin.corp.sensis.com ceph-osd=enabled ceph-osd-device-dev-sdb=enabled ceph-osd-device-dev-sdc=enabled
kubectl label node spain.corp.sensis.com ceph-osd=enabled ceph-osd-device-dev-sdb=enabled ceph-osd-device-dev-sdc=enabled
I attempted to create all Ceph resources with the following command
helm install --name=ceph local/ceph --namespace=ceph -f charts/ceph-overrides.yaml
The overrides i used are as follows:
[root@togo ~]# cat work/charts/ceph-overrides.yaml
network:
public: 172.21.0.0/20
cluster: 172.21.0.0/20
osd_devices:
- name: dev-sdb
device: /dev/sdb
zap: "1"
- name: dev-sdc
device: /dev/sdc
zap: "1"
storageclass:
name: ceph-rbd
pool: rbd
user_id: k8s
I am able to run bash to see what is happening in the container. However it exits with error code 137 as I look around
[root@togo work]# kubectl -n ceph exec -it ceph-rbd-provisioner-5cf47cf8d5-24w8m -- /bin/bash
[root@ceph-rbd-provisioner-5cf47cf8d5-24w8m /]# ls /tmp
ks-script-CIN_1i rbd-provisioner.sh yum.log
[root@ceph-rbd-provisioner-5cf47cf8d5-24w8m /]# ps -aef
UID PID PPID C STIME TTY TIME CMD
root 1 0 1 11:45 ? 00:00:00 /usr/local/bin/rbd-provisioner -id ceph-rbd-provisioner-5cf47cf8d5-24w8m
root 18 0 0 11:45 pts/0 00:00:00 /bin/bash
root 32 18 0 11:45 pts/0 00:00:00 ps -aef
[root@ceph-rbd-provisioner-5cf47cf8d5-24w8m /]# command terminated with exit code 137
How can I debug this?