I am trying to setup a small Kubernetes cluster using RKE with 2 nodes using RKE. The 2 nodes are Ubuntu server VM's running in VirtualBox, both with a bridged connection.
ip of vm 1: xx.xx.xx.61
ip of vm 2: xx.xx.xx.67
When I launch the cluster using rke up I get the following error:
FATA[0212] [etcd] Failed to bring up Etcd Plane: [etcd] Etcd Cluster is not healthy
When I subsequently run kubectl commands such as kubectl --kubeconfig kube_config_cluster.yml version, I get the following error:
Client Version: version.Info{Major:"1", Minor:"15", GitVersion:"v1.15.1", GitCommit:"4485c6f18cee9a5d3c3b4e523bd27972b1b53892", GitTreeState:"clean", BuildDate:"2019-07-18T09:18:22Z", GoVersion:"go1.12.5", Compiler:"gc", Platform:"linux/amd64"}
The connection to the server xx.xx.xx.67:6443 was refused - did you specify the right host or port?
Not sure if these errors are caused by the same underlying issue.
What could be causing this or how could I troubleshoot this issue. Are there any particular log files that I could look into?
The is what the cluster.yml looks like:
# If you intened to deploy Kubernetes in an air-gapped environment,
# please consult the documentation on how to configure custom RKE images.
nodes:
- address: xx.xx.xx.61
port: "22"
internal_address: ""
role:
- controlplane
- worker
- etcd
hostname_override: ""
user: rke
docker_socket: /var/run/docker.sock
ssh_key: ""
ssh_key_path: ~/.ssh/id_rsa
ssh_cert: ""
ssh_cert_path: ""
labels: {}
- address: xx.xx.xx.67
port: "22"
internal_address: ""
role:
- controlplane
- worker
- etcd
hostname_override: ""
user: rke
docker_socket: /var/run/docker.sock
ssh_key: ""
ssh_key_path: ~/.ssh/id_rsa
ssh_cert: ""
ssh_cert_path: ""
labels: {}
services:
etcd:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
external_urls: []
ca_cert: ""
cert: ""
key: ""
path: ""
snapshot: null
retention: ""
creation: ""
backup_config: null
kube-api:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
service_cluster_ip_range: 10.43.0.0/16
service_node_port_range: ""
pod_security_policy: false
always_pull_images: false
kube-controller:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
cluster_cidr: 10.42.0.0/16
service_cluster_ip_range: 10.43.0.0/16
scheduler:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
kubelet:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
cluster_domain: cluster.local
infra_container_image: ""
cluster_dns_server: 10.43.0.10
fail_swap_on: false
kubeproxy:
image: ""
extra_args: {}
extra_binds: []
extra_env: []
network:
plugin: canal
options: {}
authentication:
strategy: x509
sans: []
webhook: null
addons: ""
addons_include:
- https://raw.githubusercontent.com/kubernetes/dashboard/master/src/deploy/recommended/kubernetes-dashboard.yaml
- https://gist.githubusercontent.com/superseb/499f2caa2637c404af41cfb7e5f4a938/raw/930841ac00653fdff8beca61dab9a20bb8983782/k8s-dashboard-user.yml
system_images:
etcd: rancher/coreos-etcd:v3.3.10-rancher1
alpine: rancher/rke-tools:v0.1.34
nginx_proxy: rancher/rke-tools:v0.1.34
cert_downloader: rancher/rke-tools:v0.1.34
kubernetes_services_sidecar: rancher/rke-tools:v0.1.34
kubedns: rancher/k8s-dns-kube-dns:1.15.0
dnsmasq: rancher/k8s-dns-dnsmasq-nanny:1.15.0
kubedns_sidecar: rancher/k8s-dns-sidecar:1.15.0
kubedns_autoscaler: rancher/cluster-proportional-autoscaler:1.3.0
coredns: rancher/coredns-coredns:1.3.1
coredns_autoscaler: rancher/cluster-proportional-autoscaler:1.3.0
kubernetes: rancher/hyperkube:v1.14.3-rancher1
flannel: rancher/coreos-flannel:v0.10.0-rancher1
flannel_cni: rancher/flannel-cni:v0.3.0-rancher1
calico_node: rancher/calico-node:v3.4.0
calico_cni: rancher/calico-cni:v3.4.0
calico_controllers: ""
calico_ctl: rancher/calico-ctl:v2.0.0
canal_node: rancher/calico-node:v3.4.0
canal_cni: rancher/calico-cni:v3.4.0
canal_flannel: rancher/coreos-flannel:v0.10.0
weave_node: weaveworks/weave-kube:2.5.0
weave_cni: weaveworks/weave-npc:2.5.0
pod_infra_container: rancher/pause:3.1
ingress: rancher/nginx-ingress-controller:0.21.0-rancher3
ingress_backend: rancher/nginx-ingress-controller-defaultbackend:1.5-rancher1
metrics_server: rancher/metrics-server:v0.3.1
ssh_key_path: ~/.ssh/id_rsa
ssh_cert_path: ""
ssh_agent_auth: false
authorization:
mode: rbac
options: {}
ignore_docker_version: false
kubernetes_version: ""
private_registries: []
ingress:
provider: ""
options: {}
node_selector: {}
extra_args: {}
cluster_name: ""
cloud_provider:
name: ""
prefix_path: ""
addon_job_timeout: 0
bastion_host:
address: ""
port: ""
user: ""
ssh_key: ""
ssh_key_path: ""
ssh_cert: ""
ssh_cert_path: ""
monitoring:
provider: ""
options: {}
restore:
restore: false
snapshot_name: ""
dns: null
The scale for etcd deployment is generally an odd number (1, 3, 5 ...). I see that you have selected both the nodes with etcd role. It might be causing issues with quorum. Can you use etcd
role with only one of the nodes and run rke up
again?