Unable to create a namespace for AKS cluster using Terraform reports no such host

6/26/2021

I have a module definition as below:

===

providers.tf

provider "kubernetes" {
  #load_config_file = "false"
  host                   = azurerm_kubernetes_cluster.aks.kube_config.0.host
  username               = azurerm_kubernetes_cluster.aks.kube_config.0.username
  password               = azurerm_kubernetes_cluster.aks.kube_config.0.password
  client_certificate     = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate)
  client_key             = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_key)
  cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.cluster_ca_certificate)
}

outputs.tf

output "node_resource_group" {
  value       = azurerm_kubernetes_cluster.aks.node_resource_group
  description = "The name of resource group where the AKS Nodes are created"
}
output "kubeConfig" {
  value = azurerm_kubernetes_cluster.aks.kube_config_raw
  description = "Kubeconfig of AKS Cluster"
}

output "host" {
  value = azurerm_kubernetes_cluster.aks.kube_config.0.host
}

output "client_key" {
  value = azurerm_kubernetes_cluster.aks.kube_config.0.client_key
}

output "client_certificate" {
  value = azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate
}

output "kube_config" {
  value = azurerm_kubernetes_cluster.aks.kube_config_raw
}

output "cluster_ca_certificate" {
  value = azurerm_kubernetes_cluster.aks.kube_config.0.cluster_ca_certificate
}

main.tf

resource "azurerm_log_analytics_workspace" "law" {
  name                = "${var.tla}-la-${local.lookup_result}-${var.identifier}"
  location            = data.azurerm_resource_group.rg.location
  resource_group_name = data.azurerm_resource_group.rg.name
  sku                 = var.la_sku
  retention_in_days   = 30
}

resource "azurerm_kubernetes_cluster" "aks" {
  name                    = "${var.tla}-aks-${local.lookup_result}-${var.identifier}"
  location                = data.azurerm_resource_group.rg.location
  resource_group_name     = data.azurerm_resource_group.rg.name
  dns_prefix              = var.dns_prefix
  kubernetes_version      = var.kubernetes_version
  sku_tier                = var.sku_tier
  private_cluster_enabled = var.enable_private_cluster
  #api_server_authorized_ip_ranges = ""
  default_node_pool {
    name                  = "syspool001"
    orchestrator_version  = var.orchestrator_version
    availability_zones    = var.agents_availability_zones
    enable_auto_scaling   = true
    node_count            = var.default_pool_node_count
    max_count             = var.default_pool_max_node_count
    min_count             = var.default_pool_min_node_count
    max_pods              = var.default_pool_max_pod_count
    vm_size               = var.agents_size
    enable_node_public_ip = false
    os_disk_size_gb       = var.default_pool_os_disk_size_gb
    type                  = "VirtualMachineScaleSets"
    vnet_subnet_id        = var.vnet_subnet_id
    node_labels           = var.agents_labels
    tags                  = merge(local.tags, var.agents_tags)
  }

  network_profile {
    network_plugin     = var.network_plugin
    network_policy     = var.network_policy
    dns_service_ip     = var.net_profile_dns_service_ip
    docker_bridge_cidr = var.net_profile_docker_bridge_cidr
    service_cidr       = var.net_profile_service_cidr
  }

  role_based_access_control {
    enabled = true
    azure_active_directory {
      managed                = true
      admin_group_object_ids = var.rbac_aad_admin_group_object_ids
    }
  }

  identity {
    type = "SystemAssigned"
  }

  addon_profile {
    azure_policy {
      enabled = true
    }

    http_application_routing {
      enabled = false
    }

    oms_agent {
      enabled                    = true
      log_analytics_workspace_id = data.azurerm_log_analytics_workspace.log_analytics.id
    }
  }

  tags = local.tags

  lifecycle {
    ignore_changes = [
      default_node_pool
    ]
  }

}

resource "azurerm_kubernetes_cluster_node_pool" "aksnp" {
  lifecycle {
    ignore_changes = [
      node_count
    ]
  }
  for_each              = var.additional_node_pools
  kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id
  name                  = each.value.node_os == "Windows" ? substr(each.key, 0, 6) : substr(each.key, 0, 12)
  node_count            = each.value.node_count
  vm_size               = each.value.vm_size
  availability_zones    = each.value.zones
  max_pods              = each.value.max_pods
  os_disk_size_gb       = each.value.os_disk_size_gb
  os_type               = each.value.node_os
  vnet_subnet_id        = var.vnet_subnet_id
  node_taints           = each.value.taints
  enable_auto_scaling   = each.value.cluster_auto_scaling
  min_count             = each.value.cluster_auto_scaling_min_count
  max_count             = each.value.cluster_auto_scaling_max_count
}

resource "kubernetes_namespace" "aks-namespace" {
  metadata {
    name = var.namespace
  }
}

data.tf

data "azurerm_resource_group" "rg" {
  name = var.resource_group_name
}

lookups.tf

locals {

  environment_lookup = {
    dev  = "d"
    test = "t"
    int  = "i"
    prod = "p"
    prd  = "p"
    uat  = "a"
    poc  = "d"
    dr   = "r"
    lab  = "l"
  }

 
  lookup_result = lookup(local.environment_lookup, var.environment)

  tags = merge(
    data.azurerm_resource_group.rg.tags, {
      Directory      = "tectcompany.com",
      PrivateDNSZone = var.private_dns_zone,
      Immutable      = "False",
      ManagedOS      = "True",
    }
  )
}

data "azurerm_log_analytics_workspace" "log_analytics" {
  name                = "abc-az-lad2"
  resource_group_name = "abc-dev-aae"
}

variables.tf

variable "secondary_region" {
  description = "Is this resource being deployed into the secondary (pair) region?"

  default = false
  type    = bool
}

variable "override_log_analytics_workspace" {
  description = "Override the vm log analytics workspace"
  type        = string
  default     = null
}

variable "override_log_analytics_resource_group_name" {
  description = "Overrides the log analytics resource group name"
  type        = string
  default     = null
}

variable "environment" {
  description = "The name of environment for the AKS Cluster"
  type        = string
  default     = "dev"
}

variable "identifier" {
  description = "The identifier for the AKS Cluster"
  type        = number
  default     = "001"
}

variable "kubernetes_version" {
  description = "Specify which Kubernetes release to use. The default used is the latest Kubernetes version available in the region"
  type        = string
  default     = "1.19.9"
}

variable "dns_prefix" {
  description = "The dns prefix for the AKS Cluster"
  type        = string
  default     = "odessa-sandpit"
}

variable "orchestrator_version" {
  description = "Specify which Kubernetes release to use for the orchestration layer. The default used is the latest Kubernetes version available in the region"
  type        = string
  default     = null
}

variable "agents_availability_zones" {
  description = "(Optional) A list of Availability Zones across which the Node Pool should be spread. Changing this forces a new resource to be created."
  type        = list(string)
  default     = null
}

variable "agents_size" {
  default     = "Standard_D4s_v3"
  description = "The default virtual machine size for the Kubernetes agents"
  type        = string
}

variable "vnet_subnet_id" {
  description = "(Optional) The ID of a Subnet where the Kubernetes Node Pool should exist. Changing this forces a new resource to be created."
  type        = string
  default     = null
}

variable "agents_labels" {
  description = "(Optional) A map of Kubernetes labels which should be applied to nodes in the Default Node Pool. Changing this forces a new resource to be created."
  type        = map(string)
  default     = {}
}

variable "agents_tags" {
  description = "(Optional) A mapping of tags to assign to the Node Pool."
  type        = map(string)
  default     = {}
}

variable "net_profile_dns_service_ip" {
  description = "(Optional) IP address within the Kubernetes service address range that will be used by cluster service discovery (kube-dns). Changing this forces a new resource to be created."
  type        = string
  default     = null
}

variable "net_profile_docker_bridge_cidr" {
  description = "(Optional) IP address (in CIDR notation) used as the Docker bridge IP address on nodes. Changing this forces a new resource to be created."
  type        = string
  default     = null
}

variable "net_profile_service_cidr" {
  description = "(Optional) The Network Range used by the Kubernetes service. Changing this forces a new resource to be created."
  type        = string
  default     = null
}

variable "rbac_aad_admin_group_object_ids" {
  description = "Object ID of groups with admin access."
  type        = list(string)
  default     = null
}

variable "network_policy" {
  description = "(Optional) The Network Policy to be used by the network profile of Azure Kubernetes Cluster."
  type        = string
  default     = "azure"
}

variable "network_plugin" {
  description = "(Optional) The Network Plugin to be used by the network profile of Azure Kubernetes Cluster."
  type        = string
  default     = "azure"
}

variable "enable_private_cluster" {
  description = "(Optional) Set this variable to true if you want Azure Kubernetes Cluster to be private."
  default     = true
}

variable "default_pool_node_count" {
  description = "(Optional) The initial node count for the default pool of AKS Cluster"
  type        = number
  default     = 3
}

variable "default_pool_max_node_count" {
  description = "(Optional) The max node count for the default pool of AKS Cluster"
  type        = number
  default     = 6
}

variable "default_pool_min_node_count" {
  description = "(Optional) The min node count for the default pool of AKS Cluster"
  type        = number
  default     = 3
}

variable "default_pool_max_pod_count" {
  description = "(Optional) The max pod count for the default pool of AKS Cluster"
  type        = number
  default     = 13
}

variable "default_pool_os_disk_size_gb" {
  description = "(Optional) The size of os disk in gb for the nodes from default pool of AKS Cluster"
  type        = string
  default     = "64"
}

variable "additional_node_pools" {
  type = map(object({
    node_count                     = number
    max_pods                       = number
    os_disk_size_gb                = number
    vm_size                        = string
    zones                          = list(string)
    node_os                        = string
    taints                         = list(string)
    cluster_auto_scaling           = bool
    cluster_auto_scaling_min_count = number
    cluster_auto_scaling_max_count = number
  }))
}
variable "sku_tier" {
  description = "(Optional)The SKU Tier that should be used for this Kubernetes Cluster, possible values Free or Paid"
  type        = string
  default     = "Paid"

  validation {
    condition     = contains(["Free", "Paid"], var.sku_tier)
    error_message = "SKU_TIER can only be either Paid or Free."
  }

}

variable "la_sku" {

  description = "(Optional)The SKU Tier that should be used for Log Analytics. Multiple values are possible."
  type        = string
  default     = "PerGB2018"

  validation {
    condition     = contains(["Free", "PerNode", "Premium", "Standard", "Standalone", "Unlimited", "CapacityReservation", "PerGB2018"], var.la_sku)
    error_message = "SKU_TIER for Log Analytics can be can only be either of Free, PerNode, Premium, Standard, Standalone, Unlimited, CapacityReservation and PerGB2018(Default Value)."
  }

}

variable "resource_group_name" {
  description = "Resource Group for deploying AKS Cluster"
  type = string
}

variable "private_dns_zone" {
  description = "DNS prefix for AKS Cluster"
  type = string
  default = "testcluster"
}

variable "tla" {
  description = "Three Level acronym - three letter abbreviation for application"
  type = string
  default = ""
  validation {
    condition     = length(var.tla) == 3
    error_message = "The TLA should be precisely three characters."
  }
}

variable "namespace"{
 description = "AKS Namespace"
  type = string
}

Finally, I am calling my module below to create the AKS cluster, LA, and Namespace for the AKS Cluster:

provider "azurerm" {
   features {}
   #version = "~> 2.53.0"
}
module "aks-cluster1" {

  source = "../../"
  resource_group_name = "pst-aks-sandpit-dev-1"
  tla = "pqr"
  additional_node_pools = { 
        pool1 = {
            node_count                      = "1"
            max_pods                       = "110"
            os_disk_size_gb                = "30"
            vm_size                        = "Standard_D8s_v3"
            zones                          = ["1","2","3"]
            node_os                        = "Linux"
            taints                         =  ["kubernetes.io/os=windows:NoSchedule"]
            cluster_auto_scaling           = true
            cluster_auto_scaling_min_count = "2"
            cluster_auto_scaling_max_count = "4"
        } 
                            }
  namespace = "sample-ns"
}

Problem: I get an error that no such host when terraform attempts to create the cluster.

I think that it is not able to connect to the cluster but I could be wrong. I do not know how it handles internally.

Error: Post "https://testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io:443/api/v1/namespaces": dial tcp: lookup testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io: no such host
-- learner
azure-aks
kubernetes

3 Answers

6/26/2021

Difficult to say what the issue is since the code you posted is incomplete. For starters, you shouldn't be doing this:

provider "kubernetes" {
  config_path    = "~/.kube/config"
}

The AKS URL you posted doesn't exist, so I think that's pulling and old cluster default from your kube config

-- Nicholas S. Castellano
Source: StackOverflow

7/2/2021

I'm one of the maintainers of the Terraform Kubernetes provider, and I see this particular issue pretty often. As a former devops person myself, I empathize with the struggle I keep seeing in this area. It's something I would really love to fix in the provider, if it were possible.

The issue you're facing is a limitation in Terraform core when passing an unknown value to a provider configuration block. To quote their docs:

You can use expressions in the values of these configuration arguments, 
but can only reference values that are known before the configuration is applied.

When you make a change to the underlying infrastructure, such the AKS cluster in this case, you're passing an unknown value into the Kubernetes provider configuration block, since the full scope of the cluster infrastructure is not known until after the change has been applied to the AKS cluster.

Although I did write the initial guide to show that it can be possible to work around some of these issues, as you've found from experience, there are many edge cases that make it an unreliable and unintuitive process, to get the Kubernetes provider working alongside the underlying infrastructure. This is due to a long-standing limitation in Terraform, that can't be fixed in any provider, but we do have plans to smooth out the bumps a little by adding better error messages upfront, which would have saved you some headache in this case.

To solve this particular type of problem, the cluster infrastructure needs to be kept in a state separate from the Kubernetes and Helm provider resources. I have an example here which builds an AKS cluster in one apply and then manages the Kubernetes/Helm resources in a second apply. You can use this approach to build the most robust configuration for you particular use case:

https://github.com/hashicorp/terraform-provider-kubernetes/tree/e058e225e621f06e393bcb6407e7737fd43817bd/_examples/aks

I know this two-apply approach is inconvenient, which is why we continue to try and accommodate users in single-apply scenarios, and scenarios which contain the Kubernetes and cluster resources in the same Terraform state. However, until upstream Terraform can add support for this, the single-apply workflow will remain buggy and less reliable than separating cluster infrastructure from Kubernetes resources.

Most cases can be worked around using depends_on (to ensure the cluster is created before the Kubernetes resource), or by moving the cluster infrastructure into a separate module and running terraform state rm module.kubernetes-config or terraform apply -target=module.aks-cluster. But I think encouraging this kind of work-around will cause more headaches in the long run, as it puts the user in charge of figuring out when to use special one-off apply commands, rather than setting up Terraform to behave reliably and predictably from the start. Plus it can have unintended side-effects, like orphaning cloud resources.

-- Stef Forrester
Source: StackOverflow

6/27/2021

Thanks for the additional detail. I see a few problems here. The first one is at the heart of your immediate problem:

variable "enable_private_cluster" {
  description = "(Optional) Set this variable to true if you want Azure Kubernetes Cluster to be private."
  default     = true
}

Your cluster deployment is taking the default here, so your API endpoint is a private DNS entry in the zone privatelink.australiaeast.azmk8s.io:

Post "https://testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io:443/api/v1/namespaces"

The terraform kubernetes provider must be able to reach the API endpoint in order to deploy the namespace. However, it is unable to resolve the domain. For this to work, you will need to ensure that:

  1. The private DNS zone exists in Azure
  2. The private DNS zone is linked to the relevant virtual networks, including the host where you're running Terraform
  3. The DNS resolver on the Terraform host can resolve the privatelink domain through the endpoint defined at https://docs.microsoft.com/en-us/azure/virtual-network/what-is-ip-address-168-63-129-16 - note that this may require forwarding the private domain if your network uses on-premises internal DNS.
  4. You must ensure that your terraform host can reach the privatelink endpoint deployed by the cluster on TCP port 443

Azure privatelink and private DNS can be non-trivial to configure correctly, especially in a complex networking environment. So, you may encounter additional hurdles that I haven't covered here.

Alternatively, you may wish to deploy this cluster without using privatelink by setting this module option to false. This may be undesirable for security and compliance reasons, so be sure you understand what you're doing here:

  enable_private_cluster = false

The next issue I encountered is:

 Error: creating Managed Kubernetes Cluster "pqr-aks-d-1" (Resource Group "pst-aks-sandpit-dev-1"): containerservice.ManagedClustersClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: Code="InsufficientAgentPoolMaxPodsPerAgentPool" Message="The AgentPoolProfile 'syspool001' has an invalid total maxPods(maxPods per node * node count), the total maxPods(13 * 824668498368) should be larger than 30. Please refer to aka.ms/aks-min-max-pod for more detail." Target="agentPoolProfile.kubernetesConfig.kubeletConfig.maxPods"

I overcame that by setting:

  default_pool_max_pod_count = 30

The last issue is that you need to configure the kubernetes provider to have sufficient privileges to deploy the namespace:

│ Error: Unauthorized
│   with module.aks-cluster1.kubernetes_namespace.aks-namespace,
│   on ../../main.tf line 103, in resource "kubernetes_namespace" "aks-namespace":
│  103: resource "kubernetes_namespace" "aks-namespace" {

One way to accomplish that is to use kube_admin_config instead of kube_config:

provider "kubernetes" {
  #load_config_file = "false"
  host                   = azurerm_kubernetes_cluster.aks.kube_admin_config.0.host
  username               = azurerm_kubernetes_cluster.aks.kube_admin_config.0.username
  password               = azurerm_kubernetes_cluster.aks.kube_admin_config.0.password
  client_certificate     = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_certificate)
  client_key             = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_key)
  cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.cluster_ca_certificate)
}
-- Nicholas S. Castellano
Source: StackOverflow