I have a module definition as below:
===
providers.tf
provider "kubernetes" {
#load_config_file = "false"
host = azurerm_kubernetes_cluster.aks.kube_config.0.host
username = azurerm_kubernetes_cluster.aks.kube_config.0.username
password = azurerm_kubernetes_cluster.aks.kube_config.0.password
client_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate)
client_key = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_key)
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.cluster_ca_certificate)
}
outputs.tf
output "node_resource_group" {
value = azurerm_kubernetes_cluster.aks.node_resource_group
description = "The name of resource group where the AKS Nodes are created"
}
output "kubeConfig" {
value = azurerm_kubernetes_cluster.aks.kube_config_raw
description = "Kubeconfig of AKS Cluster"
}
output "host" {
value = azurerm_kubernetes_cluster.aks.kube_config.0.host
}
output "client_key" {
value = azurerm_kubernetes_cluster.aks.kube_config.0.client_key
}
output "client_certificate" {
value = azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate
}
output "kube_config" {
value = azurerm_kubernetes_cluster.aks.kube_config_raw
}
output "cluster_ca_certificate" {
value = azurerm_kubernetes_cluster.aks.kube_config.0.cluster_ca_certificate
}
main.tf
resource "azurerm_log_analytics_workspace" "law" {
name = "${var.tla}-la-${local.lookup_result}-${var.identifier}"
location = data.azurerm_resource_group.rg.location
resource_group_name = data.azurerm_resource_group.rg.name
sku = var.la_sku
retention_in_days = 30
}
resource "azurerm_kubernetes_cluster" "aks" {
name = "${var.tla}-aks-${local.lookup_result}-${var.identifier}"
location = data.azurerm_resource_group.rg.location
resource_group_name = data.azurerm_resource_group.rg.name
dns_prefix = var.dns_prefix
kubernetes_version = var.kubernetes_version
sku_tier = var.sku_tier
private_cluster_enabled = var.enable_private_cluster
#api_server_authorized_ip_ranges = ""
default_node_pool {
name = "syspool001"
orchestrator_version = var.orchestrator_version
availability_zones = var.agents_availability_zones
enable_auto_scaling = true
node_count = var.default_pool_node_count
max_count = var.default_pool_max_node_count
min_count = var.default_pool_min_node_count
max_pods = var.default_pool_max_pod_count
vm_size = var.agents_size
enable_node_public_ip = false
os_disk_size_gb = var.default_pool_os_disk_size_gb
type = "VirtualMachineScaleSets"
vnet_subnet_id = var.vnet_subnet_id
node_labels = var.agents_labels
tags = merge(local.tags, var.agents_tags)
}
network_profile {
network_plugin = var.network_plugin
network_policy = var.network_policy
dns_service_ip = var.net_profile_dns_service_ip
docker_bridge_cidr = var.net_profile_docker_bridge_cidr
service_cidr = var.net_profile_service_cidr
}
role_based_access_control {
enabled = true
azure_active_directory {
managed = true
admin_group_object_ids = var.rbac_aad_admin_group_object_ids
}
}
identity {
type = "SystemAssigned"
}
addon_profile {
azure_policy {
enabled = true
}
http_application_routing {
enabled = false
}
oms_agent {
enabled = true
log_analytics_workspace_id = data.azurerm_log_analytics_workspace.log_analytics.id
}
}
tags = local.tags
lifecycle {
ignore_changes = [
default_node_pool
]
}
}
resource "azurerm_kubernetes_cluster_node_pool" "aksnp" {
lifecycle {
ignore_changes = [
node_count
]
}
for_each = var.additional_node_pools
kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id
name = each.value.node_os == "Windows" ? substr(each.key, 0, 6) : substr(each.key, 0, 12)
node_count = each.value.node_count
vm_size = each.value.vm_size
availability_zones = each.value.zones
max_pods = each.value.max_pods
os_disk_size_gb = each.value.os_disk_size_gb
os_type = each.value.node_os
vnet_subnet_id = var.vnet_subnet_id
node_taints = each.value.taints
enable_auto_scaling = each.value.cluster_auto_scaling
min_count = each.value.cluster_auto_scaling_min_count
max_count = each.value.cluster_auto_scaling_max_count
}
resource "kubernetes_namespace" "aks-namespace" {
metadata {
name = var.namespace
}
}
data.tf
data "azurerm_resource_group" "rg" {
name = var.resource_group_name
}
lookups.tf
locals {
environment_lookup = {
dev = "d"
test = "t"
int = "i"
prod = "p"
prd = "p"
uat = "a"
poc = "d"
dr = "r"
lab = "l"
}
lookup_result = lookup(local.environment_lookup, var.environment)
tags = merge(
data.azurerm_resource_group.rg.tags, {
Directory = "tectcompany.com",
PrivateDNSZone = var.private_dns_zone,
Immutable = "False",
ManagedOS = "True",
}
)
}
data "azurerm_log_analytics_workspace" "log_analytics" {
name = "abc-az-lad2"
resource_group_name = "abc-dev-aae"
}
variables.tf
variable "secondary_region" {
description = "Is this resource being deployed into the secondary (pair) region?"
default = false
type = bool
}
variable "override_log_analytics_workspace" {
description = "Override the vm log analytics workspace"
type = string
default = null
}
variable "override_log_analytics_resource_group_name" {
description = "Overrides the log analytics resource group name"
type = string
default = null
}
variable "environment" {
description = "The name of environment for the AKS Cluster"
type = string
default = "dev"
}
variable "identifier" {
description = "The identifier for the AKS Cluster"
type = number
default = "001"
}
variable "kubernetes_version" {
description = "Specify which Kubernetes release to use. The default used is the latest Kubernetes version available in the region"
type = string
default = "1.19.9"
}
variable "dns_prefix" {
description = "The dns prefix for the AKS Cluster"
type = string
default = "odessa-sandpit"
}
variable "orchestrator_version" {
description = "Specify which Kubernetes release to use for the orchestration layer. The default used is the latest Kubernetes version available in the region"
type = string
default = null
}
variable "agents_availability_zones" {
description = "(Optional) A list of Availability Zones across which the Node Pool should be spread. Changing this forces a new resource to be created."
type = list(string)
default = null
}
variable "agents_size" {
default = "Standard_D4s_v3"
description = "The default virtual machine size for the Kubernetes agents"
type = string
}
variable "vnet_subnet_id" {
description = "(Optional) The ID of a Subnet where the Kubernetes Node Pool should exist. Changing this forces a new resource to be created."
type = string
default = null
}
variable "agents_labels" {
description = "(Optional) A map of Kubernetes labels which should be applied to nodes in the Default Node Pool. Changing this forces a new resource to be created."
type = map(string)
default = {}
}
variable "agents_tags" {
description = "(Optional) A mapping of tags to assign to the Node Pool."
type = map(string)
default = {}
}
variable "net_profile_dns_service_ip" {
description = "(Optional) IP address within the Kubernetes service address range that will be used by cluster service discovery (kube-dns). Changing this forces a new resource to be created."
type = string
default = null
}
variable "net_profile_docker_bridge_cidr" {
description = "(Optional) IP address (in CIDR notation) used as the Docker bridge IP address on nodes. Changing this forces a new resource to be created."
type = string
default = null
}
variable "net_profile_service_cidr" {
description = "(Optional) The Network Range used by the Kubernetes service. Changing this forces a new resource to be created."
type = string
default = null
}
variable "rbac_aad_admin_group_object_ids" {
description = "Object ID of groups with admin access."
type = list(string)
default = null
}
variable "network_policy" {
description = "(Optional) The Network Policy to be used by the network profile of Azure Kubernetes Cluster."
type = string
default = "azure"
}
variable "network_plugin" {
description = "(Optional) The Network Plugin to be used by the network profile of Azure Kubernetes Cluster."
type = string
default = "azure"
}
variable "enable_private_cluster" {
description = "(Optional) Set this variable to true if you want Azure Kubernetes Cluster to be private."
default = true
}
variable "default_pool_node_count" {
description = "(Optional) The initial node count for the default pool of AKS Cluster"
type = number
default = 3
}
variable "default_pool_max_node_count" {
description = "(Optional) The max node count for the default pool of AKS Cluster"
type = number
default = 6
}
variable "default_pool_min_node_count" {
description = "(Optional) The min node count for the default pool of AKS Cluster"
type = number
default = 3
}
variable "default_pool_max_pod_count" {
description = "(Optional) The max pod count for the default pool of AKS Cluster"
type = number
default = 13
}
variable "default_pool_os_disk_size_gb" {
description = "(Optional) The size of os disk in gb for the nodes from default pool of AKS Cluster"
type = string
default = "64"
}
variable "additional_node_pools" {
type = map(object({
node_count = number
max_pods = number
os_disk_size_gb = number
vm_size = string
zones = list(string)
node_os = string
taints = list(string)
cluster_auto_scaling = bool
cluster_auto_scaling_min_count = number
cluster_auto_scaling_max_count = number
}))
}
variable "sku_tier" {
description = "(Optional)The SKU Tier that should be used for this Kubernetes Cluster, possible values Free or Paid"
type = string
default = "Paid"
validation {
condition = contains(["Free", "Paid"], var.sku_tier)
error_message = "SKU_TIER can only be either Paid or Free."
}
}
variable "la_sku" {
description = "(Optional)The SKU Tier that should be used for Log Analytics. Multiple values are possible."
type = string
default = "PerGB2018"
validation {
condition = contains(["Free", "PerNode", "Premium", "Standard", "Standalone", "Unlimited", "CapacityReservation", "PerGB2018"], var.la_sku)
error_message = "SKU_TIER for Log Analytics can be can only be either of Free, PerNode, Premium, Standard, Standalone, Unlimited, CapacityReservation and PerGB2018(Default Value)."
}
}
variable "resource_group_name" {
description = "Resource Group for deploying AKS Cluster"
type = string
}
variable "private_dns_zone" {
description = "DNS prefix for AKS Cluster"
type = string
default = "testcluster"
}
variable "tla" {
description = "Three Level acronym - three letter abbreviation for application"
type = string
default = ""
validation {
condition = length(var.tla) == 3
error_message = "The TLA should be precisely three characters."
}
}
variable "namespace"{
description = "AKS Namespace"
type = string
}
Finally, I am calling my module below to create the AKS cluster, LA, and Namespace for the AKS Cluster:
provider "azurerm" {
features {}
#version = "~> 2.53.0"
}
module "aks-cluster1" {
source = "../../"
resource_group_name = "pst-aks-sandpit-dev-1"
tla = "pqr"
additional_node_pools = {
pool1 = {
node_count = "1"
max_pods = "110"
os_disk_size_gb = "30"
vm_size = "Standard_D8s_v3"
zones = ["1","2","3"]
node_os = "Linux"
taints = ["kubernetes.io/os=windows:NoSchedule"]
cluster_auto_scaling = true
cluster_auto_scaling_min_count = "2"
cluster_auto_scaling_max_count = "4"
}
}
namespace = "sample-ns"
}
Problem: I get an error that no such host when terraform attempts to create the cluster.
I think that it is not able to connect to the cluster but I could be wrong. I do not know how it handles internally.
Error: Post "https://testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io:443/api/v1/namespaces": dial tcp: lookup testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io: no such host
Difficult to say what the issue is since the code you posted is incomplete. For starters, you shouldn't be doing this:
provider "kubernetes" {
config_path = "~/.kube/config"
}
The AKS URL you posted doesn't exist, so I think that's pulling and old cluster default from your kube config
I'm one of the maintainers of the Terraform Kubernetes provider, and I see this particular issue pretty often. As a former devops person myself, I empathize with the struggle I keep seeing in this area. It's something I would really love to fix in the provider, if it were possible.
The issue you're facing is a limitation in Terraform core when passing an unknown value to a provider configuration block. To quote their docs:
You can use expressions in the values of these configuration arguments,
but can only reference values that are known before the configuration is applied.
When you make a change to the underlying infrastructure, such the AKS cluster in this case, you're passing an unknown value into the Kubernetes provider configuration block, since the full scope of the cluster infrastructure is not known until after the change has been applied to the AKS cluster.
Although I did write the initial guide to show that it can be possible to work around some of these issues, as you've found from experience, there are many edge cases that make it an unreliable and unintuitive process, to get the Kubernetes provider working alongside the underlying infrastructure. This is due to a long-standing limitation in Terraform, that can't be fixed in any provider, but we do have plans to smooth out the bumps a little by adding better error messages upfront, which would have saved you some headache in this case.
To solve this particular type of problem, the cluster infrastructure needs to be kept in a state separate from the Kubernetes and Helm provider resources. I have an example here which builds an AKS cluster in one apply and then manages the Kubernetes/Helm resources in a second apply. You can use this approach to build the most robust configuration for you particular use case:
I know this two-apply approach is inconvenient, which is why we continue to try and accommodate users in single-apply scenarios, and scenarios which contain the Kubernetes and cluster resources in the same Terraform state. However, until upstream Terraform can add support for this, the single-apply workflow will remain buggy and less reliable than separating cluster infrastructure from Kubernetes resources.
Most cases can be worked around using depends_on
(to ensure the cluster is created before the Kubernetes resource), or by moving the cluster infrastructure into a separate module and running terraform state rm module.kubernetes-config
or terraform apply -target=module.aks-cluster
. But I think encouraging this kind of work-around will cause more headaches in the long run, as it puts the user in charge of figuring out when to use special one-off apply commands, rather than setting up Terraform to behave reliably and predictably from the start. Plus it can have unintended side-effects, like orphaning cloud resources.
Thanks for the additional detail. I see a few problems here. The first one is at the heart of your immediate problem:
variable "enable_private_cluster" {
description = "(Optional) Set this variable to true if you want Azure Kubernetes Cluster to be private."
default = true
}
Your cluster deployment is taking the default here, so your API endpoint is a private DNS entry in the zone privatelink.australiaeast.azmk8s.io
:
Post "https://testdns-05885a32.145f13c0-25ce-43e4-ae46-8cbef448ecf3.privatelink.australiaeast.azmk8s.io:443/api/v1/namespaces"
The terraform kubernetes provider must be able to reach the API endpoint in order to deploy the namespace. However, it is unable to resolve the domain. For this to work, you will need to ensure that:
Azure privatelink and private DNS can be non-trivial to configure correctly, especially in a complex networking environment. So, you may encounter additional hurdles that I haven't covered here.
Alternatively, you may wish to deploy this cluster without using privatelink by setting this module option to false. This may be undesirable for security and compliance reasons, so be sure you understand what you're doing here:
enable_private_cluster = false
The next issue I encountered is:
Error: creating Managed Kubernetes Cluster "pqr-aks-d-1" (Resource Group "pst-aks-sandpit-dev-1"): containerservice.ManagedClustersClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: Code="InsufficientAgentPoolMaxPodsPerAgentPool" Message="The AgentPoolProfile 'syspool001' has an invalid total maxPods(maxPods per node * node count), the total maxPods(13 * 824668498368) should be larger than 30. Please refer to aka.ms/aks-min-max-pod for more detail." Target="agentPoolProfile.kubernetesConfig.kubeletConfig.maxPods"
I overcame that by setting:
default_pool_max_pod_count = 30
The last issue is that you need to configure the kubernetes provider to have sufficient privileges to deploy the namespace:
│ Error: Unauthorized
│
│ with module.aks-cluster1.kubernetes_namespace.aks-namespace,
│ on ../../main.tf line 103, in resource "kubernetes_namespace" "aks-namespace":
│ 103: resource "kubernetes_namespace" "aks-namespace" {
One way to accomplish that is to use kube_admin_config instead of kube_config:
provider "kubernetes" {
#load_config_file = "false"
host = azurerm_kubernetes_cluster.aks.kube_admin_config.0.host
username = azurerm_kubernetes_cluster.aks.kube_admin_config.0.username
password = azurerm_kubernetes_cluster.aks.kube_admin_config.0.password
client_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_certificate)
client_key = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.client_key)
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.aks.kube_admin_config.0.cluster_ca_certificate)
}