I've been trying to setup an Airflow environment on Kubernetes (v1.13.11-gke.14), with MySQL DB as metadata database and KubernetesExecutor as core executor. Bound a serviceAccount to Airflow deployment with all privileges and configured the same to worker_service_account_name
as well. Now when I trigger Airflow , scheduler is throwing KubernetesJobWatcher Exception with the following logs,
[2019-12-13 04:40:40,919] {{kubernetes_executor.py:335}} ERROR - Unknown error in KubernetesJobWatcher. Failing
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 313, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1840, in recv_into
self._raise_ssl_error(self._ssl, result)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1646, in _raise_ssl_error
raise WantReadError()
OpenSSL.SSL.WantReadError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 425, in _error_catcher
yield
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 752, in read_chunked
self._update_chunk_length()
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 682, in _update_chunk_length
line = self._fp.fp.readline()
File "/usr/lib/python3.5/socket.py", line 576, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 326, in recv_into
raise timeout("The read operation timed out")
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 333, in run
self.worker_uuid, self.kube_config)
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 357, in _run
**kwargs):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 144, in stream
for line in iter_resp_lines(resp):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 48, in iter_resp_lines
for seg in resp.read_chunked(decode_content=False):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 781, in read_chunked
self._original_response.close()
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='172.20.0.1', port=443): Read timed out.
Process KubernetesJobWatcher-1467:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 313, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1840, in recv_into
self._raise_ssl_error(self._ssl, result)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1646, in _raise_ssl_error
raise WantReadError()
OpenSSL.SSL.WantReadError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 425, in _error_catcher
yield
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 752, in read_chunked
self._update_chunk_length()
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 682, in _update_chunk_length
line = self._fp.fp.readline()
File "/usr/lib/python3.5/socket.py", line 576, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 326, in recv_into
raise timeout("The read operation timed out")
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 333, in run
self.worker_uuid, self.kube_config)
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 357, in _run
**kwargs):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 144, in stream
for line in iter_resp_lines(resp):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 48, in iter_resp_lines
for seg in resp.read_chunked(decode_content=False):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 781, in read_chunked
self._original_response.close()
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='172.20.0.1', port=443): Read timed out.
10.186.229.193 - - [13/Dec/2019:04:40:41 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
[2019-12-13 04:40:42,838] {{kubernetes_executor.py:440}} ERROR - Error while health checking kube watcher process. Process died for unknown reasons
[2019-12-13 04:40:42,864] {{kubernetes_executor.py:344}} INFO - Event: and now my watch begins starting at resource_version: 0
10.186.229.193 - - [13/Dec/2019:04:40:48 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
[2019-12-13 04:40:50 -0600] [49] [INFO] Handling signal: ttin
[2019-12-13 04:40:50 -0600] [24285] [INFO] Booting worker with pid: 24285
10.186.229.193 - - [13/Dec/2019:04:40:51 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
[2019-12-13 04:40:51,522] {{__init__.py:51}} INFO - Using executor KubernetesExecutor
[2019-12-13 04:40:51,528] {{dagbag.py:92}} INFO - Filling up the DagBag from /usr/local/airflow/dags
[2019-12-13 04:40:52 -0600] [49] [INFO] Handling signal: ttou
[2019-12-13 04:40:52 -0600] [24251] [INFO] Worker exiting (pid: 24251)
10.186.229.193 - - [13/Dec/2019:04:40:57 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:01 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:07 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:11 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:18 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:21 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
[2019-12-13 04:41:23 -0600] [49] [INFO] Handling signal: ttin
[2019-12-13 04:41:23 -0600] [24290] [INFO] Booting worker with pid: 24290
[2019-12-13 04:41:24,024] {{__init__.py:51}} INFO - Using executor KubernetesExecutor
[2019-12-13 04:41:24,029] {{dagbag.py:92}} INFO - Filling up the DagBag from /usr/local/airflow/dags
[2019-12-13 04:41:25 -0600] [49] [INFO] Handling signal: ttou
[2019-12-13 04:41:25 -0600] [24256] [INFO] Worker exiting (pid: 24256)
10.186.229.193 - - [13/Dec/2019:04:41:27 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:31 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:37 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
10.186.229.193 - - [13/Dec/2019:04:41:41 -0600] "GET /admin/ HTTP/1.1" 200 115845 "-" "kube-probe/1.13+"
[2019-12-13 04:41:42,944] {{kubernetes_executor.py:335}} ERROR - Unknown error in KubernetesJobWatcher. Failing
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 313, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1840, in recv_into
self._raise_ssl_error(self._ssl, result)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1646, in _raise_ssl_error
raise WantReadError()
OpenSSL.SSL.WantReadError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 425, in _error_catcher
yield
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 752, in read_chunked
self._update_chunk_length()
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 682, in _update_chunk_length
line = self._fp.fp.readline()
File "/usr/lib/python3.5/socket.py", line 576, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 326, in recv_into
raise timeout("The read operation timed out")
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 333, in run
self.worker_uuid, self.kube_config)
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 357, in _run
**kwargs):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 144, in stream
for line in iter_resp_lines(resp):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 48, in iter_resp_lines
for seg in resp.read_chunked(decode_content=False):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 781, in read_chunked
self._original_response.close()
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='172.20.0.1', port=443): Read timed out.
Process KubernetesJobWatcher-1468:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 313, in recv_into
return self.connection.recv_into(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1840, in recv_into
self._raise_ssl_error(self._ssl, result)
File "/usr/local/lib/python3.5/dist-packages/OpenSSL/SSL.py", line 1646, in _raise_ssl_error
raise WantReadError()
OpenSSL.SSL.WantReadError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 425, in _error_catcher
yield
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 752, in read_chunked
self._update_chunk_length()
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 682, in _update_chunk_length
line = self._fp.fp.readline()
File "/usr/lib/python3.5/socket.py", line 576, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.5/dist-packages/urllib3/contrib/pyopenssl.py", line 326, in recv_into
raise timeout("The read operation timed out")
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 333, in run
self.worker_uuid, self.kube_config)
File "/usr/local/lib/python3.5/dist-packages/airflow/contrib/executors/kubernetes_executor.py", line 357, in _run
**kwargs):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 144, in stream
for line in iter_resp_lines(resp):
File "/usr/local/lib/python3.5/dist-packages/kubernetes/watch/watch.py", line 48, in iter_resp_lines
for seg in resp.read_chunked(decode_content=False):
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 781, in read_chunked
self._original_response.close()
File "/usr/lib/python3.5/contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.5/dist-packages/urllib3/response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='172.20.0.1', port=443): Read timed out.
I've tried commenting request_timeout
setting so the scheduler waits for KubernetesAPI response indefinitely as some answers suggested but the issue still persists. Would appreciate any suggestions on how to solve it.
Please find environment information as below,
Kubernetes Versions used: (v1.13.11-gke.14) and V1.16
Platform : GKE and Standalone VMs
Airflow Versions used: 1.10.6/1.10.5/1.10.6rc2