@@ -26,18 +26,29 @@ install_nvidia_driver_amzn2() {
2626 # Purge any nvidia driver installed from RHEL repo
2727 sudo yum remove -y nvidia-driver-latest-dkms
2828
29+ # Try to gather more information about the runner and its existing NVIDIA driver if any
30+ echo " Before installing NVIDIA driver"
31+ lspci
32+ lsmod
33+ modinfo nvidia || true
34+
2935 HAS_NVIDIA_DRIVER=0
3036 # Check if NVIDIA driver has already been installed
3137 if [ -x " $( command -v nvidia-smi) " ]; then
38+ set +e
3239 # The driver exists, check its version next
3340 INSTALLED_DRIVER_VERSION=$( nvidia-smi --query-gpu=driver_version --format=csv,noheader)
41+ NVIDIA_SMI_STATUS=$?
3442
35- if [ " $INSTALLED_DRIVER_VERSION " != " $DRIVER_VERSION " ]; then
43+ if [ " $NVIDIA_SMI_STATUS " -ne 0 ] && [ " $NVIDIA_SMI_STATUS " -ne 14 ]; then
44+ echo " Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION ). Continuing"
45+ elif [ " $INSTALLED_DRIVER_VERSION " != " $DRIVER_VERSION " ]; then
3646 echo " NVIDIA driver ($INSTALLED_DRIVER_VERSION ) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
3747 else
3848 HAS_NVIDIA_DRIVER=1
3949 echo " NVIDIA driver ($INSTALLED_DRIVER_VERSION ) has already been installed. Skipping NVIDIA driver installation"
4050 fi
51+ set -e
4152 fi
4253
4354 if [ " $HAS_NVIDIA_DRIVER " -eq 0 ]; then
@@ -51,17 +62,25 @@ install_nvidia_driver_amzn2() {
5162 sudo rm -fv /tmp/nvidia_driver
5263 fi
5364
65+ sudo modprobe nvidia || true
66+ echo " After installing NVIDIA driver"
67+ lspci
68+ lsmod
69+ modinfo nvidia || true
70+
5471 (
5572 set +e
5673 nvidia-smi
57- status=$?
74+ NVIDIA_SMI_STATUS=$?
75+
5876 # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
59- if [ $status -eq 0 ] || [ $status -eq 14 ]; then
60- echo " INFO: Ignoring allowed status ${status } "
77+ if [ " $NVIDIA_SMI_STATUS " -eq 0 ] || [ " $NVIDIA_SMI_STATUS " -eq 14 ]; then
78+ echo " INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS } "
6179 else
62- echo " ERROR: nvidia-smi exited with unresolved status ${status } "
63- exit ${status }
80+ echo " ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS } "
81+ exit ${NVIDIA_SMI_STATUS }
6482 fi
83+ set -e
6584 )
6685 )
6786}
0 commit comments