Bootstrap

centos8安装NVIDIA显卡驱动,运行机器学习代码

 官方安装文档:NVIDIA Driver Installation Quickstart Guide :: NVIDIA Tesla Documentation

1.查看默认显卡驱动 

[root@localhost ~]# lsmod |grep nouveau
nouveau              2351104  0
mxm_wmi                16384  1 nouveau
wmi                    32768  2 mxm_wmi,nouveau
video                  49152  1 nouveau
i2c_algo_bit           16384  1 nouveau
drm_kms_helper        266240  5 drm_vram_helper,bochs_drm,nouveau
drm_ttm_helper         16384  3 drm_vram_helper,bochs_drm,nouveau
ttm                    73728  3 drm_vram_helper,drm_ttm_helper,nouveau
drm                   585728  7 drm_kms_helper,drm_vram_helper,bochs_drm,drm_ttm_helper,ttm,nouveau

2.安装依赖环境

dnf install -y tar bzip2 make automake gcc gcc-c++ pciutils elfutils-libelf-devel libglvnd-devel
dnf install -y epel-release
distribution=rhel8
ARCH=$( /bin/arch )
dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/$distribution/${ARCH}/cuda-$distribution.repo
dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)
dnf install -y kernel kernel-core kernel-modules

 3.安装NVIDIA显卡驱动

dnf clean all
dnf -y module install nvidia-driver:latest-dkms

安装之后,需要重启机器,使显卡驱动生效

[root@localhost ~]# lsmod|grep nouveau
[root@localhost ~]# lsmod|grep nvidia
nvidia_drm             69632  0
nvidia_modeset       1183744  1 nvidia_drm
nvidia_uvm           1339392  0
nvidia              55173120  2 nvidia_uvm,nvidia_modeset
drm_kms_helper        266240  5 drm_vram_helper,bochs_drm,nvidia_drm
drm                   585728  8 drm_kms_helper,drm_vram_helper,bochs_drm,nvidia,drm_ttm_helper,nvidia_drm,ttm

可以看到,默认显卡驱动已经没了,换成了NVIDIA显卡驱动

查看显卡信息

[root@localhost ~]# nvidia-smi 
Tue Oct 25 10:41:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:00:10.0 Off |                  N/A |
| 20%   40C    P0    N/A /  75W |      0MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

4.安装NVIDIA CUDA工具包(包含CUPTI),当前最新为11.8版本,大概下载3.8G

dnf install cuda-toolkit-11-8 -y

5.安装NVIDIA CUDNN包,当前最新为11.8版本,需下载681M

dnf install libcudnn8 -y

6.安装Python环境进行测试

dnf install python39 python39-devel -y

 创建虚拟环境,安装tensorflow

python3.9 -m venv py39
source py39/bin/activate
pip install tensorflow

通过tensorflow查看可使用设备

python -c 'import tensorflow as tf;print(tf.config.experimental.list_physical_devices())'
(py39) [root@localhost ~]# python
Python 3.9.13 (main, Jun 24 2022, 15:32:51) 
[GCC 8.5.0 20210514 (Red Hat 8.5.0-13)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
2022-10-25 11:39:46.930430: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-25 11:39:47.056206: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-25 11:39:47.602293: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-10-25 11:39:47.602366: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-10-25 11:39:47.602381: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
>>> tf.config.experimental.list_physical_devices()
2022-10-25 11:40:00.293825: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 11:40:00.307989: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 11:40:00.308201: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

测试下GPU速度

1.打开一个窗口,输入命令,实时监控GPU信息

watch -n1 'nvidia-smi'

 2.新建test.py文件,输入以下内容,然后运行

import tensorflow as tf
import timeit
 
def cpu_run():
    with tf.device('/cpu:0'):
        cpu_a = tf.random.normal([10000, 1000])
        cpu_b = tf.random.normal([1000, 2000])
        c = tf.matmul(cpu_a, cpu_b)
    return c
 
 
def gpu_run():
    with tf.device('/gpu:0'):
        gpu_a = tf.random.normal([10000, 1000])
        gpu_b = tf.random.normal([1000, 2000])
        c = tf.matmul(gpu_a, gpu_b)
    return c
 
 
cpu_time = timeit.timeit(cpu_run, number=10)
gpu_time = timeit.timeit(gpu_run, number=10)
print("cpu:", cpu_time, "  gpu:", gpu_time)
python test.py

 

;