컴퓨터

CentOS7 horovod 설치

돌하니 2020. 3. 18. 14:32

# OS 버전
cat /etc/centos-release
CentOS Linux release 7.7.1908 (Core)

# cuda yum repository file
cat /etc/yum.repos.d/cuda.repo
[cuda]
name=cuda
baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64
#baseurl=http://mirror.edison.re.kr/cuda/repos/rhel7/x86_64
enabled=1
gpgcheck=1
gpgkey=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub

# cuda driver 설치 10.0 필요함

yum install cuda-10-0

yum list installed | grep cuda
cuda-10-0.x86_64                       10.0.130-1                 @cuda
cuda-command-line-tools-10-0.x86_64    10.0.130-1                 @cuda
cuda-compiler-10-0.x86_64              10.0.130-1                 @cuda
cuda-cublas-10-0.x86_64                10.0.130-1                 @cuda
cuda-cublas-dev-10-0.x86_64            10.0.130-1                 @cuda
cuda-cudart-10-0.x86_64                10.0.130-1                 @cuda
cuda-cudart-dev-10-0.x86_64            10.0.130-1                 @cuda
cuda-cufft-10-0.x86_64                 10.0.130-1                 @cuda
cuda-cufft-dev-10-0.x86_64             10.0.130-1                 @cuda
cuda-cuobjdump-10-0.x86_64             10.0.130-1                 @cuda
cuda-cupti-10-0.x86_64                 10.0.130-1                 @cuda
cuda-curand-10-0.x86_64                10.0.130-1                 @cuda
cuda-curand-dev-10-0.x86_64            10.0.130-1                 @cuda
cuda-cusolver-10-0.x86_64              10.0.130-1                 @cuda
cuda-cusolver-dev-10-0.x86_64          10.0.130-1                 @cuda
cuda-cusparse-10-0.x86_64              10.0.130-1                 @cuda
cuda-cusparse-dev-10-0.x86_64          10.0.130-1                 @cuda
cuda-demo-suite-10-0.x86_64            10.0.130-1                 @cuda
cuda-documentation-10-0.x86_64         10.0.130-1                 @cuda
cuda-driver-dev-10-0.x86_64            10.0.130-1                 @cuda
cuda-drivers.x86_64                    440.64.00-1                @cuda
cuda-gdb-10-0.x86_64                   10.0.130-1                 @cuda
cuda-gpu-library-advisor-10-0.x86_64   10.0.130-1                 @cuda
cuda-libraries-10-0.x86_64             10.0.130-1                 @cuda
cuda-libraries-dev-10-0.x86_64         10.0.130-1                 @cuda
cuda-license-10-0.x86_64               10.0.130-1                 @cuda
cuda-memcheck-10-0.x86_64              10.0.130-1                 @cuda
cuda-misc-headers-10-0.x86_64          10.0.130-1                 @cuda
cuda-npp-10-0.x86_64                   10.0.130-1                 @cuda
cuda-npp-dev-10-0.x86_64               10.0.130-1                 @cuda
cuda-nsight-10-0.x86_64                10.0.130-1                 @cuda
cuda-nsight-compute-10-0.x86_64        10.0.130-1                 @cuda
cuda-nvcc-10-0.x86_64                  10.0.130-1                 @cuda
cuda-nvdisasm-10-0.x86_64              10.0.130-1                 @cuda
cuda-nvgraph-10-0.x86_64               10.0.130-1                 @cuda
cuda-nvgraph-dev-10-0.x86_64           10.0.130-1                 @cuda
cuda-nvjpeg-10-0.x86_64                10.0.130.1-1               @cuda
cuda-nvjpeg-dev-10-0.x86_64            10.0.130.1-1               @cuda
cuda-nvml-dev-10-0.x86_64              10.0.130-1                 @cuda
cuda-nvprof-10-0.x86_64                10.0.130-1                 @cuda
cuda-nvprune-10-0.x86_64               10.0.130-1                 @cuda
cuda-nvrtc-10-0.x86_64                 10.0.130-1                 @cuda
cuda-nvrtc-dev-10-0.x86_64             10.0.130-1                 @cuda
cuda-nvtx-10-0.x86_64                  10.0.130-1                 @cuda
cuda-nvvp-10-0.x86_64                  10.0.130-1                 @cuda
cuda-runtime-10-0.x86_64               10.0.130-1                 @cuda
cuda-samples-10-0.x86_64               10.0.130-1                 @cuda
cuda-toolkit-10-0.x86_64               10.0.130-1                 @cuda
cuda-tools-10-0.x86_64                 10.0.130-1                 @cuda
cuda-visual-tools-10-0.x86_64          10.0.130-1                 @cuda
libcudnn7.x86_64                       7.6.5.33-1.cuda10.2        @/libcudnn7-7.6.5.33-1.cuda10.2.x86_64
libnccl.x86_64                         2.5.6-1+cuda10.0           @nccl-2.5.6-ga-cuda10.0
libnccl-devel.x86_64                   2.5.6-1+cuda10.0           @nccl-2.5.6-ga-cuda10.0
libnccl-static.x86_64                  2.5.6-1+cuda10.0           @nccl-2.5.6-ga-cuda10.0
nccl-repo-rhel7-2.5.6-ga-cuda10.0.x86_64
                                       1-1                        @/nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64
nvidia-kmod.x86_64                     1:440.64.00-2.el6          @cuda
xorg-x11-drv-nvidia.x86_64             1:440.64.00-1.el6          @cuda
xorg-x11-drv-nvidia-devel.x86_64       1:440.64.00-1.el6          @cuda
xorg-x11-drv-nvidia-gl.x86_64          1:440.64.00-1.el6          @cuda
xorg-x11-drv-nvidia-libs.x86_64        1:440.64.00-1.el6          @cuda

# cudart 라이브러리 설치 위치 확인
rpm -qil cuda-cudart-10-0

/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130


# openmpi 설치 https://www.open-mpi.org/software/ompi/v3.1/
wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.0.tar.gz
tar xzvf openmpi-3.1.0.tar.gz
cd openmpi-3.1.0
./configure --prefix=/usr/local/openmpi
make
make install

 

# /etc/bashrc 아래 내용 추가

export LD_LIBRARY_PATH=/SYSTEM/openmpi/3.1.0/lib:$LD_LIBRARY_PATH
export PATH=/SYSTEM/openmpi/3.1.0/bin:$PATH

# nccl 설치
https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#rhel_centos

nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64.rpm download
yum install nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64.rpm
yum install libnccl libnccl-devel libnccl-static

 

# nccl 라이브러리 설치 위치 확인
rpm -qil libnccl-devel

/usr/include/nccl.h
/usr/lib64/libnccl.so


# Anaconda3 설치

conda create -n mytensorflow
conda activate mytensorflow 

conda install tensorflow-gpu==1.13.1 python keras cudnn cudatoolkit=10.0 gcc_linux-64 gxx_linux-64

HOROVOD_CUDA_HOME=/usr/local/cuda-10.0/targets/x86_64-linux/lib
HOROVOD_GPU_ALLREDUCE=NCCL
HOROVOD_WITH_TENSORFLOW=1
pip install --no-cache-dir horovod==0.18.2


horovodrun -np 2 -H localhost:2 python tensorflow_mnist.py

또는

mpirun -np 2 --allow-run-as-root     -bind-to none -map-by slot     -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH     -mca pml ob1 -mca btl ^openib  python tensorflow_mnist.py 

 

2대의 GPU가 설치된 2대의 머신에서 실행시키고자 할 경우 아래와 같이 입력 
time horovodrun -np 4 -H node1:2,node2:2 python tensorflow_mnist.py 

openmpi 대신 gloo 사용 가능
conda install cmake
HOROVOD_WITH_GLOO=1 pip install --no-cache-dir horovod==0.18.2
horovodrun --gloo -np 2 -H localhost:2 python tensorflow_mnist.py



참고:
KISTI 슈퍼컴 Conda 환경 기반 Horovod-tensorflow 설치방법
https://helpdesk.ksc.re.kr/jcs/hw/?jcskey=56

https://github.com/horovod/horovod#install

Tensorflow에 Horovod 사용하기
https://jinwooklim.github.io/development/Horovod_tf/

Setting up Horovod + Keras for Multi-GPU training
https://lambdalabs.com/blog/horovod-keras-for-multi-gpu-training/


http://solarisailab.com/archives/2627

 

https://raw.githubusercontent.com/horovod/horovod/master/examples/tensorflow_mnist.py


Horovod를 활용하여 Tensorflow에서 Multi GPU로 학습하기 (BERT Distributed Training 해보기)
https://y-rok.github.io/deep%20learning/2019/12/19/horovod-tensorflow.html

open MPI 에러시 설정법 
http://djrhee08.egloos.com/1214959