CentOS7 horovod 설치
# OS 버전
cat /etc/centos-release
CentOS Linux release 7.7.1908 (Core)
# cuda yum repository file
cat /etc/yum.repos.d/cuda.repo
[cuda]
name=cuda
baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64
#baseurl=http://mirror.edison.re.kr/cuda/repos/rhel7/x86_64
enabled=1
gpgcheck=1
gpgkey=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub
# cuda driver 설치 10.0 필요함
yum install cuda-10-0
yum list installed | grep cuda
cuda-10-0.x86_64 10.0.130-1 @cuda
cuda-command-line-tools-10-0.x86_64 10.0.130-1 @cuda
cuda-compiler-10-0.x86_64 10.0.130-1 @cuda
cuda-cublas-10-0.x86_64 10.0.130-1 @cuda
cuda-cublas-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-cudart-10-0.x86_64 10.0.130-1 @cuda
cuda-cudart-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-cufft-10-0.x86_64 10.0.130-1 @cuda
cuda-cufft-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-cuobjdump-10-0.x86_64 10.0.130-1 @cuda
cuda-cupti-10-0.x86_64 10.0.130-1 @cuda
cuda-curand-10-0.x86_64 10.0.130-1 @cuda
cuda-curand-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-cusolver-10-0.x86_64 10.0.130-1 @cuda
cuda-cusolver-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-cusparse-10-0.x86_64 10.0.130-1 @cuda
cuda-cusparse-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-demo-suite-10-0.x86_64 10.0.130-1 @cuda
cuda-documentation-10-0.x86_64 10.0.130-1 @cuda
cuda-driver-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-drivers.x86_64 440.64.00-1 @cuda
cuda-gdb-10-0.x86_64 10.0.130-1 @cuda
cuda-gpu-library-advisor-10-0.x86_64 10.0.130-1 @cuda
cuda-libraries-10-0.x86_64 10.0.130-1 @cuda
cuda-libraries-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-license-10-0.x86_64 10.0.130-1 @cuda
cuda-memcheck-10-0.x86_64 10.0.130-1 @cuda
cuda-misc-headers-10-0.x86_64 10.0.130-1 @cuda
cuda-npp-10-0.x86_64 10.0.130-1 @cuda
cuda-npp-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-nsight-10-0.x86_64 10.0.130-1 @cuda
cuda-nsight-compute-10-0.x86_64 10.0.130-1 @cuda
cuda-nvcc-10-0.x86_64 10.0.130-1 @cuda
cuda-nvdisasm-10-0.x86_64 10.0.130-1 @cuda
cuda-nvgraph-10-0.x86_64 10.0.130-1 @cuda
cuda-nvgraph-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-nvjpeg-10-0.x86_64 10.0.130.1-1 @cuda
cuda-nvjpeg-dev-10-0.x86_64 10.0.130.1-1 @cuda
cuda-nvml-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-nvprof-10-0.x86_64 10.0.130-1 @cuda
cuda-nvprune-10-0.x86_64 10.0.130-1 @cuda
cuda-nvrtc-10-0.x86_64 10.0.130-1 @cuda
cuda-nvrtc-dev-10-0.x86_64 10.0.130-1 @cuda
cuda-nvtx-10-0.x86_64 10.0.130-1 @cuda
cuda-nvvp-10-0.x86_64 10.0.130-1 @cuda
cuda-runtime-10-0.x86_64 10.0.130-1 @cuda
cuda-samples-10-0.x86_64 10.0.130-1 @cuda
cuda-toolkit-10-0.x86_64 10.0.130-1 @cuda
cuda-tools-10-0.x86_64 10.0.130-1 @cuda
cuda-visual-tools-10-0.x86_64 10.0.130-1 @cuda
libcudnn7.x86_64 7.6.5.33-1.cuda10.2 @/libcudnn7-7.6.5.33-1.cuda10.2.x86_64
libnccl.x86_64 2.5.6-1+cuda10.0 @nccl-2.5.6-ga-cuda10.0
libnccl-devel.x86_64 2.5.6-1+cuda10.0 @nccl-2.5.6-ga-cuda10.0
libnccl-static.x86_64 2.5.6-1+cuda10.0 @nccl-2.5.6-ga-cuda10.0
nccl-repo-rhel7-2.5.6-ga-cuda10.0.x86_64
1-1 @/nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64
nvidia-kmod.x86_64 1:440.64.00-2.el6 @cuda
xorg-x11-drv-nvidia.x86_64 1:440.64.00-1.el6 @cuda
xorg-x11-drv-nvidia-devel.x86_64 1:440.64.00-1.el6 @cuda
xorg-x11-drv-nvidia-gl.x86_64 1:440.64.00-1.el6 @cuda
xorg-x11-drv-nvidia-libs.x86_64 1:440.64.00-1.el6 @cuda
# cudart 라이브러리 설치 위치 확인
rpm -qil cuda-cudart-10-0
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0
/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130
# openmpi 설치 https://www.open-mpi.org/software/ompi/v3.1/
wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.0.tar.gz
tar xzvf openmpi-3.1.0.tar.gz
cd openmpi-3.1.0
./configure --prefix=/usr/local/openmpi
make
make install
# /etc/bashrc 아래 내용 추가
export LD_LIBRARY_PATH=/SYSTEM/openmpi/3.1.0/lib:$LD_LIBRARY_PATH
export PATH=/SYSTEM/openmpi/3.1.0/bin:$PATH
# nccl 설치
https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#rhel_centos
nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64.rpm download
yum install nccl-repo-rhel7-2.5.6-ga-cuda10.0-1-1.x86_64.rpm
yum install libnccl libnccl-devel libnccl-static
# nccl 라이브러리 설치 위치 확인
rpm -qil libnccl-devel
/usr/include/nccl.h
/usr/lib64/libnccl.so
# Anaconda3 설치
conda create -n mytensorflow
conda activate mytensorflow
conda install tensorflow-gpu==1.13.1 python keras cudnn cudatoolkit=10.0 gcc_linux-64 gxx_linux-64
HOROVOD_CUDA_HOME=/usr/local/cuda-10.0/targets/x86_64-linux/lib
HOROVOD_GPU_ALLREDUCE=NCCL
HOROVOD_WITH_TENSORFLOW=1
pip install --no-cache-dir horovod==0.18.2
horovodrun -np 2 -H localhost:2 python tensorflow_mnist.py
또는
mpirun -np 2 --allow-run-as-root -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib python tensorflow_mnist.py
2대의 GPU가 설치된 2대의 머신에서 실행시키고자 할 경우 아래와 같이 입력
time horovodrun -np 4 -H node1:2,node2:2 python tensorflow_mnist.py
openmpi 대신 gloo 사용 가능
conda install cmake
HOROVOD_WITH_GLOO=1 pip install --no-cache-dir horovod==0.18.2
horovodrun --gloo -np 2 -H localhost:2 python tensorflow_mnist.py
참고:
KISTI 슈퍼컴 Conda 환경 기반 Horovod-tensorflow 설치방법
https://helpdesk.ksc.re.kr/jcs/hw/?jcskey=56
https://github.com/horovod/horovod#install
Tensorflow에 Horovod 사용하기
https://jinwooklim.github.io/development/Horovod_tf/
Setting up Horovod + Keras for Multi-GPU training
https://lambdalabs.com/blog/horovod-keras-for-multi-gpu-training/
http://solarisailab.com/archives/2627
https://raw.githubusercontent.com/horovod/horovod/master/examples/tensorflow_mnist.py
Horovod를 활용하여 Tensorflow에서 Multi GPU로 학습하기 (BERT Distributed Training 해보기)
https://y-rok.github.io/deep%20learning/2019/12/19/horovod-tensorflow.html
open MPI 에러시 설정법
http://djrhee08.egloos.com/1214959