Use nvidia tensorflow image; check NUMA settings

This commit is contained in:
Erik Stambaugh 2024-03-02 08:55:39 -08:00
parent f335235b28
commit db11c15897
3 changed files with 73 additions and 25 deletions

View file

@ -7,16 +7,26 @@ up: run
run: build
docker compose up
build: jupyter/Dockerfile jupyter/cuda.deb
docker compose build
jupyter/cuda.deb:
curl https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-debian11-12-0-local_12.0.0-525.60.13-1_amd64.deb -o cuda.deb
build: jupyter/Dockerfile
docker compose build --progress plain
down:
docker compose down
.PHONY: default up run down build
# all of this assumes there's a single Nvidia GPU
precheck:
echo hihi
lspci | grep -i nvidia | head | awk '{ print $1 }'
DEVICE := $(shell lspci | grep -i nvidia | head | awk '{ print $1 }')
FILE := /sys/bus/pci/devices/0000:${DEVICE}/numa_node
NUMA_STATE := $(shell cat ${FILE})
if [[ ${NUMA_STATE} -ne 0 ]]; then \
echo "NUMA is not connected to your GPU. Try:"; \
echo " echo 0 | sudo tee ${FILE}"; \
exit 1; \
fi
.PHONY: default up run down build precheck

View file

@ -6,12 +6,14 @@ services:
build:
context: ./jupyter
dockerfile: Dockerfile
privileged: true
restart: unless-stopped
ports:
- 0.0.0.0:9001:9001
- 0.0.0.0:6006:6006 # for TensorBoard
volumes:
- ./notebooks:/notebooks
- ./jupyter/overrides.json:/opt/conda/share/jupyter/lab/settings/overrides.json
- ./jupyter/overrides.json:/usr/local/share/jupyter/lab/settings/overrides.json
environment:
- JUPYTER_TOKEN=12345
devices:

View file

@ -1,35 +1,71 @@
FROM continuumio/miniconda3
FROM nvcr.io/nvidia/tensorflow:24.01-tf2-py3
ENV DEBIAN_FRONTEND noninteractive
# get miniconda3 installed
# cribbed from https://github.com/ContinuumIO/docker-images/blob/main/miniconda3/debian/Dockerfile
RUN apt-get update -q && \
apt-get install -q -y --no-install-recommends \
bzip2 \
ca-certificates \
git \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
mercurial \
openssh-client \
procps \
subversion \
wget \
curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN ls /usr/local/lib/python3.10/dist-packages && true
RUN /bin/false
ENV PATH /opt/conda/bin:$PATH
ARG CONDA_VERSION=py311_23.11.0-1
RUN cd /tmp \
&& curl "https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh" -o miniconda.sh \
&& mkdir -p /opt \
&& bash miniconda.sh -b -p /opt/conda \
&& rm miniconda.sh \
&& ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \
&& echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \
&& echo "conda activate base" >> ~/.bashrc \
&& find /opt/conda/ -follow -type f -name '*.a' -delete \
&& find /opt/conda/ -follow -type f -name '*.js.map' -delete \
&& /opt/conda/bin/conda clean -afy
RUN conda update -y -n base -c conda-forge conda
RUN conda create -y --name jupyter python=3.10
RUN conda create -y --name jupyter
# CUDA toolkit 12.0 is not available in conda-forge, and my host OS has 12.0
COPY cuda.deb /tmp
RUN dpkg -i /tmp/cuda.deb && rm /tmp/cuda.deb
RUN CONDA_OVERRIDE_CUDA=12.0 \
conda install -n jupyter --quiet -y -c conda-forge \
jupyterlab \
RUN conda run --no-capture-output -n jupyter \
pip3 install --user --force-reinstall --ignore-installed \
tensorflow[with-gpu] \
keras \
cudnn \
keras-tuner \
numpy \
h5py \
tensorflow=2.15 \
&& /bin/true # only added to make the installed package lines consistent
RUN conda install -n jupyter --quiet -y -c conda-forge \
RUN pip3 install \
pandas \
librosa \
matplotlib \
pyarrow \
pydot \
pillow \
keras-tuner \
&& /bin/true # as above
SHELL ["conda", "run", "-n", "jupyter", "/bin/bash", "-c"]
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "jupyter" ]
CMD ["jupyter", "lab", "--ip", "0.0.0.0", "--port", "9001", "--no-browser", "--allow-root", "--LabApp.token=''", "--notebook-dir=/notebooks", "--ResourceUseDisplay.track_cpu_percent=True" ]
ENV LD_LIBRARY_PATH /usr/local/cuda-12.0/compat:/usr/local/cuda-12.0/targets/x86_64-linux/lib/:$LD_LIBRARY_PATH
CMD ["jupyter", "lab", "--ip", "0.0.0.0", "--port", "9001", "--no-browser", "--allow-root", "--LabApp.token=''", "--notebook-dir=/notebooks", "--ResourceUseDisplay.track_cpu_percent=True" ]