Use nvidia tensorflow image; check NUMA settings
This commit is contained in:
parent
f335235b28
commit
db11c15897
3 changed files with 73 additions and 25 deletions
24
Makefile
24
Makefile
|
|
@ -7,16 +7,26 @@ up: run
|
||||||
run: build
|
run: build
|
||||||
docker compose up
|
docker compose up
|
||||||
|
|
||||||
build: jupyter/Dockerfile jupyter/cuda.deb
|
build: jupyter/Dockerfile
|
||||||
docker compose build
|
docker compose build --progress plain
|
||||||
|
|
||||||
jupyter/cuda.deb:
|
|
||||||
curl https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda-repo-debian11-12-0-local_12.0.0-525.60.13-1_amd64.deb -o cuda.deb
|
|
||||||
|
|
||||||
down:
|
down:
|
||||||
docker compose down
|
docker compose down
|
||||||
|
|
||||||
|
# all of this assumes there's a single Nvidia GPU
|
||||||
.PHONY: default up run down build
|
precheck:
|
||||||
|
echo hihi
|
||||||
|
lspci | grep -i nvidia | head | awk '{ print $1 }'
|
||||||
|
DEVICE := $(shell lspci | grep -i nvidia | head | awk '{ print $1 }')
|
||||||
|
FILE := /sys/bus/pci/devices/0000:${DEVICE}/numa_node
|
||||||
|
NUMA_STATE := $(shell cat ${FILE})
|
||||||
|
if [[ ${NUMA_STATE} -ne 0 ]]; then \
|
||||||
|
echo "NUMA is not connected to your GPU. Try:"; \
|
||||||
|
echo " echo 0 | sudo tee ${FILE}"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: default up run down build precheck
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,14 @@ services:
|
||||||
build:
|
build:
|
||||||
context: ./jupyter
|
context: ./jupyter
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
|
privileged: true
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- 0.0.0.0:9001:9001
|
- 0.0.0.0:9001:9001
|
||||||
|
- 0.0.0.0:6006:6006 # for TensorBoard
|
||||||
volumes:
|
volumes:
|
||||||
- ./notebooks:/notebooks
|
- ./notebooks:/notebooks
|
||||||
- ./jupyter/overrides.json:/opt/conda/share/jupyter/lab/settings/overrides.json
|
- ./jupyter/overrides.json:/usr/local/share/jupyter/lab/settings/overrides.json
|
||||||
environment:
|
environment:
|
||||||
- JUPYTER_TOKEN=12345
|
- JUPYTER_TOKEN=12345
|
||||||
devices:
|
devices:
|
||||||
|
|
|
||||||
|
|
@ -1,35 +1,71 @@
|
||||||
|
|
||||||
FROM continuumio/miniconda3
|
FROM nvcr.io/nvidia/tensorflow:24.01-tf2-py3
|
||||||
|
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
|
|
||||||
|
# get miniconda3 installed
|
||||||
|
# cribbed from https://github.com/ContinuumIO/docker-images/blob/main/miniconda3/debian/Dockerfile
|
||||||
|
RUN apt-get update -q && \
|
||||||
|
apt-get install -q -y --no-install-recommends \
|
||||||
|
bzip2 \
|
||||||
|
ca-certificates \
|
||||||
|
git \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
mercurial \
|
||||||
|
openssh-client \
|
||||||
|
procps \
|
||||||
|
subversion \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN ls /usr/local/lib/python3.10/dist-packages && true
|
||||||
|
RUN /bin/false
|
||||||
|
|
||||||
|
|
||||||
|
ENV PATH /opt/conda/bin:$PATH
|
||||||
|
ARG CONDA_VERSION=py311_23.11.0-1
|
||||||
|
|
||||||
|
RUN cd /tmp \
|
||||||
|
&& curl "https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh" -o miniconda.sh \
|
||||||
|
&& mkdir -p /opt \
|
||||||
|
&& bash miniconda.sh -b -p /opt/conda \
|
||||||
|
&& rm miniconda.sh \
|
||||||
|
&& ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \
|
||||||
|
&& echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \
|
||||||
|
&& echo "conda activate base" >> ~/.bashrc \
|
||||||
|
&& find /opt/conda/ -follow -type f -name '*.a' -delete \
|
||||||
|
&& find /opt/conda/ -follow -type f -name '*.js.map' -delete \
|
||||||
|
&& /opt/conda/bin/conda clean -afy
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
RUN conda update -y -n base -c conda-forge conda
|
RUN conda update -y -n base -c conda-forge conda
|
||||||
|
|
||||||
RUN conda create -y --name jupyter python=3.10
|
RUN conda create -y --name jupyter
|
||||||
|
|
||||||
# CUDA toolkit 12.0 is not available in conda-forge, and my host OS has 12.0
|
RUN conda run --no-capture-output -n jupyter \
|
||||||
COPY cuda.deb /tmp
|
pip3 install --user --force-reinstall --ignore-installed \
|
||||||
RUN dpkg -i /tmp/cuda.deb && rm /tmp/cuda.deb
|
tensorflow[with-gpu] \
|
||||||
|
|
||||||
RUN CONDA_OVERRIDE_CUDA=12.0 \
|
|
||||||
conda install -n jupyter --quiet -y -c conda-forge \
|
|
||||||
jupyterlab \
|
|
||||||
keras \
|
keras \
|
||||||
cudnn \
|
|
||||||
keras-tuner \
|
keras-tuner \
|
||||||
numpy \
|
numpy \
|
||||||
h5py \
|
h5py \
|
||||||
tensorflow=2.15 \
|
|
||||||
&& /bin/true # only added to make the installed package lines consistent
|
&& /bin/true # only added to make the installed package lines consistent
|
||||||
|
|
||||||
RUN conda install -n jupyter --quiet -y -c conda-forge \
|
RUN pip3 install \
|
||||||
pandas \
|
pandas \
|
||||||
librosa \
|
librosa \
|
||||||
matplotlib \
|
matplotlib \
|
||||||
pyarrow \
|
|
||||||
pydot \
|
|
||||||
pillow \
|
pillow \
|
||||||
|
keras-tuner \
|
||||||
&& /bin/true # as above
|
&& /bin/true # as above
|
||||||
|
|
||||||
SHELL ["conda", "run", "-n", "jupyter", "/bin/bash", "-c"]
|
ENV LD_LIBRARY_PATH /usr/local/cuda-12.0/compat:/usr/local/cuda-12.0/targets/x86_64-linux/lib/:$LD_LIBRARY_PATH
|
||||||
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "jupyter" ]
|
|
||||||
CMD ["jupyter", "lab", "--ip", "0.0.0.0", "--port", "9001", "--no-browser", "--allow-root", "--LabApp.token=''", "--notebook-dir=/notebooks", "--ResourceUseDisplay.track_cpu_percent=True" ]
|
|
||||||
|
|
||||||
|
CMD ["jupyter", "lab", "--ip", "0.0.0.0", "--port", "9001", "--no-browser", "--allow-root", "--LabApp.token=''", "--notebook-dir=/notebooks", "--ResourceUseDisplay.track_cpu_percent=True" ]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue