Skip to content

Commit

Permalink
Correctly set NUM_CPU and MEM for hq local
Browse files Browse the repository at this point in the history
  • Loading branch information
unkcpz committed Aug 19, 2024
1 parent 1e2bded commit 7254799
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 34 deletions.
14 changes: 6 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ ENV UV_CONSTRAINT=${PIP_CONSTRAINT}
# XXX: fix me after release aiida-hyperqueue
RUN --mount=from=uv,source=/uv,target=/bin/uv \
--mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \
git clone https://github.com/aiidateam/aiida-hyperqueue && \
uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue
uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} \
"aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue"

COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/

Expand Down Expand Up @@ -101,14 +101,12 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \
--mount=from=build_deps,source=${QE_APP_SRC},target=${QE_APP_SRC},rw \
uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC}

# TODO: this seems need to do twice
# ENV UV_CONSTRAINT=${PIP_CONSTRAINT}
# # Install the aiida-hyperqueue
# # XXX: fix me after release aiida-hyperqueue
# RUN --mount=from=uv,source=/uv,target=/bin/uv \
# --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \
# git clone https://github.com/aiidateam/aiida-hyperqueue && \
# uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue
RUN --mount=from=uv,source=/uv,target=/bin/uv \
--mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \
uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} \
"aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue"

# copy hq binary
COPY --from=home_build /opt/conda/hq /usr/local/bin/
Expand Down
25 changes: 5 additions & 20 deletions before-notebook.d/41_setup-hq-computer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,11 @@ verdi daemon stop || echo "stop fail"
# Setup hyperqueue computer if needed
HQ_COMPUTER="local-hq"

# XXX: duplicate as 42_??.sh, set in one place as script and reuse?
# Compute number of cpus allocated to the container
CPU_LIMIT=$(awk '{print $1}' /sys/fs/cgroup/cpu.max)
CPU_PERIOD=$(awk '{print $2}' /sys/fs/cgroup/cpu.max)

if [ "$CPU_PERIOD" -ne 0 ]; then
CPU_NUMBER=$(echo "scale=2; $CPU_LIMIT / $CPU_PERIOD" | bc)
echo "Number of CPUs allocated: $CPU_NUMBER"

# for HQ setting round to integer number of CPUs, the left are for system tasks
HQ_CPU_NUMBER=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc)
else
# if no limit (with local OCI without setting cpu limit, use all CPUs)
HQ_CPU_NUMBER=$(nproc)
echo "No CPU limit set"
fi

computer_list=$(verdi computer list)
if echo ${computer_list} | grep -q ${HQ_COMPUTER}; then
echo "${HQ_COMPUTER} already setup"
else
# computer
# XXX: upbounded mem??
verdi computer show ${HQ_COMPUTER} || verdi computer setup \
--non-interactive \
--label "${HQ_COMPUTER}" \
Expand All @@ -39,12 +21,15 @@ else
--transport core.local \
--scheduler hyperqueue \
--work-dir /home/${NB_USER}/aiida_run/ \
--mpirun-command "mpirun -np {tot_num_mpiprocs}" \
--mpiprocs-per-machine ${HQ_CPU_NUMBER}
--mpirun-command "mpirun -np {tot_num_mpiprocs}"

verdi computer configure core.local "${HQ_COMPUTER}" \
--non-interactive \
--safe-interval 5.0

# disable the localhost which is set in base image
# XXX: this cas also be done before hq computer set and set the hq computer as `localhost`
verdi computer disable localhost aiida@localhost
fi

verdi daemon start || echo "start fail"
12 changes: 6 additions & 6 deletions before-notebook.d/42_start-hq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ set -x
MEMORY_LIMIT=$(cat /sys/fs/cgroup/memory.max)

if [ "$MEMORY_LIMIT" = "max" ]; then
MEMORY_LIMIT=1024
echo "No memory limit set"
MEMORY_LIMIT=4096
echo "No memory limit set, use 4GiB"
else
MEMORY_LIMIT=$(echo "scale=2; $MEMORY_LIMIT / (1024 * 1024)" | bc)
echo "Memory Limit: ${MEMORY_LIMIT} MB"
echo "Memory Limit: ${MEMORY_LIMIT} MiB"
fi

# Compute number of cpus allocated to the container
Expand All @@ -22,14 +22,14 @@ if [ "$CPU_PERIOD" -ne 0 ]; then
echo "Number of CPUs allocated: $CPU_NUMBER"

# for HQ setting round to integer number of CPUs, the left are for system tasks
HQ_CPU_NUMBER=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc)
CPU_LIMIT=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc)
else
# if no limit (with local OCI without setting cpu limit, use all CPUs)
HQ_CPU_NUMBER=$(nproc)
CPU_LIMIT=$(nproc)
echo "No CPU limit set"
fi

# Start hq server with a worker
run-one-constantly hq server start 1>$HOME/.hq-stdout 2>$HOME/.hq-stderr &
run-one-constantly hq worker start --cpus=${HQ_CPU_NUMBER} --resource "mem=sum(${LOCAL_MEM})" --no-detect-resources &
run-one-constantly hq worker start --cpus=${CPU_LIMIT} --resource "mem=sum(${MEMORY_LIMIT})" --no-detect-resources &

0 comments on commit 7254799

Please sign in to comment.