From 725479987405467803287efcfdea24d85d16cfc5 Mon Sep 17 00:00:00 2001 From: Jusong Yu Date: Mon, 19 Aug 2024 15:23:07 +0200 Subject: [PATCH] Correctly set NUM_CPU and MEM for hq local --- Dockerfile | 14 ++++++------- before-notebook.d/41_setup-hq-computer.sh | 25 +++++------------------ before-notebook.d/42_start-hq.sh | 12 +++++------ 3 files changed, 17 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index ea7f1ebf4..04d3ffe14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -64,8 +64,8 @@ ENV UV_CONSTRAINT=${PIP_CONSTRAINT} # XXX: fix me after release aiida-hyperqueue RUN --mount=from=uv,source=/uv,target=/bin/uv \ --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ - git clone https://github.com/aiidateam/aiida-hyperqueue && \ - uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue + uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} \ + "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue" COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/ @@ -101,14 +101,12 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \ --mount=from=build_deps,source=${QE_APP_SRC},target=${QE_APP_SRC},rw \ uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC} -# TODO: this seems need to do twice -# ENV UV_CONSTRAINT=${PIP_CONSTRAINT} # # Install the aiida-hyperqueue # # XXX: fix me after release aiida-hyperqueue -# RUN --mount=from=uv,source=/uv,target=/bin/uv \ -# --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ -# git clone https://github.com/aiidateam/aiida-hyperqueue && \ -# uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue +RUN --mount=from=uv,source=/uv,target=/bin/uv \ + --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ + uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} \ + "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue" # copy hq binary COPY --from=home_build /opt/conda/hq /usr/local/bin/ diff --git a/before-notebook.d/41_setup-hq-computer.sh b/before-notebook.d/41_setup-hq-computer.sh index 3d6c27ad1..5f6599ff3 100755 --- a/before-notebook.d/41_setup-hq-computer.sh +++ b/before-notebook.d/41_setup-hq-computer.sh @@ -8,29 +8,11 @@ verdi daemon stop || echo "stop fail" # Setup hyperqueue computer if needed HQ_COMPUTER="local-hq" -# XXX: duplicate as 42_??.sh, set in one place as script and reuse? -# Compute number of cpus allocated to the container -CPU_LIMIT=$(awk '{print $1}' /sys/fs/cgroup/cpu.max) -CPU_PERIOD=$(awk '{print $2}' /sys/fs/cgroup/cpu.max) - -if [ "$CPU_PERIOD" -ne 0 ]; then - CPU_NUMBER=$(echo "scale=2; $CPU_LIMIT / $CPU_PERIOD" | bc) - echo "Number of CPUs allocated: $CPU_NUMBER" - - # for HQ setting round to integer number of CPUs, the left are for system tasks - HQ_CPU_NUMBER=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc) -else - # if no limit (with local OCI without setting cpu limit, use all CPUs) - HQ_CPU_NUMBER=$(nproc) - echo "No CPU limit set" -fi - computer_list=$(verdi computer list) if echo ${computer_list} | grep -q ${HQ_COMPUTER}; then echo "${HQ_COMPUTER} already setup" else # computer - # XXX: upbounded mem?? verdi computer show ${HQ_COMPUTER} || verdi computer setup \ --non-interactive \ --label "${HQ_COMPUTER}" \ @@ -39,12 +21,15 @@ else --transport core.local \ --scheduler hyperqueue \ --work-dir /home/${NB_USER}/aiida_run/ \ - --mpirun-command "mpirun -np {tot_num_mpiprocs}" \ - --mpiprocs-per-machine ${HQ_CPU_NUMBER} + --mpirun-command "mpirun -np {tot_num_mpiprocs}" verdi computer configure core.local "${HQ_COMPUTER}" \ --non-interactive \ --safe-interval 5.0 + + # disable the localhost which is set in base image + # XXX: this cas also be done before hq computer set and set the hq computer as `localhost` + verdi computer disable localhost aiida@localhost fi verdi daemon start || echo "start fail" diff --git a/before-notebook.d/42_start-hq.sh b/before-notebook.d/42_start-hq.sh index 3a45b3162..aaa8b1839 100644 --- a/before-notebook.d/42_start-hq.sh +++ b/before-notebook.d/42_start-hq.sh @@ -6,11 +6,11 @@ set -x MEMORY_LIMIT=$(cat /sys/fs/cgroup/memory.max) if [ "$MEMORY_LIMIT" = "max" ]; then - MEMORY_LIMIT=1024 - echo "No memory limit set" + MEMORY_LIMIT=4096 + echo "No memory limit set, use 4GiB" else MEMORY_LIMIT=$(echo "scale=2; $MEMORY_LIMIT / (1024 * 1024)" | bc) - echo "Memory Limit: ${MEMORY_LIMIT} MB" + echo "Memory Limit: ${MEMORY_LIMIT} MiB" fi # Compute number of cpus allocated to the container @@ -22,14 +22,14 @@ if [ "$CPU_PERIOD" -ne 0 ]; then echo "Number of CPUs allocated: $CPU_NUMBER" # for HQ setting round to integer number of CPUs, the left are for system tasks - HQ_CPU_NUMBER=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc) + CPU_LIMIT=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc) else # if no limit (with local OCI without setting cpu limit, use all CPUs) - HQ_CPU_NUMBER=$(nproc) + CPU_LIMIT=$(nproc) echo "No CPU limit set" fi # Start hq server with a worker run-one-constantly hq server start 1>$HOME/.hq-stdout 2>$HOME/.hq-stderr & -run-one-constantly hq worker start --cpus=${HQ_CPU_NUMBER} --resource "mem=sum(${LOCAL_MEM})" --no-detect-resources & +run-one-constantly hq worker start --cpus=${CPU_LIMIT} --resource "mem=sum(${MEMORY_LIMIT})" --no-detect-resources &