From 16b4180b1e2b39e54221f068df9a3ef35b9a8723 Mon Sep 17 00:00:00 2001 From: Jusong Yu Date: Mon, 12 Aug 2024 17:44:51 +0200 Subject: [PATCH] Workable --- Dockerfile | 33 ++++++++++++++- .../{00_untar_home.sh => 00_untar-home.sh} | 0 ...41_setup_hq.sh => 41_setup-hq-computer.sh} | 22 +++++++--- src/aiidalab_qe/__main__.py | 7 ++-- src/aiidalab_qe/common/setup_codes.py | 40 +++++++++---------- 5 files changed, 71 insertions(+), 31 deletions(-) rename before-notebook.d/{00_untar_home.sh => 00_untar-home.sh} (100%) rename before-notebook.d/{41_setup_hq.sh => 41_setup-hq-computer.sh} (56%) diff --git a/Dockerfile b/Dockerfile index 0d8fdf05d..1c6d73981 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,7 @@ ARG FULL_STACK_VER=2024.1021 ARG UV_VER=0.2.27 ARG QE_VER=7.2 ARG QE_DIR=/opt/conda/envs/quantum-espresso-${QE_VER} +ARG HQ_VER=0.19.0 ARG UV_CACHE_DIR=/tmp/uv_cache ARG QE_APP_SRC=/tmp/quantum-espresso @@ -43,19 +44,37 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \ # STAGE 3 # - Prepare AiiDA profile and localhost computer +# - Prepare hq computer using hyperqueue as scheduler # - Install QE codes and pseudopotentials # - Archive home folder FROM build_deps AS home_build ARG QE_DIR +ARG HQ_VER + +# Install hq binary +RUN wget -c -O hq.tar.gz https://github.com/It4innovations/hyperqueue/releases/download/v${HQ_VER}/hq-v${HQ_VER}-linux-x64.tar.gz && \ + tar xf hq.tar.gz -C /opt/conda/ + ENV PSEUDO_FOLDER=/tmp/pseudo RUN mkdir -p ${PSEUDO_FOLDER} && \ python -m aiidalab_qe download-pseudos --dest ${PSEUDO_FOLDER} +ENV UV_CONSTRAINT=${PIP_CONSTRAINT} +# Install the aiida-hyperqueue +# XXX: fix me after release aiida-hyperqueue +RUN --mount=from=uv,source=/uv,target=/bin/uv \ + --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ + git clone https://github.com/aiidateam/aiida-hyperqueue && \ + uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue + +COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/ + # TODO: Remove PGSQL and daemon log files, and other unneeded files RUN --mount=from=qe_conda_env,source=${QE_DIR},target=${QE_DIR} \ bash /usr/local/bin/before-notebook.d/20_start-postgresql.sh && \ bash /usr/local/bin/before-notebook.d/40_prepare-aiida.sh && \ - python -m aiidalab_qe install-qe && \ + bash /usr/local/bin/before-notebook.d/41_setup-hq-computer.sh && \ + python -m aiidalab_qe install-qe --computer local-hq && \ python -m aiidalab_qe install-pseudos --source ${PSEUDO_FOLDER} && \ verdi daemon stop && \ mamba run -n aiida-core-services pg_ctl stop && \ @@ -82,6 +101,18 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \ --mount=from=build_deps,source=${QE_APP_SRC},target=${QE_APP_SRC},rw \ uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC} +# TODO: this seems need to do twice +# ENV UV_CONSTRAINT=${PIP_CONSTRAINT} +# # Install the aiida-hyperqueue +# # XXX: fix me after release aiida-hyperqueue +# RUN --mount=from=uv,source=/uv,target=/bin/uv \ +# --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ +# git clone https://github.com/aiidateam/aiida-hyperqueue && \ +# uv pip install --system --strict --compile-bytecode --cache-dir=${UV_CACHE_DIR} aiida-hyperqueue + +# copy hq binary +COPY --from=home_build /opt/conda/hq /usr/local/bin/ + COPY --from=qe_conda_env ${QE_DIR} ${QE_DIR} USER root diff --git a/before-notebook.d/00_untar_home.sh b/before-notebook.d/00_untar-home.sh similarity index 100% rename from before-notebook.d/00_untar_home.sh rename to before-notebook.d/00_untar-home.sh diff --git a/before-notebook.d/41_setup_hq.sh b/before-notebook.d/41_setup-hq-computer.sh similarity index 56% rename from before-notebook.d/41_setup_hq.sh rename to before-notebook.d/41_setup-hq-computer.sh index 5881d0673..043c54a99 100755 --- a/before-notebook.d/41_setup_hq.sh +++ b/before-notebook.d/41_setup-hq-computer.sh @@ -2,15 +2,21 @@ set -x +# XXX: need to make daemon start late +verdi daemon stop || echo "stop fail" + # Setup hyperqueue computer if needed HQ_COMPUTER="local-hq" -LOCALHOST_MPI_PROCS_PER_MACHINE=2 +# XXX: hardcode N_MPI_PROCES, or read from OCI runtime?? think monkey, think! +LOCAL_MPI_PROCS=2 +LOCAL_MEM=2560 -verdi show computer ${HQ_COMPUTER} -if [[ $? -eq 0 ]]; then +computer_list=$(verdi computer list) +if echo ${computer_list} | grep -q ${HQ_COMPUTER}; then echo "${HQ_COMPUTER} already setup" else # computer + # XXX: upbounded mem?? verdi computer show ${HQ_COMPUTER} || verdi computer setup \ --non-interactive \ --label "${HQ_COMPUTER}" \ @@ -19,10 +25,16 @@ else --transport core.local \ --scheduler hyperqueue \ --work-dir /home/${NB_USER}/aiida_run/ \ - --mpirun-command "mpirun -np {num_cpus}" \ - --mpiprocs-per-machine ${LOCALHOST_MPI_PROCS_PER_MACHINE} + --mpirun-command "mpirun -np {tot_num_mpiprocs}" \ + --mpiprocs-per-machine ${LOCAL_MPI_PROCS} verdi computer configure core.local "${HQ_COMPUTER}" \ --non-interactive \ --safe-interval 5.0 fi + +# Start hq server with a worker +nohup hq server start 1>$HOME/.hq-stdout 2>$HOME/.hq-stderr & +nohup hq worker start --cpus=${LOCAL_MPI_PROCS} --resource "mem=sum(${LOCAL_MEM})" --no-detect-resources & + +verdi daemon start || echo "start fail" diff --git a/src/aiidalab_qe/__main__.py b/src/aiidalab_qe/__main__.py index 8cb08cc5f..f21f5be7f 100644 --- a/src/aiidalab_qe/__main__.py +++ b/src/aiidalab_qe/__main__.py @@ -7,7 +7,7 @@ from aiida import load_profile from aiidalab_qe.common.setup_codes import codes_are_setup -from aiidalab_qe.common.setup_codes import install_and_setup as install_qe_codes +from aiidalab_qe.common.setup_codes import install_and_setup as install_and_setup_qe_codes # The default profile name of AiiDAlab container. _DEFAULT_PROFILE = "default" @@ -20,11 +20,12 @@ def cli(): @cli.command() @click.option("-f", "--force", is_flag=True) +@click.option("--computer") @click.option("-p", "--profile", default=_DEFAULT_PROFILE) -def install_qe(force, profile): +def install_qe(force, profile, computer): load_profile(profile) try: - for msg in install_qe_codes(force=force): + for msg in install_and_setup_qe_codes(target_computer=computer, force=force): click.echo(msg) assert codes_are_setup() click.secho("Codes are setup!", fg="green") diff --git a/src/aiidalab_qe/common/setup_codes.py b/src/aiidalab_qe/common/setup_codes.py index d8aa7629e..2875218c0 100644 --- a/src/aiidalab_qe/common/setup_codes.py +++ b/src/aiidalab_qe/common/setup_codes.py @@ -53,11 +53,7 @@ def get_qe_env(): def qe_installed(): env_exist = get_qe_env().exists() - proc = subprocess.run( - ["conda", "list", "-n", f"{get_qe_env()}", "qe"], - check=False, - capture_output=True, - ) + proc = subprocess.run(["conda", "list", "-n", f"{get_qe_env().name}", "qe"], check=True, capture_output=True,) # XXX: "qe" in check is not future proof if there are similar packages such as qe-tool, better solution?? JSON output?? return env_exist and "qe" in str(proc.stdout) @@ -106,13 +102,13 @@ def _generate_header_to_setup_code(): return header_code -def _generate_string_to_setup_code(code_name, computer_name="localhost"): +def _generate_string_to_setup_code(code_name, computer="localhost"): """Generate the Python string to setup an AiiDA code for a given computer. Tries to load an existing code and if not existent, generates Python code to create and store a new code setup.""" try: - load_code(f"{code_name}-{QE_VERSION}@{computer_name}") + load_code(f"{code_name}-{QE_VERSION}@{computer}") except NotExistent: label = f"{code_name}-{QE_VERSION}" description = f"{code_name}.x ({QE_VERSION}) setup by AiiDAlab." @@ -131,7 +127,7 @@ def _generate_string_to_setup_code(code_name, computer_name="localhost"): code.store() """.format( # noqa: UP032 - computer_name, + computer, label, description, filepath_executable, @@ -154,7 +150,7 @@ def setup_codes(): raise RuntimeError(f"Failed to setup codes: {error}") from None -def install_and_setup(force=False, target_computer="localhost"): +def install_and_setup(target_computer, force=False): """Install Quantum ESPRESSO and the corresponding AiiDA codes. Args: @@ -168,8 +164,8 @@ def install_and_setup(force=False, target_computer="localhost"): if not force and FN_DO_NOT_SETUP.exists(): raise RuntimeError("Installation failed in previous attempt.") - _install() - _setup(target_computer) + yield from _install() + yield from _setup(target_computer) def _install(): @@ -186,17 +182,17 @@ def _install(): "is not available." ) - if not qe_installed(): - # First, install Quantum ESPRESSO. - yield "Installing QE..." - try: - install_qe() - except subprocess.CalledProcessError as error: - raise RuntimeError( - f"Failed to create conda environment: {error}" - ) from None - else: - return + if qe_installed(): + return + + # Install Quantum ESPRESSO. + yield "Installing QE..." + try: + install_qe() + except subprocess.CalledProcessError as error: + raise RuntimeError( + f"Failed to create conda environment: {error}" + ) from None except Timeout: # Assume that the installation was triggered by a different process.