From 827436b2b2f81b3c34d8731a689d636f306e1b70 Mon Sep 17 00:00:00 2001 From: Aaron Siddhartha Mondal Date: Mon, 18 Dec 2023 23:47:07 +0100 Subject: [PATCH] Migrate integration tests to Bazel and K8s Remove the broken docker-compose workflows, rewrite the tests to be rootlessly runnable via bazel and pin infrastructure outside of Bazel's build graph in Nix. Each integration test now spins up a nativelink deployment in K8s, runs the test and removes the deployment again. All integration tests now have timeouts to provide faster feedback on failing tests. Apart from a few dynamically declared IPs the new approach is fully reproducible and can reuse containers from the existing nix workflow. The "build nativelink with nativelink" test has been removed as the LRE/Remote test already covers that usecase. --- .bazelrc | 2 + .github/workflows/integration.yaml | 55 ++++++++ .github/workflows/lre.yaml | 2 +- .github/workflows/main.yml | 91 ------------- BUILD.bazel | 10 ++ MODULE.bazel | 23 ++++ WORKSPACE.bazel | 26 ++++ deployment-examples/kubernetes/00_infra.sh | 1 + .../kubernetes/01_operations.sh | 9 +- .../kubernetes/02_application.sh | 19 +++ .../kubernetes/03_delete_application.sh | 1 + deployment-examples/kubernetes/BUILD.bazel | 35 +++++ .../kubernetes/bazel_k8s_prelude.sh | 53 ++++++++ deployment-examples/kubernetes/cas.yaml | 31 ++++- deployment-examples/kubernetes/gateway.yaml | 28 +++- deployment-examples/kubernetes/routes.yaml | 30 ++++- deployment-examples/kubernetes/scheduler.yaml | 2 +- deployment-examples/kubernetes/worker.yaml | 2 +- flake.nix | 11 +- integration_tests/BUILD.bazel | 36 +++++ integration_tests/README.md | 10 ++ integration_tests/simple_cache_test.sh | 48 +++++-- integration_tests/simple_prometheus_test.sh | 42 ++++-- .../simple_remote_execution_test.sh | 66 ++++++--- integration_tests/simple_tls_test.sh | 29 +++- run_integration_tests.sh | 127 ------------------ tools/{BUILD => BUILD.bazel} | 8 +- tools/integration_test_utils.sh | 42 ++++++ 28 files changed, 563 insertions(+), 276 deletions(-) create mode 100644 .github/workflows/integration.yaml delete mode 100644 .github/workflows/main.yml create mode 100644 MODULE.bazel create mode 100644 deployment-examples/kubernetes/BUILD.bazel create mode 100755 deployment-examples/kubernetes/bazel_k8s_prelude.sh create mode 100644 integration_tests/BUILD.bazel create mode 100644 integration_tests/README.md mode change 100644 => 100755 integration_tests/simple_cache_test.sh mode change 100644 => 100755 integration_tests/simple_prometheus_test.sh mode change 100644 => 100755 integration_tests/simple_remote_execution_test.sh delete mode 100755 run_integration_tests.sh rename tools/{BUILD => BUILD.bazel} (74%) create mode 100644 tools/integration_test_utils.sh diff --git a/.bazelrc b/.bazelrc index b4d109ea91..0cea4879c7 100644 --- a/.bazelrc +++ b/.bazelrc @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +common --enable_bzlmod + # Use the earliest supported C++ version for protoc. build --cxxopt=-std=c++14 --host_cxxopt=-std=c++14 diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml new file mode 100644 index 0000000000..1941947236 --- /dev/null +++ b/.github/workflows/integration.yaml @@ -0,0 +1,55 @@ +--- +name: Integration tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: read-all + +jobs: + remote: + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04] + name: Integration tests / ${{ matrix.os }} + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: >- # v4.1.1 + actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 + + - name: Install Nix + uses: >- #v7 + DeterminateSystems/nix-installer-action@5620eb4af6b562c53e4d4628c0b6e4f9d9ae8612 + + - name: Cache Nix derivations + uses: >- # Custom commit, last pinned at 2023-11-17. + DeterminateSystems/magic-nix-cache-action@a04e6275a6bea232cd04fc6f3cbf20d4cb02a3e1 + + - name: Start Kubernetes cluster + run: > + nix develop --impure --command + bash -c "./deployment-examples/kubernetes/00_infra.sh \ + && ./deployment-examples/kubernetes/01_operations.sh" + + - name: Run warmup tests (intentionally fail) + continue-on-error: true + run: | + nix develop --impure --command + bash -c "bazel test integration_tests \ + --platforms=@rules_nixpkgs_core//platforms:host" + + - name: Clean outer directories + run: > + nix develop --impure --command + bash -c "bazel clean" + + - name: Run integration tests + run: > + nix develop --impure --command + bash -c "bazel test integration_tests \ + --platforms=@rules_nixpkgs_core//platforms:host" diff --git a/.github/workflows/lre.yaml b/.github/workflows/lre.yaml index 738097e623..078d4ae98e 100644 --- a/.github/workflows/lre.yaml +++ b/.github/workflows/lre.yaml @@ -70,7 +70,7 @@ jobs: - name: Get gateway IPs id: gateway-ips run: | - echo "cache_ip=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV" + echo "cache_ip=$(kubectl get gtw insecure-cache -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV" echo "scheduler_ip=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}')" >> "$GITHUB_ENV" - name: Print cluster state diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 9c5cd057a5..0000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: CI - -# Controls when the workflow will run. -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - docker-compose-compiles-nativelink: - # The type of runner that the job will run on. - runs-on: ubuntu-22.04 - strategy: - matrix: - # Which OS versions we will test on. - os_version: [ 20.04, 22.04 ] - steps: - - uses: actions/checkout@v3.5.3 - with: - fetch-depth: 0 - - - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v4 - with: - context: . - file: ./deployment-examples/docker-compose/Dockerfile - build-args: | - OPT_LEVEL=opt - OS_VERSION=${{ matrix.os_version }} - ADDITIONAL_SETUP_WORKER_CMD=DEBIAN_FRONTEND=noninteractive apt-get install -y gcc g++ lld pkg-config python3 - load: true # This brings the build into `docker images` from buildx. - tags: trace_machina/nativelink:latest - - uses: docker/build-push-action@v4 - with: - context: . - file: ./deployment-examples/docker-compose/Dockerfile - build-args: | - OPT_LEVEL=opt - OS_VERSION=${{ matrix.os_version }} - load: true # This brings the build into `docker images` from buildx. - tags: trace_machina/nativelink:builder - target: builder - - - name: Compile Native Link with Native Link - run: | - mkdir -p ~/.cache && \ - cd deployment-examples/docker-compose && \ - docker-compose up -d && \ - cd ../../ && \ - docker run --rm --net=host -w /root/nativelink -v $PWD:/root/nativelink trace_machina/nativelink:builder sh -c ' \ - bazel clean && \ - bazel test //... \ - --remote_instance_name=main \ - --remote_cache=grpc://127.0.0.1:50051 \ - --remote_executor=grpc://127.0.0.1:50052 \ - --remote_default_exec_properties=cpu_count=1 \ - ' && \ - docker run --rm --net=host -w /root/nativelink -v $PWD:/root/nativelink trace_machina/nativelink:builder sh -c ' \ - bazel clean && \ - bazel test //... \ - --remote_instance_name=main \ - --remote_cache=grpc://127.0.0.1:50051 \ - --remote_executor=grpc://127.0.0.1:50052 \ - --remote_default_exec_properties=cpu_count=1 \ - ' 2>&1 | ( ! grep ' PASSED in ' ) # If we get PASSED without (cache) it means there's a cache issue. - - integration-tests: - runs-on: ubuntu-22.04 - strategy: - matrix: - # Which OS versions we will test on. - os_version: [ 20.04, 22.04 ] - steps: - - uses: actions/checkout@v3.5.3 - with: - fetch-depth: 0 - - - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v4 - with: - context: . - file: ./deployment-examples/docker-compose/Dockerfile - build-args: | - OPT_LEVEL=fastbuild - OS_VERSION=${{ matrix.os_version }} - load: true # This brings the build into `docker images` from buildx. - tags: trace_machina/nativelink:latest - - - name: Run tests - run: ./run_integration_tests.sh diff --git a/BUILD.bazel b/BUILD.bazel index 314a35b1af..6614001815 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -50,3 +50,13 @@ sh_test( name = "dummy_test", srcs = [":dummy_test_sh"], ) + +sh_library( + name = "current_tag", + srcs = ["@nativelink-current-tag//:bin/nativelink-current-tag"], + target_compatible_with = select({ + "@rules_nixpkgs_core//constraints:support_nix": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + visibility = ["//visibility:public"], +) diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..3a6d2deec5 --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,23 @@ +module( + name = "nativelink", + version = "0.0.0", +) + +bazel_dep( + name = "rules_bazel_integration_test", + version = "0.20.0", + dev_dependency = True, +) + +bazel_binaries = use_extension( + "@rules_bazel_integration_test//:extensions.bzl", + "bazel_binaries", + dev_dependency = True, +) +bazel_binaries.download(version_file = "//:.bazelversion") +use_repo( + bazel_binaries, + "bazel_binaries", + "bazel_binaries_bazelisk", + "build_bazel_bazel_.bazelversion", +) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index c6e8e5ad50..ce1ac04b69 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -85,3 +85,29 @@ http_archive( load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") protobuf_deps() + +http_archive( + name = "io_tweag_rules_nixpkgs", + sha256 = "980edfceef2e59e1122d9be6c52413bc298435f0a3d452532b8a48d7562ffd67", + strip_prefix = "rules_nixpkgs-0.10.0", + urls = [ + "https://github.com/tweag/rules_nixpkgs/releases/download/v0.10.0/rules_nixpkgs-0.10.0.tar.gz", + ], +) + +load( + "@io_tweag_rules_nixpkgs//nixpkgs:repositories.bzl", + "rules_nixpkgs_dependencies", +) + +rules_nixpkgs_dependencies() + +load("@io_tweag_rules_nixpkgs//nixpkgs:nixpkgs.bzl", "nixpkgs_flake_package") + +nixpkgs_flake_package( + name = "nativelink-current-tag", + fail_not_supported = False, + nix_flake_file = "//:flake.nix", + nix_flake_lock_file = "//:flake.lock", + package = "currentTag", +) diff --git a/deployment-examples/kubernetes/00_infra.sh b/deployment-examples/kubernetes/00_infra.sh index 0c70184675..172eaff014 100755 --- a/deployment-examples/kubernetes/00_infra.sh +++ b/deployment-examples/kubernetes/00_infra.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # This script sets up a local development cluster. It's roughly equivalent to # a managed K8s setup. diff --git a/deployment-examples/kubernetes/01_operations.sh b/deployment-examples/kubernetes/01_operations.sh index 48ae17a8fe..017a8ee39a 100755 --- a/deployment-examples/kubernetes/01_operations.sh +++ b/deployment-examples/kubernetes/01_operations.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # This script configures a cluster with a few standard deployments. # TODO(aaronmondal): Add Grafana, OpenTelemetry and the various other standard @@ -7,11 +8,11 @@ set -xeuo pipefail SRC_ROOT=$(git rev-parse --show-toplevel) -kubectl apply -f ${SRC_ROOT}/deployment-examples/kubernetes/gateway.yaml +kubectl apply -f "$SRC_ROOT"/deployment-examples/kubernetes/gateway.yaml IMAGE_TAG=$(nix eval .#image.imageTag --raw) -$(nix build .#image --print-build-logs --verbose) \ +nix build .#image --print-build-logs --verbose \ && ./result \ | skopeo \ copy \ @@ -21,9 +22,9 @@ $(nix build .#image --print-build-logs --verbose) \ IMAGE_TAG=$(nix eval .#lre.imageTag --raw) -echo $IMAGE_TAG +echo "$IMAGE_TAG" -$(nix build .#lre --print-build-logs --verbose) \ +nix build .#lre --print-build-logs --verbose \ && ./result \ | skopeo \ copy \ diff --git a/deployment-examples/kubernetes/02_application.sh b/deployment-examples/kubernetes/02_application.sh index a380309821..feb2ded7c4 100755 --- a/deployment-examples/kubernetes/02_application.sh +++ b/deployment-examples/kubernetes/02_application.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # Get the nix derivation hash from the toolchain container, change the # `TOOLCHAIN_TAG` variable in the `worker.json.template` to that hash and apply # the configuration. @@ -13,3 +14,21 @@ kubectl apply -k "$KUSTOMIZE_DIR" kubectl rollout status deploy/nativelink-cas kubectl rollout status deploy/nativelink-scheduler kubectl rollout status deploy/nativelink-worker + +# Verify endpoint reachability. +INSECURE_CACHE=$(kubectl get gtw insecure-cache -o=jsonpath='{.status.addresses[0].value}') +SCHEDULER=$(kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}') +CACHE=$(kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}') +PROMETHEUS=$(kubectl get gtw prometheus -o=jsonpath='{.status.addresses[0].value}') + +printf " +Insecure Cache IP: $INSECURE_CACHE -> --remote_cache=grpc://$INSECURE_CACHE:50051 +Cache IP: $CACHE +Scheduler IP: $SCHEDULER -> --remote_executor=grpc://$SCHEDULER:50052 +Prometheus IP: $PROMETHEUS + +Insecure cache status: $(curl http://"$INSECURE_CACHE":50051/status) +Cache status: $(curl https://"$CACHE":50071/status) +Scheduler status: $(curl http://"$SCHEDULER":50052/status) +Prometheus status: $(curl http://"$PROMETHEUS":50061/status) +" diff --git a/deployment-examples/kubernetes/03_delete_application.sh b/deployment-examples/kubernetes/03_delete_application.sh index 9055ac480b..e2a660603e 100755 --- a/deployment-examples/kubernetes/03_delete_application.sh +++ b/deployment-examples/kubernetes/03_delete_application.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # Get the nix derivation hash from the toolchain container, change the # `TOOLCHAIN_TAG` variable in the `worker.json.template` to that hash and delete # the configuration. diff --git a/deployment-examples/kubernetes/BUILD.bazel b/deployment-examples/kubernetes/BUILD.bazel new file mode 100644 index 0000000000..cb7b755626 --- /dev/null +++ b/deployment-examples/kubernetes/BUILD.bazel @@ -0,0 +1,35 @@ +sh_library( + name = "kustomization", + srcs = [ + "cas.json", + "cas.yaml", + "example-do-not-use-in-prod-key.pem", + "example-do-not-use-in-prod-rootca.crt", + "kustomization.yaml", + "routes.yaml", + "scheduler.json", + "scheduler.yaml", + "worker.json.template", + "worker.yaml", + ], + visibility = ["//visibility:public"], +) + +# This target is used by end-to-end tests running under k8s. To deploy a test +# envirionment, make sure to have a k8s cluster running before invoking the test +# and add this to the testscript: +# +# source $(rlocation nativelink/deployment-examples/kubernetes/k8s) +# +sh_library( + name = "bazel_k8s_prelude", + srcs = ["bazel_k8s_prelude.sh"], + data = [ + ":kustomization", + ], + visibility = ["//visibility:public"], + deps = [ + "@bazel_tools//tools/bash/runfiles", + "@nativelink//:current_tag", + ], +) diff --git a/deployment-examples/kubernetes/bazel_k8s_prelude.sh b/deployment-examples/kubernetes/bazel_k8s_prelude.sh new file mode 100755 index 0000000000..064834a52a --- /dev/null +++ b/deployment-examples/kubernetes/bazel_k8s_prelude.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Copyright 2022 The Native Link Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# --- begin runfiles.bash initialization v3 --- +# Copy-pasted from the Bazel Bash runfiles library v3. +set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash +source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \ + source "$0.runfiles/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e +# --- end runfiles.bash initialization v3 --- + +NATIVELINK_TAG=$(cat "$(rlocation nativelink-current-tag/bin/nativelink-current-tag)") +KUSTOMIZE_DIR=$(rlocation nativelink/deployment-examples/kubernetes) + +remove_resources() { + kubectl kustomize \ + --load-restrictor LoadRestrictionsNone \ + "$KUSTOMIZE_DIR" \ + | kubectl delete -f - \ + || echo "Resource cleanup failed. Manually verify your cluster." >&2 +} + +trap remove_resources EXIT + +sed "s/__NATIVELINK_TOOLCHAIN_TAG__/${NATIVELINK_TAG}/g" \ + "$KUSTOMIZE_DIR/worker.json.template" \ + > "$KUSTOMIZE_DIR/worker.json" + +kubectl kustomize \ + --load-restrictor LoadRestrictionsNone \ + "$KUSTOMIZE_DIR" \ + | kubectl apply -f - + +kubectl rollout status deploy/nativelink-cas +kubectl rollout status deploy/nativelink-scheduler +kubectl rollout status deploy/nativelink-worker + +# Application code will run here. diff --git a/deployment-examples/kubernetes/cas.yaml b/deployment-examples/kubernetes/cas.yaml index 96fe183311..d041bde2a7 100644 --- a/deployment-examples/kubernetes/cas.yaml +++ b/deployment-examples/kubernetes/cas.yaml @@ -38,11 +38,14 @@ spec: - name: tls-volume secret: secretName: tls-secret +# TODO(aaronmondal): These should be different ports in a single nativlink-cas +# service. But that's bugged, so we use multiple services: +# https://github.com/cilium/cilium/issues/29099 --- apiVersion: v1 kind: Service metadata: - name: nativelink-cas + name: nativelink-insecure-cas spec: selector: app: nativelink-cas @@ -51,11 +54,29 @@ spec: protocol: TCP port: 50051 targetPort: 50051 - - name: metrics - protocol: TCP - port: 50061 - targetPort: 50061 +--- +apiVersion: v1 +kind: Service +metadata: + name: nativelink-cas +spec: + selector: + app: nativelink-cas + ports: - name: https protocol: TCP port: 50071 targetPort: 50071 +--- +apiVersion: v1 +kind: Service +metadata: + name: nativelink-prometheus +spec: + selector: + app: nativelink-cas + ports: + - name: metrics + protocol: TCP + port: 50061 + targetPort: 50061 diff --git a/deployment-examples/kubernetes/gateway.yaml b/deployment-examples/kubernetes/gateway.yaml index bc6bf5450a..701f8e0526 100644 --- a/deployment-examples/kubernetes/gateway.yaml +++ b/deployment-examples/kubernetes/gateway.yaml @@ -3,16 +3,40 @@ --- apiVersion: gateway.networking.k8s.io/v1 kind: Gateway +metadata: + name: insecure-cache +spec: + gatewayClassName: cilium + listeners: + - name: insecure-cache + protocol: HTTP + port: 50051 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway metadata: name: cache spec: gatewayClassName: cilium listeners: - name: cache + protocol: TLS + port: 50071 + tls: + mode: Passthrough +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: prometheus +spec: + gatewayClassName: cilium + listeners: + - name: prometheus protocol: HTTP - port: 50051 + port: 50061 --- -apiVersion: gateway.networking.k8s.io/v1beta1 +apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: scheduler diff --git a/deployment-examples/kubernetes/routes.yaml b/deployment-examples/kubernetes/routes.yaml index e094f9d020..96d9cc2af5 100644 --- a/deployment-examples/kubernetes/routes.yaml +++ b/deployment-examples/kubernetes/routes.yaml @@ -2,6 +2,20 @@ apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute # TODO(aaronmondal): Use GRPCRoute after resolution of # https://github.com/TraceMachina/nativelink/issues/481 +metadata: + name: insecure-cache-route +spec: + parentRefs: + - sectionName: insecure-cache + name: insecure-cache + rules: + - backendRefs: + - name: nativelink-insecure-cas + port: 50051 +--- +apiVersion: gateway.networking.k8s.io/v1alpha2 +kind: TLSRoute # TODO(aaronmondal): Use GRPCRoute after resolution of + # https://github.com/TraceMachina/nativelink/issues/481 metadata: name: cache-route spec: @@ -11,7 +25,21 @@ spec: rules: - backendRefs: - name: nativelink-cas - port: 50051 + port: 50071 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute # TODO(aaronmondal): Pure GRPC is unstable here. Find out why + # and migrate to a GRPCRoute. +metadata: + name: prometheus-route +spec: + parentRefs: + - sectionName: prometheus + name: prometheus + rules: + - backendRefs: + - name: nativelink-prometheus + port: 50061 --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute # TODO(aaronmondal): Pure GRPC is unstable here. Find out why diff --git a/deployment-examples/kubernetes/scheduler.yaml b/deployment-examples/kubernetes/scheduler.yaml index 02a6892acb..4831a8f8fb 100644 --- a/deployment-examples/kubernetes/scheduler.yaml +++ b/deployment-examples/kubernetes/scheduler.yaml @@ -20,7 +20,7 @@ spec: - name: RUST_LOG value: info - name: CAS_ENDPOINT - value: nativelink-cas + value: nativelink-insecure-cas ports: - containerPort: 50052 volumeMounts: diff --git a/deployment-examples/kubernetes/worker.yaml b/deployment-examples/kubernetes/worker.yaml index 66397a377b..2b3726eb2f 100644 --- a/deployment-examples/kubernetes/worker.yaml +++ b/deployment-examples/kubernetes/worker.yaml @@ -20,7 +20,7 @@ spec: - name: RUST_LOG value: info - name: CAS_ENDPOINT - value: nativelink-cas + value: nativelink-insecure-cas - name: SCHEDULER_ENDPOINT value: nativelink-scheduler volumeMounts: diff --git a/flake.nix b/flake.nix index d44eb520cb..c66386950c 100644 --- a/flake.nix +++ b/flake.nix @@ -85,7 +85,7 @@ program = "${nativelink}/bin/cas"; }; }; - packages = { + packages = rec { inherit publish-ghcr local-image-test; default = nativelink; lre = import ./local-remote-execution/image.nix { inherit pkgs nativelink; }; @@ -108,6 +108,12 @@ }; }; }; + # This "package" contains just the tag of the "image" package. This + # allows us to import the derivation hash into bazel integration + # test scripts. + currentTag = pkgs.writeScriptBin "nativelink-current-tag" '' + ${lre.imageTag} + ''; }; checks = { # TODO(aaronmondal): Fix the tests. @@ -134,6 +140,9 @@ pkgs.kubectl pkgs.kubernetes-helm pkgs.cilium-cli + pkgs.jq + pkgs.curl + pkgs.kind # Additional tools from within our development environment. local-image-test diff --git a/integration_tests/BUILD.bazel b/integration_tests/BUILD.bazel new file mode 100644 index 0000000000..aa8c50c822 --- /dev/null +++ b/integration_tests/BUILD.bazel @@ -0,0 +1,36 @@ +load("@bazel_binaries//:defs.bzl", "bazel_binaries") +load( + "@rules_bazel_integration_test//bazel_integration_test:defs.bzl", + "integration_test_utils", + "script_test", +) + +SIMPLE_TESTS = [ + "simple_cache_test", + "simple_remote_execution_test", + # "simple_prometheus_test", TODO(aaronmondal): Broken. + "simple_tls_test", +] + +[ + script_test( + name = "{}".format(testname), + timeout = "moderate", + srcs = ["{}.sh".format(testname)], + bazel_binaries = bazel_binaries, + bazel_version = bazel_binaries.versions.current, + deps = [ + "//deployment-examples/kubernetes:bazel_k8s_prelude", + "//tools:integration_test_utils", + "@bazel_tools//tools/bash/runfiles", + ], + ) + for testname in SIMPLE_TESTS +] + +test_suite( + name = "integration_tests", + # This tag causes the target to be ignored during `bazel test //...`. + tags = integration_test_utils.DEFAULT_INTEGRATION_TEST_TAGS, + tests = [":{}".format(testname) for testname in SIMPLE_TESTS], +) diff --git a/integration_tests/README.md b/integration_tests/README.md new file mode 100644 index 0000000000..ae1b9b2e95 --- /dev/null +++ b/integration_tests/README.md @@ -0,0 +1,10 @@ +# Integration tests + +These tests run in Kubernetes and are gated behind platform constraints: + +```bash +./deployment-examples/kubernetes/00_infra.sh +./deployment-examples/kubernetes/01_operations.sh + +bazel test integration_tests --platforms=@rules_nixpkgs_core//platforms:host +``` diff --git a/integration_tests/simple_cache_test.sh b/integration_tests/simple_cache_test.sh old mode 100644 new mode 100755 index e0997004f9..25a20ea83e --- a/integration_tests/simple_cache_test.sh +++ b/integration_tests/simple_cache_test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright 2022 The Native Link Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,14 +15,36 @@ # This is a sanity check test to ensure we are caching test results. -if [[ $UNDER_TEST_RUNNER -ne 1 ]]; then - echo "This script should be run under run_integration_tests.sh" - exit 1 -fi -set -x +# --- begin runfiles.bash initialization v3 --- +# Copy-pasted from the Bazel Bash runfiles library v3. +set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash +source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \ + source "$0.runfiles/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e +# --- end runfiles.bash initialization v3 --- + +set -xeuo pipefail + +source "$(rlocation nativelink/deployment-examples/kubernetes/bazel_k8s_prelude.sh)" +source "$(rlocation nativelink/tools/integration_test_utils.sh)" + +BAZEL_CACHE_DIR=$(temporary_cache)/bazel +CACHE_IP=$(kubernetes_insecure_cache_ip) +SCHEDULER_IP=$(kubernetes_scheduler_ip) # First run our test under bazel. It should not be cached. -OUTPUT=$(bazel --output_base="$BAZEL_CACHE_DIR" test --config self_test //:dummy_test) +OUTPUT=$("${BIT_BAZEL_BINARY:-}" \ + --output_base="$BAZEL_CACHE_DIR" \ + test \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://"$CACHE_IP":50051 \ + --remote_executor=grpc://"$SCHEDULER_IP":50052 \ + //:dummy_test) + if [[ "$OUTPUT" =~ .*'(cached)'.* ]]; then echo "Expected first bazel run to not have test cached." echo "STDOUT:" @@ -31,10 +53,18 @@ if [[ "$OUTPUT" =~ .*'(cached)'.* ]]; then fi # Clean our local cache. -bazel --output_base="$BAZEL_CACHE_DIR" clean +"${BIT_BAZEL_BINARY:-}" --output_base="$BAZEL_CACHE_DIR" clean # Now run it under bazel again. This time the remote cache should have it. -OUTPUT=$(bazel --output_base="$BAZEL_CACHE_DIR" test --config self_test //:dummy_test) +OUTPUT=$("${BIT_BAZEL_BINARY:-}" \ + --output_base="$BAZEL_CACHE_DIR" \ + test \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://"$CACHE_IP":50051 \ + --remote_executor=grpc://"$SCHEDULER_IP":50052 \ + //:dummy_test) + if [[ ! "$OUTPUT" =~ .*'(cached)'.* ]]; then echo "Expected second bazel run to have test cached." echo "STDOUT:" diff --git a/integration_tests/simple_prometheus_test.sh b/integration_tests/simple_prometheus_test.sh old mode 100644 new mode 100755 index f588b46976..dd3e8c99ef --- a/integration_tests/simple_prometheus_test.sh +++ b/integration_tests/simple_prometheus_test.sh @@ -15,18 +15,44 @@ # This is a sanity check test to ensure we are caching test results. -if [[ $UNDER_TEST_RUNNER -ne 1 ]]; then - echo "This script should be run under run_integration_tests.sh" - exit 1 -fi +# --- begin runfiles.bash initialization v3 --- +# Copy-pasted from the Bazel Bash runfiles library v3. +set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash +source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \ + source "$0.runfiles/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e +# --- end runfiles.bash initialization v3 --- + +set -xeuo pipefail + +source "$(rlocation nativelink/deployment-examples/kubernetes/bazel_k8s_prelude.sh)" +source "$(rlocation nativelink/tools/integration_test_utils.sh)" -set -euo pipefail +BAZEL_CACHE_DIR=$(temporary_cache)/bazel +CACHE_IP=$(kubernetes_insecure_cache_ip) +SCHEDULER_IP=$(kubernetes_scheduler_ip) +PROMETHEUS_IP=$(kubernetes_prometheus_ip) # Run bazel to populate some of the metrics. -bazel --output_base="$BAZEL_CACHE_DIR" test --config self_test //:dummy_test +"${BIT_BAZEL_BINARY:-}" \ + --output_base="$BAZEL_CACHE_DIR" \ + build \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://"$CACHE_IP":50051 \ + --remote_executor=grpc://"$SCHEDULER_IP":50052 \ + //local-remote-execution/examples:hello_lre # Our service may take a few seconds to get started, so retry a few times. -all_contents="$(curl --retry 5 --retry-delay 0 --retry-max-time 30 http://127.0.0.1:50061/metrics)" +all_contents=$(curl \ + --retry 5 \ + --retry-delay 0 \ + --retry-max-time 30 \ + http://"$PROMETHEUS_IP":50061/metrics +) echo "$all_contents" @@ -40,7 +66,7 @@ echo 'Checking: nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000 grep -q 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents" # Ensure our store metrics are only published once. -count=$(grep 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents" | wc -l) +count=$(grep -c 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents") if [[ $count -ne 1 ]]; then echo "Expected to find 1 instance of CAS_MAIN_STORE, but found $count" exit 1 diff --git a/integration_tests/simple_remote_execution_test.sh b/integration_tests/simple_remote_execution_test.sh old mode 100644 new mode 100755 index 70565d4590..5dbd6a3843 --- a/integration_tests/simple_remote_execution_test.sh +++ b/integration_tests/simple_remote_execution_test.sh @@ -13,23 +13,42 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This test is to ensure we can run the same job two times on a single node and no errors occur -# and the results are said to be executed remotely by bazel. +# This test is to ensure we can run the same job two times on a single node and +# no errors occur and the results are said to be executed remotely by bazel. # This test is also here to ensure GrpcStore is being used properly. -if [[ $UNDER_TEST_RUNNER -ne 1 ]]; then - echo "This script should be run under run_integration_tests.sh" - exit 1 -fi -set -xo pipefail +# --- begin runfiles.bash initialization v3 --- +# Copy-pasted from the Bazel Bash runfiles library v3. +set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash +source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \ + source "$0.runfiles/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e +# --- end runfiles.bash initialization v3 --- + +set -xeuo pipefail + +source "$(rlocation nativelink/deployment-examples/kubernetes/bazel_k8s_prelude.sh)" +source "$(rlocation nativelink/tools/integration_test_utils.sh)" -rm -rf "$CACHE_DIR/build_events.json" +CACHE_DIR=$(temporary_cache) +BAZEL_CACHE_DIR=$CACHE_DIR/bazel +CACHE_IP=$(kubernetes_insecure_cache_ip) +SCHEDULER_IP=$(kubernetes_scheduler_ip) # First run our test under bazel. It should not be cached. -OUTPUT=$( - bazel --output_base="$BAZEL_CACHE_DIR" \ - test --config self_test --config self_execute \ - //:dummy_test --nocache_test_results --build_event_json_file="$CACHE_DIR/build_events.json" +OUTPUT=$("${BIT_BAZEL_BINARY:-}" \ + --output_base="$BAZEL_CACHE_DIR" \ + test \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://"$CACHE_IP":50051 \ + --remote_executor=grpc://"$SCHEDULER_IP":50052 \ + //:dummy_test \ + --nocache_test_results \ + --build_event_json_file="$CACHE_DIR/build_events.json" ) STRATEGY=$(jq --slurp -r '.[] | select(.id.testResult.label=="//:dummy_test") | .testResult.executionInfo.strategy' "$CACHE_DIR/build_events.json") if [[ "$STRATEGY" != "remote" ]]; then @@ -40,16 +59,25 @@ if [[ "$STRATEGY" != "remote" ]]; then fi # Clean our local cache. -bazel --output_base="$BAZEL_CACHE_DIR" clean -rm -rf "$CACHE_DIR/build_events.json" +"${BIT_BAZEL_BINARY:-}" --output_base="$BAZEL_CACHE_DIR" clean +rm "$CACHE_DIR/build_events.json" # Now run it under bazel again. This time the remote cache should have it. -OUTPUT=$( - bazel --output_base="$BAZEL_CACHE_DIR" \ - test --config self_test --config self_execute \ - //:dummy_test --nocache_test_results --build_event_json_file="$CACHE_DIR/build_events.json" +OUTPUT=$("${BIT_BAZEL_BINARY:-}" \ + --output_base="$BAZEL_CACHE_DIR" \ + test \ + --config=lre \ + --remote_instance_name=main \ + --remote_cache=grpc://"$CACHE_IP":50051 \ + --remote_executor=grpc://"$SCHEDULER_IP":50052 \ + //:dummy_test \ + --nocache_test_results \ + --build_event_json_file="$CACHE_DIR/build_events.json" +) +STRATEGY=$(jq --slurp -r \ + '.[] | select(.id.testResult.label=="//:dummy_test") | .testResult.executionInfo.strategy' \ + "$CACHE_DIR/build_events.json" ) -STRATEGY=$(jq --slurp -r '.[] | select(.id.testResult.label=="//:dummy_test") | .testResult.executionInfo.strategy' "$CACHE_DIR/build_events.json") if [[ "$STRATEGY" != "remote" ]]; then echo "$OUTPUT" echo "" diff --git a/integration_tests/simple_tls_test.sh b/integration_tests/simple_tls_test.sh index cd134b48b7..1788bd01fe 100755 --- a/integration_tests/simple_tls_test.sh +++ b/integration_tests/simple_tls_test.sh @@ -13,12 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [[ $UNDER_TEST_RUNNER -ne 1 ]]; then - echo "This script should be run under run_integration_tests.sh" - exit 1 -fi +# --- begin runfiles.bash initialization v3 --- +# Copy-pasted from the Bazel Bash runfiles library v3. +set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash +source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \ + source "$0.runfiles/$f" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \ + { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e +# --- end runfiles.bash initialization v3 --- + +set -xeuo pipefail -RESULTS=$(curl --retry 5 --insecure --cacert ./example-do-not-use-in-prod-rootca.crt --key ./example-do-not-use-in-prod-key1.pem https://127.0.0.1:50071/status 2>&1) +source "$(rlocation nativelink/deployment-examples/kubernetes/bazel_k8s_prelude.sh)" +source "$(rlocation nativelink/tools/integration_test_utils.sh)" + +CACHE_IP=$(kubernetes_cache_ip) + +# TODO(aaronmondal): This doesn't validat any certificates such as the one used +# when deploying nativelink. It just checks that we have +# *some* form of TLS encryption. Properly set up certificates +# so that we can use the command below instead: +# +# RESULTS=$(curl --cacert mycert.crt https://"$CACHE_IP":50071/status 2>&1) +RESULTS="$(curl --retry 5 --insecure https://"$CACHE_IP":50071/status 2>&1)" echo "Results from curl: $RESULTS" diff --git a/run_integration_tests.sh b/run_integration_tests.sh deleted file mode 100755 index 3f0d05a87f..0000000000 --- a/run_integration_tests.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# Copyright 2023 The Native Link Authors. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -euo pipefail - -if [[ $EUID -eq 0 ]]; then - echo "This script should not be run as root." - exit 1 -fi - -TEST_PATTERNS=() - -while [[ $# -gt 0 ]]; do - case $1 in - --help) - echo <<'EOT' -Runner for integration tests - -Usage: - run_integration_tests.sh [TEST_PATTERNS...] - -TEST_PATTERNS: Name of test you wish to execute. Wildcard (*) supported. - Default: '*' -EOT - ;; - -*|--*) - echo "Unknown option $1" - exit 1 - ;; - *) - TEST_PATTERNS+=("$1") - shift # past argument - ;; - esac -done - -if ! docker --version; then - echo "This script must be run as root due to docker permission issues (try with 'sudo')" - exit 1 -fi - -if [[ "${#TEST_PATTERNS[@]}" -eq 0 ]]; then - TEST_PATTERNS=("*") -fi - -SELF_DIR=$(realpath $(dirname $0)) -cd "$SELF_DIR/deployment-examples/docker-compose" - -export UNDER_TEST_RUNNER=1 - -# Ensure our cache locations are empty. -sudo rm -rf ~/.cache/nativelink -mkdir -p ~/.cache/nativelink - -# Ensure our docker compose is not running. -sudo docker-compose rm --stop -f - -export TMPDIR=$HOME/.cache/nativelink/ -mkdir -p "$TMPDIR" - -if [[ "$OSTYPE" == "darwin"* ]]; then - export CACHE_DIR=$(mktemp -d "${TMPDIR}nativelink-integration-test") -else - echo "Assumes Linux/WSL" - export CACHE_DIR=$(mktemp -d --tmpdir="$TMPDIR" --suffix="-nativelink-integration-test") -fi - -export BAZEL_CACHE_DIR="$CACHE_DIR/bazel" -trap "sudo rm -rf $CACHE_DIR; sudo docker-compose rm --stop -f" EXIT - -echo "" # New line. - -DID_FAIL=0 - -export NATIVELINK_DIR="$CACHE_DIR/nativelink" -mkdir -p "$NATIVELINK_DIR" - -for pattern in "${TEST_PATTERNS[@]}"; do - find "$SELF_DIR/integration_tests/" -name "$pattern" -type f -print0 | while IFS= read -r -d $'\0' fullpath; do - # Cleanup. - echo "Cleaning up cache directories TURBOC_CACHE_DIR: $NATIVELINK_DIR" - echo "Checking for existince of the NATIVELINK_DIR" - if [ -d "$NATIVELINK_DIR" ]; then - sudo find "$NATIVELINK_DIR" -delete - else - echo "Directory $NATIVELINK_DIR does not exist." - fi - - bazel --output_base="$BAZEL_CACHE_DIR" clean - FILENAME=$(basename $fullpath) - echo "Running test $FILENAME" - sudo docker-compose up -d - if perl -e 'alarm shift; exec @ARGV' 30 bash -c 'until sudo docker-compose logs | grep -q "Ready, listening on"; do sleep 1; done' - then - echo "String 'Ready, listening on' found in the logs." - else - echo "String 'Ready, listening on' not found in the logs within the given time." - fi - set +e - bash -euo pipefail "$fullpath" - EXIT_CODE="$?" - set -e - if [[ $EXIT_CODE -eq 0 ]]; then - echo "$FILENAME passed" - else - echo "$FILENAME failed with exit code $EXIT_CODE" - sudo docker-compose logs - exit $EXIT_CODE - fi - sudo docker-compose rm --stop -f - echo "" # New line. - done -done - -echo "All tests passed!" diff --git a/tools/BUILD b/tools/BUILD.bazel similarity index 74% rename from tools/BUILD rename to tools/BUILD.bazel index 315c102c92..7630e0f142 100644 --- a/tools/BUILD +++ b/tools/BUILD.bazel @@ -1,4 +1,4 @@ -# Copyright 2022 The Native Link Authors. All rights reserved. +# Copyright 2023 The Native Link Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,3 +13,9 @@ # limitations under the License. exports_files(["tsan.sh"]) + +sh_library( + name = "integration_test_utils", + srcs = ["integration_test_utils.sh"], + visibility = ["//visibility:public"], +) diff --git a/tools/integration_test_utils.sh b/tools/integration_test_utils.sh new file mode 100644 index 0000000000..51bd54d6bd --- /dev/null +++ b/tools/integration_test_utils.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Copyright 2023 The Native Link Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kubernetes_insecure_cache_ip() { + kubectl get gtw insecure-cache -o=jsonpath='{.status.addresses[0].value}' +} + +kubernetes_cache_ip() { + kubectl get gtw cache -o=jsonpath='{.status.addresses[0].value}' +} + +kubernetes_scheduler_ip() { + kubectl get gtw scheduler -o=jsonpath='{.status.addresses[0].value}' +} + +kubernetes_prometheus_ip() { + kubectl get gtw prometheus -o=jsonpath='{.status.addresses[0].value}' +} + +temporary_cache() { + if [[ "$OSTYPE" == "darwin"* ]]; then + cache_dir=$(mktemp -d -t "nativelink-integration-test") + else + echo "Assumes Linux/WSL" >&2 + cache_dir=$(mktemp -d --suffix="-nativelink-integration-test") + fi + + echo "$cache_dir" + trap 'rm -rf "$cache_dir";' EXIT +}