From c82c7ff921940f2530b3c04f266dac3272149cc9 Mon Sep 17 00:00:00 2001 From: ludamad Date: Sat, 20 Apr 2024 18:25:31 +0000 Subject: [PATCH] chore(ci): better docker prune --- .github/ci-setup-action/action.yml | 3 -- .github/workflows/ci-arm.yml | 1 - .github/workflows/ci.yml | 52 +++++++----------- .github/workflows/setup-runner.yml | 4 +- .github/workflows/start-spot.yml | 24 ++++++--- ci.py | 86 ++++++++++++++++++------------ 6 files changed, 90 insertions(+), 80 deletions(-) diff --git a/.github/ci-setup-action/action.yml b/.github/ci-setup-action/action.yml index e96dfd29a7c..4b1d7da6dbb 100644 --- a/.github/ci-setup-action/action.yml +++ b/.github/ci-setup-action/action.yml @@ -9,9 +9,6 @@ inputs: concurrency_key: required: false description: 'Concurrency key for locking jobs' - concurrency_token: - required: false - description: 'TODO unused' runs: # define an action, runs in OS of caller using: composite diff --git a/.github/workflows/ci-arm.yml b/.github/workflows/ci-arm.yml index 84e4cc7dda1..aa1d3f26ea7 100644 --- a/.github/workflows/ci-arm.yml +++ b/.github/workflows/ci-arm.yml @@ -36,7 +36,6 @@ jobs: - uses: ./.github/ci-setup-action with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - concurrency_token: "${{ secrets.AZTEC_GITHUB_TOKEN }}" # must be globally unique for build x runner concurrency_key: build-master-arm # prepare images locally, tagged by commit hash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b5f2c532c0..bad2c7a5a53 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,13 +5,12 @@ on: pull_request: {} workflow_dispatch: inputs: + username: + description: 'Defaults to GitHub Actor' + required: false runner_action: description: "The action to take with the self-hosted runner (start, stop, restart)." required: false - just_start_spot: - description: "Should we just run spots?" - type: boolean - required: false concurrency: # force parallelism in master group: ci-${{ github.ref_name == 'master' && github.run_id || github.ref_name }} @@ -20,10 +19,10 @@ jobs: setup: uses: ./.github/workflows/setup-runner.yml with: - runner_label: ${{ github.actor }}-x86 + runner_label: ${{ inputs.username || github.actor }}-x86 ebs_cache_size_gb: 256 runner_concurrency: 20 - subaction: ${{ github.event.inputs.runner_action || 'start' }} + subaction: ${{ inputs.runner_action || 'start' }} ec2_instance_type: m6a.32xlarge ec2_ami_id: ami-04d8422a9ba4de80f ec2_instance_ttl: 40 # refreshed by jobs @@ -31,8 +30,7 @@ jobs: build: needs: setup - runs-on: ${{ github.actor }}-x86 - if: ${{ github.event.inputs.just_start_spot != 'true' }} + runs-on: ${{ inputs.username || github.actor }}-x86 outputs: e2e_list: ${{ steps.e2e_list.outputs.list }} steps: @@ -40,7 +38,7 @@ jobs: - uses: ./.github/ci-setup-action with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - concurrency_key: build-${{ github.actor }}-x86 + concurrency_key: build-${{ inputs.username || github.actor }}-x86 # prepare images locally, tagged by commit hash - name: "Build E2E Image" timeout-minutes: 40 @@ -54,7 +52,7 @@ jobs: # all the end-to-end integration tests for aztec e2e: needs: build - runs-on: ${{ github.actor }}-x86 + runs-on: ${{ inputs.username || github.actor }}-x86 strategy: fail-fast: false matrix: @@ -65,7 +63,7 @@ jobs: with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" # must be globally unique for build x runner - concurrency_key: e2e-${{ github.actor }}-x86-${{ matrix.test }} + concurrency_key: e2e-${{ inputs.username || github.actor }}-x86-${{ matrix.test }} - name: Test working-directory: ./yarn-project/end-to-end/ timeout-minutes: 25 @@ -78,7 +76,7 @@ jobs: # only ran on x86 for resource reasons (memory intensive) bb-native-tests: needs: setup - runs-on: ${{ github.actor }}-x86 + runs-on: ${{ inputs.username || github.actor }}-x86 strategy: fail-fast: false steps: @@ -88,7 +86,7 @@ jobs: with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" # must be globally unique for build x runner - concurrency_key: bb-native-tests-${{ github.actor }}-x86 + concurrency_key: bb-native-tests-${{ inputs.username || github.actor }}-x86 - name: "Native Prover Tests" working-directory: ./barretenberg/cpp/ timeout-minutes: 25 @@ -98,15 +96,14 @@ jobs: # push benchmarking binaries to dockerhub registry bb-bench-binaries: needs: setup - runs-on: ${{ github.actor }}-x86 + runs-on: ${{ inputs.username || github.actor }}-x86 steps: - {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}} - uses: ./.github/ci-setup-action with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - concurrency_key: bb-bench-binaries-${{ github.actor }}-x86 + concurrency_key: bb-bench-binaries-${{ inputs.username || github.actor }}-x86 - name: Build and Push Binaries - if: ${{ github.event.inputs.just_start_spot != 'true' }} timeout-minutes: 15 working-directory: ./barretenberg/cpp/ run: earthly-ci --push +bench-binaries @@ -115,24 +112,24 @@ jobs: uses: ./.github/workflows/setup-runner.yml needs: bb-bench-binaries with: - runner_label: ${{ github.actor }}-bench-x86 + runner_label: ${{ inputs.username || github.actor }}-bench-x86 ebs_cache_size_gb: 64 runner_concurrency: 1 - subaction: ${{ github.event.inputs.runner_action || 'start' }} + subaction: ${{ inputs.runner_action || 'start' }} ec2_instance_type: m6a.4xlarge ec2_ami_id: ami-04d8422a9ba4de80f ec2_instance_ttl: 15 # refreshed by jobs secrets: inherit bb-bench: - runs-on: ${{ github.actor }}-bench-x86 + runs-on: ${{ inputs.username || github.actor }}-bench-x86 needs: setup-bench steps: - {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}} - uses: ./.github/ci-setup-action with: dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - concurrency_key: bb-bench-${{ github.actor }}-bench-x86 + concurrency_key: bb-bench-${{ inputs.username || github.actor }}-bench-x86 # Use bench_mode=cache to read the pushed build above - name: Client IVC Bench working-directory: ./barretenberg/cpp/ @@ -145,23 +142,14 @@ jobs: run: earthly-ci --no-output +bench-ultra-honk --bench_mode=cache merge-check: - runs-on: ubuntu-latest + runs-on: ${{ inputs.username || github.actor }}-x86 needs: [e2e, bb-native-tests, bb-bench] - if: always() # Ensures this job runs regardless of the success or failure of dependencies. steps: - - run: | - echo "E2E Test Status: ${{ needs.e2e.result }}" - echo "Native Tests Status: ${{ needs.bb-native-tests.result }}" - echo "Bench Tests Status: ${{ needs.bb-bench.result }}" - if [[ "${{ needs.e2e.result }}" != 'success' || "${{ needs.bb-native-tests.result }}" != 'success' || "${{ needs.bb-bench.result }}" != 'success' ]]; then - echo "Pull request merging not allowed due to failures." - exit 1 - fi - echo "Pull request merging now allowed." + - run: echo Pull request merging now allowed. notify: - runs-on: ubuntu-latest needs: [e2e, bb-native-tests, bb-bench] + runs-on: ubuntu-latest if: ${{ github.ref == 'refs/heads/master' && failure() }} steps: - name: Send notification to aztec3-ci channel if workflow failed on master diff --git a/.github/workflows/setup-runner.yml b/.github/workflows/setup-runner.yml index 0fe34e0cb73..fdde1373f94 100644 --- a/.github/workflows/setup-runner.yml +++ b/.github/workflows/setup-runner.yml @@ -58,7 +58,7 @@ jobs: group: start-builder-${{ inputs.runner_label }} steps: - name: Start EC2 runner - uses: AztecProtocol/ec2-action-builder@v0.15 + uses: AztecProtocol/ec2-action-builder@v0.14e with: github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -117,7 +117,7 @@ jobs: - name: Run Docker Prune # helps to not overuse space - run: docker system prune -f || true + run: docker system prune -f -a || true - name: Run Earthly Bootstrap run: earthly bootstrap diff --git a/.github/workflows/start-spot.yml b/.github/workflows/start-spot.yml index f3d84c1f557..539567f08cb 100644 --- a/.github/workflows/start-spot.yml +++ b/.github/workflows/start-spot.yml @@ -1,27 +1,35 @@ # Useful if the spot runners are in a bad state -name: Start Personal Spot +name: Start/Stop Personal Spot on: - workflow_dispatch: {} + workflow_dispatch: + inputs: + username: + description: 'Defaults to GitHub Actor' + required: false + action: + description: 'Can also be stop or restart, defaults to start' + required: false + default: 'start' jobs: - stop-build-x86: + start-build: uses: ./.github/workflows/setup-runner.yml with: - runner_label: ${{ github.actor }}-x86 + runner_label: ${{ inputs.username || github.actor }}-x86 ebs_cache_size_gb: 256 runner_concurrency: 20 - subaction: start + subaction: ${{ inputs.action }} ec2_instance_type: m6a.32xlarge ec2_ami_id: ami-04d8422a9ba4de80f ec2_instance_ttl: 40 # refreshed by jobs secrets: inherit - stop-bench: + start-bench: uses: ./.github/workflows/setup-runner.yml with: - runner_label: ${{ github.actor }}-bench-x86 + runner_label: ${{ inputs.username || github.actor }}-bench-x86 ebs_cache_size_gb: 64 runner_concurrency: 1 - subaction: start + subaction: ${{ inputs.action }} ec2_instance_type: m6a.4xlarge ec2_ami_id: ami-04d8422a9ba4de80f ec2_instance_ttl: 15 # refreshed by jobs diff --git a/ci.py b/ci.py index f5127ec0759..21caf8c9480 100755 --- a/ci.py +++ b/ci.py @@ -1,27 +1,32 @@ #!/usr/bin/env python3 # ubuntu: apt install python3-blessed from blessed import Terminal -import os, json, subprocess, sys +import os, json, subprocess, sys, time term = Terminal() if 'GITHUB_ACTOR' not in os.environ: print("Make sure you have GITHUB_ACTOR in your environment variables e.g. .zshrc") sys.exit(1) GITHUB_ACTOR = os.environ['GITHUB_ACTOR'] +BRANCH = subprocess.run("git rev-parse --abbrev-ref HEAD", shell=True, text=True, capture_output=True).stdout.strip() def main(): selection = -1 - with term.fullscreen(), term.cbreak(): - print(term.home + term.clear) - while selection not in ('1', '2', '3', '4', 'q'): - print(term.move_y(1) + "Please select an option:") - print("1. SSH into build machine") - print("2. SSH into bench machine") - print("3. Start/Stop spot machines") - print("4. Manage Running Jobs") - print("q. Quit") - with term.location(0, term.height - 1): - selection = term.inkey() + if len(sys.argv) >= 2: + selection = sys.argv[1] + else: + with term.fullscreen(), term.cbreak(): + print(term.home + term.clear) + while selection not in ('1', '2', '3', '4', '5', 'q'): + print(term.move_y(1) + "Please select an option:") + print("1. SSH into build machine") + print("2. SSH into bench machine") + print("3. Start/Stop spot machines") + print("4. Manage Running Jobs") + print("5. Run ci.yml manually") + print("q. Quit") + with term.location(0, term.height - 1): + selection = term.inkey() if selection == '1': ssh_into_machine('x86') @@ -31,42 +36,55 @@ def main(): manage_spot_instances() elif selection == '4': manage_ci_workflows() + elif selection == '5': + call_ci_workflow() def ssh_into_machine(suffix): - GITHUB_ACTOR = os.getenv('GITHUB_ACTOR', 'default_actor') ssh_key_path = os.path.expanduser('~/.ssh/build_instance_key') if not os.path.exists(ssh_key_path): print("SSH key does not exist.") return - # Command to get the instance information - cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2' - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if result.returncode != 0: - print("Failed to get AWS instances:", result.stderr) - return - # Parse the output to find the public IP address - try: - instances_data = json.loads(result.stdout) - instance = instances_data['Reservations'][0]['Instances'][0] - instance_ip = instance['PublicIpAddress'] - except (KeyError, IndexError, json.JSONDecodeError) as e: - print("Error parsing AWS CLI output:", e) - return + for i in range(10): + # Command to get the instance information + cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2' + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + if result.returncode != 0: + print("Failed to get AWS instances:", result.stderr) + return + try: + instances_data = json.loads(result.stdout) + instance = instances_data['Reservations'][0]['Instances'][0] + instance_ip = instance['PublicIpAddress'] + break + except (KeyError, IndexError, json.JSONDecodeError) as e: + print("Error parsing AWS CLI output, trying again:", e) + if i == 0: + print("Couldn't find spot, starting spot, and looping until we can find it") + call_spot_workflow('start') + elif i == 9: + print("Couldn't find spot even after creating it!") + sys.exit(1) + time.sleep(10) # SSH command using the public IP ssh_cmd = f"ssh -o StrictHostKeychecking=no -i {ssh_key_path} ubuntu@{instance_ip}" - print(f"Connecting to {instance_ip}. Consider delaying the impeding shutdown.") + print(f"Connecting to {instance_ip}. Consider delaying the impending shutdown and running a process called Runner.Worker to fool the reaper (automation TODO).") ssh_process = subprocess.Popen(ssh_cmd, shell=True) ssh_process.wait() # Wait for the SSH session to complete +def call_spot_workflow(action): + subprocess.run(f'gh workflow run start-spot.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}" --field action="{action}"', shell=True) + +def call_ci_workflow(): + print( + "NOTE: This is mostly useful if impersonating a GITHUB_ACTOR. Usually you rather do Manage Running Jobs and retry." + ) + subprocess.run(f'gh workflow run ci.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}"', shell=True) + def manage_spot_instances(): - action = input("Enter 'start' to run or 'stop' to stop spot instances: ") - if action == 'start': - subprocess.run('gh workflow run start-spot.yml', shell=True) - elif action == 'stop': - subprocess.run('gh workflow run stop-spot.yml', shell=True) + call_spot_workflow(input("Enter one of 'start', 'stop', 'restart':")) def manage_ci_workflows(): # Retrieve the most recent workflow run @@ -86,7 +104,7 @@ def manage_ci_workflows(): subprocess.run(f"gh run cancel {run_id}", shell=True) if action.lower() == 'rerun': # needed so the spot runners still work - subprocess.run('gh workflow run start-spot.yml', shell=True) + call_spot_workflow('start') subprocess.run(f"gh run rerun {run_id} --failed", shell=True) elif action.lower() == 'rerun-all': subprocess.run(f"gh run rerun {run_id}", shell=True)