Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(ci): better docker prune #5889

Merged
merged 1 commit into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .github/ci-setup-action/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ inputs:
concurrency_key:
required: false
description: 'Concurrency key for locking jobs'
concurrency_token:
required: false
description: 'TODO unused'
runs:
# define an action, runs in OS of caller
using: composite
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci-arm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ jobs:
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_token: "${{ secrets.AZTEC_GITHUB_TOKEN }}"
# must be globally unique for build x runner
concurrency_key: build-master-arm
# prepare images locally, tagged by commit hash
Expand Down
52 changes: 20 additions & 32 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@ on:
pull_request: {}
workflow_dispatch:
inputs:
username:
description: 'Defaults to GitHub Actor'
required: false
runner_action:
description: "The action to take with the self-hosted runner (start, stop, restart)."
required: false
just_start_spot:
description: "Should we just run spots?"
type: boolean
required: false
concurrency:
# force parallelism in master
group: ci-${{ github.ref_name == 'master' && github.run_id || github.ref_name }}
Expand All @@ -20,27 +19,26 @@ jobs:
setup:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-x86
runner_label: ${{ inputs.username || github.actor }}-x86
ebs_cache_size_gb: 256
runner_concurrency: 20
subaction: ${{ github.event.inputs.runner_action || 'start' }}
subaction: ${{ inputs.runner_action || 'start' }}
ec2_instance_type: m6a.32xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 40 # refreshed by jobs
secrets: inherit

build:
needs: setup
runs-on: ${{ github.actor }}-x86
if: ${{ github.event.inputs.just_start_spot != 'true' }}
runs-on: ${{ inputs.username || github.actor }}-x86
outputs:
e2e_list: ${{ steps.e2e_list.outputs.list }}
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: build-${{ github.actor }}-x86
concurrency_key: build-${{ inputs.username || github.actor }}-x86
# prepare images locally, tagged by commit hash
- name: "Build E2E Image"
timeout-minutes: 40
Expand All @@ -54,7 +52,7 @@ jobs:
# all the end-to-end integration tests for aztec
e2e:
needs: build
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
strategy:
fail-fast: false
matrix:
Expand All @@ -65,7 +63,7 @@ jobs:
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
# must be globally unique for build x runner
concurrency_key: e2e-${{ github.actor }}-x86-${{ matrix.test }}
concurrency_key: e2e-${{ inputs.username || github.actor }}-x86-${{ matrix.test }}
- name: Test
working-directory: ./yarn-project/end-to-end/
timeout-minutes: 25
Expand All @@ -78,7 +76,7 @@ jobs:
# only ran on x86 for resource reasons (memory intensive)
bb-native-tests:
needs: setup
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
strategy:
fail-fast: false
steps:
Expand All @@ -88,7 +86,7 @@ jobs:
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
# must be globally unique for build x runner
concurrency_key: bb-native-tests-${{ github.actor }}-x86
concurrency_key: bb-native-tests-${{ inputs.username || github.actor }}-x86
- name: "Native Prover Tests"
working-directory: ./barretenberg/cpp/
timeout-minutes: 25
Expand All @@ -98,15 +96,14 @@ jobs:
# push benchmarking binaries to dockerhub registry
bb-bench-binaries:
needs: setup
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: bb-bench-binaries-${{ github.actor }}-x86
concurrency_key: bb-bench-binaries-${{ inputs.username || github.actor }}-x86
- name: Build and Push Binaries
if: ${{ github.event.inputs.just_start_spot != 'true' }}
timeout-minutes: 15
working-directory: ./barretenberg/cpp/
run: earthly-ci --push +bench-binaries
Expand All @@ -115,24 +112,24 @@ jobs:
uses: ./.github/workflows/setup-runner.yml
needs: bb-bench-binaries
with:
runner_label: ${{ github.actor }}-bench-x86
runner_label: ${{ inputs.username || github.actor }}-bench-x86
ebs_cache_size_gb: 64
runner_concurrency: 1
subaction: ${{ github.event.inputs.runner_action || 'start' }}
subaction: ${{ inputs.runner_action || 'start' }}
ec2_instance_type: m6a.4xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 15 # refreshed by jobs
secrets: inherit

bb-bench:
runs-on: ${{ github.actor }}-bench-x86
runs-on: ${{ inputs.username || github.actor }}-bench-x86
needs: setup-bench
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: bb-bench-${{ github.actor }}-bench-x86
concurrency_key: bb-bench-${{ inputs.username || github.actor }}-bench-x86
# Use bench_mode=cache to read the pushed build above
- name: Client IVC Bench
working-directory: ./barretenberg/cpp/
Expand All @@ -145,23 +142,14 @@ jobs:
run: earthly-ci --no-output +bench-ultra-honk --bench_mode=cache

merge-check:
runs-on: ubuntu-latest
runs-on: ${{ inputs.username || github.actor }}-x86
needs: [e2e, bb-native-tests, bb-bench]
if: always() # Ensures this job runs regardless of the success or failure of dependencies.
steps:
- run: |
echo "E2E Test Status: ${{ needs.e2e.result }}"
echo "Native Tests Status: ${{ needs.bb-native-tests.result }}"
echo "Bench Tests Status: ${{ needs.bb-bench.result }}"
if [[ "${{ needs.e2e.result }}" != 'success' || "${{ needs.bb-native-tests.result }}" != 'success' || "${{ needs.bb-bench.result }}" != 'success' ]]; then
echo "Pull request merging not allowed due to failures."
exit 1
fi
echo "Pull request merging now allowed."
- run: echo Pull request merging now allowed.

notify:
runs-on: ubuntu-latest
needs: [e2e, bb-native-tests, bb-bench]
runs-on: ubuntu-latest
if: ${{ github.ref == 'refs/heads/master' && failure() }}
steps:
- name: Send notification to aztec3-ci channel if workflow failed on master
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/setup-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
group: start-builder-${{ inputs.runner_label }}
steps:
- name: Start EC2 runner
uses: AztecProtocol/ec2-action-builder@v0.15
uses: AztecProtocol/ec2-action-builder@v0.14e
with:
github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand Down Expand Up @@ -117,7 +117,7 @@ jobs:

- name: Run Docker Prune
# helps to not overuse space
run: docker system prune -f || true
run: docker system prune -f -a || true

- name: Run Earthly Bootstrap
run: earthly bootstrap
24 changes: 16 additions & 8 deletions .github/workflows/start-spot.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,35 @@
# Useful if the spot runners are in a bad state
name: Start Personal Spot
name: Start/Stop Personal Spot
on:
workflow_dispatch: {}
workflow_dispatch:
inputs:
username:
description: 'Defaults to GitHub Actor'
required: false
action:
description: 'Can also be stop or restart, defaults to start'
required: false
default: 'start'
jobs:
stop-build-x86:
start-build:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-x86
runner_label: ${{ inputs.username || github.actor }}-x86
ebs_cache_size_gb: 256
runner_concurrency: 20
subaction: start
subaction: ${{ inputs.action }}
ec2_instance_type: m6a.32xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 40 # refreshed by jobs
secrets: inherit

stop-bench:
start-bench:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-bench-x86
runner_label: ${{ inputs.username || github.actor }}-bench-x86
ebs_cache_size_gb: 64
runner_concurrency: 1
subaction: start
subaction: ${{ inputs.action }}
ec2_instance_type: m6a.4xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 15 # refreshed by jobs
Expand Down
86 changes: 52 additions & 34 deletions ci.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
#!/usr/bin/env python3
# ubuntu: apt install python3-blessed
from blessed import Terminal
import os, json, subprocess, sys
import os, json, subprocess, sys, time

term = Terminal()
if 'GITHUB_ACTOR' not in os.environ:
print("Make sure you have GITHUB_ACTOR in your environment variables e.g. .zshrc")
sys.exit(1)
GITHUB_ACTOR = os.environ['GITHUB_ACTOR']
BRANCH = subprocess.run("git rev-parse --abbrev-ref HEAD", shell=True, text=True, capture_output=True).stdout.strip()

def main():
selection = -1
with term.fullscreen(), term.cbreak():
print(term.home + term.clear)
while selection not in ('1', '2', '3', '4', 'q'):
print(term.move_y(1) + "Please select an option:")
print("1. SSH into build machine")
print("2. SSH into bench machine")
print("3. Start/Stop spot machines")
print("4. Manage Running Jobs")
print("q. Quit")
with term.location(0, term.height - 1):
selection = term.inkey()
if len(sys.argv) >= 2:
selection = sys.argv[1]
else:
with term.fullscreen(), term.cbreak():
print(term.home + term.clear)
while selection not in ('1', '2', '3', '4', '5', 'q'):
print(term.move_y(1) + "Please select an option:")
print("1. SSH into build machine")
print("2. SSH into bench machine")
print("3. Start/Stop spot machines")
print("4. Manage Running Jobs")
print("5. Run ci.yml manually")
print("q. Quit")
with term.location(0, term.height - 1):
selection = term.inkey()

if selection == '1':
ssh_into_machine('x86')
Expand All @@ -31,42 +36,55 @@ def main():
manage_spot_instances()
elif selection == '4':
manage_ci_workflows()
elif selection == '5':
call_ci_workflow()

def ssh_into_machine(suffix):
GITHUB_ACTOR = os.getenv('GITHUB_ACTOR', 'default_actor')
ssh_key_path = os.path.expanduser('~/.ssh/build_instance_key')
if not os.path.exists(ssh_key_path):
print("SSH key does not exist.")
return

# Command to get the instance information
cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print("Failed to get AWS instances:", result.stderr)
return

# Parse the output to find the public IP address
try:
instances_data = json.loads(result.stdout)
instance = instances_data['Reservations'][0]['Instances'][0]
instance_ip = instance['PublicIpAddress']
except (KeyError, IndexError, json.JSONDecodeError) as e:
print("Error parsing AWS CLI output:", e)
return
for i in range(10):
# Command to get the instance information
cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print("Failed to get AWS instances:", result.stderr)
return
try:
instances_data = json.loads(result.stdout)
instance = instances_data['Reservations'][0]['Instances'][0]
instance_ip = instance['PublicIpAddress']
break
except (KeyError, IndexError, json.JSONDecodeError) as e:
print("Error parsing AWS CLI output, trying again:", e)
if i == 0:
print("Couldn't find spot, starting spot, and looping until we can find it")
call_spot_workflow('start')
elif i == 9:
print("Couldn't find spot even after creating it!")
sys.exit(1)
time.sleep(10)

# SSH command using the public IP
ssh_cmd = f"ssh -o StrictHostKeychecking=no -i {ssh_key_path} ubuntu@{instance_ip}"
print(f"Connecting to {instance_ip}. Consider delaying the impeding shutdown.")
print(f"Connecting to {instance_ip}. Consider delaying the impending shutdown and running a process called Runner.Worker to fool the reaper (automation TODO).")
ssh_process = subprocess.Popen(ssh_cmd, shell=True)
ssh_process.wait() # Wait for the SSH session to complete

def call_spot_workflow(action):
subprocess.run(f'gh workflow run start-spot.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}" --field action="{action}"', shell=True)

def call_ci_workflow():
print(
"NOTE: This is mostly useful if impersonating a GITHUB_ACTOR. Usually you rather do Manage Running Jobs and retry."
)
subprocess.run(f'gh workflow run ci.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}"', shell=True)

def manage_spot_instances():
action = input("Enter 'start' to run or 'stop' to stop spot instances: ")
if action == 'start':
subprocess.run('gh workflow run start-spot.yml', shell=True)
elif action == 'stop':
subprocess.run('gh workflow run stop-spot.yml', shell=True)
call_spot_workflow(input("Enter one of 'start', 'stop', 'restart':"))

def manage_ci_workflows():
# Retrieve the most recent workflow run
Expand All @@ -86,7 +104,7 @@ def manage_ci_workflows():
subprocess.run(f"gh run cancel {run_id}", shell=True)
if action.lower() == 'rerun':
# needed so the spot runners still work
subprocess.run('gh workflow run start-spot.yml', shell=True)
call_spot_workflow('start')
subprocess.run(f"gh run rerun {run_id} --failed", shell=True)
elif action.lower() == 'rerun-all':
subprocess.run(f"gh run rerun {run_id}", shell=True)
Expand Down
Loading