Skip to content

Commit

Permalink
Merge pull request shakenfist#2385 from shakenfist/expose-cluster-res…
Browse files Browse the repository at this point in the history
…ources

Expose cluster resources
  • Loading branch information
mikalstill authored May 1, 2024
2 parents 651ea3c + a73f14a commit c75bd6a
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 4 deletions.
12 changes: 10 additions & 2 deletions docs/release_notes/v07-v08.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,14 @@

## Major changes

## REST API

* There is now an API call (`GET /admin/resources`) which exposes the resource
utilization of the cluster to admin users.

## Supported distributions

* Debian 12 is now supported as a host OS.
* Ubuntu 22.04 is now supported as a host OS.
* Fedora 34, 38, and 29, as well as Ubuntu 22.04 now have canned guest images.
* Rocky 8 and 9 now have canned guest images.
Expand Down Expand Up @@ -78,7 +84,8 @@
* We now use gRPC calls to compact etcd, instead of relying on a python client
wrapper. This means we can now update our gRPC and protobuf dependencies to
much more recent versions.
* etcd traffic levels are now monitored in CI.
* etcd traffic levels are now monitored in CI and we attempt to hold fewer
cluster level locks for local operations.

## Minor changes

Expand All @@ -96,4 +103,5 @@
longer required, and is therefore more reliable.
* The `qemu` commands generated now vary based on the version of `qemu` installed
on the machine. This was required to support the newer `qemu` version in
Ubuntu 22.04.
Ubuntu 22.04.
* The ansible modules have been rewritten to be more reliable.
20 changes: 20 additions & 0 deletions shakenfist/external_api/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from shakenfist import etcd
from shakenfist.external_api import base as api_base
from shakenfist import scheduler


admin_locks_get_example = """{
Expand Down Expand Up @@ -61,3 +62,22 @@ def get(self):
resp = flask.Response(cacert, mimetype='text/plain')
resp.status_code = 200
return resp


admin_resources_get_example = """{
...
}"""


class AdminREsourcesEndpoint(sf_api.Resource):
@swag_from(api_base.swagger_helper(
'admin', 'List resources currently available in the cluster.', [],
[(200, 'All summary of resource usage and availability in the cluster.',
admin_resources_get_example)],
requires_admin=True))
@api_base.verify_token
@api_base.caller_is_admin
@api_base.log_token_use
def get(self):
s = scheduler.Scheduler()
return s.summarize_resources()
2 changes: 1 addition & 1 deletion shakenfist/external_api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def get(self):
'<body><h1>Shaken Fist REST API service</h1>'
'<p>You might be interested in the <a href="/apidocs">apidocs</a>.</p>'
'<p>Machine searchable API capabilities:</p><ul>'
'<li>admin: cluster-cacert</li>'
'<li>admin: cluster-cacert, cluster-resources</li>'
'<li>agent-operations: agentoperations-crud, instance-agentoperations, '
'instance-agentoperations-all</li>'
'<li>artifacts: artifact-metadata, artifact-upload-types</li>'
Expand Down
72 changes: 71 additions & 1 deletion shakenfist/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
# the cache is populated by the first caller.
CACHED_NETWORK_NODE = None

UNREASONABLE_QUEUE_LENGTH = 20


def get_network_node():
global CACHED_NETWORK_NODE
Expand Down Expand Up @@ -82,7 +84,7 @@ def refresh_metrics(self):

def _has_reasonable_queue_state(self, log_ctx, node):
waiting = self.metrics[node].get('node_queue_waiting', 0)
if waiting > 20:
if waiting > UNREASONABLE_QUEUE_LENGTH:
log_ctx.with_fields({
'node': node,
'node_queue_waiting': waiting
Expand Down Expand Up @@ -339,3 +341,71 @@ def find_candidates(self, inst, network, candidates=None):
inst.add_event(EVENT_TYPE_AUDIT, 'schedule final candidates',
extra={'candidates': candidates})
return candidates

def summarize_resources(self):
# Refresh metrics if its too old, or there are no nodes.
diff = time.time() - self.metrics_updated
if diff > config.SCHEDULER_CACHE_TIMEOUT or len(self.metrics) == 0:
self.refresh_metrics()

# Only hypervisors with reasonable queue lengths are candidates
resources = {
'total': {
'cpu_available': 0,
'ram_available': 0
},
'per_node': {}
}

for n in self.metrics.keys():
if not self.metrics[n].get('is_hypervisor', False):
continue

if (self.metrics[n].get('node_queue_waiting', 0) >
UNREASONABLE_QUEUE_LENGTH):
continue

resources['per_node'][n] = {}

# CPU
resources['per_node'][n]['cpu_max_per_instance'] = \
self.metrics[n].get('cpu_max_per_instance', 0)

hard_max_cpus = (self.metrics[n].get(
'cpu_max', 0) * config.CPU_OVERCOMMIT_RATIO)
current_cpu = self.metrics[n].get('cpu_total_instance_vcpus', 0)
resources['per_node'][n]['cpu_available'] = hard_max_cpus - current_cpu
resources['total']['cpu_available'] += resources['per_node'][n]['cpu_available']

resources['per_node'][n]['cpu_load_1'] = self.metrics[n].get(
'cpu_load_1', 0)
resources['per_node'][n]['cpu_load_5'] = self.metrics[n].get(
'cpu_load_5', 0)
resources['per_node'][n]['cpu_load_15'] = self.metrics[n].get(
'cpu_load_15', 0)

# Memory
resources['per_node'][n]['ram_max_per_instance'] = \
(self.metrics[n].get('memory_available', 0) -
(config.RAM_SYSTEM_RESERVATION * 1024))
resources['per_node'][n]['ram_max'] = \
self.metrics[n].get('memory_max', 0) * \
config.RAM_OVERCOMMIT_RATIO
resources['per_node'][n]['ram_available'] = \
(self.metrics[n].get('memory_max', 0) * config.RAM_OVERCOMMIT_RATIO -
self.metrics[n].get('memory_total_instance_actual', 0))
resources['total']['ram_available'] += resources['per_node'][n]['ram_available']

# Disk
disk_free = int(self.metrics[n].get(
'disk_free_instances', '0')) / GiB
disk_free -= config.MINIMUM_FREE_DISK
resources['per_node'][n]['disk_available'] = disk_free

# Instance count
resources['per_node'][n]['instances_total'] = self.metrics[n].get(
'instances_total', 0)
resources['per_node'][n]['instances_active'] = self.metrics[n].get(
'instances_active', 0)

return resources

0 comments on commit c75bd6a

Please sign in to comment.