diff --git a/docs/release_notes/v07-v08.md b/docs/release_notes/v07-v08.md
index b7cf29bcd..52434e921 100644
--- a/docs/release_notes/v07-v08.md
+++ b/docs/release_notes/v07-v08.md
@@ -2,8 +2,14 @@
## Major changes
+## REST API
+
+* There is now an API call (`GET /admin/resources`) which exposes the resource
+ utilization of the cluster to admin users.
+
## Supported distributions
+* Debian 12 is now supported as a host OS.
* Ubuntu 22.04 is now supported as a host OS.
* Fedora 34, 38, and 29, as well as Ubuntu 22.04 now have canned guest images.
* Rocky 8 and 9 now have canned guest images.
@@ -78,7 +84,8 @@
* We now use gRPC calls to compact etcd, instead of relying on a python client
wrapper. This means we can now update our gRPC and protobuf dependencies to
much more recent versions.
-* etcd traffic levels are now monitored in CI.
+* etcd traffic levels are now monitored in CI and we attempt to hold fewer
+ cluster level locks for local operations.
## Minor changes
@@ -96,4 +103,5 @@
longer required, and is therefore more reliable.
* The `qemu` commands generated now vary based on the version of `qemu` installed
on the machine. This was required to support the newer `qemu` version in
- Ubuntu 22.04.
\ No newline at end of file
+ Ubuntu 22.04.
+* The ansible modules have been rewritten to be more reliable.
\ No newline at end of file
diff --git a/shakenfist/external_api/admin.py b/shakenfist/external_api/admin.py
index f44c11901..533e3d449 100644
--- a/shakenfist/external_api/admin.py
+++ b/shakenfist/external_api/admin.py
@@ -14,6 +14,7 @@
from shakenfist import etcd
from shakenfist.external_api import base as api_base
+from shakenfist import scheduler
admin_locks_get_example = """{
@@ -61,3 +62,22 @@ def get(self):
resp = flask.Response(cacert, mimetype='text/plain')
resp.status_code = 200
return resp
+
+
+admin_resources_get_example = """{
+ ...
+}"""
+
+
+class AdminREsourcesEndpoint(sf_api.Resource):
+ @swag_from(api_base.swagger_helper(
+ 'admin', 'List resources currently available in the cluster.', [],
+ [(200, 'All summary of resource usage and availability in the cluster.',
+ admin_resources_get_example)],
+ requires_admin=True))
+ @api_base.verify_token
+ @api_base.caller_is_admin
+ @api_base.log_token_use
+ def get(self):
+ s = scheduler.Scheduler()
+ return s.summarize_resources()
diff --git a/shakenfist/external_api/app.py b/shakenfist/external_api/app.py
index 9b34fe3be..1139271e7 100644
--- a/shakenfist/external_api/app.py
+++ b/shakenfist/external_api/app.py
@@ -121,7 +121,7 @@ def get(self):
'
Shaken Fist REST API service
'
'You might be interested in the apidocs.
'
'Machine searchable API capabilities:
'
- '- admin: cluster-cacert
'
+ '- admin: cluster-cacert, cluster-resources
'
'- agent-operations: agentoperations-crud, instance-agentoperations, '
'instance-agentoperations-all
'
'- artifacts: artifact-metadata, artifact-upload-types
'
diff --git a/shakenfist/scheduler.py b/shakenfist/scheduler.py
index acaddeda2..29bc76780 100644
--- a/shakenfist/scheduler.py
+++ b/shakenfist/scheduler.py
@@ -26,6 +26,8 @@
# the cache is populated by the first caller.
CACHED_NETWORK_NODE = None
+UNREASONABLE_QUEUE_LENGTH = 20
+
def get_network_node():
global CACHED_NETWORK_NODE
@@ -82,7 +84,7 @@ def refresh_metrics(self):
def _has_reasonable_queue_state(self, log_ctx, node):
waiting = self.metrics[node].get('node_queue_waiting', 0)
- if waiting > 20:
+ if waiting > UNREASONABLE_QUEUE_LENGTH:
log_ctx.with_fields({
'node': node,
'node_queue_waiting': waiting
@@ -339,3 +341,71 @@ def find_candidates(self, inst, network, candidates=None):
inst.add_event(EVENT_TYPE_AUDIT, 'schedule final candidates',
extra={'candidates': candidates})
return candidates
+
+ def summarize_resources(self):
+ # Refresh metrics if its too old, or there are no nodes.
+ diff = time.time() - self.metrics_updated
+ if diff > config.SCHEDULER_CACHE_TIMEOUT or len(self.metrics) == 0:
+ self.refresh_metrics()
+
+ # Only hypervisors with reasonable queue lengths are candidates
+ resources = {
+ 'total': {
+ 'cpu_available': 0,
+ 'ram_available': 0
+ },
+ 'per_node': {}
+ }
+
+ for n in self.metrics.keys():
+ if not self.metrics[n].get('is_hypervisor', False):
+ continue
+
+ if (self.metrics[n].get('node_queue_waiting', 0) >
+ UNREASONABLE_QUEUE_LENGTH):
+ continue
+
+ resources['per_node'][n] = {}
+
+ # CPU
+ resources['per_node'][n]['cpu_max_per_instance'] = \
+ self.metrics[n].get('cpu_max_per_instance', 0)
+
+ hard_max_cpus = (self.metrics[n].get(
+ 'cpu_max', 0) * config.CPU_OVERCOMMIT_RATIO)
+ current_cpu = self.metrics[n].get('cpu_total_instance_vcpus', 0)
+ resources['per_node'][n]['cpu_available'] = hard_max_cpus - current_cpu
+ resources['total']['cpu_available'] += resources['per_node'][n]['cpu_available']
+
+ resources['per_node'][n]['cpu_load_1'] = self.metrics[n].get(
+ 'cpu_load_1', 0)
+ resources['per_node'][n]['cpu_load_5'] = self.metrics[n].get(
+ 'cpu_load_5', 0)
+ resources['per_node'][n]['cpu_load_15'] = self.metrics[n].get(
+ 'cpu_load_15', 0)
+
+ # Memory
+ resources['per_node'][n]['ram_max_per_instance'] = \
+ (self.metrics[n].get('memory_available', 0) -
+ (config.RAM_SYSTEM_RESERVATION * 1024))
+ resources['per_node'][n]['ram_max'] = \
+ self.metrics[n].get('memory_max', 0) * \
+ config.RAM_OVERCOMMIT_RATIO
+ resources['per_node'][n]['ram_available'] = \
+ (self.metrics[n].get('memory_max', 0) * config.RAM_OVERCOMMIT_RATIO -
+ self.metrics[n].get('memory_total_instance_actual', 0))
+ resources['total']['ram_available'] += resources['per_node'][n]['ram_available']
+
+ # Disk
+ disk_free = int(self.metrics[n].get(
+ 'disk_free_instances', '0')) / GiB
+ disk_free -= config.MINIMUM_FREE_DISK
+ resources['per_node'][n]['disk_available'] = disk_free
+
+ # Instance count
+ resources['per_node'][n]['instances_total'] = self.metrics[n].get(
+ 'instances_total', 0)
+ resources['per_node'][n]['instances_active'] = self.metrics[n].get(
+ 'instances_active', 0)
+
+ return resources